genesis
This commit is contained in:
198
similarity_search.c
Normal file
198
similarity_search.c
Normal file
@@ -0,0 +1,198 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <ctype.h>
|
||||
#include "similarity_search.h"
|
||||
|
||||
// Case insensitive string comparison
|
||||
int str_case_cmp(const char *s1, const char *s2) {
|
||||
while (*s1 && *s2) {
|
||||
int c1 = tolower((unsigned char)*s1);
|
||||
int c2 = tolower((unsigned char)*s2);
|
||||
if (c1 != c2) {
|
||||
return c1 - c2;
|
||||
}
|
||||
s1++;
|
||||
s2++;
|
||||
}
|
||||
return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
|
||||
}
|
||||
|
||||
// Split a string into words
|
||||
int split_into_words(const char *string, char *words[MAX_WORDS]) {
|
||||
char temp[MAX_STRING_LEN];
|
||||
strcpy(temp, string);
|
||||
|
||||
int word_count = 0;
|
||||
char *token = strtok(temp, " \t\n");
|
||||
|
||||
while (token != NULL && word_count < MAX_WORDS) {
|
||||
words[word_count] = strdup(token);
|
||||
word_count++;
|
||||
token = strtok(NULL, " \t\n");
|
||||
}
|
||||
|
||||
return word_count;
|
||||
}
|
||||
|
||||
// Free memory allocated for words
|
||||
void free_words(char *words[], int word_count) {
|
||||
for (int i = 0; i < word_count; i++) {
|
||||
free(words[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate similarity between query and target string
|
||||
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
||||
// Split strings into words
|
||||
char *query_words[MAX_WORDS] = {0};
|
||||
char *target_words[MAX_WORDS] = {0};
|
||||
|
||||
int query_word_count = split_into_words(query, query_words);
|
||||
int target_word_count = split_into_words(target, target_words);
|
||||
|
||||
if (query_word_count == 0 || target_word_count == 0) {
|
||||
free_words(query_words, query_word_count);
|
||||
free_words(target_words, target_word_count);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Count matches
|
||||
int matches = 0;
|
||||
for (int i = 0; i < query_word_count; i++) {
|
||||
for (int j = 0; j < target_word_count; j++) {
|
||||
if (str_case_cmp(query_words[i], target_words[j]) == 0) {
|
||||
matches++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate Jaccard similarity (intersection over union)
|
||||
float similarity = (float)matches / (query_word_count + target_word_count - matches);
|
||||
|
||||
free_words(query_words, query_word_count);
|
||||
free_words(target_words, target_word_count);
|
||||
|
||||
return similarity;
|
||||
}
|
||||
|
||||
// Compare function for qsort to sort results by similarity (descending)
|
||||
int compare_results(const void *a, const void *b) {
|
||||
const SearchResult *result_a = (const SearchResult *)a;
|
||||
const SearchResult *result_b = (const SearchResult *)b;
|
||||
|
||||
if (result_b->similarity > result_a->similarity) return 1;
|
||||
if (result_b->similarity < result_a->similarity) return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Generate a random word
|
||||
void generate_random_word(char *word, int max_len) {
|
||||
int len = 3 + rand() % 8; // Random length between 3 and 10
|
||||
for (int i = 0; i < len; i++) {
|
||||
word[i] = 'a' + (rand() % 26);
|
||||
}
|
||||
word[len] = '\0';
|
||||
}
|
||||
|
||||
// Generate a random string consisting of multiple words
|
||||
void generate_random_string(char *string, int max_len) {
|
||||
int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
|
||||
string[0] = '\0';
|
||||
|
||||
for (int i = 0; i < num_words; i++) {
|
||||
char word[20];
|
||||
generate_random_word(word, 10);
|
||||
|
||||
// Check if there's enough space to add this word
|
||||
if (strlen(string) + strlen(word) + 1 < (size_t)max_len) {
|
||||
if (i > 0) strcat(string, " ");
|
||||
strcat(string, word);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new search index
|
||||
SearchIndex* create_search_index(int capacity) {
|
||||
SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
|
||||
if (!index) return NULL;
|
||||
|
||||
index->strings = (char**)malloc(capacity * sizeof(char*));
|
||||
if (!index->strings) {
|
||||
free(index);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
index->num_strings = 0;
|
||||
return index;
|
||||
}
|
||||
|
||||
// Add a string to the index
|
||||
int add_string_to_index(SearchIndex* index, const char* string) {
|
||||
if (!index || !string) return -1;
|
||||
|
||||
index->strings[index->num_strings] = strdup(string);
|
||||
if (!index->strings[index->num_strings]) return -1;
|
||||
|
||||
index->num_strings++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Free the search index and all associated memory
|
||||
void free_search_index(SearchIndex* index) {
|
||||
if (!index) return;
|
||||
|
||||
for (int i = 0; i < index->num_strings; i++) {
|
||||
free(index->strings[i]);
|
||||
}
|
||||
|
||||
free(index->strings);
|
||||
free(index);
|
||||
}
|
||||
|
||||
// Search the index with the given query and similarity cutoff
|
||||
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
|
||||
if (!index || !query || !num_results) return NULL;
|
||||
|
||||
// Allocate temporary array for results
|
||||
SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
|
||||
if (!temp_results) return NULL;
|
||||
|
||||
*num_results = 0;
|
||||
|
||||
// Search through all strings in the index
|
||||
for (int i = 0; i < index->num_strings; i++) {
|
||||
float similarity = calculate_similarity(query, index->strings[i], cutoff);
|
||||
|
||||
if (similarity >= cutoff) {
|
||||
temp_results[*num_results].string = index->strings[i];
|
||||
temp_results[*num_results].similarity = similarity;
|
||||
(*num_results)++;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort results by similarity
|
||||
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
|
||||
|
||||
// Allocate final result array with exact size
|
||||
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
|
||||
if (!results) {
|
||||
free(temp_results);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Copy results to final array
|
||||
memcpy(results, temp_results, *num_results * sizeof(SearchResult));
|
||||
free(temp_results);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// Free the search results
|
||||
void free_search_results(SearchResult* results, int num_results) {
|
||||
free(results);
|
||||
}
|
||||
Reference in New Issue
Block a user