#include #include #include #include #include #include "similarity_search.h" // Case insensitive string comparison int str_case_cmp(const char *s1, const char *s2) { while (*s1 && *s2) { int c1 = tolower((unsigned char)*s1); int c2 = tolower((unsigned char)*s2); if (c1 != c2) { return c1 - c2; } s1++; s2++; } return tolower((unsigned char)*s1) - tolower((unsigned char)*s2); } // Split a string into words int split_into_words(const char *string, char *words[MAX_WORDS]) { if (!string || strlen(string) >= MAX_STRING_LEN) { return 0; } char temp[MAX_STRING_LEN]; strncpy(temp, string, MAX_STRING_LEN - 1); temp[MAX_STRING_LEN - 1] = '\0'; int word_count = 0; char *token = strtok(temp, " \t\n"); while (token != NULL && word_count < MAX_WORDS) { words[word_count] = strdup(token); if (!words[word_count]) { // Free any already allocated words on error for (int i = 0; i < word_count; i++) { free(words[i]); } return 0; } word_count++; token = strtok(NULL, " \t\n"); } return word_count; } // Free memory allocated for words void free_words(char *words[], int word_count) { for (int i = 0; i < word_count; i++) { free(words[i]); } } // Calculate similarity between two words based on character matching float word_similarity(const char *word1, const char *word2) { int len1 = strlen(word1); int len2 = strlen(word2); // For very short words (3 chars or less), require exact match if (len1 <= 3 || len2 <= 3) { return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; } int max_len = len1 > len2 ? len1 : len2; if (max_len == 0) return 0.0f; // Count matching characters (case insensitive) int matches = 0; int i = 0, j = 0; while (i < len1 && j < len2) { if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { matches++; i++; j++; } else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) { i++; } else { j++; } } // Calculate similarity based on matching characters float char_similarity = (float)matches / max_len; // Require higher similarity for shorter words float min_similarity = 0.7f; // Default minimum if (len1 <= 5 || len2 <= 5) { min_similarity = 0.9f; // Higher requirement for short words } // If similarity is below minimum, return 0 if (char_similarity < min_similarity) { return 0.0f; } // If words are the same length, give a small boost if (len1 == len2) { char_similarity = char_similarity * 1.1f; // 10% boost for same length if (char_similarity > 1.0f) char_similarity = 1.0f; } return char_similarity; } // Calculate similarity between query and target string float calculate_similarity(const char *query, const char *target, float cutoff) { // Split strings into words char *query_words[MAX_WORDS] = {0}; char *target_words[MAX_WORDS] = {0}; int query_word_count = split_into_words(query, query_words); int target_word_count = split_into_words(target, target_words); if (query_word_count == 0 || target_word_count == 0) { free_words(query_words, query_word_count); free_words(target_words, target_word_count); return 0.0; } // Track best matches for each query word float best_word_similarities[MAX_WORDS] = {0.0f}; int query_words_found = 0; // For each query word, find its best match in target words for (int i = 0; i < query_word_count; i++) { float best_similarity = 0.0f; for (int j = 0; j < target_word_count; j++) { float similarity = word_similarity(query_words[i], target_words[j]); if (similarity > best_similarity) { best_similarity = similarity; } } best_word_similarities[i] = best_similarity; if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough query_words_found++; } } // Calculate overall similarity float word_match_score = (float)query_words_found / query_word_count; // Calculate average of best word similarities float avg_word_similarity = 0.0f; for (int i = 0; i < query_word_count; i++) { avg_word_similarity += best_word_similarities[i]; } avg_word_similarity /= query_word_count; // Combine scores: 70% weight on word matches, 30% on character similarity float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); // If all query words were found with high similarity, ensure high overall score if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) { similarity = 1.0f; // Perfect match } free_words(query_words, query_word_count); free_words(target_words, target_word_count); return similarity; } // Compare function for qsort to sort results by similarity (descending) int compare_results(const void *a, const void *b) { const SearchResult *result_a = (const SearchResult *)a; const SearchResult *result_b = (const SearchResult *)b; if (result_b->similarity > result_a->similarity) return 1; if (result_b->similarity < result_a->similarity) return -1; return 0; } // Generate a random word void generate_random_word(char *word, int max_len) { int len = 3 + rand() % 8; // Random length between 3 and 10 for (int i = 0; i < len; i++) { word[i] = 'a' + (rand() % 26); } word[len] = '\0'; } // Generate a random string consisting of multiple words void generate_random_string(char *string, int max_len) { if (!string || max_len <= 0) { return; } int num_words = 2 + rand() % 5; // Random number of words between 2 and 6 string[0] = '\0'; size_t current_len = 0; for (int i = 0; i < num_words; i++) { char word[20]; generate_random_word(word, sizeof(word) - 1); size_t word_len = strlen(word); size_t space_needed = word_len + (i > 0 ? 1 : 0); // +1 for space if not first word if (current_len + space_needed < (size_t)max_len - 1) { if (i > 0) { strncat(string, " ", max_len - current_len - 1); current_len++; } strncat(string, word, max_len - current_len - 1); current_len += word_len; } else { break; } } } // Create a new search index SearchIndex* create_search_index(int capacity) { if (capacity <= 0) return NULL; SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex)); if (!index) return NULL; index->strings = (char**)malloc(capacity * sizeof(char*)); if (!index->strings) { free(index); return NULL; } index->num_strings = 0; index->capacity = capacity; return index; } // Add a string to the index int add_string_to_index(SearchIndex* index, const char* string) { if (!index || !string) return -1; // Check if we've reached capacity if (index->num_strings >= index->capacity) { return -1; } // Check if string is too long if (strlen(string) >= MAX_STRING_LEN) { return -1; } index->strings[index->num_strings] = strdup(string); if (!index->strings[index->num_strings]) return -1; index->num_strings++; return 0; } // Free the search index and all associated memory void free_search_index(SearchIndex* index) { if (!index) return; for (int i = 0; i < index->num_strings; i++) { free(index->strings[i]); } free(index->strings); free(index); } // Search the index with the given query and similarity cutoff SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) { if (!index || !query || !num_results) return NULL; // Validate input string length if (strlen(query) >= MAX_STRING_LEN) { *num_results = 0; return NULL; } // Validate cutoff if (cutoff < 0.0f || cutoff > 1.0f) { *num_results = 0; return NULL; } // Allocate temporary array for results SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult)); if (!temp_results) return NULL; *num_results = 0; // Search through all strings in the index for (int i = 0; i < index->num_strings; i++) { float similarity = calculate_similarity(query, index->strings[i], cutoff); if (similarity >= cutoff) { // Store a copy of the string in the result temp_results[*num_results].string = strdup(index->strings[i]); if (!temp_results[*num_results].string) { // Free any already allocated strings on error for (int j = 0; j < *num_results; j++) { free(temp_results[j].string); } free(temp_results); return NULL; } temp_results[*num_results].similarity = similarity; (*num_results)++; } } // Sort results by similarity qsort(temp_results, *num_results, sizeof(SearchResult), compare_results); // Allocate final result array with exact size SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult)); if (!results) { // Free all strings in temp_results for (int i = 0; i < *num_results; i++) { free(temp_results[i].string); } free(temp_results); return NULL; } // Copy results to final array for (int i = 0; i < *num_results; i++) { results[i].string = temp_results[i].string; results[i].similarity = temp_results[i].similarity; } free(temp_results); return results; } // Free the search results void free_search_results(SearchResult* results, int num_results) { if (!results) return; // Free all strings in the results for (int i = 0; i < num_results; i++) { free(results[i].string); } free(results); }