From e2aacaf54b5c8c1f1929a1c6551a4dd1964270ad Mon Sep 17 00:00:00 2001 From: seb Date: Fri, 18 Apr 2025 18:55:37 +0200 Subject: [PATCH] Refactor similarity_search.c to improve memory management and word splitting logic. Simplify split_into_words function to use a single allocation and update free_words to handle memory more efficiently. Enhance levenshtein_distance calculation with dynamic memory allocation and optimize similarity scoring in calculate_similarity function for better accuracy and performance. --- similarity_search.c | 150 ++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 88 deletions(-) diff --git a/similarity_search.c b/similarity_search.c index 57850ba..98de023 100644 --- a/similarity_search.c +++ b/similarity_search.c @@ -22,76 +22,54 @@ int str_case_cmp(const char *s1, const char *s2) { } // Split a string into words -int split_into_words(const char *string, char *words[MAX_WORDS]) { - if (!string || strlen(string) >= MAX_STRING_LEN) { - return 0; +int split_into_words(const char *s, + char *words[MAX_WORDS], + char **storage) /* NEW OUT PARAM */ +{ + if (!s || strlen(s) >= MAX_STRING_LEN) return 0; + + char *buf = strdup(s); /* one single allocation */ + if (!buf) return 0; + *storage = buf; /* hand ownership to caller */ + + int n = 0; + for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS; + tok = strtok(NULL, " \t\n")) + { + words[n++] = tok; /* pointers into buf */ } - - char temp[MAX_STRING_LEN]; - strncpy(temp, string, MAX_STRING_LEN - 1); - temp[MAX_STRING_LEN - 1] = '\0'; - - int word_count = 0; - char *token = strtok(temp, " \t\n"); - - while (token != NULL && word_count < MAX_WORDS) { - words[word_count] = strdup(token); - if (!words[word_count]) { - // Free any already allocated words on error - for (int i = 0; i < word_count; i++) { - free(words[i]); - } - return 0; - } - word_count++; - token = strtok(NULL, " \t\n"); - } - - return word_count; + return n; } // Free memory allocated for words -void free_words(char *words[], int word_count) { - for (int i = 0; i < word_count; i++) { - free(words[i]); - } +void free_words(char *storage) { /* simplified */ + free(storage); /* single free, if any */ } // Calculate Levenshtein distance between two strings -int levenshtein_distance(const char *s1, const char *s2) { - int len1 = strlen(s1); - int len2 = strlen(s2); - - // Convert to lowercase for comparison - char s1_lower[MAX_STRING_LEN]; - char s2_lower[MAX_STRING_LEN]; - for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]); - for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]); - s1_lower[len1] = '\0'; - s2_lower[len2] = '\0'; - - // Create distance matrix - int matrix[len1 + 1][len2 + 1]; - - // Initialize first row and column - for (int i = 0; i <= len1; i++) matrix[i][0] = i; - for (int j = 0; j <= len2; j++) matrix[0][j] = j; - - // Fill in the rest of the matrix - for (int i = 1; i <= len1; i++) { - for (int j = 1; j <= len2; j++) { - if (s1_lower[i-1] == s2_lower[j-1]) { - matrix[i][j] = matrix[i-1][j-1]; - } else { - int min = matrix[i-1][j-1]; // substitution - if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion - if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion - matrix[i][j] = min + 1; - } +int levenshtein_distance(const char *a, const char *b) +{ + size_t m = strlen(a), n = strlen(b); + if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; } + + int *row0 = alloca((n + 1) * sizeof(int)); + int *row1 = alloca((n + 1) * sizeof(int)); + + for (size_t j = 0; j <= n; ++j) row0[j] = j; + for (size_t i = 1; i <= m; ++i) { + row1[0] = i; + for (size_t j = 1; j <= n; ++j) { + int cost = (tolower((unsigned)a[i-1]) == + tolower((unsigned)b[j-1])) ? 0 : 1; + int del = row0[j] + 1; + int ins = row1[j-1] + 1; + int sub = row0[j-1] + cost; + row1[j] = (del < ins ? (del < sub ? del : sub) + : (ins < sub ? ins : sub)); } + int *tmp = row0; row0 = row1; row1 = tmp; } - - return matrix[len1][len2]; + return row0[n]; } // Calculate similarity between two words based on Levenshtein distance @@ -121,9 +99,12 @@ float word_similarity(const char *word1, const char *word2) { int distance = levenshtein_distance(word1, word2); int max_len = len1 > len2 ? len1 : len2; - // Calculate similarity based on edit distance + // Calculate similarity based on edit distance with exponential decay float similarity = 1.0f - (float)distance / max_len; + // Apply exponential decay to make it more sensitive to differences + similarity = pow(similarity, 3.0f); + // Adjust similarity based on word lengths if (len1 != len2) { float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2); @@ -135,26 +116,21 @@ float word_similarity(const char *word1, const char *word2) { return 0.0f; } - // Never return perfect similarity for non-identical words - if (distance > 0) { - similarity = fmin(similarity, 0.9f); - } - return similarity; } // Calculate similarity between query and target string float calculate_similarity(const char *query, const char *target, float cutoff) { // Split strings into words - char *query_words[MAX_WORDS] = {0}; - char *target_words[MAX_WORDS] = {0}; + char *query_buf, *target_buf; + char *query_words[MAX_WORDS], *target_words[MAX_WORDS]; - int query_word_count = split_into_words(query, query_words); - int target_word_count = split_into_words(target, target_words); + int query_word_count = split_into_words(query, query_words, &query_buf); + int target_word_count = split_into_words(target, target_words, &target_buf); if (query_word_count == 0 || target_word_count == 0) { - free_words(query_words, query_word_count); - free_words(target_words, target_word_count); + free_words(query_buf); + free_words(target_buf); return 0.0; } @@ -167,9 +143,14 @@ float calculate_similarity(const char *query, const char *target, float cutoff) float best_similarity = 0.0f; for (int j = 0; j < target_word_count; j++) { + /* quick length‑difference filter (early‑exit #4) */ + int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]); + if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue; + float similarity = word_similarity(query_words[i], target_words[j]); if (similarity > best_similarity) { best_similarity = similarity; + if (best_similarity >= 0.90f) break; /* early exit #4 */ } } @@ -189,24 +170,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff) } avg_word_similarity /= query_word_count; - // Combine scores: 70% weight on word matches, 30% on character similarity - float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); + // Combine scores: 60% weight on word matches, 40% on character similarity + // This gives more weight to finding all words, regardless of order + float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f); - // Never return perfect similarity unless all words are exact matches - bool all_exact_matches = true; - for (int i = 0; i < query_word_count; i++) { - if (best_word_similarities[i] < 1.0f) { - all_exact_matches = false; - break; - } + // If all words are found, boost the score + if (query_words_found == query_word_count) { + similarity = 0.7f + (similarity * 0.3f); } - if (!all_exact_matches) { - similarity = fmin(similarity, 0.9f); - } - - free_words(query_words, query_word_count); - free_words(target_words, target_word_count); + free_words(query_buf); + free_words(target_buf); return similarity; }