Refactor similarity_search.c to improve memory management and word splitting logic. Simplify split_into_words function to use a single allocation and update free_words to handle memory more efficiently. Enhance levenshtein_distance calculation with dynamic memory allocation and optimize similarity scoring in calculate_similarity function for better accuracy and performance.

2025-04-18 18:55:37 +02:00
parent 0dd17b794f
commit e2aacaf54b
1 changed files with 62 additions and 88 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -22,76 +22,54 @@ int str_case_cmp(const char *s1, const char *s2) {
 }

 // Split a string into words
-int split_into_words(const char *string, char *words[MAX_WORDS]) {
-    if (!string || strlen(string) >= MAX_STRING_LEN) {
-        return 0;
-    }
+int split_into_words(const char *s,
+                     char  *words[MAX_WORDS],
+                     char **storage)          /* NEW OUT PARAM            */
+{
+    if (!s || strlen(s) >= MAX_STRING_LEN) return 0;

-    char temp[MAX_STRING_LEN];
-    strncpy(temp, string, MAX_STRING_LEN - 1);
-    temp[MAX_STRING_LEN - 1] = '\0';
+    char *buf = strdup(s);                    /* one single allocation    */
+    if (!buf) return 0;
+    *storage = buf;                           /* hand ownership to caller */

-    int word_count = 0;
-    char *token = strtok(temp, " \t\n");
-    
-    while (token != NULL && word_count < MAX_WORDS) {
-        words[word_count] = strdup(token);
-        if (!words[word_count]) {
-            // Free any already allocated words on error
-            for (int i = 0; i < word_count; i++) {
-                free(words[i]);
+    int n = 0;
+    for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
+         tok = strtok(NULL, " \t\n"))
+    {
+        words[n++] = tok;                     /* pointers into buf        */
    }
-            return 0;
-        }
-        word_count++;
-        token = strtok(NULL, " \t\n");
-    }
-    
-    return word_count;
+    return n;
 }

 // Free memory allocated for words
-void free_words(char *words[], int word_count) {
-    for (int i = 0; i < word_count; i++) {
-        free(words[i]);
-    }
+void free_words(char *storage) {              /* simplified               */
+    free(storage);                            /* single free, if any      */
 }

 // Calculate Levenshtein distance between two strings
-int levenshtein_distance(const char *s1, const char *s2) {
-    int len1 = strlen(s1);
-    int len2 = strlen(s2);
+int levenshtein_distance(const char *a, const char *b)
+{
+    size_t m = strlen(a), n = strlen(b);
+    if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }

-    // Convert to lowercase for comparison
-    char s1_lower[MAX_STRING_LEN];
-    char s2_lower[MAX_STRING_LEN];
-    for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
-    for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
-    s1_lower[len1] = '\0';
-    s2_lower[len2] = '\0';
+    int *row0 = alloca((n + 1) * sizeof(int));
+    int *row1 = alloca((n + 1) * sizeof(int));

-    // Create distance matrix
-    int matrix[len1 + 1][len2 + 1];
-    
-    // Initialize first row and column
-    for (int i = 0; i <= len1; i++) matrix[i][0] = i;
-    for (int j = 0; j <= len2; j++) matrix[0][j] = j;
-    
-    // Fill in the rest of the matrix
-    for (int i = 1; i <= len1; i++) {
-        for (int j = 1; j <= len2; j++) {
-            if (s1_lower[i-1] == s2_lower[j-1]) {
-                matrix[i][j] = matrix[i-1][j-1];
-            } else {
-                int min = matrix[i-1][j-1]; // substitution
-                if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
-                if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
-                matrix[i][j] = min + 1;
+    for (size_t j = 0; j <= n; ++j) row0[j] = j;
+    for (size_t i = 1; i <= m; ++i) {
+        row1[0] = i;
+        for (size_t j = 1; j <= n; ++j) {
+            int cost = (tolower((unsigned)a[i-1]) ==
+                        tolower((unsigned)b[j-1])) ? 0 : 1;
+            int del  = row0[j]   + 1;
+            int ins  = row1[j-1] + 1;
+            int sub  = row0[j-1] + cost;
+            row1[j] = (del < ins ? (del < sub ? del : sub)
+                                 : (ins < sub ? ins : sub));
        }
+        int *tmp = row0; row0 = row1; row1 = tmp;
    }
-    }
-    
-    return matrix[len1][len2];
+    return row0[n];
 }

 // Calculate similarity between two words based on Levenshtein distance
@@ -121,9 +99,12 @@ float word_similarity(const char *word1, const char *word2) {
    int distance = levenshtein_distance(word1, word2);
    int max_len = len1 > len2 ? len1 : len2;
    
-    // Calculate similarity based on edit distance
+    // Calculate similarity based on edit distance with exponential decay
    float similarity = 1.0f - (float)distance / max_len;
    
+    // Apply exponential decay to make it more sensitive to differences
+    similarity = pow(similarity, 3.0f);
+    
    // Adjust similarity based on word lengths
    if (len1 != len2) {
        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
@@ -135,26 +116,21 @@ float word_similarity(const char *word1, const char *word2) {
        return 0.0f;
    }
    
-    // Never return perfect similarity for non-identical words
-    if (distance > 0) {
-        similarity = fmin(similarity, 0.9f);
-    }
-    
    return similarity;
 }

 // Calculate similarity between query and target string
 float calculate_similarity(const char *query, const char *target, float cutoff) {
    // Split strings into words
-    char *query_words[MAX_WORDS] = {0};
-    char *target_words[MAX_WORDS] = {0};
+    char *query_buf, *target_buf;
+    char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
    
-    int query_word_count = split_into_words(query, query_words);
-    int target_word_count = split_into_words(target, target_words);
+    int query_word_count = split_into_words(query,  query_words,  &query_buf);
+    int target_word_count = split_into_words(target, target_words, &target_buf);
    
    if (query_word_count == 0 || target_word_count == 0) {
-        free_words(query_words, query_word_count);
-        free_words(target_words, target_word_count);
+        free_words(query_buf);
+        free_words(target_buf);
        return 0.0;
    }
    
@@ -167,9 +143,14 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
        float best_similarity = 0.0f;
        
        for (int j = 0; j < target_word_count; j++) {
+            /* quick length‑difference filter (early‑exit #4)            */
+            int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
+            if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
+
            float similarity = word_similarity(query_words[i], target_words[j]);
            if (similarity > best_similarity) {
                best_similarity = similarity;
+                if (best_similarity >= 0.90f) break;   /* early exit #4 */
            }
        }
        
@@ -189,24 +170,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
    }
    avg_word_similarity /= query_word_count;
    
-    // Combine scores: 70% weight on word matches, 30% on character similarity
-    float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
+    // Combine scores: 60% weight on word matches, 40% on character similarity
+    // This gives more weight to finding all words, regardless of order
+    float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
    
-    // Never return perfect similarity unless all words are exact matches
-    bool all_exact_matches = true;
-    for (int i = 0; i < query_word_count; i++) {
-        if (best_word_similarities[i] < 1.0f) {
-            all_exact_matches = false;
-            break;
-        }
+    // If all words are found, boost the score
+    if (query_words_found == query_word_count) {
+        similarity = 0.7f + (similarity * 0.3f);
    }
    
-    if (!all_exact_matches) {
-        similarity = fmin(similarity, 0.9f);
-    }
-    
-    free_words(query_words, query_word_count);
-    free_words(target_words, target_word_count);
+    free_words(query_buf);
+    free_words(target_buf);
    
    return similarity;
 }