Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance.

2025-04-18 19:01:22 +02:00
parent e2aacaf54b
commit a9a9247773
1 changed files with 14 additions and 42 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -82,38 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }
    
-    // If one word is significantly shorter than the other, it must be a prefix
-    if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
-        // Check if the shorter word is a prefix of the longer word
-        const char *longer = len1 > len2 ? word1 : word2;
-        const char *shorter = len1 > len2 ? word2 : word1;
-        int shorter_len = len1 > len2 ? len2 : len1;
-        
-        if (strncasecmp(longer, shorter, shorter_len) == 0) {
-            return 0.8f; // Good prefix match
-        }
-        return 0.0f; // Not a prefix match
-    }
-    
-    // For words of similar length, calculate similarity
+    // Calculate Levenshtein distance
    int distance = levenshtein_distance(word1, word2);
    int max_len = len1 > len2 ? len1 : len2;
    
-    // Calculate similarity based on edit distance with exponential decay
+    // Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
    float similarity = 1.0f - (float)distance / max_len;
    
-    // Apply exponential decay to make it more sensitive to differences
-    similarity = pow(similarity, 3.0f);
-    
-    // Adjust similarity based on word lengths
-    if (len1 != len2) {
-        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
-        similarity *= length_ratio;
-    }
-    
-    // For words of similar length, require reasonable similarity
-    if (similarity < 0.4f) {
-        return 0.0f;
+    // Boost similarity for small differences
+    if (distance <= 1) {
+        similarity = 0.9f + (similarity * 0.1f);
    }
    
    return similarity;
@@ -143,40 +121,34 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
        float best_similarity = 0.0f;
        
        for (int j = 0; j < target_word_count; j++) {
-            /* quick length‑difference filter (early‑exit #4)            */
-            int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
-            if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
-
            float similarity = word_similarity(query_words[i], target_words[j]);
            if (similarity > best_similarity) {
                best_similarity = similarity;
-                if (best_similarity >= 0.90f) break;   /* early exit #4 */
            }
        }
        
        best_word_similarities[i] = best_similarity;
-        if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
+        if (best_similarity >= 0.4f) {
            query_words_found++;
        }
    }
    
-    // Calculate overall similarity
-    float word_match_score = (float)query_words_found / query_word_count;
-    
-    // Calculate average of best word similarities
+    // Calculate average word similarity
    float avg_word_similarity = 0.0f;
    for (int i = 0; i < query_word_count; i++) {
        avg_word_similarity += best_word_similarities[i];
    }
    avg_word_similarity /= query_word_count;
    
-    // Combine scores: 60% weight on word matches, 40% on character similarity
-    // This gives more weight to finding all words, regardless of order
-    float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
+    // Calculate word match ratio
+    float word_match_ratio = (float)query_words_found / query_word_count;
    
-    // If all words are found, boost the score
+    // Final score is the average of word match ratio and average word similarity
+    float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
+    
+    // Boost score if all words are found
    if (query_words_found == query_word_count) {
-        similarity = 0.7f + (similarity * 0.3f);
+        similarity = 0.8f + (similarity * 0.2f);
    }
    
    free_words(query_buf);