Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance.

2025-04-18 19:01:22 +02:00
parent e2aacaf54b
commit a9a9247773
1 changed files with 14 additions and 42 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -82,38 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }
-    // If one word is significantly shorter than the other, it must be a prefix
+    // Calculate Levenshtein distance
    if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
        // Check if the shorter word is a prefix of the longer word
        const char *longer = len1 > len2 ? word1 : word2;
        const char *shorter = len1 > len2 ? word2 : word1;
        int shorter_len = len1 > len2 ? len2 : len1;
        if (strncasecmp(longer, shorter, shorter_len) == 0) {
            return 0.8f; // Good prefix match
        }
        return 0.0f; // Not a prefix match
    }
    // For words of similar length, calculate similarity
    int distance = levenshtein_distance(word1, word2);
    int max_len = len1 > len2 ? len1 : len2;
-    // Calculate similarity based on edit distance with exponential decay
+    // Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
    float similarity = 1.0f - (float)distance / max_len;
-    // Apply exponential decay to make it more sensitive to differences
+    // Boost similarity for small differences
-    similarity = pow(similarity, 3.0f);
+    if (distance <= 1) {
-    
+        similarity = 0.9f + (similarity * 0.1f);
    // Adjust similarity based on word lengths
    if (len1 != len2) {
        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
        similarity *= length_ratio;
    }
    // For words of similar length, require reasonable similarity
    if (similarity < 0.4f) {
        return 0.0f;
    }
    return similarity;
@@ -143,40 +121,34 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
        float best_similarity = 0.0f;
        for (int j = 0; j < target_word_count; j++) {
            /* quick length‑difference filter (early‑exit #4)            */
            int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
            if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
            float similarity = word_similarity(query_words[i], target_words[j]);
            if (similarity > best_similarity) {
                best_similarity = similarity;
                if (best_similarity >= 0.90f) break;   /* early exit #4 */
            }
        }
        best_word_similarities[i] = best_similarity;
-        if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
+        if (best_similarity >= 0.4f) {
            query_words_found++;
        }
    }
-    // Calculate overall similarity
+    // Calculate average word similarity
    float word_match_score = (float)query_words_found / query_word_count;
    // Calculate average of best word similarities
    float avg_word_similarity = 0.0f;
    for (int i = 0; i < query_word_count; i++) {
        avg_word_similarity += best_word_similarities[i];
    }
    avg_word_similarity /= query_word_count;
-    // Combine scores: 60% weight on word matches, 40% on character similarity
+    // Calculate word match ratio
-    // This gives more weight to finding all words, regardless of order
+    float word_match_ratio = (float)query_words_found / query_word_count;
    float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
-    // If all words are found, boost the score
+    // Final score is the average of word match ratio and average word similarity
    float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
    // Boost score if all words are found
    if (query_words_found == query_word_count) {
-        similarity = 0.7f + (similarity * 0.3f);
+        similarity = 0.8f + (similarity * 0.2f);
    }
    free_words(query_buf);