Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.

2025-04-18 09:47:58 +02:00
parent e94c034927
commit 92a7bad2b6
2 changed files with 89 additions and 37 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -3,6 +3,8 @@
 #include <string.h>
 #include <time.h>
 #include <ctype.h>
+#include <math.h>
+#include <stdbool.h>
 #include "similarity_search.h"

 // Case insensitive string comparison
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
    }
 }

-// Calculate similarity between two words based on character matching
+// Calculate Levenshtein distance between two strings
+int levenshtein_distance(const char *s1, const char *s2) {
+    int len1 = strlen(s1);
+    int len2 = strlen(s2);
+    
+    // Convert to lowercase for comparison
+    char s1_lower[MAX_STRING_LEN];
+    char s2_lower[MAX_STRING_LEN];
+    for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
+    for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
+    s1_lower[len1] = '\0';
+    s2_lower[len2] = '\0';
+    
+    // Create distance matrix
+    int matrix[len1 + 1][len2 + 1];
+    
+    // Initialize first row and column
+    for (int i = 0; i <= len1; i++) matrix[i][0] = i;
+    for (int j = 0; j <= len2; j++) matrix[0][j] = j;
+    
+    // Fill in the rest of the matrix
+    for (int i = 1; i <= len1; i++) {
+        for (int j = 1; j <= len2; j++) {
+            if (s1_lower[i-1] == s2_lower[j-1]) {
+                matrix[i][j] = matrix[i-1][j-1];
+            } else {
+                int min = matrix[i-1][j-1]; // substitution
+                if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
+                if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
+                matrix[i][j] = min + 1;
+            }
+        }
+    }
+    
+    return matrix[len1][len2];
+}
+
+// Calculate similarity between two words based on Levenshtein distance
 float word_similarity(const char *word1, const char *word2) {
    int len1 = strlen(word1);
    int len2 = strlen(word2);
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }
    
-    int max_len = len1 > len2 ? len1 : len2;
-    if (max_len == 0) return 0.0f;
-    
-    // Count matching characters (case insensitive)
-    int matches = 0;
-    int i = 0, j = 0;
-    
-    while (i < len1 && j < len2) {
-        if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
-            matches++;
-            i++;
-            j++;
-        } else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
-            i++;
-        } else {
-            j++;
+    // If one word is significantly shorter than the other, it must be a prefix
+    if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
+        // Check if the shorter word is a prefix of the longer word
+        const char *longer = len1 > len2 ? word1 : word2;
+        const char *shorter = len1 > len2 ? word2 : word1;
+        int shorter_len = len1 > len2 ? len2 : len1;
+        
+        if (strncasecmp(longer, shorter, shorter_len) == 0) {
+            return 0.8f; // Good prefix match
        }
+        return 0.0f; // Not a prefix match
    }
    
-    // Calculate similarity based on matching characters
-    float char_similarity = (float)matches / max_len;
+    // For words of similar length, calculate similarity
+    int distance = levenshtein_distance(word1, word2);
+    int max_len = len1 > len2 ? len1 : len2;
    
-    // Require higher similarity for shorter words
-    float min_similarity = 0.7f; // Default minimum
-    if (len1 <= 5 || len2 <= 5) {
-        min_similarity = 0.9f; // Higher requirement for short words
+    // Calculate similarity based on edit distance
+    float similarity = 1.0f - (float)distance / max_len;
+    
+    // Adjust similarity based on word lengths
+    if (len1 != len2) {
+        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
+        similarity *= length_ratio;
    }
    
-    // If similarity is below minimum, return 0
-    if (char_similarity < min_similarity) {
+    // For words of similar length, require reasonable similarity
+    if (similarity < 0.4f) {
        return 0.0f;
    }
    
-    // If words are the same length, give a small boost
-    if (len1 == len2) {
-        char_similarity = char_similarity * 1.1f; // 10% boost for same length
-        if (char_similarity > 1.0f) char_similarity = 1.0f;
+    // Never return perfect similarity for non-identical words
+    if (distance > 0) {
+        similarity = fmin(similarity, 0.9f);
    }
    
-    return char_similarity;
+    return similarity;
 }

 // Calculate similarity between query and target string
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
        }
        
        best_word_similarities[i] = best_similarity;
-        if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
+        if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
            query_words_found++;
        }
    }
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
    // Combine scores: 70% weight on word matches, 30% on character similarity
    float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
    
-    // If all query words were found with high similarity, ensure high overall score
-    if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
-        similarity = 1.0f; // Perfect match
+    // Never return perfect similarity unless all words are exact matches
+    bool all_exact_matches = true;
+    for (int i = 0; i < query_word_count; i++) {
+        if (best_word_similarities[i] < 1.0f) {
+            all_exact_matches = false;
+            break;
+        }
+    }
+    
+    if (!all_exact_matches) {
+        similarity = fmin(similarity, 0.9f);
    }
    
    free_words(query_words, query_word_count);
--- a/test.js
+++ b/test.js
@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
 // Add multiple strings at once
 customIndex.addStrings([
  'plant growth bio formula',
-  'garden soil substrate'
+  'garden soil substrate',
+  'plagron light mix',
+  'Anesia Seeds Imperium X Auto 10',
+  'anesi'
 ]);

 console.log(`Custom index created with ${customIndex.size()} strings`);

 // Search with a higher similarity threshold
-console.log('\nSearching with higher similarity threshold (0.3):');
-const results = customIndex.search('bio bizz', 0.3);
+console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
+const results = customIndex.search('amnesia haze', 0.1);
 results.forEach(match => {
  console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
+}); 
+console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
+const results2 = customIndex.search('lightmix', 0.1);
+results2.forEach(match => {
+  console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
 });