Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.

2025-04-18 09:47:58 +02:00
parent e94c034927
commit 92a7bad2b6
2 changed files with 89 additions and 37 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -3,6 +3,8 @@
 #include <string.h>
 #include <time.h>
 #include <ctype.h>
 #include <math.h>
 #include <stdbool.h>
 #include "similarity_search.h"
 // Case insensitive string comparison
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
    }
 }
-// Calculate similarity between two words based on character matching
+// Calculate Levenshtein distance between two strings
 int levenshtein_distance(const char *s1, const char *s2) {
    int len1 = strlen(s1);
    int len2 = strlen(s2);
    // Convert to lowercase for comparison
    char s1_lower[MAX_STRING_LEN];
    char s2_lower[MAX_STRING_LEN];
    for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
    for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
    s1_lower[len1] = '\0';
    s2_lower[len2] = '\0';
    // Create distance matrix
    int matrix[len1 + 1][len2 + 1];
    // Initialize first row and column
    for (int i = 0; i <= len1; i++) matrix[i][0] = i;
    for (int j = 0; j <= len2; j++) matrix[0][j] = j;
    // Fill in the rest of the matrix
    for (int i = 1; i <= len1; i++) {
        for (int j = 1; j <= len2; j++) {
            if (s1_lower[i-1] == s2_lower[j-1]) {
                matrix[i][j] = matrix[i-1][j-1];
            } else {
                int min = matrix[i-1][j-1]; // substitution
                if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
                if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
                matrix[i][j] = min + 1;
            }
        }
    }
    return matrix[len1][len2];
 }
 // Calculate similarity between two words based on Levenshtein distance
 float word_similarity(const char *word1, const char *word2) {
    int len1 = strlen(word1);
    int len2 = strlen(word2);
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }
-    int max_len = len1 > len2 ? len1 : len2;
+    // If one word is significantly shorter than the other, it must be a prefix
-    if (max_len == 0) return 0.0f;
+    if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
-    
+        // Check if the shorter word is a prefix of the longer word
-    // Count matching characters (case insensitive)
+        const char *longer = len1 > len2 ? word1 : word2;
-    int matches = 0;
+        const char *shorter = len1 > len2 ? word2 : word1;
-    int i = 0, j = 0;
+        int shorter_len = len1 > len2 ? len2 : len1;
-    
+        
-    while (i < len1 && j < len2) {
+        if (strncasecmp(longer, shorter, shorter_len) == 0) {
-        if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
+            return 0.8f; // Good prefix match
            matches++;
            i++;
            j++;
        } else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
            i++;
        } else {
            j++;
        }
        return 0.0f; // Not a prefix match
    }
-    // Calculate similarity based on matching characters
+    // For words of similar length, calculate similarity
-    float char_similarity = (float)matches / max_len;
+    int distance = levenshtein_distance(word1, word2);
    int max_len = len1 > len2 ? len1 : len2;
-    // Require higher similarity for shorter words
+    // Calculate similarity based on edit distance
-    float min_similarity = 0.7f; // Default minimum
+    float similarity = 1.0f - (float)distance / max_len;
-    if (len1 <= 5 || len2 <= 5) {
+    
-        min_similarity = 0.9f; // Higher requirement for short words
+    // Adjust similarity based on word lengths
    if (len1 != len2) {
        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
        similarity *= length_ratio;
    }
-    // If similarity is below minimum, return 0
+    // For words of similar length, require reasonable similarity
-    if (char_similarity < min_similarity) {
+    if (similarity < 0.4f) {
        return 0.0f;
    }
-    // If words are the same length, give a small boost
+    // Never return perfect similarity for non-identical words
-    if (len1 == len2) {
+    if (distance > 0) {
-        char_similarity = char_similarity * 1.1f; // 10% boost for same length
+        similarity = fmin(similarity, 0.9f);
        if (char_similarity > 1.0f) char_similarity = 1.0f;
    }
-    return char_similarity;
+    return similarity;
 }
 // Calculate similarity between query and target string
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
        }
        best_word_similarities[i] = best_similarity;
-        if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
+        if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
            query_words_found++;
        }
    }
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
    // Combine scores: 70% weight on word matches, 30% on character similarity
    float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
-    // If all query words were found with high similarity, ensure high overall score
+    // Never return perfect similarity unless all words are exact matches
-    if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
+    bool all_exact_matches = true;
-        similarity = 1.0f; // Perfect match
+    for (int i = 0; i < query_word_count; i++) {
        if (best_word_similarities[i] < 1.0f) {
            all_exact_matches = false;
            break;
        }
    }
    if (!all_exact_matches) {
        similarity = fmin(similarity, 0.9f);
    }
    free_words(query_words, query_word_count);
--- a/test.js
+++ b/test.js
@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
 // Add multiple strings at once
 customIndex.addStrings([
  'plant growth bio formula',
-  'garden soil substrate'
+  'garden soil substrate',
  'plagron light mix',
  'Anesia Seeds Imperium X Auto 10',
  'anesi'
 ]);
 console.log(`Custom index created with ${customIndex.size()} strings`);
 // Search with a higher similarity threshold
-console.log('\nSearching with higher similarity threshold (0.3):');
+console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
-const results = customIndex.search('bio bizz', 0.3);
+const results = customIndex.search('amnesia haze', 0.1);
 results.forEach(match => {
  console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
 }); 
 console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
 const results2 = customIndex.search('lightmix', 0.1);
 results2.forEach(match => {
  console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
 });