From 92a7bad2b669ad149a229e361693a058598493d8 Mon Sep 17 00:00:00 2001 From: seb Date: Fri, 18 Apr 2025 09:47:58 +0200 Subject: [PATCH] Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds. --- similarity_search.c | 112 ++++++++++++++++++++++++++++++-------------- test.js | 14 ++++-- 2 files changed, 89 insertions(+), 37 deletions(-) diff --git a/similarity_search.c b/similarity_search.c index ac61580..57850ba 100644 --- a/similarity_search.c +++ b/similarity_search.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include "similarity_search.h" // Case insensitive string comparison @@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) { } } -// Calculate similarity between two words based on character matching +// Calculate Levenshtein distance between two strings +int levenshtein_distance(const char *s1, const char *s2) { + int len1 = strlen(s1); + int len2 = strlen(s2); + + // Convert to lowercase for comparison + char s1_lower[MAX_STRING_LEN]; + char s2_lower[MAX_STRING_LEN]; + for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]); + for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]); + s1_lower[len1] = '\0'; + s2_lower[len2] = '\0'; + + // Create distance matrix + int matrix[len1 + 1][len2 + 1]; + + // Initialize first row and column + for (int i = 0; i <= len1; i++) matrix[i][0] = i; + for (int j = 0; j <= len2; j++) matrix[0][j] = j; + + // Fill in the rest of the matrix + for (int i = 1; i <= len1; i++) { + for (int j = 1; j <= len2; j++) { + if (s1_lower[i-1] == s2_lower[j-1]) { + matrix[i][j] = matrix[i-1][j-1]; + } else { + int min = matrix[i-1][j-1]; // substitution + if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion + if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion + matrix[i][j] = min + 1; + } + } + } + + return matrix[len1][len2]; +} + +// Calculate similarity between two words based on Levenshtein distance float word_similarity(const char *word1, const char *word2) { int len1 = strlen(word1); int len2 = strlen(word2); @@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) { return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; } - int max_len = len1 > len2 ? len1 : len2; - if (max_len == 0) return 0.0f; - - // Count matching characters (case insensitive) - int matches = 0; - int i = 0, j = 0; - - while (i < len1 && j < len2) { - if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { - matches++; - i++; - j++; - } else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) { - i++; - } else { - j++; + // If one word is significantly shorter than the other, it must be a prefix + if (len1 < len2 * 0.7 || len2 < len1 * 0.7) { + // Check if the shorter word is a prefix of the longer word + const char *longer = len1 > len2 ? word1 : word2; + const char *shorter = len1 > len2 ? word2 : word1; + int shorter_len = len1 > len2 ? len2 : len1; + + if (strncasecmp(longer, shorter, shorter_len) == 0) { + return 0.8f; // Good prefix match } + return 0.0f; // Not a prefix match } - // Calculate similarity based on matching characters - float char_similarity = (float)matches / max_len; + // For words of similar length, calculate similarity + int distance = levenshtein_distance(word1, word2); + int max_len = len1 > len2 ? len1 : len2; - // Require higher similarity for shorter words - float min_similarity = 0.7f; // Default minimum - if (len1 <= 5 || len2 <= 5) { - min_similarity = 0.9f; // Higher requirement for short words + // Calculate similarity based on edit distance + float similarity = 1.0f - (float)distance / max_len; + + // Adjust similarity based on word lengths + if (len1 != len2) { + float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2); + similarity *= length_ratio; } - // If similarity is below minimum, return 0 - if (char_similarity < min_similarity) { + // For words of similar length, require reasonable similarity + if (similarity < 0.4f) { return 0.0f; } - // If words are the same length, give a small boost - if (len1 == len2) { - char_similarity = char_similarity * 1.1f; // 10% boost for same length - if (char_similarity > 1.0f) char_similarity = 1.0f; + // Never return perfect similarity for non-identical words + if (distance > 0) { + similarity = fmin(similarity, 0.9f); } - return char_similarity; + return similarity; } // Calculate similarity between query and target string @@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff) } best_word_similarities[i] = best_similarity; - if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough + if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable query_words_found++; } } @@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff) // Combine scores: 70% weight on word matches, 30% on character similarity float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); - // If all query words were found with high similarity, ensure high overall score - if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) { - similarity = 1.0f; // Perfect match + // Never return perfect similarity unless all words are exact matches + bool all_exact_matches = true; + for (int i = 0; i < query_word_count; i++) { + if (best_word_similarities[i] < 1.0f) { + all_exact_matches = false; + break; + } + } + + if (!all_exact_matches) { + similarity = fmin(similarity, 0.9f); } free_words(query_words, query_word_count); diff --git a/test.js b/test.js index dafd369..d652b0d 100644 --- a/test.js +++ b/test.js @@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light'); // Add multiple strings at once customIndex.addStrings([ 'plant growth bio formula', - 'garden soil substrate' + 'garden soil substrate', + 'plagron light mix', + 'Anesia Seeds Imperium X Auto 10', + 'anesi' ]); console.log(`Custom index created with ${customIndex.size()} strings`); // Search with a higher similarity threshold -console.log('\nSearching with higher similarity threshold (0.3):'); -const results = customIndex.search('bio bizz', 0.3); +console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":'); +const results = customIndex.search('amnesia haze', 0.1); results.forEach(match => { console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); +}); +console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":'); +const results2 = customIndex.search('lightmix', 0.1); +results2.forEach(match => { + console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); }); \ No newline at end of file