Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.

This commit is contained in:
seb
2025-04-18 09:47:58 +02:00
parent e94c034927
commit 92a7bad2b6
2 changed files with 89 additions and 37 deletions

View File

@@ -3,6 +3,8 @@
#include <string.h> #include <string.h>
#include <time.h> #include <time.h>
#include <ctype.h> #include <ctype.h>
#include <math.h>
#include <stdbool.h>
#include "similarity_search.h" #include "similarity_search.h"
// Case insensitive string comparison // Case insensitive string comparison
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
} }
} }
// Calculate similarity between two words based on character matching // Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) {
int len1 = strlen(s1);
int len2 = strlen(s2);
// Convert to lowercase for comparison
char s1_lower[MAX_STRING_LEN];
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix
int matrix[len1 + 1][len2 + 1];
// Initialize first row and column
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
// Fill in the rest of the matrix
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (s1_lower[i-1] == s2_lower[j-1]) {
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
}
}
}
return matrix[len1][len2];
}
// Calculate similarity between two words based on Levenshtein distance
float word_similarity(const char *word1, const char *word2) { float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1); int len1 = strlen(word1);
int len2 = strlen(word2); int len2 = strlen(word2);
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
} }
int max_len = len1 > len2 ? len1 : len2; // If one word is significantly shorter than the other, it must be a prefix
if (max_len == 0) return 0.0f; if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
// Count matching characters (case insensitive) const char *longer = len1 > len2 ? word1 : word2;
int matches = 0; const char *shorter = len1 > len2 ? word2 : word1;
int i = 0, j = 0; int shorter_len = len1 > len2 ? len2 : len1;
while (i < len1 && j < len2) { if (strncasecmp(longer, shorter, shorter_len) == 0) {
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { return 0.8f; // Good prefix match
matches++;
i++;
j++;
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
i++;
} else {
j++;
} }
return 0.0f; // Not a prefix match
} }
// Calculate similarity based on matching characters // For words of similar length, calculate similarity
float char_similarity = (float)matches / max_len; int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
// Require higher similarity for shorter words // Calculate similarity based on edit distance
float min_similarity = 0.7f; // Default minimum float similarity = 1.0f - (float)distance / max_len;
if (len1 <= 5 || len2 <= 5) {
min_similarity = 0.9f; // Higher requirement for short words // Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
} }
// If similarity is below minimum, return 0 // For words of similar length, require reasonable similarity
if (char_similarity < min_similarity) { if (similarity < 0.4f) {
return 0.0f; return 0.0f;
} }
// If words are the same length, give a small boost // Never return perfect similarity for non-identical words
if (len1 == len2) { if (distance > 0) {
char_similarity = char_similarity * 1.1f; // 10% boost for same length similarity = fmin(similarity, 0.9f);
if (char_similarity > 1.0f) char_similarity = 1.0f;
} }
return char_similarity; return similarity;
} }
// Calculate similarity between query and target string // Calculate similarity between query and target string
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
} }
best_word_similarities[i] = best_similarity; best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
query_words_found++; query_words_found++;
} }
} }
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
// Combine scores: 70% weight on word matches, 30% on character similarity // Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// If all query words were found with high similarity, ensure high overall score // Never return perfect similarity unless all words are exact matches
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) { bool all_exact_matches = true;
similarity = 1.0f; // Perfect match for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) {
all_exact_matches = false;
break;
}
}
if (!all_exact_matches) {
similarity = fmin(similarity, 0.9f);
} }
free_words(query_words, query_word_count); free_words(query_words, query_word_count);

14
test.js
View File

@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
// Add multiple strings at once // Add multiple strings at once
customIndex.addStrings([ customIndex.addStrings([
'plant growth bio formula', 'plant growth bio formula',
'garden soil substrate' 'garden soil substrate',
'plagron light mix',
'Anesia Seeds Imperium X Auto 10',
'anesi'
]); ]);
console.log(`Custom index created with ${customIndex.size()} strings`); console.log(`Custom index created with ${customIndex.size()} strings`);
// Search with a higher similarity threshold // Search with a higher similarity threshold
console.log('\nSearching with higher similarity threshold (0.3):'); console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
const results = customIndex.search('bio bizz', 0.3); const results = customIndex.search('amnesia haze', 0.1);
results.forEach(match => { results.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
});
console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
const results2 = customIndex.search('lightmix', 0.1);
results2.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
}); });