Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.

This commit is contained in:
seb
2025-04-18 09:47:58 +02:00
parent e94c034927
commit 92a7bad2b6
2 changed files with 89 additions and 37 deletions

View File

@@ -3,6 +3,8 @@
#include <string.h> #include <string.h>
#include <time.h> #include <time.h>
#include <ctype.h> #include <ctype.h>
#include <math.h>
#include <stdbool.h>
#include "similarity_search.h" #include "similarity_search.h"
// Case insensitive string comparison // Case insensitive string comparison
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
} }
} }
// Calculate similarity between two words based on character matching // Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) {
int len1 = strlen(s1);
int len2 = strlen(s2);
// Convert to lowercase for comparison
char s1_lower[MAX_STRING_LEN];
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix
int matrix[len1 + 1][len2 + 1];
// Initialize first row and column
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
// Fill in the rest of the matrix
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (s1_lower[i-1] == s2_lower[j-1]) {
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
}
}
}
return matrix[len1][len2];
}
// Calculate similarity between two words based on Levenshtein distance
float word_similarity(const char *word1, const char *word2) { float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1); int len1 = strlen(word1);
int len2 = strlen(word2); int len2 = strlen(word2);
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
} }
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2; int max_len = len1 > len2 ? len1 : len2;
if (max_len == 0) return 0.0f;
// Count matching characters (case insensitive) // Calculate similarity based on edit distance
int matches = 0; float similarity = 1.0f - (float)distance / max_len;
int i = 0, j = 0;
while (i < len1 && j < len2) { // Adjust similarity based on word lengths
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { if (len1 != len2) {
matches++; float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
i++; similarity *= length_ratio;
j++;
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
i++;
} else {
j++;
}
} }
// Calculate similarity based on matching characters // For words of similar length, require reasonable similarity
float char_similarity = (float)matches / max_len; if (similarity < 0.4f) {
// Require higher similarity for shorter words
float min_similarity = 0.7f; // Default minimum
if (len1 <= 5 || len2 <= 5) {
min_similarity = 0.9f; // Higher requirement for short words
}
// If similarity is below minimum, return 0
if (char_similarity < min_similarity) {
return 0.0f; return 0.0f;
} }
// If words are the same length, give a small boost // Never return perfect similarity for non-identical words
if (len1 == len2) { if (distance > 0) {
char_similarity = char_similarity * 1.1f; // 10% boost for same length similarity = fmin(similarity, 0.9f);
if (char_similarity > 1.0f) char_similarity = 1.0f;
} }
return char_similarity; return similarity;
} }
// Calculate similarity between query and target string // Calculate similarity between query and target string
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
} }
best_word_similarities[i] = best_similarity; best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
query_words_found++; query_words_found++;
} }
} }
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
// Combine scores: 70% weight on word matches, 30% on character similarity // Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// If all query words were found with high similarity, ensure high overall score // Never return perfect similarity unless all words are exact matches
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) { bool all_exact_matches = true;
similarity = 1.0f; // Perfect match for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) {
all_exact_matches = false;
break;
}
}
if (!all_exact_matches) {
similarity = fmin(similarity, 0.9f);
} }
free_words(query_words, query_word_count); free_words(query_words, query_word_count);

14
test.js
View File

@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
// Add multiple strings at once // Add multiple strings at once
customIndex.addStrings([ customIndex.addStrings([
'plant growth bio formula', 'plant growth bio formula',
'garden soil substrate' 'garden soil substrate',
'plagron light mix',
'Anesia Seeds Imperium X Auto 10',
'anesi'
]); ]);
console.log(`Custom index created with ${customIndex.size()} strings`); console.log(`Custom index created with ${customIndex.size()} strings`);
// Search with a higher similarity threshold // Search with a higher similarity threshold
console.log('\nSearching with higher similarity threshold (0.3):'); console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
const results = customIndex.search('bio bizz', 0.3); const results = customIndex.search('amnesia haze', 0.1);
results.forEach(match => { results.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
}); });
console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
const results2 = customIndex.search('lightmix', 0.1);
results2.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
});