Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.

This commit is contained in:
seb
2025-04-18 09:47:58 +02:00
parent e94c034927
commit 92a7bad2b6
2 changed files with 89 additions and 37 deletions

View File

@@ -3,6 +3,8 @@
#include <string.h>
#include <time.h>
#include <ctype.h>
#include <math.h>
#include <stdbool.h>
#include "similarity_search.h"
// Case insensitive string comparison
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
}
}
// Calculate similarity between two words based on character matching
// Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) {
int len1 = strlen(s1);
int len2 = strlen(s2);
// Convert to lowercase for comparison
char s1_lower[MAX_STRING_LEN];
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix
int matrix[len1 + 1][len2 + 1];
// Initialize first row and column
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
// Fill in the rest of the matrix
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (s1_lower[i-1] == s2_lower[j-1]) {
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
}
}
}
return matrix[len1][len2];
}
// Calculate similarity between two words based on Levenshtein distance
float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1);
int len2 = strlen(word2);
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
if (max_len == 0) return 0.0f;
// Count matching characters (case insensitive)
int matches = 0;
int i = 0, j = 0;
// Calculate similarity based on edit distance
float similarity = 1.0f - (float)distance / max_len;
while (i < len1 && j < len2) {
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
matches++;
i++;
j++;
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
i++;
} else {
j++;
}
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// Calculate similarity based on matching characters
float char_similarity = (float)matches / max_len;
// Require higher similarity for shorter words
float min_similarity = 0.7f; // Default minimum
if (len1 <= 5 || len2 <= 5) {
min_similarity = 0.9f; // Higher requirement for short words
}
// If similarity is below minimum, return 0
if (char_similarity < min_similarity) {
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
}
// If words are the same length, give a small boost
if (len1 == len2) {
char_similarity = char_similarity * 1.1f; // 10% boost for same length
if (char_similarity > 1.0f) char_similarity = 1.0f;
// Never return perfect similarity for non-identical words
if (distance > 0) {
similarity = fmin(similarity, 0.9f);
}
return char_similarity;
return similarity;
}
// Calculate similarity between query and target string
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
query_words_found++;
}
}
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
// Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// If all query words were found with high similarity, ensure high overall score
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
similarity = 1.0f; // Perfect match
// Never return perfect similarity unless all words are exact matches
bool all_exact_matches = true;
for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) {
all_exact_matches = false;
break;
}
}
if (!all_exact_matches) {
similarity = fmin(similarity, 0.9f);
}
free_words(query_words, query_word_count);

14
test.js
View File

@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
// Add multiple strings at once
customIndex.addStrings([
'plant growth bio formula',
'garden soil substrate'
'garden soil substrate',
'plagron light mix',
'Anesia Seeds Imperium X Auto 10',
'anesi'
]);
console.log(`Custom index created with ${customIndex.size()} strings`);
// Search with a higher similarity threshold
console.log('\nSearching with higher similarity threshold (0.3):');
const results = customIndex.search('bio bizz', 0.3);
console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
const results = customIndex.search('amnesia haze', 0.1);
results.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
});
console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
const results2 = customIndex.search('lightmix', 0.1);
results2.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
});