Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.
This commit is contained in:
@@ -3,6 +3,8 @@
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <ctype.h>
|
||||
#include <math.h>
|
||||
#include <stdbool.h>
|
||||
#include "similarity_search.h"
|
||||
|
||||
// Case insensitive string comparison
|
||||
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate similarity between two words based on character matching
|
||||
// Calculate Levenshtein distance between two strings
|
||||
int levenshtein_distance(const char *s1, const char *s2) {
|
||||
int len1 = strlen(s1);
|
||||
int len2 = strlen(s2);
|
||||
|
||||
// Convert to lowercase for comparison
|
||||
char s1_lower[MAX_STRING_LEN];
|
||||
char s2_lower[MAX_STRING_LEN];
|
||||
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
|
||||
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
|
||||
s1_lower[len1] = '\0';
|
||||
s2_lower[len2] = '\0';
|
||||
|
||||
// Create distance matrix
|
||||
int matrix[len1 + 1][len2 + 1];
|
||||
|
||||
// Initialize first row and column
|
||||
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
|
||||
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
|
||||
|
||||
// Fill in the rest of the matrix
|
||||
for (int i = 1; i <= len1; i++) {
|
||||
for (int j = 1; j <= len2; j++) {
|
||||
if (s1_lower[i-1] == s2_lower[j-1]) {
|
||||
matrix[i][j] = matrix[i-1][j-1];
|
||||
} else {
|
||||
int min = matrix[i-1][j-1]; // substitution
|
||||
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
|
||||
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
|
||||
matrix[i][j] = min + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[len1][len2];
|
||||
}
|
||||
|
||||
// Calculate similarity between two words based on Levenshtein distance
|
||||
float word_similarity(const char *word1, const char *word2) {
|
||||
int len1 = strlen(word1);
|
||||
int len2 = strlen(word2);
|
||||
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
|
||||
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
int max_len = len1 > len2 ? len1 : len2;
|
||||
if (max_len == 0) return 0.0f;
|
||||
|
||||
// Count matching characters (case insensitive)
|
||||
int matches = 0;
|
||||
int i = 0, j = 0;
|
||||
|
||||
while (i < len1 && j < len2) {
|
||||
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
|
||||
matches++;
|
||||
i++;
|
||||
j++;
|
||||
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
|
||||
i++;
|
||||
} else {
|
||||
j++;
|
||||
// If one word is significantly shorter than the other, it must be a prefix
|
||||
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
|
||||
// Check if the shorter word is a prefix of the longer word
|
||||
const char *longer = len1 > len2 ? word1 : word2;
|
||||
const char *shorter = len1 > len2 ? word2 : word1;
|
||||
int shorter_len = len1 > len2 ? len2 : len1;
|
||||
|
||||
if (strncasecmp(longer, shorter, shorter_len) == 0) {
|
||||
return 0.8f; // Good prefix match
|
||||
}
|
||||
return 0.0f; // Not a prefix match
|
||||
}
|
||||
|
||||
// Calculate similarity based on matching characters
|
||||
float char_similarity = (float)matches / max_len;
|
||||
// For words of similar length, calculate similarity
|
||||
int distance = levenshtein_distance(word1, word2);
|
||||
int max_len = len1 > len2 ? len1 : len2;
|
||||
|
||||
// Require higher similarity for shorter words
|
||||
float min_similarity = 0.7f; // Default minimum
|
||||
if (len1 <= 5 || len2 <= 5) {
|
||||
min_similarity = 0.9f; // Higher requirement for short words
|
||||
// Calculate similarity based on edit distance
|
||||
float similarity = 1.0f - (float)distance / max_len;
|
||||
|
||||
// Adjust similarity based on word lengths
|
||||
if (len1 != len2) {
|
||||
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
|
||||
similarity *= length_ratio;
|
||||
}
|
||||
|
||||
// If similarity is below minimum, return 0
|
||||
if (char_similarity < min_similarity) {
|
||||
// For words of similar length, require reasonable similarity
|
||||
if (similarity < 0.4f) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
// If words are the same length, give a small boost
|
||||
if (len1 == len2) {
|
||||
char_similarity = char_similarity * 1.1f; // 10% boost for same length
|
||||
if (char_similarity > 1.0f) char_similarity = 1.0f;
|
||||
// Never return perfect similarity for non-identical words
|
||||
if (distance > 0) {
|
||||
similarity = fmin(similarity, 0.9f);
|
||||
}
|
||||
|
||||
return char_similarity;
|
||||
return similarity;
|
||||
}
|
||||
|
||||
// Calculate similarity between query and target string
|
||||
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
||||
}
|
||||
|
||||
best_word_similarities[i] = best_similarity;
|
||||
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
|
||||
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
|
||||
query_words_found++;
|
||||
}
|
||||
}
|
||||
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
||||
// Combine scores: 70% weight on word matches, 30% on character similarity
|
||||
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
|
||||
|
||||
// If all query words were found with high similarity, ensure high overall score
|
||||
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
|
||||
similarity = 1.0f; // Perfect match
|
||||
// Never return perfect similarity unless all words are exact matches
|
||||
bool all_exact_matches = true;
|
||||
for (int i = 0; i < query_word_count; i++) {
|
||||
if (best_word_similarities[i] < 1.0f) {
|
||||
all_exact_matches = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!all_exact_matches) {
|
||||
similarity = fmin(similarity, 0.9f);
|
||||
}
|
||||
|
||||
free_words(query_words, query_word_count);
|
||||
|
||||
14
test.js
14
test.js
@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
|
||||
// Add multiple strings at once
|
||||
customIndex.addStrings([
|
||||
'plant growth bio formula',
|
||||
'garden soil substrate'
|
||||
'garden soil substrate',
|
||||
'plagron light mix',
|
||||
'Anesia Seeds Imperium X Auto 10',
|
||||
'anesi'
|
||||
]);
|
||||
|
||||
console.log(`Custom index created with ${customIndex.size()} strings`);
|
||||
|
||||
// Search with a higher similarity threshold
|
||||
console.log('\nSearching with higher similarity threshold (0.3):');
|
||||
const results = customIndex.search('bio bizz', 0.3);
|
||||
console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
|
||||
const results = customIndex.search('amnesia haze', 0.1);
|
||||
results.forEach(match => {
|
||||
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
||||
});
|
||||
console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
|
||||
const results2 = customIndex.search('lightmix', 0.1);
|
||||
results2.forEach(match => {
|
||||
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
||||
});
|
||||
Reference in New Issue
Block a user