Implement Levenshtein distance calculation for improved word similarity in similarity_search.c. Adjust similarity thresholds and scoring logic to enhance accuracy, particularly for prefix matches and varying word lengths. Update test.js to reflect new search scenarios with lower similarity thresholds.
This commit is contained in:
@@ -3,6 +3,8 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include "similarity_search.h"
|
#include "similarity_search.h"
|
||||||
|
|
||||||
// Case insensitive string comparison
|
// Case insensitive string comparison
|
||||||
@@ -55,7 +57,44 @@ void free_words(char *words[], int word_count) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate similarity between two words based on character matching
|
// Calculate Levenshtein distance between two strings
|
||||||
|
int levenshtein_distance(const char *s1, const char *s2) {
|
||||||
|
int len1 = strlen(s1);
|
||||||
|
int len2 = strlen(s2);
|
||||||
|
|
||||||
|
// Convert to lowercase for comparison
|
||||||
|
char s1_lower[MAX_STRING_LEN];
|
||||||
|
char s2_lower[MAX_STRING_LEN];
|
||||||
|
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
|
||||||
|
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
|
||||||
|
s1_lower[len1] = '\0';
|
||||||
|
s2_lower[len2] = '\0';
|
||||||
|
|
||||||
|
// Create distance matrix
|
||||||
|
int matrix[len1 + 1][len2 + 1];
|
||||||
|
|
||||||
|
// Initialize first row and column
|
||||||
|
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
|
||||||
|
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
|
||||||
|
|
||||||
|
// Fill in the rest of the matrix
|
||||||
|
for (int i = 1; i <= len1; i++) {
|
||||||
|
for (int j = 1; j <= len2; j++) {
|
||||||
|
if (s1_lower[i-1] == s2_lower[j-1]) {
|
||||||
|
matrix[i][j] = matrix[i-1][j-1];
|
||||||
|
} else {
|
||||||
|
int min = matrix[i-1][j-1]; // substitution
|
||||||
|
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
|
||||||
|
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
|
||||||
|
matrix[i][j] = min + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return matrix[len1][len2];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate similarity between two words based on Levenshtein distance
|
||||||
float word_similarity(const char *word1, const char *word2) {
|
float word_similarity(const char *word1, const char *word2) {
|
||||||
int len1 = strlen(word1);
|
int len1 = strlen(word1);
|
||||||
int len2 = strlen(word2);
|
int len2 = strlen(word2);
|
||||||
@@ -65,46 +104,43 @@ float word_similarity(const char *word1, const char *word2) {
|
|||||||
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
int max_len = len1 > len2 ? len1 : len2;
|
// If one word is significantly shorter than the other, it must be a prefix
|
||||||
if (max_len == 0) return 0.0f;
|
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
|
||||||
|
// Check if the shorter word is a prefix of the longer word
|
||||||
// Count matching characters (case insensitive)
|
const char *longer = len1 > len2 ? word1 : word2;
|
||||||
int matches = 0;
|
const char *shorter = len1 > len2 ? word2 : word1;
|
||||||
int i = 0, j = 0;
|
int shorter_len = len1 > len2 ? len2 : len1;
|
||||||
|
|
||||||
while (i < len1 && j < len2) {
|
if (strncasecmp(longer, shorter, shorter_len) == 0) {
|
||||||
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
|
return 0.8f; // Good prefix match
|
||||||
matches++;
|
|
||||||
i++;
|
|
||||||
j++;
|
|
||||||
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
|
|
||||||
i++;
|
|
||||||
} else {
|
|
||||||
j++;
|
|
||||||
}
|
}
|
||||||
|
return 0.0f; // Not a prefix match
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate similarity based on matching characters
|
// For words of similar length, calculate similarity
|
||||||
float char_similarity = (float)matches / max_len;
|
int distance = levenshtein_distance(word1, word2);
|
||||||
|
int max_len = len1 > len2 ? len1 : len2;
|
||||||
|
|
||||||
// Require higher similarity for shorter words
|
// Calculate similarity based on edit distance
|
||||||
float min_similarity = 0.7f; // Default minimum
|
float similarity = 1.0f - (float)distance / max_len;
|
||||||
if (len1 <= 5 || len2 <= 5) {
|
|
||||||
min_similarity = 0.9f; // Higher requirement for short words
|
// Adjust similarity based on word lengths
|
||||||
|
if (len1 != len2) {
|
||||||
|
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
|
||||||
|
similarity *= length_ratio;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If similarity is below minimum, return 0
|
// For words of similar length, require reasonable similarity
|
||||||
if (char_similarity < min_similarity) {
|
if (similarity < 0.4f) {
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If words are the same length, give a small boost
|
// Never return perfect similarity for non-identical words
|
||||||
if (len1 == len2) {
|
if (distance > 0) {
|
||||||
char_similarity = char_similarity * 1.1f; // 10% boost for same length
|
similarity = fmin(similarity, 0.9f);
|
||||||
if (char_similarity > 1.0f) char_similarity = 1.0f;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return char_similarity;
|
return similarity;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate similarity between query and target string
|
// Calculate similarity between query and target string
|
||||||
@@ -138,7 +174,7 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
|||||||
}
|
}
|
||||||
|
|
||||||
best_word_similarities[i] = best_similarity;
|
best_word_similarities[i] = best_similarity;
|
||||||
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
|
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
|
||||||
query_words_found++;
|
query_words_found++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -156,9 +192,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
|||||||
// Combine scores: 70% weight on word matches, 30% on character similarity
|
// Combine scores: 70% weight on word matches, 30% on character similarity
|
||||||
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
|
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
|
||||||
|
|
||||||
// If all query words were found with high similarity, ensure high overall score
|
// Never return perfect similarity unless all words are exact matches
|
||||||
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
|
bool all_exact_matches = true;
|
||||||
similarity = 1.0f; // Perfect match
|
for (int i = 0; i < query_word_count; i++) {
|
||||||
|
if (best_word_similarities[i] < 1.0f) {
|
||||||
|
all_exact_matches = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!all_exact_matches) {
|
||||||
|
similarity = fmin(similarity, 0.9f);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_words(query_words, query_word_count);
|
free_words(query_words, query_word_count);
|
||||||
|
|||||||
14
test.js
14
test.js
@@ -43,14 +43,22 @@ customIndex.addString('bizz bio mix light');
|
|||||||
// Add multiple strings at once
|
// Add multiple strings at once
|
||||||
customIndex.addStrings([
|
customIndex.addStrings([
|
||||||
'plant growth bio formula',
|
'plant growth bio formula',
|
||||||
'garden soil substrate'
|
'garden soil substrate',
|
||||||
|
'plagron light mix',
|
||||||
|
'Anesia Seeds Imperium X Auto 10',
|
||||||
|
'anesi'
|
||||||
]);
|
]);
|
||||||
|
|
||||||
console.log(`Custom index created with ${customIndex.size()} strings`);
|
console.log(`Custom index created with ${customIndex.size()} strings`);
|
||||||
|
|
||||||
// Search with a higher similarity threshold
|
// Search with a higher similarity threshold
|
||||||
console.log('\nSearching with higher similarity threshold (0.3):');
|
console.log('\nSearching with higher similarity threshold (0.1) for "amnesia":');
|
||||||
const results = customIndex.search('bio bizz', 0.3);
|
const results = customIndex.search('amnesia haze', 0.1);
|
||||||
results.forEach(match => {
|
results.forEach(match => {
|
||||||
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
||||||
|
});
|
||||||
|
console.log('\nSearching with higher similarity threshold (0.1) for "lightmix":');
|
||||||
|
const results2 = customIndex.search('lightmix', 0.1);
|
||||||
|
results2.forEach(match => {
|
||||||
|
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
||||||
});
|
});
|
||||||
Reference in New Issue
Block a user