Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance.

This commit is contained in:
seb
2025-04-18 19:01:22 +02:00
parent e2aacaf54b
commit a9a9247773

View File

@@ -82,38 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
// Calculate Levenshtein distance
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance with exponential decay
// Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
float similarity = 1.0f - (float)distance / max_len;
// Apply exponential decay to make it more sensitive to differences
similarity = pow(similarity, 3.0f);
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
// Boost similarity for small differences
if (distance <= 1) {
similarity = 0.9f + (similarity * 0.1f);
}
return similarity;
@@ -143,40 +121,34 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
float best_similarity = 0.0f;
for (int j = 0; j < target_word_count; j++) {
/* quick lengthdifference filter (earlyexit #4) */
int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
float similarity = word_similarity(query_words[i], target_words[j]);
if (similarity > best_similarity) {
best_similarity = similarity;
if (best_similarity >= 0.90f) break; /* early exit #4 */
}
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
if (best_similarity >= 0.4f) {
query_words_found++;
}
}
// Calculate overall similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
// Calculate average word similarity
float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i];
}
avg_word_similarity /= query_word_count;
// Combine scores: 60% weight on word matches, 40% on character similarity
// This gives more weight to finding all words, regardless of order
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
// Calculate word match ratio
float word_match_ratio = (float)query_words_found / query_word_count;
// If all words are found, boost the score
// Final score is the average of word match ratio and average word similarity
float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
// Boost score if all words are found
if (query_words_found == query_word_count) {
similarity = 0.7f + (similarity * 0.3f);
similarity = 0.8f + (similarity * 0.2f);
}
free_words(query_buf);