Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance.

This commit is contained in:
seb
2025-04-18 19:01:22 +02:00
parent e2aacaf54b
commit a9a9247773

View File

@@ -82,38 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
} }
// If one word is significantly shorter than the other, it must be a prefix // Calculate Levenshtein distance
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
int distance = levenshtein_distance(word1, word2); int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2; int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance with exponential decay // Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
float similarity = 1.0f - (float)distance / max_len; float similarity = 1.0f - (float)distance / max_len;
// Apply exponential decay to make it more sensitive to differences // Boost similarity for small differences
similarity = pow(similarity, 3.0f); if (distance <= 1) {
similarity = 0.9f + (similarity * 0.1f);
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
} }
return similarity; return similarity;
@@ -143,40 +121,34 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
float best_similarity = 0.0f; float best_similarity = 0.0f;
for (int j = 0; j < target_word_count; j++) { for (int j = 0; j < target_word_count; j++) {
/* quick lengthdifference filter (earlyexit #4) */
int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
float similarity = word_similarity(query_words[i], target_words[j]); float similarity = word_similarity(query_words[i], target_words[j]);
if (similarity > best_similarity) { if (similarity > best_similarity) {
best_similarity = similarity; best_similarity = similarity;
if (best_similarity >= 0.90f) break; /* early exit #4 */
} }
} }
best_word_similarities[i] = best_similarity; best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable if (best_similarity >= 0.4f) {
query_words_found++; query_words_found++;
} }
} }
// Calculate overall similarity // Calculate average word similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
float avg_word_similarity = 0.0f; float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) { for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i]; avg_word_similarity += best_word_similarities[i];
} }
avg_word_similarity /= query_word_count; avg_word_similarity /= query_word_count;
// Combine scores: 60% weight on word matches, 40% on character similarity // Calculate word match ratio
// This gives more weight to finding all words, regardless of order float word_match_ratio = (float)query_words_found / query_word_count;
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
// If all words are found, boost the score // Final score is the average of word match ratio and average word similarity
float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
// Boost score if all words are found
if (query_words_found == query_word_count) { if (query_words_found == query_word_count) {
similarity = 0.7f + (similarity * 0.3f); similarity = 0.8f + (similarity * 0.2f);
} }
free_words(query_buf); free_words(query_buf);