Refine word similarity calculation in similarity_search.c by enforcing exact matches for short words and adjusting similarity thresholds. Increase weight of word matches in overall similarity score calculation.

This commit is contained in:
seb
2025-04-18 09:32:44 +02:00
parent 53da84fbcf
commit e94c034927

View File

@@ -59,13 +59,19 @@ void free_words(char *words[], int word_count) {
float word_similarity(const char *word1, const char *word2) { float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1); int len1 = strlen(word1);
int len2 = strlen(word2); int len2 = strlen(word2);
// For very short words (3 chars or less), require exact match
if (len1 <= 3 || len2 <= 3) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
int max_len = len1 > len2 ? len1 : len2; int max_len = len1 > len2 ? len1 : len2;
if (max_len == 0) return 0.0f; if (max_len == 0) return 0.0f;
// Count matching characters (case insensitive)
int matches = 0; int matches = 0;
int i = 0, j = 0; int i = 0, j = 0;
// Count matching characters (case insensitive)
while (i < len1 && j < len2) { while (i < len1 && j < len2) {
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
matches++; matches++;
@@ -81,9 +87,20 @@ float word_similarity(const char *word1, const char *word2) {
// Calculate similarity based on matching characters // Calculate similarity based on matching characters
float char_similarity = (float)matches / max_len; float char_similarity = (float)matches / max_len;
// If words are the same length, boost similarity // Require higher similarity for shorter words
float min_similarity = 0.7f; // Default minimum
if (len1 <= 5 || len2 <= 5) {
min_similarity = 0.9f; // Higher requirement for short words
}
// If similarity is below minimum, return 0
if (char_similarity < min_similarity) {
return 0.0f;
}
// If words are the same length, give a small boost
if (len1 == len2) { if (len1 == len2) {
char_similarity = char_similarity * 1.2f; // 20% boost for same length char_similarity = char_similarity * 1.1f; // 10% boost for same length
if (char_similarity > 1.0f) char_similarity = 1.0f; if (char_similarity > 1.0f) char_similarity = 1.0f;
} }
@@ -136,8 +153,8 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
} }
avg_word_similarity /= query_word_count; avg_word_similarity /= query_word_count;
// Combine scores: 60% weight on word matches, 40% on character similarity // Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f); float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// If all query words were found with high similarity, ensure high overall score // If all query words were found with high similarity, ensure high overall score
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) { if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {