From e94c0349270dbf64909b0f62a212a4e481324bbc Mon Sep 17 00:00:00 2001 From: seb Date: Fri, 18 Apr 2025 09:32:44 +0200 Subject: [PATCH] Refine word similarity calculation in similarity_search.c by enforcing exact matches for short words and adjusting similarity thresholds. Increase weight of word matches in overall similarity score calculation. --- similarity_search.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/similarity_search.c b/similarity_search.c index 87a5329..ac61580 100644 --- a/similarity_search.c +++ b/similarity_search.c @@ -59,13 +59,19 @@ void free_words(char *words[], int word_count) { float word_similarity(const char *word1, const char *word2) { int len1 = strlen(word1); int len2 = strlen(word2); + + // For very short words (3 chars or less), require exact match + if (len1 <= 3 || len2 <= 3) { + return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; + } + int max_len = len1 > len2 ? len1 : len2; if (max_len == 0) return 0.0f; + // Count matching characters (case insensitive) int matches = 0; int i = 0, j = 0; - // Count matching characters (case insensitive) while (i < len1 && j < len2) { if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { matches++; @@ -81,9 +87,20 @@ float word_similarity(const char *word1, const char *word2) { // Calculate similarity based on matching characters float char_similarity = (float)matches / max_len; - // If words are the same length, boost similarity + // Require higher similarity for shorter words + float min_similarity = 0.7f; // Default minimum + if (len1 <= 5 || len2 <= 5) { + min_similarity = 0.9f; // Higher requirement for short words + } + + // If similarity is below minimum, return 0 + if (char_similarity < min_similarity) { + return 0.0f; + } + + // If words are the same length, give a small boost if (len1 == len2) { - char_similarity = char_similarity * 1.2f; // 20% boost for same length + char_similarity = char_similarity * 1.1f; // 10% boost for same length if (char_similarity > 1.0f) char_similarity = 1.0f; } @@ -136,8 +153,8 @@ float calculate_similarity(const char *query, const char *target, float cutoff) } avg_word_similarity /= query_word_count; - // Combine scores: 60% weight on word matches, 40% on character similarity - float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f); + // Combine scores: 70% weight on word matches, 30% on character similarity + float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); // If all query words were found with high similarity, ensure high overall score if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {