Refine word similarity calculation in similarity_search.c by enforcing exact matches for short words and adjusting similarity thresholds. Increase weight of word matches in overall similarity score calculation.

2025-04-18 09:32:44 +02:00
parent 53da84fbcf
commit e94c034927
1 changed files with 22 additions and 5 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -59,13 +59,19 @@ void free_words(char *words[], int word_count) {
 float word_similarity(const char *word1, const char *word2) {
    int len1 = strlen(word1);
    int len2 = strlen(word2);
    // For very short words (3 chars or less), require exact match
    if (len1 <= 3 || len2 <= 3) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }
    int max_len = len1 > len2 ? len1 : len2;
    if (max_len == 0) return 0.0f;
    // Count matching characters (case insensitive)
    int matches = 0;
    int i = 0, j = 0;
    // Count matching characters (case insensitive)
    while (i < len1 && j < len2) {
        if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
            matches++;
@@ -81,9 +87,20 @@ float word_similarity(const char *word1, const char *word2) {
    // Calculate similarity based on matching characters
    float char_similarity = (float)matches / max_len;
-    // If words are the same length, boost similarity
+    // Require higher similarity for shorter words
    float min_similarity = 0.7f; // Default minimum
    if (len1 <= 5 || len2 <= 5) {
        min_similarity = 0.9f; // Higher requirement for short words
    }
    // If similarity is below minimum, return 0
    if (char_similarity < min_similarity) {
        return 0.0f;
    }
    // If words are the same length, give a small boost
    if (len1 == len2) {
-        char_similarity = char_similarity * 1.2f; // 20% boost for same length
+        char_similarity = char_similarity * 1.1f; // 10% boost for same length
        if (char_similarity > 1.0f) char_similarity = 1.0f;
    }
@@ -136,8 +153,8 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
    }
    avg_word_similarity /= query_word_count;
-    // Combine scores: 60% weight on word matches, 40% on character similarity
+    // Combine scores: 70% weight on word matches, 30% on character similarity
-    float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
+    float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
    // If all query words were found with high similarity, ensure high overall score
    if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {