diff --git a/similarity_search.c b/similarity_search.c index 98de023..68cc9b7 100644 --- a/similarity_search.c +++ b/similarity_search.c @@ -82,38 +82,16 @@ float word_similarity(const char *word1, const char *word2) { return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; } - // If one word is significantly shorter than the other, it must be a prefix - if (len1 < len2 * 0.7 || len2 < len1 * 0.7) { - // Check if the shorter word is a prefix of the longer word - const char *longer = len1 > len2 ? word1 : word2; - const char *shorter = len1 > len2 ? word2 : word1; - int shorter_len = len1 > len2 ? len2 : len1; - - if (strncasecmp(longer, shorter, shorter_len) == 0) { - return 0.8f; // Good prefix match - } - return 0.0f; // Not a prefix match - } - - // For words of similar length, calculate similarity + // Calculate Levenshtein distance int distance = levenshtein_distance(word1, word2); int max_len = len1 > len2 ? len1 : len2; - // Calculate similarity based on edit distance with exponential decay + // Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc. float similarity = 1.0f - (float)distance / max_len; - // Apply exponential decay to make it more sensitive to differences - similarity = pow(similarity, 3.0f); - - // Adjust similarity based on word lengths - if (len1 != len2) { - float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2); - similarity *= length_ratio; - } - - // For words of similar length, require reasonable similarity - if (similarity < 0.4f) { - return 0.0f; + // Boost similarity for small differences + if (distance <= 1) { + similarity = 0.9f + (similarity * 0.1f); } return similarity; @@ -143,40 +121,34 @@ float calculate_similarity(const char *query, const char *target, float cutoff) float best_similarity = 0.0f; for (int j = 0; j < target_word_count; j++) { - /* quick length‑difference filter (early‑exit #4) */ - int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]); - if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue; - float similarity = word_similarity(query_words[i], target_words[j]); if (similarity > best_similarity) { best_similarity = similarity; - if (best_similarity >= 0.90f) break; /* early exit #4 */ } } best_word_similarities[i] = best_similarity; - if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable + if (best_similarity >= 0.4f) { query_words_found++; } } - // Calculate overall similarity - float word_match_score = (float)query_words_found / query_word_count; - - // Calculate average of best word similarities + // Calculate average word similarity float avg_word_similarity = 0.0f; for (int i = 0; i < query_word_count; i++) { avg_word_similarity += best_word_similarities[i]; } avg_word_similarity /= query_word_count; - // Combine scores: 60% weight on word matches, 40% on character similarity - // This gives more weight to finding all words, regardless of order - float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f); + // Calculate word match ratio + float word_match_ratio = (float)query_words_found / query_word_count; - // If all words are found, boost the score + // Final score is the average of word match ratio and average word similarity + float similarity = (word_match_ratio + avg_word_similarity) / 2.0f; + + // Boost score if all words are found if (query_words_found == query_word_count) { - similarity = 0.7f + (similarity * 0.3f); + similarity = 0.8f + (similarity * 0.2f); } free_words(query_buf);