diff --git a/similarity_search.c b/similarity_search.c index 0c1e0ee..87a5329 100644 --- a/similarity_search.c +++ b/similarity_search.c @@ -55,6 +55,41 @@ void free_words(char *words[], int word_count) { } } +// Calculate similarity between two words based on character matching +float word_similarity(const char *word1, const char *word2) { + int len1 = strlen(word1); + int len2 = strlen(word2); + int max_len = len1 > len2 ? len1 : len2; + if (max_len == 0) return 0.0f; + + int matches = 0; + int i = 0, j = 0; + + // Count matching characters (case insensitive) + while (i < len1 && j < len2) { + if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) { + matches++; + i++; + j++; + } else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) { + i++; + } else { + j++; + } + } + + // Calculate similarity based on matching characters + float char_similarity = (float)matches / max_len; + + // If words are the same length, boost similarity + if (len1 == len2) { + char_similarity = char_similarity * 1.2f; // 20% boost for same length + if (char_similarity > 1.0f) char_similarity = 1.0f; + } + + return char_similarity; +} + // Calculate similarity between query and target string float calculate_similarity(const char *query, const char *target, float cutoff) { // Split strings into words @@ -70,32 +105,43 @@ float calculate_similarity(const char *query, const char *target, float cutoff) return 0.0; } - // Count matches and track which query words were found - int matches = 0; + // Track best matches for each query word + float best_word_similarities[MAX_WORDS] = {0.0f}; int query_words_found = 0; - int found_query_words[MAX_WORDS] = {0}; // Track which query words were found + // For each query word, find its best match in target words for (int i = 0; i < query_word_count; i++) { + float best_similarity = 0.0f; + for (int j = 0; j < target_word_count; j++) { - if (str_case_cmp(query_words[i], target_words[j]) == 0) { - matches++; - if (!found_query_words[i]) { - found_query_words[i] = 1; - query_words_found++; - } - break; + float similarity = word_similarity(query_words[i], target_words[j]); + if (similarity > best_similarity) { + best_similarity = similarity; } } + + best_word_similarities[i] = best_similarity; + if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough + query_words_found++; + } } - // Calculate base similarity (intersection over union) - float base_similarity = (float)matches / (query_word_count + target_word_count - matches); + // Calculate overall similarity + float word_match_score = (float)query_words_found / query_word_count; - // If all query words were found, boost the similarity - float similarity = base_similarity; - if (query_words_found == query_word_count) { - // If all query words were found, similarity should be at least 0.8 - similarity = base_similarity > 0.8f ? base_similarity : 0.8f; + // Calculate average of best word similarities + float avg_word_similarity = 0.0f; + for (int i = 0; i < query_word_count; i++) { + avg_word_similarity += best_word_similarities[i]; + } + avg_word_similarity /= query_word_count; + + // Combine scores: 60% weight on word matches, 40% on character similarity + float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f); + + // If all query words were found with high similarity, ensure high overall score + if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) { + similarity = 1.0f; // Perfect match } free_words(query_words, query_word_count);