Add word similarity calculation to enhance overall similarity scoring in calculate_similarity function. Implement character matching logic and boost score for same-length words.

This commit is contained in:
seb
2025-04-18 09:20:44 +02:00
parent 6091cc0b80
commit cd41ca2f52

View File

@@ -55,6 +55,41 @@ void free_words(char *words[], int word_count) {
}
}
// Calculate similarity between two words based on character matching
float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1);
int len2 = strlen(word2);
int max_len = len1 > len2 ? len1 : len2;
if (max_len == 0) return 0.0f;
int matches = 0;
int i = 0, j = 0;
// Count matching characters (case insensitive)
while (i < len1 && j < len2) {
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
matches++;
i++;
j++;
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
i++;
} else {
j++;
}
}
// Calculate similarity based on matching characters
float char_similarity = (float)matches / max_len;
// If words are the same length, boost similarity
if (len1 == len2) {
char_similarity = char_similarity * 1.2f; // 20% boost for same length
if (char_similarity > 1.0f) char_similarity = 1.0f;
}
return char_similarity;
}
// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words
@@ -70,32 +105,43 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
return 0.0;
}
// Count matches and track which query words were found
int matches = 0;
// Track best matches for each query word
float best_word_similarities[MAX_WORDS] = {0.0f};
int query_words_found = 0;
int found_query_words[MAX_WORDS] = {0}; // Track which query words were found
// For each query word, find its best match in target words
for (int i = 0; i < query_word_count; i++) {
float best_similarity = 0.0f;
for (int j = 0; j < target_word_count; j++) {
if (str_case_cmp(query_words[i], target_words[j]) == 0) {
matches++;
if (!found_query_words[i]) {
found_query_words[i] = 1;
query_words_found++;
}
break;
float similarity = word_similarity(query_words[i], target_words[j]);
if (similarity > best_similarity) {
best_similarity = similarity;
}
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
query_words_found++;
}
}
// Calculate base similarity (intersection over union)
float base_similarity = (float)matches / (query_word_count + target_word_count - matches);
// Calculate overall similarity
float word_match_score = (float)query_words_found / query_word_count;
// If all query words were found, boost the similarity
float similarity = base_similarity;
if (query_words_found == query_word_count) {
// If all query words were found, similarity should be at least 0.8
similarity = base_similarity > 0.8f ? base_similarity : 0.8f;
// Calculate average of best word similarities
float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i];
}
avg_word_similarity /= query_word_count;
// Combine scores: 60% weight on word matches, 40% on character similarity
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
// If all query words were found with high similarity, ensure high overall score
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
similarity = 1.0f; // Perfect match
}
free_words(query_words, query_word_count);