Add word similarity calculation to enhance overall similarity scoring in calculate_similarity function. Implement character matching logic and boost score for same-length words.
This commit is contained in:
@@ -55,6 +55,41 @@ void free_words(char *words[], int word_count) {
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate similarity between two words based on character matching
|
||||
float word_similarity(const char *word1, const char *word2) {
|
||||
int len1 = strlen(word1);
|
||||
int len2 = strlen(word2);
|
||||
int max_len = len1 > len2 ? len1 : len2;
|
||||
if (max_len == 0) return 0.0f;
|
||||
|
||||
int matches = 0;
|
||||
int i = 0, j = 0;
|
||||
|
||||
// Count matching characters (case insensitive)
|
||||
while (i < len1 && j < len2) {
|
||||
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
|
||||
matches++;
|
||||
i++;
|
||||
j++;
|
||||
} else if (tolower((unsigned char)word1[i]) < tolower((unsigned char)word2[j])) {
|
||||
i++;
|
||||
} else {
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate similarity based on matching characters
|
||||
float char_similarity = (float)matches / max_len;
|
||||
|
||||
// If words are the same length, boost similarity
|
||||
if (len1 == len2) {
|
||||
char_similarity = char_similarity * 1.2f; // 20% boost for same length
|
||||
if (char_similarity > 1.0f) char_similarity = 1.0f;
|
||||
}
|
||||
|
||||
return char_similarity;
|
||||
}
|
||||
|
||||
// Calculate similarity between query and target string
|
||||
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
||||
// Split strings into words
|
||||
@@ -70,32 +105,43 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Count matches and track which query words were found
|
||||
int matches = 0;
|
||||
// Track best matches for each query word
|
||||
float best_word_similarities[MAX_WORDS] = {0.0f};
|
||||
int query_words_found = 0;
|
||||
int found_query_words[MAX_WORDS] = {0}; // Track which query words were found
|
||||
|
||||
// For each query word, find its best match in target words
|
||||
for (int i = 0; i < query_word_count; i++) {
|
||||
float best_similarity = 0.0f;
|
||||
|
||||
for (int j = 0; j < target_word_count; j++) {
|
||||
if (str_case_cmp(query_words[i], target_words[j]) == 0) {
|
||||
matches++;
|
||||
if (!found_query_words[i]) {
|
||||
found_query_words[i] = 1;
|
||||
float similarity = word_similarity(query_words[i], target_words[j]);
|
||||
if (similarity > best_similarity) {
|
||||
best_similarity = similarity;
|
||||
}
|
||||
}
|
||||
|
||||
best_word_similarities[i] = best_similarity;
|
||||
if (best_similarity >= 0.8f) { // Consider it a match if similarity is high enough
|
||||
query_words_found++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate base similarity (intersection over union)
|
||||
float base_similarity = (float)matches / (query_word_count + target_word_count - matches);
|
||||
// Calculate overall similarity
|
||||
float word_match_score = (float)query_words_found / query_word_count;
|
||||
|
||||
// If all query words were found, boost the similarity
|
||||
float similarity = base_similarity;
|
||||
if (query_words_found == query_word_count) {
|
||||
// If all query words were found, similarity should be at least 0.8
|
||||
similarity = base_similarity > 0.8f ? base_similarity : 0.8f;
|
||||
// Calculate average of best word similarities
|
||||
float avg_word_similarity = 0.0f;
|
||||
for (int i = 0; i < query_word_count; i++) {
|
||||
avg_word_similarity += best_word_similarities[i];
|
||||
}
|
||||
avg_word_similarity /= query_word_count;
|
||||
|
||||
// Combine scores: 60% weight on word matches, 40% on character similarity
|
||||
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
|
||||
|
||||
// If all query words were found with high similarity, ensure high overall score
|
||||
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
|
||||
similarity = 1.0f; // Perfect match
|
||||
}
|
||||
|
||||
free_words(query_words, query_word_count);
|
||||
|
||||
Reference in New Issue
Block a user