Refine word similarity calculation in similarity_search.c by enforcing exact matches for short words and adjusting similarity thresholds. Increase weight of word matches in overall similarity score calculation.
This commit is contained in:
@@ -59,13 +59,19 @@ void free_words(char *words[], int word_count) {
|
|||||||
float word_similarity(const char *word1, const char *word2) {
|
float word_similarity(const char *word1, const char *word2) {
|
||||||
int len1 = strlen(word1);
|
int len1 = strlen(word1);
|
||||||
int len2 = strlen(word2);
|
int len2 = strlen(word2);
|
||||||
|
|
||||||
|
// For very short words (3 chars or less), require exact match
|
||||||
|
if (len1 <= 3 || len2 <= 3) {
|
||||||
|
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
int max_len = len1 > len2 ? len1 : len2;
|
int max_len = len1 > len2 ? len1 : len2;
|
||||||
if (max_len == 0) return 0.0f;
|
if (max_len == 0) return 0.0f;
|
||||||
|
|
||||||
|
// Count matching characters (case insensitive)
|
||||||
int matches = 0;
|
int matches = 0;
|
||||||
int i = 0, j = 0;
|
int i = 0, j = 0;
|
||||||
|
|
||||||
// Count matching characters (case insensitive)
|
|
||||||
while (i < len1 && j < len2) {
|
while (i < len1 && j < len2) {
|
||||||
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
|
if (tolower((unsigned char)word1[i]) == tolower((unsigned char)word2[j])) {
|
||||||
matches++;
|
matches++;
|
||||||
@@ -81,9 +87,20 @@ float word_similarity(const char *word1, const char *word2) {
|
|||||||
// Calculate similarity based on matching characters
|
// Calculate similarity based on matching characters
|
||||||
float char_similarity = (float)matches / max_len;
|
float char_similarity = (float)matches / max_len;
|
||||||
|
|
||||||
// If words are the same length, boost similarity
|
// Require higher similarity for shorter words
|
||||||
|
float min_similarity = 0.7f; // Default minimum
|
||||||
|
if (len1 <= 5 || len2 <= 5) {
|
||||||
|
min_similarity = 0.9f; // Higher requirement for short words
|
||||||
|
}
|
||||||
|
|
||||||
|
// If similarity is below minimum, return 0
|
||||||
|
if (char_similarity < min_similarity) {
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If words are the same length, give a small boost
|
||||||
if (len1 == len2) {
|
if (len1 == len2) {
|
||||||
char_similarity = char_similarity * 1.2f; // 20% boost for same length
|
char_similarity = char_similarity * 1.1f; // 10% boost for same length
|
||||||
if (char_similarity > 1.0f) char_similarity = 1.0f;
|
if (char_similarity > 1.0f) char_similarity = 1.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -136,8 +153,8 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
|||||||
}
|
}
|
||||||
avg_word_similarity /= query_word_count;
|
avg_word_similarity /= query_word_count;
|
||||||
|
|
||||||
// Combine scores: 60% weight on word matches, 40% on character similarity
|
// Combine scores: 70% weight on word matches, 30% on character similarity
|
||||||
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
|
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
|
||||||
|
|
||||||
// If all query words were found with high similarity, ensure high overall score
|
// If all query words were found with high similarity, ensure high overall score
|
||||||
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
|
if (query_words_found == query_word_count && avg_word_similarity >= 0.8f) {
|
||||||
|
|||||||
Reference in New Issue
Block a user