Compare commits

...

2 Commits

View File

@@ -22,76 +22,54 @@ int str_case_cmp(const char *s1, const char *s2) {
} }
// Split a string into words // Split a string into words
int split_into_words(const char *string, char *words[MAX_WORDS]) { int split_into_words(const char *s,
if (!string || strlen(string) >= MAX_STRING_LEN) { char *words[MAX_WORDS],
return 0; char **storage) /* NEW OUT PARAM */
} {
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
char temp[MAX_STRING_LEN]; char *buf = strdup(s); /* one single allocation */
strncpy(temp, string, MAX_STRING_LEN - 1); if (!buf) return 0;
temp[MAX_STRING_LEN - 1] = '\0'; *storage = buf; /* hand ownership to caller */
int word_count = 0; int n = 0;
char *token = strtok(temp, " \t\n"); for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
tok = strtok(NULL, " \t\n"))
while (token != NULL && word_count < MAX_WORDS) { {
words[word_count] = strdup(token); words[n++] = tok; /* pointers into buf */
if (!words[word_count]) {
// Free any already allocated words on error
for (int i = 0; i < word_count; i++) {
free(words[i]);
} }
return 0; return n;
}
word_count++;
token = strtok(NULL, " \t\n");
}
return word_count;
} }
// Free memory allocated for words // Free memory allocated for words
void free_words(char *words[], int word_count) { void free_words(char *storage) { /* simplified */
for (int i = 0; i < word_count; i++) { free(storage); /* single free, if any */
free(words[i]);
}
} }
// Calculate Levenshtein distance between two strings // Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) { int levenshtein_distance(const char *a, const char *b)
int len1 = strlen(s1); {
int len2 = strlen(s2); size_t m = strlen(a), n = strlen(b);
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
// Convert to lowercase for comparison int *row0 = alloca((n + 1) * sizeof(int));
char s1_lower[MAX_STRING_LEN]; int *row1 = alloca((n + 1) * sizeof(int));
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix for (size_t j = 0; j <= n; ++j) row0[j] = j;
int matrix[len1 + 1][len2 + 1]; for (size_t i = 1; i <= m; ++i) {
row1[0] = i;
// Initialize first row and column for (size_t j = 1; j <= n; ++j) {
for (int i = 0; i <= len1; i++) matrix[i][0] = i; int cost = (tolower((unsigned)a[i-1]) ==
for (int j = 0; j <= len2; j++) matrix[0][j] = j; tolower((unsigned)b[j-1])) ? 0 : 1;
int del = row0[j] + 1;
// Fill in the rest of the matrix int ins = row1[j-1] + 1;
for (int i = 1; i <= len1; i++) { int sub = row0[j-1] + cost;
for (int j = 1; j <= len2; j++) { row1[j] = (del < ins ? (del < sub ? del : sub)
if (s1_lower[i-1] == s2_lower[j-1]) { : (ins < sub ? ins : sub));
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
} }
int *tmp = row0; row0 = row1; row1 = tmp;
} }
} return row0[n];
return matrix[len1][len2];
} }
// Calculate similarity between two words based on Levenshtein distance // Calculate similarity between two words based on Levenshtein distance
@@ -104,40 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f; return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
} }
// If one word is significantly shorter than the other, it must be a prefix // Calculate Levenshtein distance
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
int distance = levenshtein_distance(word1, word2); int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2; int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance // Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
float similarity = 1.0f - (float)distance / max_len; float similarity = 1.0f - (float)distance / max_len;
// Adjust similarity based on word lengths // Boost similarity for small differences
if (len1 != len2) { if (distance <= 1) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2); similarity = 0.9f + (similarity * 0.1f);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
}
// Never return perfect similarity for non-identical words
if (distance > 0) {
similarity = fmin(similarity, 0.9f);
} }
return similarity; return similarity;
@@ -146,15 +100,15 @@ float word_similarity(const char *word1, const char *word2) {
// Calculate similarity between query and target string // Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) { float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words // Split strings into words
char *query_words[MAX_WORDS] = {0}; char *query_buf, *target_buf;
char *target_words[MAX_WORDS] = {0}; char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
int query_word_count = split_into_words(query, query_words); int query_word_count = split_into_words(query, query_words, &query_buf);
int target_word_count = split_into_words(target, target_words); int target_word_count = split_into_words(target, target_words, &target_buf);
if (query_word_count == 0 || target_word_count == 0) { if (query_word_count == 0 || target_word_count == 0) {
free_words(query_words, query_word_count); free_words(query_buf);
free_words(target_words, target_word_count); free_words(target_buf);
return 0.0; return 0.0;
} }
@@ -174,39 +128,31 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
} }
best_word_similarities[i] = best_similarity; best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable if (best_similarity >= 0.4f) {
query_words_found++; query_words_found++;
} }
} }
// Calculate overall similarity // Calculate average word similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
float avg_word_similarity = 0.0f; float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) { for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i]; avg_word_similarity += best_word_similarities[i];
} }
avg_word_similarity /= query_word_count; avg_word_similarity /= query_word_count;
// Combine scores: 70% weight on word matches, 30% on character similarity // Calculate word match ratio
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f); float word_match_ratio = (float)query_words_found / query_word_count;
// Never return perfect similarity unless all words are exact matches // Final score is the average of word match ratio and average word similarity
bool all_exact_matches = true; float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) { // Boost score if all words are found
all_exact_matches = false; if (query_words_found == query_word_count) {
break; similarity = 0.8f + (similarity * 0.2f);
}
} }
if (!all_exact_matches) { free_words(query_buf);
similarity = fmin(similarity, 0.9f); free_words(target_buf);
}
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
return similarity; return similarity;
} }