Compare commits

...

2 Commits

View File

@@ -22,76 +22,54 @@ int str_case_cmp(const char *s1, const char *s2) {
}
// Split a string into words
int split_into_words(const char *string, char *words[MAX_WORDS]) {
if (!string || strlen(string) >= MAX_STRING_LEN) {
return 0;
int split_into_words(const char *s,
char *words[MAX_WORDS],
char **storage) /* NEW OUT PARAM */
{
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
char *buf = strdup(s); /* one single allocation */
if (!buf) return 0;
*storage = buf; /* hand ownership to caller */
int n = 0;
for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
tok = strtok(NULL, " \t\n"))
{
words[n++] = tok; /* pointers into buf */
}
char temp[MAX_STRING_LEN];
strncpy(temp, string, MAX_STRING_LEN - 1);
temp[MAX_STRING_LEN - 1] = '\0';
int word_count = 0;
char *token = strtok(temp, " \t\n");
while (token != NULL && word_count < MAX_WORDS) {
words[word_count] = strdup(token);
if (!words[word_count]) {
// Free any already allocated words on error
for (int i = 0; i < word_count; i++) {
free(words[i]);
}
return 0;
}
word_count++;
token = strtok(NULL, " \t\n");
}
return word_count;
return n;
}
// Free memory allocated for words
void free_words(char *words[], int word_count) {
for (int i = 0; i < word_count; i++) {
free(words[i]);
}
void free_words(char *storage) { /* simplified */
free(storage); /* single free, if any */
}
// Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) {
int len1 = strlen(s1);
int len2 = strlen(s2);
// Convert to lowercase for comparison
char s1_lower[MAX_STRING_LEN];
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix
int matrix[len1 + 1][len2 + 1];
// Initialize first row and column
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
// Fill in the rest of the matrix
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (s1_lower[i-1] == s2_lower[j-1]) {
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
}
int levenshtein_distance(const char *a, const char *b)
{
size_t m = strlen(a), n = strlen(b);
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
int *row0 = alloca((n + 1) * sizeof(int));
int *row1 = alloca((n + 1) * sizeof(int));
for (size_t j = 0; j <= n; ++j) row0[j] = j;
for (size_t i = 1; i <= m; ++i) {
row1[0] = i;
for (size_t j = 1; j <= n; ++j) {
int cost = (tolower((unsigned)a[i-1]) ==
tolower((unsigned)b[j-1])) ? 0 : 1;
int del = row0[j] + 1;
int ins = row1[j-1] + 1;
int sub = row0[j-1] + cost;
row1[j] = (del < ins ? (del < sub ? del : sub)
: (ins < sub ? ins : sub));
}
int *tmp = row0; row0 = row1; row1 = tmp;
}
return matrix[len1][len2];
return row0[n];
}
// Calculate similarity between two words based on Levenshtein distance
@@ -104,40 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
// Calculate Levenshtein distance
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance
// Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
float similarity = 1.0f - (float)distance / max_len;
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
}
// Never return perfect similarity for non-identical words
if (distance > 0) {
similarity = fmin(similarity, 0.9f);
// Boost similarity for small differences
if (distance <= 1) {
similarity = 0.9f + (similarity * 0.1f);
}
return similarity;
@@ -146,15 +100,15 @@ float word_similarity(const char *word1, const char *word2) {
// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words
char *query_words[MAX_WORDS] = {0};
char *target_words[MAX_WORDS] = {0};
char *query_buf, *target_buf;
char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
int query_word_count = split_into_words(query, query_words);
int target_word_count = split_into_words(target, target_words);
int query_word_count = split_into_words(query, query_words, &query_buf);
int target_word_count = split_into_words(target, target_words, &target_buf);
if (query_word_count == 0 || target_word_count == 0) {
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
free_words(query_buf);
free_words(target_buf);
return 0.0;
}
@@ -174,39 +128,31 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
if (best_similarity >= 0.4f) {
query_words_found++;
}
}
// Calculate overall similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
// Calculate average word similarity
float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i];
}
avg_word_similarity /= query_word_count;
// Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// Calculate word match ratio
float word_match_ratio = (float)query_words_found / query_word_count;
// Never return perfect similarity unless all words are exact matches
bool all_exact_matches = true;
for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) {
all_exact_matches = false;
break;
}
// Final score is the average of word match ratio and average word similarity
float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
// Boost score if all words are found
if (query_words_found == query_word_count) {
similarity = 0.8f + (similarity * 0.2f);
}
if (!all_exact_matches) {
similarity = fmin(similarity, 0.9f);
}
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
free_words(query_buf);
free_words(target_buf);
return similarity;
}