Refactor similarity_search.c to improve memory management and word splitting logic. Simplify split_into_words function to use a single allocation and update free_words to handle memory more efficiently. Enhance levenshtein_distance calculation with dynamic memory allocation and optimize similarity scoring in calculate_similarity function for better accuracy and performance.
This commit is contained in:
@@ -22,76 +22,54 @@ int str_case_cmp(const char *s1, const char *s2) {
|
||||
}
|
||||
|
||||
// Split a string into words
|
||||
int split_into_words(const char *string, char *words[MAX_WORDS]) {
|
||||
if (!string || strlen(string) >= MAX_STRING_LEN) {
|
||||
return 0;
|
||||
}
|
||||
int split_into_words(const char *s,
|
||||
char *words[MAX_WORDS],
|
||||
char **storage) /* NEW OUT PARAM */
|
||||
{
|
||||
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
|
||||
|
||||
char temp[MAX_STRING_LEN];
|
||||
strncpy(temp, string, MAX_STRING_LEN - 1);
|
||||
temp[MAX_STRING_LEN - 1] = '\0';
|
||||
char *buf = strdup(s); /* one single allocation */
|
||||
if (!buf) return 0;
|
||||
*storage = buf; /* hand ownership to caller */
|
||||
|
||||
int word_count = 0;
|
||||
char *token = strtok(temp, " \t\n");
|
||||
|
||||
while (token != NULL && word_count < MAX_WORDS) {
|
||||
words[word_count] = strdup(token);
|
||||
if (!words[word_count]) {
|
||||
// Free any already allocated words on error
|
||||
for (int i = 0; i < word_count; i++) {
|
||||
free(words[i]);
|
||||
int n = 0;
|
||||
for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
|
||||
tok = strtok(NULL, " \t\n"))
|
||||
{
|
||||
words[n++] = tok; /* pointers into buf */
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
word_count++;
|
||||
token = strtok(NULL, " \t\n");
|
||||
}
|
||||
|
||||
return word_count;
|
||||
return n;
|
||||
}
|
||||
|
||||
// Free memory allocated for words
|
||||
void free_words(char *words[], int word_count) {
|
||||
for (int i = 0; i < word_count; i++) {
|
||||
free(words[i]);
|
||||
}
|
||||
void free_words(char *storage) { /* simplified */
|
||||
free(storage); /* single free, if any */
|
||||
}
|
||||
|
||||
// Calculate Levenshtein distance between two strings
|
||||
int levenshtein_distance(const char *s1, const char *s2) {
|
||||
int len1 = strlen(s1);
|
||||
int len2 = strlen(s2);
|
||||
int levenshtein_distance(const char *a, const char *b)
|
||||
{
|
||||
size_t m = strlen(a), n = strlen(b);
|
||||
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
|
||||
|
||||
// Convert to lowercase for comparison
|
||||
char s1_lower[MAX_STRING_LEN];
|
||||
char s2_lower[MAX_STRING_LEN];
|
||||
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
|
||||
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
|
||||
s1_lower[len1] = '\0';
|
||||
s2_lower[len2] = '\0';
|
||||
int *row0 = alloca((n + 1) * sizeof(int));
|
||||
int *row1 = alloca((n + 1) * sizeof(int));
|
||||
|
||||
// Create distance matrix
|
||||
int matrix[len1 + 1][len2 + 1];
|
||||
|
||||
// Initialize first row and column
|
||||
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
|
||||
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
|
||||
|
||||
// Fill in the rest of the matrix
|
||||
for (int i = 1; i <= len1; i++) {
|
||||
for (int j = 1; j <= len2; j++) {
|
||||
if (s1_lower[i-1] == s2_lower[j-1]) {
|
||||
matrix[i][j] = matrix[i-1][j-1];
|
||||
} else {
|
||||
int min = matrix[i-1][j-1]; // substitution
|
||||
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
|
||||
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
|
||||
matrix[i][j] = min + 1;
|
||||
for (size_t j = 0; j <= n; ++j) row0[j] = j;
|
||||
for (size_t i = 1; i <= m; ++i) {
|
||||
row1[0] = i;
|
||||
for (size_t j = 1; j <= n; ++j) {
|
||||
int cost = (tolower((unsigned)a[i-1]) ==
|
||||
tolower((unsigned)b[j-1])) ? 0 : 1;
|
||||
int del = row0[j] + 1;
|
||||
int ins = row1[j-1] + 1;
|
||||
int sub = row0[j-1] + cost;
|
||||
row1[j] = (del < ins ? (del < sub ? del : sub)
|
||||
: (ins < sub ? ins : sub));
|
||||
}
|
||||
int *tmp = row0; row0 = row1; row1 = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[len1][len2];
|
||||
return row0[n];
|
||||
}
|
||||
|
||||
// Calculate similarity between two words based on Levenshtein distance
|
||||
@@ -121,9 +99,12 @@ float word_similarity(const char *word1, const char *word2) {
|
||||
int distance = levenshtein_distance(word1, word2);
|
||||
int max_len = len1 > len2 ? len1 : len2;
|
||||
|
||||
// Calculate similarity based on edit distance
|
||||
// Calculate similarity based on edit distance with exponential decay
|
||||
float similarity = 1.0f - (float)distance / max_len;
|
||||
|
||||
// Apply exponential decay to make it more sensitive to differences
|
||||
similarity = pow(similarity, 3.0f);
|
||||
|
||||
// Adjust similarity based on word lengths
|
||||
if (len1 != len2) {
|
||||
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
|
||||
@@ -135,26 +116,21 @@ float word_similarity(const char *word1, const char *word2) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
// Never return perfect similarity for non-identical words
|
||||
if (distance > 0) {
|
||||
similarity = fmin(similarity, 0.9f);
|
||||
}
|
||||
|
||||
return similarity;
|
||||
}
|
||||
|
||||
// Calculate similarity between query and target string
|
||||
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
||||
// Split strings into words
|
||||
char *query_words[MAX_WORDS] = {0};
|
||||
char *target_words[MAX_WORDS] = {0};
|
||||
char *query_buf, *target_buf;
|
||||
char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
|
||||
|
||||
int query_word_count = split_into_words(query, query_words);
|
||||
int target_word_count = split_into_words(target, target_words);
|
||||
int query_word_count = split_into_words(query, query_words, &query_buf);
|
||||
int target_word_count = split_into_words(target, target_words, &target_buf);
|
||||
|
||||
if (query_word_count == 0 || target_word_count == 0) {
|
||||
free_words(query_words, query_word_count);
|
||||
free_words(target_words, target_word_count);
|
||||
free_words(query_buf);
|
||||
free_words(target_buf);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
@@ -167,9 +143,14 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
||||
float best_similarity = 0.0f;
|
||||
|
||||
for (int j = 0; j < target_word_count; j++) {
|
||||
/* quick length‑difference filter (early‑exit #4) */
|
||||
int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
|
||||
if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
|
||||
|
||||
float similarity = word_similarity(query_words[i], target_words[j]);
|
||||
if (similarity > best_similarity) {
|
||||
best_similarity = similarity;
|
||||
if (best_similarity >= 0.90f) break; /* early exit #4 */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,24 +170,17 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
||||
}
|
||||
avg_word_similarity /= query_word_count;
|
||||
|
||||
// Combine scores: 70% weight on word matches, 30% on character similarity
|
||||
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
|
||||
// Combine scores: 60% weight on word matches, 40% on character similarity
|
||||
// This gives more weight to finding all words, regardless of order
|
||||
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
|
||||
|
||||
// Never return perfect similarity unless all words are exact matches
|
||||
bool all_exact_matches = true;
|
||||
for (int i = 0; i < query_word_count; i++) {
|
||||
if (best_word_similarities[i] < 1.0f) {
|
||||
all_exact_matches = false;
|
||||
break;
|
||||
}
|
||||
// If all words are found, boost the score
|
||||
if (query_words_found == query_word_count) {
|
||||
similarity = 0.7f + (similarity * 0.3f);
|
||||
}
|
||||
|
||||
if (!all_exact_matches) {
|
||||
similarity = fmin(similarity, 0.9f);
|
||||
}
|
||||
|
||||
free_words(query_words, query_word_count);
|
||||
free_words(target_words, target_word_count);
|
||||
free_words(query_buf);
|
||||
free_words(target_buf);
|
||||
|
||||
return similarity;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user