Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance.

Refactor similarity_search.c to improve memory management and word splitting logic. Simplify split_into_words function to use a single allocation and update free_words to handle memory more efficiently. Enhance levenshtein_distance calculation with dynamic memory allocation and optimize similarity scoring in calculate_similarity function for better accuracy and performance.
2025-04-18 19:01:22 +02:00 · 2025-04-18 18:55:37 +02:00
1 changed files with 62 additions and 116 deletions
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -22,76 +22,54 @@ int str_case_cmp(const char *s1, const char *s2) {
 }
 // Split a string into words
-int split_into_words(const char *string, char *words[MAX_WORDS]) {
+int split_into_words(const char *s,
-    if (!string || strlen(string) >= MAX_STRING_LEN) {
+                     char  *words[MAX_WORDS],
-        return 0;
+                     char **storage)          /* NEW OUT PARAM            */
-    }
+{
    if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
-    char temp[MAX_STRING_LEN];
+    char *buf = strdup(s);                    /* one single allocation    */
-    strncpy(temp, string, MAX_STRING_LEN - 1);
+    if (!buf) return 0;
-    temp[MAX_STRING_LEN - 1] = '\0';
+    *storage = buf;                           /* hand ownership to caller */
-    int word_count = 0;
+    int n = 0;
-    char *token = strtok(temp, " \t\n");
+    for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
-    
+         tok = strtok(NULL, " \t\n"))
-    while (token != NULL && word_count < MAX_WORDS) {
+    {
-        words[word_count] = strdup(token);
+        words[n++] = tok;                     /* pointers into buf        */
        if (!words[word_count]) {
            // Free any already allocated words on error
            for (int i = 0; i < word_count; i++) {
                free(words[i]);
    }
-            return 0;
+    return n;
        }
        word_count++;
        token = strtok(NULL, " \t\n");
    }
    return word_count;
 }
 // Free memory allocated for words
-void free_words(char *words[], int word_count) {
+void free_words(char *storage) {              /* simplified               */
-    for (int i = 0; i < word_count; i++) {
+    free(storage);                            /* single free, if any      */
        free(words[i]);
    }
 }
 // Calculate Levenshtein distance between two strings
-int levenshtein_distance(const char *s1, const char *s2) {
+int levenshtein_distance(const char *a, const char *b)
-    int len1 = strlen(s1);
+{
-    int len2 = strlen(s2);
+    size_t m = strlen(a), n = strlen(b);
    if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
-    // Convert to lowercase for comparison
+    int *row0 = alloca((n + 1) * sizeof(int));
-    char s1_lower[MAX_STRING_LEN];
+    int *row1 = alloca((n + 1) * sizeof(int));
    char s2_lower[MAX_STRING_LEN];
    for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
    for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
    s1_lower[len1] = '\0';
    s2_lower[len2] = '\0';
-    // Create distance matrix
+    for (size_t j = 0; j <= n; ++j) row0[j] = j;
-    int matrix[len1 + 1][len2 + 1];
+    for (size_t i = 1; i <= m; ++i) {
-    
+        row1[0] = i;
-    // Initialize first row and column
+        for (size_t j = 1; j <= n; ++j) {
-    for (int i = 0; i <= len1; i++) matrix[i][0] = i;
+            int cost = (tolower((unsigned)a[i-1]) ==
-    for (int j = 0; j <= len2; j++) matrix[0][j] = j;
+                        tolower((unsigned)b[j-1])) ? 0 : 1;
-    
+            int del  = row0[j]   + 1;
-    // Fill in the rest of the matrix
+            int ins  = row1[j-1] + 1;
-    for (int i = 1; i <= len1; i++) {
+            int sub  = row0[j-1] + cost;
-        for (int j = 1; j <= len2; j++) {
+            row1[j] = (del < ins ? (del < sub ? del : sub)
-            if (s1_lower[i-1] == s2_lower[j-1]) {
+                                 : (ins < sub ? ins : sub));
                matrix[i][j] = matrix[i-1][j-1];
            } else {
                int min = matrix[i-1][j-1]; // substitution
                if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
                if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
                matrix[i][j] = min + 1;
        }
        int *tmp = row0; row0 = row1; row1 = tmp;
    }
-    }
+    return row0[n];
    return matrix[len1][len2];
 }
 // Calculate similarity between two words based on Levenshtein distance
@@ -104,40 +82,16 @@ float word_similarity(const char *word1, const char *word2) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }
-    // If one word is significantly shorter than the other, it must be a prefix
+    // Calculate Levenshtein distance
    if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
        // Check if the shorter word is a prefix of the longer word
        const char *longer = len1 > len2 ? word1 : word2;
        const char *shorter = len1 > len2 ? word2 : word1;
        int shorter_len = len1 > len2 ? len2 : len1;
        if (strncasecmp(longer, shorter, shorter_len) == 0) {
            return 0.8f; // Good prefix match
        }
        return 0.0f; // Not a prefix match
    }
    // For words of similar length, calculate similarity
    int distance = levenshtein_distance(word1, word2);
    int max_len = len1 > len2 ? len1 : len2;
-    // Calculate similarity based on edit distance
+    // Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
    float similarity = 1.0f - (float)distance / max_len;
-    // Adjust similarity based on word lengths
+    // Boost similarity for small differences
-    if (len1 != len2) {
+    if (distance <= 1) {
-        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
+        similarity = 0.9f + (similarity * 0.1f);
        similarity *= length_ratio;
    }
    // For words of similar length, require reasonable similarity
    if (similarity < 0.4f) {
        return 0.0f;
    }
    // Never return perfect similarity for non-identical words
    if (distance > 0) {
        similarity = fmin(similarity, 0.9f);
    }
    return similarity;
@@ -146,15 +100,15 @@ float word_similarity(const char *word1, const char *word2) {
 // Calculate similarity between query and target string
 float calculate_similarity(const char *query, const char *target, float cutoff) {
    // Split strings into words
-    char *query_words[MAX_WORDS] = {0};
+    char *query_buf, *target_buf;
-    char *target_words[MAX_WORDS] = {0};
+    char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
-    int query_word_count = split_into_words(query, query_words);
+    int query_word_count = split_into_words(query,  query_words,  &query_buf);
-    int target_word_count = split_into_words(target, target_words);
+    int target_word_count = split_into_words(target, target_words, &target_buf);
    if (query_word_count == 0 || target_word_count == 0) {
-        free_words(query_words, query_word_count);
+        free_words(query_buf);
-        free_words(target_words, target_word_count);
+        free_words(target_buf);
        return 0.0;
    }
@@ -174,39 +128,31 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
        }
        best_word_similarities[i] = best_similarity;
-        if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
+        if (best_similarity >= 0.4f) {
            query_words_found++;
        }
    }
-    // Calculate overall similarity
+    // Calculate average word similarity
    float word_match_score = (float)query_words_found / query_word_count;
    // Calculate average of best word similarities
    float avg_word_similarity = 0.0f;
    for (int i = 0; i < query_word_count; i++) {
        avg_word_similarity += best_word_similarities[i];
    }
    avg_word_similarity /= query_word_count;
-    // Combine scores: 70% weight on word matches, 30% on character similarity
+    // Calculate word match ratio
-    float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
+    float word_match_ratio = (float)query_words_found / query_word_count;
-    // Never return perfect similarity unless all words are exact matches
+    // Final score is the average of word match ratio and average word similarity
-    bool all_exact_matches = true;
+    float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
-    for (int i = 0; i < query_word_count; i++) {
+    
-        if (best_word_similarities[i] < 1.0f) {
+    // Boost score if all words are found
-            all_exact_matches = false;
+    if (query_words_found == query_word_count) {
-            break;
+        similarity = 0.8f + (similarity * 0.2f);
        }
    }
-    if (!all_exact_matches) {
+    free_words(query_buf);
-        similarity = fmin(similarity, 0.9f);
+    free_words(target_buf);
    }
    free_words(query_words, query_word_count);
    free_words(target_words, target_word_count);
    return similarity;
 }
Author	SHA1	Message	Date
seb	a9a9247773	Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance.	2025-04-18 19:01:22 +02:00
seb	e2aacaf54b	Refactor similarity_search.c to improve memory management and word splitting logic. Simplify split_into_words function to use a single allocation and update free_words to handle memory more efficiently. Enhance levenshtein_distance calculation with dynamic memory allocation and optimize similarity scoring in calculate_similarity function for better accuracy and performance.	2025-04-18 18:55:37 +02:00