From 6091cc0b805bdd7b083612f05fe6098dcdbe6c37 Mon Sep 17 00:00:00 2001 From: seb Date: Fri, 18 Apr 2025 09:16:26 +0200 Subject: [PATCH] Increase default capacity in SearchIndexWrapper and enhance similarity calculation in calculate_similarity function to boost similarity score when all query words are found. Update MAX_WORDS and MAX_STRING_LEN definitions for improved handling. --- similarity_search.c | 25 +++++++++++++++++++++---- similarity_search.h | 6 +++--- similarity_search_addon.cc | 2 +- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/similarity_search.c b/similarity_search.c index 9cb8b33..0c1e0ee 100644 --- a/similarity_search.c +++ b/similarity_search.c @@ -70,19 +70,33 @@ float calculate_similarity(const char *query, const char *target, float cutoff) return 0.0; } - // Count matches + // Count matches and track which query words were found int matches = 0; + int query_words_found = 0; + int found_query_words[MAX_WORDS] = {0}; // Track which query words were found + for (int i = 0; i < query_word_count; i++) { for (int j = 0; j < target_word_count; j++) { if (str_case_cmp(query_words[i], target_words[j]) == 0) { matches++; + if (!found_query_words[i]) { + found_query_words[i] = 1; + query_words_found++; + } break; } } } - // Calculate Jaccard similarity (intersection over union) - float similarity = (float)matches / (query_word_count + target_word_count - matches); + // Calculate base similarity (intersection over union) + float base_similarity = (float)matches / (query_word_count + target_word_count - matches); + + // If all query words were found, boost the similarity + float similarity = base_similarity; + if (query_words_found == query_word_count) { + // If all query words were found, similarity should be at least 0.8 + similarity = base_similarity > 0.8f ? base_similarity : 0.8f; + } free_words(query_words, query_word_count); free_words(target_words, target_word_count); @@ -247,7 +261,10 @@ SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, } // Copy results to final array - memcpy(results, temp_results, *num_results * sizeof(SearchResult)); + for (int i = 0; i < *num_results; i++) { + results[i].string = temp_results[i].string; + results[i].similarity = temp_results[i].similarity; + } free(temp_results); return results; diff --git a/similarity_search.h b/similarity_search.h index 613dd99..c93405b 100644 --- a/similarity_search.h +++ b/similarity_search.h @@ -5,8 +5,8 @@ extern "C" { #endif -#define MAX_STRING_LEN 100 -#define MAX_WORDS 20 +#define MAX_STRING_LEN 1000 +#define MAX_WORDS 100 // Public API @@ -19,7 +19,7 @@ typedef struct { // Structure to hold a search result typedef struct { - const char *string; + char *string; float similarity; } SearchResult; diff --git a/similarity_search_addon.cc b/similarity_search_addon.cc index f065134..40aa2f4 100644 --- a/similarity_search_addon.cc +++ b/similarity_search_addon.cc @@ -41,7 +41,7 @@ SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info) Napi::Env env = info.Env(); Napi::HandleScope scope(env); - int capacity = 500; // Default capacity + int capacity = 10000; // Increased default capacity from 500 to 10000 if (info.Length() > 0 && info[0].IsNumber()) { capacity = info[0].As().Int32Value(); }