Compare commits

...

10 Commits

Author SHA1 Message Date
seb
fb8f61f868 Fix node-gyp compatibility: move to dependencies for proper version control
- Move node-gyp from devDependencies to dependencies
- Ensures v11.2.0 is used when installed as dependency
- Fixes Visual Studio detection issues in consuming projects
- Resolves shopApi build failures with old node-gyp v8.4.1
2025-06-27 03:37:33 +02:00
seb
091c258d41 Update package.json to include Node.js engine requirement and enhance build scripts. Add new keywords for better package discoverability, define repository information, and specify supported operating systems and CPU architectures. Introduce devDependencies for node-gyp to streamline native module compilation. 2025-06-27 03:32:24 +02:00
seb
8474c77163 Enhance search index handling by returning an empty array for no results instead of throwing an error. Improve memory management in free_words function by checking for NULL before freeing. Update search_index to properly return NULL when no results are found. 2025-06-27 03:15:45 +02:00
seb
21f527ba46 Update .gitignore to ignore npm lock file and tidy entry
Add package-lock.json to prevent accidental commits of npm’s lock
file. While here, remove the stray trailing space from the Thumbs.db
entry for a cleaner diff.
2025-06-23 07:09:30 +02:00
seb
462041654d Optimize search result finalization by reallocating in place
Replace the malloc/copy/free sequence with a single realloc that
shrinks temp_results to its exact size and returns it directly.  This

* eliminates an extra allocation and memory copy
* simplifies cleanup logic
* retains correct failure handling (temp_results unchanged on realloc
  failure)

Also drop the superfluous trailing space at EOF and add package-lock.json
to version control to lock Node.js dependencies.
2025-06-23 04:16:18 +02:00
seb
60d609dd6a Fix Windows compilation issue by adding malloc.h include 2025-06-21 21:14:20 +02:00
seb
ccbd833361 package.json aktualisiert 2025-04-22 04:39:00 +00:00
seb
24895fc1bc similarity_search.c aktualisiert
3 -> 2 (min word length)
2025-04-22 04:35:04 +00:00
seb
a9a9247773 Refactor word similarity calculation in similarity_search.c to simplify scoring logic. Replace prefix matching with Levenshtein distance for improved accuracy, and adjust similarity scoring to boost results for small differences. Update overall similarity calculation to average word match ratio and average word similarity for better performance. 2025-04-18 19:01:22 +02:00
seb
e2aacaf54b Refactor similarity_search.c to improve memory management and word splitting logic. Simplify split_into_words function to use a single allocation and update free_words to handle memory more efficiently. Enhance levenshtein_distance calculation with dynamic memory allocation and optimize similarity scoring in calculate_similarity function for better accuracy and performance. 2025-04-18 18:55:37 +02:00
4 changed files with 128 additions and 138 deletions

4
.gitignore vendored
View File

@@ -22,4 +22,6 @@ build/
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
Thumbs.db
package-lock.json

View File

@@ -1,23 +1,59 @@
{
"name": "similarity-search",
"version": "1.0.1",
"version": "1.0.2",
"description": "A Node.js module for word order independent string similarity search",
"main": "index.js",
"engines": {
"node": ">=14.0.0"
},
"scripts": {
"install": "node-gyp rebuild",
"test": "node test.js"
"rebuild": "node-gyp rebuild",
"build": "node-gyp rebuild",
"clean": "node-gyp clean",
"configure": "node-gyp configure",
"test": "node test.js",
"pretest": "npm run build"
},
"keywords": [
"search",
"similarity",
"string",
"fuzzy"
"fuzzy",
"native",
"addon",
"c++",
"performance"
],
"author": "",
"license": "MIT",
"dependencies": {
"nan": "^2.22.2",
"node-addon-api": "^6.0.0"
"node-addon-api": "^6.0.0",
"node-gyp": "^11.2.0"
},
"gypfile": true
"devDependencies": {
},
"repository": {
"type": "git",
"url": ""
},
"files": [
"index.js",
"binding.gyp",
"similarity_search.c",
"similarity_search.h",
"similarity_search_addon.cc",
"README.md"
],
"gypfile": true,
"os": [
"win32",
"darwin",
"linux"
],
"cpu": [
"x64",
"arm64"
]
}

View File

@@ -5,6 +5,11 @@
#include <ctype.h>
#include <math.h>
#include <stdbool.h>
#ifdef _WIN32
#include <malloc.h> // For alloca on Windows
#else
#include <alloca.h> // For alloca on Unix-like systems
#endif
#include "similarity_search.h"
// Case insensitive string comparison
@@ -22,76 +27,56 @@ int str_case_cmp(const char *s1, const char *s2) {
}
// Split a string into words
int split_into_words(const char *string, char *words[MAX_WORDS]) {
if (!string || strlen(string) >= MAX_STRING_LEN) {
return 0;
int split_into_words(const char *s,
char *words[MAX_WORDS],
char **storage) /* NEW OUT PARAM */
{
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
char *buf = strdup(s); /* one single allocation */
if (!buf) return 0;
*storage = buf; /* hand ownership to caller */
int n = 0;
for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
tok = strtok(NULL, " \t\n"))
{
words[n++] = tok; /* pointers into buf */
}
char temp[MAX_STRING_LEN];
strncpy(temp, string, MAX_STRING_LEN - 1);
temp[MAX_STRING_LEN - 1] = '\0';
int word_count = 0;
char *token = strtok(temp, " \t\n");
while (token != NULL && word_count < MAX_WORDS) {
words[word_count] = strdup(token);
if (!words[word_count]) {
// Free any already allocated words on error
for (int i = 0; i < word_count; i++) {
free(words[i]);
}
return 0;
}
word_count++;
token = strtok(NULL, " \t\n");
}
return word_count;
return n;
}
// Free memory allocated for words
void free_words(char *words[], int word_count) {
for (int i = 0; i < word_count; i++) {
free(words[i]);
void free_words(char *storage) { /* simplified */
if (storage) { /* check for NULL */
free(storage); /* single free, if any */
}
}
// Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) {
int len1 = strlen(s1);
int len2 = strlen(s2);
// Convert to lowercase for comparison
char s1_lower[MAX_STRING_LEN];
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix
int matrix[len1 + 1][len2 + 1];
// Initialize first row and column
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
// Fill in the rest of the matrix
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (s1_lower[i-1] == s2_lower[j-1]) {
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
}
int levenshtein_distance(const char *a, const char *b)
{
size_t m = strlen(a), n = strlen(b);
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
int *row0 = alloca((n + 1) * sizeof(int));
int *row1 = alloca((n + 1) * sizeof(int));
for (size_t j = 0; j <= n; ++j) row0[j] = j;
for (size_t i = 1; i <= m; ++i) {
row1[0] = i;
for (size_t j = 1; j <= n; ++j) {
int cost = (tolower((unsigned)a[i-1]) ==
tolower((unsigned)b[j-1])) ? 0 : 1;
int del = row0[j] + 1;
int ins = row1[j-1] + 1;
int sub = row0[j-1] + cost;
row1[j] = (del < ins ? (del < sub ? del : sub)
: (ins < sub ? ins : sub));
}
int *tmp = row0; row0 = row1; row1 = tmp;
}
return matrix[len1][len2];
return row0[n];
}
// Calculate similarity between two words based on Levenshtein distance
@@ -99,45 +84,21 @@ float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1);
int len2 = strlen(word2);
// For very short words (3 chars or less), require exact match
if (len1 <= 3 || len2 <= 3) {
// For very short words (2 chars or less), require exact match
if (len1 <= 2 || len2 <= 2) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
// Calculate Levenshtein distance
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance
// Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
float similarity = 1.0f - (float)distance / max_len;
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
}
// Never return perfect similarity for non-identical words
if (distance > 0) {
similarity = fmin(similarity, 0.9f);
// Boost similarity for small differences
if (distance <= 1) {
similarity = 0.9f + (similarity * 0.1f);
}
return similarity;
@@ -146,15 +107,15 @@ float word_similarity(const char *word1, const char *word2) {
// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words
char *query_words[MAX_WORDS] = {0};
char *target_words[MAX_WORDS] = {0};
char *query_buf = NULL, *target_buf = NULL;
char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
int query_word_count = split_into_words(query, query_words);
int target_word_count = split_into_words(target, target_words);
int query_word_count = split_into_words(query, query_words, &query_buf);
int target_word_count = split_into_words(target, target_words, &target_buf);
if (query_word_count == 0 || target_word_count == 0) {
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
free_words(query_buf);
free_words(target_buf);
return 0.0;
}
@@ -174,39 +135,31 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
if (best_similarity >= 0.4f) {
query_words_found++;
}
}
// Calculate overall similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
// Calculate average word similarity
float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i];
}
avg_word_similarity /= query_word_count;
// Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// Calculate word match ratio
float word_match_ratio = (float)query_words_found / query_word_count;
// Never return perfect similarity unless all words are exact matches
bool all_exact_matches = true;
for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) {
all_exact_matches = false;
break;
}
// Final score is the average of word match ratio and average word similarity
float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
// Boost score if all words are found
if (query_words_found == query_word_count) {
similarity = 0.8f + (similarity * 0.2f);
}
if (!all_exact_matches) {
similarity = fmin(similarity, 0.9f);
}
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
free_words(query_buf);
free_words(target_buf);
return similarity;
}
@@ -353,27 +306,26 @@ SearchResult* search_index(SearchIndex* index, const char* query, float cutoff,
}
}
// If no results found, return NULL properly
if (*num_results == 0) {
free(temp_results);
return NULL;
}
// Sort results by similarity
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
// Allocate final result array with exact size
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
// Shrink temp_results to exact size and return it directly
SearchResult* results = (SearchResult*)realloc(
temp_results, *num_results * sizeof(SearchResult));
if (!results) {
// Free all strings in temp_results
// realloc failure temp_results unchanged, clean up
for (int i = 0; i < *num_results; i++) {
free(temp_results[i].string);
}
free(temp_results);
return NULL;
}
// Copy results to final array
for (int i = 0; i < *num_results; i++) {
results[i].string = temp_results[i].string;
results[i].similarity = temp_results[i].similarity;
}
free(temp_results);
return results;
}
@@ -386,4 +338,4 @@ void free_search_results(SearchResult* results, int num_results) {
free(results[i].string);
}
free(results);
}
}

View File

@@ -129,9 +129,9 @@ Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) {
int num_results = 0;
SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
if (!results) {
Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException();
return env.Null();
// If no results found, return empty array instead of throwing error
if (!results || num_results == 0) {
return Napi::Array::New(env, 0);
}
Napi::Array result_array = Napi::Array::New(env, num_results);