Compare commits
10 Commits
0dd17b794f
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fb8f61f868 | ||
|
|
091c258d41 | ||
|
|
8474c77163 | ||
|
|
21f527ba46 | ||
|
|
462041654d | ||
|
|
60d609dd6a | ||
| ccbd833361 | |||
| 24895fc1bc | |||
|
|
a9a9247773 | ||
|
|
e2aacaf54b |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -23,3 +23,5 @@ build/
|
|||||||
.Trashes
|
.Trashes
|
||||||
ehthumbs.db
|
ehthumbs.db
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
|
|
||||||
|
package-lock.json
|
||||||
46
package.json
46
package.json
@@ -1,23 +1,59 @@
|
|||||||
{
|
{
|
||||||
"name": "similarity-search",
|
"name": "similarity-search",
|
||||||
"version": "1.0.1",
|
"version": "1.0.2",
|
||||||
"description": "A Node.js module for word order independent string similarity search",
|
"description": "A Node.js module for word order independent string similarity search",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"install": "node-gyp rebuild",
|
"install": "node-gyp rebuild",
|
||||||
"test": "node test.js"
|
"rebuild": "node-gyp rebuild",
|
||||||
|
"build": "node-gyp rebuild",
|
||||||
|
"clean": "node-gyp clean",
|
||||||
|
"configure": "node-gyp configure",
|
||||||
|
"test": "node test.js",
|
||||||
|
"pretest": "npm run build"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"search",
|
"search",
|
||||||
"similarity",
|
"similarity",
|
||||||
"string",
|
"string",
|
||||||
"fuzzy"
|
"fuzzy",
|
||||||
|
"native",
|
||||||
|
"addon",
|
||||||
|
"c++",
|
||||||
|
"performance"
|
||||||
],
|
],
|
||||||
"author": "",
|
"author": "",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"nan": "^2.22.2",
|
"nan": "^2.22.2",
|
||||||
"node-addon-api": "^6.0.0"
|
"node-addon-api": "^6.0.0",
|
||||||
|
"node-gyp": "^11.2.0"
|
||||||
},
|
},
|
||||||
"gypfile": true
|
"devDependencies": {
|
||||||
|
},
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": ""
|
||||||
|
},
|
||||||
|
"files": [
|
||||||
|
"index.js",
|
||||||
|
"binding.gyp",
|
||||||
|
"similarity_search.c",
|
||||||
|
"similarity_search.h",
|
||||||
|
"similarity_search_addon.cc",
|
||||||
|
"README.md"
|
||||||
|
],
|
||||||
|
"gypfile": true,
|
||||||
|
"os": [
|
||||||
|
"win32",
|
||||||
|
"darwin",
|
||||||
|
"linux"
|
||||||
|
],
|
||||||
|
"cpu": [
|
||||||
|
"x64",
|
||||||
|
"arm64"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,11 @@
|
|||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <malloc.h> // For alloca on Windows
|
||||||
|
#else
|
||||||
|
#include <alloca.h> // For alloca on Unix-like systems
|
||||||
|
#endif
|
||||||
#include "similarity_search.h"
|
#include "similarity_search.h"
|
||||||
|
|
||||||
// Case insensitive string comparison
|
// Case insensitive string comparison
|
||||||
@@ -22,76 +27,56 @@ int str_case_cmp(const char *s1, const char *s2) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Split a string into words
|
// Split a string into words
|
||||||
int split_into_words(const char *string, char *words[MAX_WORDS]) {
|
int split_into_words(const char *s,
|
||||||
if (!string || strlen(string) >= MAX_STRING_LEN) {
|
char *words[MAX_WORDS],
|
||||||
return 0;
|
char **storage) /* NEW OUT PARAM */
|
||||||
}
|
{
|
||||||
|
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
|
||||||
|
|
||||||
char temp[MAX_STRING_LEN];
|
char *buf = strdup(s); /* one single allocation */
|
||||||
strncpy(temp, string, MAX_STRING_LEN - 1);
|
if (!buf) return 0;
|
||||||
temp[MAX_STRING_LEN - 1] = '\0';
|
*storage = buf; /* hand ownership to caller */
|
||||||
|
|
||||||
int word_count = 0;
|
int n = 0;
|
||||||
char *token = strtok(temp, " \t\n");
|
for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
|
||||||
|
tok = strtok(NULL, " \t\n"))
|
||||||
while (token != NULL && word_count < MAX_WORDS) {
|
{
|
||||||
words[word_count] = strdup(token);
|
words[n++] = tok; /* pointers into buf */
|
||||||
if (!words[word_count]) {
|
|
||||||
// Free any already allocated words on error
|
|
||||||
for (int i = 0; i < word_count; i++) {
|
|
||||||
free(words[i]);
|
|
||||||
}
|
}
|
||||||
return 0;
|
return n;
|
||||||
}
|
|
||||||
word_count++;
|
|
||||||
token = strtok(NULL, " \t\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
return word_count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Free memory allocated for words
|
// Free memory allocated for words
|
||||||
void free_words(char *words[], int word_count) {
|
void free_words(char *storage) { /* simplified */
|
||||||
for (int i = 0; i < word_count; i++) {
|
if (storage) { /* check for NULL */
|
||||||
free(words[i]);
|
free(storage); /* single free, if any */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate Levenshtein distance between two strings
|
// Calculate Levenshtein distance between two strings
|
||||||
int levenshtein_distance(const char *s1, const char *s2) {
|
int levenshtein_distance(const char *a, const char *b)
|
||||||
int len1 = strlen(s1);
|
{
|
||||||
int len2 = strlen(s2);
|
size_t m = strlen(a), n = strlen(b);
|
||||||
|
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
|
||||||
|
|
||||||
// Convert to lowercase for comparison
|
int *row0 = alloca((n + 1) * sizeof(int));
|
||||||
char s1_lower[MAX_STRING_LEN];
|
int *row1 = alloca((n + 1) * sizeof(int));
|
||||||
char s2_lower[MAX_STRING_LEN];
|
|
||||||
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
|
|
||||||
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
|
|
||||||
s1_lower[len1] = '\0';
|
|
||||||
s2_lower[len2] = '\0';
|
|
||||||
|
|
||||||
// Create distance matrix
|
for (size_t j = 0; j <= n; ++j) row0[j] = j;
|
||||||
int matrix[len1 + 1][len2 + 1];
|
for (size_t i = 1; i <= m; ++i) {
|
||||||
|
row1[0] = i;
|
||||||
// Initialize first row and column
|
for (size_t j = 1; j <= n; ++j) {
|
||||||
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
|
int cost = (tolower((unsigned)a[i-1]) ==
|
||||||
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
|
tolower((unsigned)b[j-1])) ? 0 : 1;
|
||||||
|
int del = row0[j] + 1;
|
||||||
// Fill in the rest of the matrix
|
int ins = row1[j-1] + 1;
|
||||||
for (int i = 1; i <= len1; i++) {
|
int sub = row0[j-1] + cost;
|
||||||
for (int j = 1; j <= len2; j++) {
|
row1[j] = (del < ins ? (del < sub ? del : sub)
|
||||||
if (s1_lower[i-1] == s2_lower[j-1]) {
|
: (ins < sub ? ins : sub));
|
||||||
matrix[i][j] = matrix[i-1][j-1];
|
|
||||||
} else {
|
|
||||||
int min = matrix[i-1][j-1]; // substitution
|
|
||||||
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
|
|
||||||
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
|
|
||||||
matrix[i][j] = min + 1;
|
|
||||||
}
|
}
|
||||||
|
int *tmp = row0; row0 = row1; row1 = tmp;
|
||||||
}
|
}
|
||||||
}
|
return row0[n];
|
||||||
|
|
||||||
return matrix[len1][len2];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate similarity between two words based on Levenshtein distance
|
// Calculate similarity between two words based on Levenshtein distance
|
||||||
@@ -99,45 +84,21 @@ float word_similarity(const char *word1, const char *word2) {
|
|||||||
int len1 = strlen(word1);
|
int len1 = strlen(word1);
|
||||||
int len2 = strlen(word2);
|
int len2 = strlen(word2);
|
||||||
|
|
||||||
// For very short words (3 chars or less), require exact match
|
// For very short words (2 chars or less), require exact match
|
||||||
if (len1 <= 3 || len2 <= 3) {
|
if (len1 <= 2 || len2 <= 2) {
|
||||||
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If one word is significantly shorter than the other, it must be a prefix
|
// Calculate Levenshtein distance
|
||||||
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
|
|
||||||
// Check if the shorter word is a prefix of the longer word
|
|
||||||
const char *longer = len1 > len2 ? word1 : word2;
|
|
||||||
const char *shorter = len1 > len2 ? word2 : word1;
|
|
||||||
int shorter_len = len1 > len2 ? len2 : len1;
|
|
||||||
|
|
||||||
if (strncasecmp(longer, shorter, shorter_len) == 0) {
|
|
||||||
return 0.8f; // Good prefix match
|
|
||||||
}
|
|
||||||
return 0.0f; // Not a prefix match
|
|
||||||
}
|
|
||||||
|
|
||||||
// For words of similar length, calculate similarity
|
|
||||||
int distance = levenshtein_distance(word1, word2);
|
int distance = levenshtein_distance(word1, word2);
|
||||||
int max_len = len1 > len2 ? len1 : len2;
|
int max_len = len1 > len2 ? len1 : len2;
|
||||||
|
|
||||||
// Calculate similarity based on edit distance
|
// Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
|
||||||
float similarity = 1.0f - (float)distance / max_len;
|
float similarity = 1.0f - (float)distance / max_len;
|
||||||
|
|
||||||
// Adjust similarity based on word lengths
|
// Boost similarity for small differences
|
||||||
if (len1 != len2) {
|
if (distance <= 1) {
|
||||||
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
|
similarity = 0.9f + (similarity * 0.1f);
|
||||||
similarity *= length_ratio;
|
|
||||||
}
|
|
||||||
|
|
||||||
// For words of similar length, require reasonable similarity
|
|
||||||
if (similarity < 0.4f) {
|
|
||||||
return 0.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Never return perfect similarity for non-identical words
|
|
||||||
if (distance > 0) {
|
|
||||||
similarity = fmin(similarity, 0.9f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return similarity;
|
return similarity;
|
||||||
@@ -146,15 +107,15 @@ float word_similarity(const char *word1, const char *word2) {
|
|||||||
// Calculate similarity between query and target string
|
// Calculate similarity between query and target string
|
||||||
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
||||||
// Split strings into words
|
// Split strings into words
|
||||||
char *query_words[MAX_WORDS] = {0};
|
char *query_buf = NULL, *target_buf = NULL;
|
||||||
char *target_words[MAX_WORDS] = {0};
|
char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
|
||||||
|
|
||||||
int query_word_count = split_into_words(query, query_words);
|
int query_word_count = split_into_words(query, query_words, &query_buf);
|
||||||
int target_word_count = split_into_words(target, target_words);
|
int target_word_count = split_into_words(target, target_words, &target_buf);
|
||||||
|
|
||||||
if (query_word_count == 0 || target_word_count == 0) {
|
if (query_word_count == 0 || target_word_count == 0) {
|
||||||
free_words(query_words, query_word_count);
|
free_words(query_buf);
|
||||||
free_words(target_words, target_word_count);
|
free_words(target_buf);
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,39 +135,31 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
|
|||||||
}
|
}
|
||||||
|
|
||||||
best_word_similarities[i] = best_similarity;
|
best_word_similarities[i] = best_similarity;
|
||||||
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
|
if (best_similarity >= 0.4f) {
|
||||||
query_words_found++;
|
query_words_found++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate overall similarity
|
// Calculate average word similarity
|
||||||
float word_match_score = (float)query_words_found / query_word_count;
|
|
||||||
|
|
||||||
// Calculate average of best word similarities
|
|
||||||
float avg_word_similarity = 0.0f;
|
float avg_word_similarity = 0.0f;
|
||||||
for (int i = 0; i < query_word_count; i++) {
|
for (int i = 0; i < query_word_count; i++) {
|
||||||
avg_word_similarity += best_word_similarities[i];
|
avg_word_similarity += best_word_similarities[i];
|
||||||
}
|
}
|
||||||
avg_word_similarity /= query_word_count;
|
avg_word_similarity /= query_word_count;
|
||||||
|
|
||||||
// Combine scores: 70% weight on word matches, 30% on character similarity
|
// Calculate word match ratio
|
||||||
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
|
float word_match_ratio = (float)query_words_found / query_word_count;
|
||||||
|
|
||||||
// Never return perfect similarity unless all words are exact matches
|
// Final score is the average of word match ratio and average word similarity
|
||||||
bool all_exact_matches = true;
|
float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
|
||||||
for (int i = 0; i < query_word_count; i++) {
|
|
||||||
if (best_word_similarities[i] < 1.0f) {
|
// Boost score if all words are found
|
||||||
all_exact_matches = false;
|
if (query_words_found == query_word_count) {
|
||||||
break;
|
similarity = 0.8f + (similarity * 0.2f);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!all_exact_matches) {
|
free_words(query_buf);
|
||||||
similarity = fmin(similarity, 0.9f);
|
free_words(target_buf);
|
||||||
}
|
|
||||||
|
|
||||||
free_words(query_words, query_word_count);
|
|
||||||
free_words(target_words, target_word_count);
|
|
||||||
|
|
||||||
return similarity;
|
return similarity;
|
||||||
}
|
}
|
||||||
@@ -353,27 +306,26 @@ SearchResult* search_index(SearchIndex* index, const char* query, float cutoff,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If no results found, return NULL properly
|
||||||
|
if (*num_results == 0) {
|
||||||
|
free(temp_results);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
// Sort results by similarity
|
// Sort results by similarity
|
||||||
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
|
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
|
||||||
|
|
||||||
// Allocate final result array with exact size
|
// Shrink temp_results to exact size and return it directly
|
||||||
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
|
SearchResult* results = (SearchResult*)realloc(
|
||||||
|
temp_results, *num_results * sizeof(SearchResult));
|
||||||
if (!results) {
|
if (!results) {
|
||||||
// Free all strings in temp_results
|
// realloc failure – temp_results unchanged, clean up
|
||||||
for (int i = 0; i < *num_results; i++) {
|
for (int i = 0; i < *num_results; i++) {
|
||||||
free(temp_results[i].string);
|
free(temp_results[i].string);
|
||||||
}
|
}
|
||||||
free(temp_results);
|
free(temp_results);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy results to final array
|
|
||||||
for (int i = 0; i < *num_results; i++) {
|
|
||||||
results[i].string = temp_results[i].string;
|
|
||||||
results[i].similarity = temp_results[i].similarity;
|
|
||||||
}
|
|
||||||
free(temp_results);
|
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -129,9 +129,9 @@ Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) {
|
|||||||
int num_results = 0;
|
int num_results = 0;
|
||||||
SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
|
SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
|
||||||
|
|
||||||
if (!results) {
|
// If no results found, return empty array instead of throwing error
|
||||||
Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException();
|
if (!results || num_results == 0) {
|
||||||
return env.Null();
|
return Napi::Array::New(env, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Napi::Array result_array = Napi::Array::New(env, num_results);
|
Napi::Array result_array = Napi::Array::New(env, num_results);
|
||||||
|
|||||||
Reference in New Issue
Block a user