This commit is contained in:
seb
2025-04-18 08:22:35 +02:00
commit 51a3cc6c2d
13 changed files with 794 additions and 0 deletions

25
.gitignore vendored Normal file
View File

@@ -0,0 +1,25 @@
# Node.js dependencies
node_modules/
npm-debug.log
yarn-debug.log
yarn-error.log
# Build outputs
build/
*.node
# Editor directories and files
.vscode/*
!.vscode/c_cpp_properties.json
.idea/
*.swp
*.swo
# OS generated files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db

18
.vscode/c_cpp_properties.json vendored Normal file
View File

@@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"${workspaceFolder}/node_modules/node-addon-api",
"${workspaceFolder}/node_modules/nan"
],
"defines": [],
"compilerPath": "/usr/bin/gcc",
"cStandard": "c11",
"cppStandard": "c++14",
"intelliSenseMode": "linux-gcc-x64"
}
],
"version": 4
}

119
README.md Normal file
View File

@@ -0,0 +1,119 @@
# Similarity Search
A Node.js module that performs word order independent similarity search on strings.
This module is built as a native addon that uses C code for fast similarity computations. It uses Jaccard similarity between word sets to find matches regardless of word order.
## Installation
```bash
npm install
```
## Usage
```javascript
const SimilaritySearch = require('./index');
// Create a new search index with default capacity (500)
const index = new SimilaritySearch();
// Add strings to the index
index.addString('bio bizz');
index.addString('lightmix bizz btio substrate');
index.addString('bizz bio mix light');
// Add multiple strings at once
index.addStrings([
'plant growth bio formula',
'garden soil substrate'
]);
// Search the index with a query and similarity cutoff
const results = index.search('bio bizz', 0.2);
// Display results
results.forEach(match => {
console.log(`${match.similarity.toFixed(2)}: ${match.string}`);
});
```
## API
### `new SimilaritySearch([capacity])`
Creates a new search index.
- `capacity` (optional): Initial capacity for the index. Default: 500.
### `addString(str)`
Adds a string to the index.
- `str`: The string to add.
- Returns: Boolean indicating success.
### `addStrings(strings)`
Adds multiple strings to the index.
- `strings`: Array of strings to add.
- Returns: Boolean indicating if all adds were successful.
### `search(query, [cutoff])`
Searches the index for strings similar to the query.
- `query`: The search query.
- `cutoff` (optional): Similarity threshold between 0.0 and 1.0. Default: 0.2.
- Returns: Array of matching results, sorted by similarity (descending).
### `size()`
Gets the number of strings in the index.
- Returns: Number of strings in the index.
## Helper Functions
### `SimilaritySearch.createTestIndex([size])`
Creates a test index with random data.
- `size` (optional): Number of strings to generate. Default: 500.
- Returns: A new SimilaritySearch instance with random data.
### `SimilaritySearch.benchmark(index, queries, [cutoff])`
Benchmarks the search performance.
- `index`: The index to benchmark.
- `queries`: Array of search queries.
- `cutoff` (optional): Similarity threshold. Default: 0.2.
- Returns: Benchmark results.
## How It Works
The similarity search uses Jaccard similarity between word sets:
```
similarity = (number of matching words) / (total unique words)
```
This means word order doesn't matter - "bio bizz" will match with "bizz bio" with 100% similarity.
## Building
To rebuild the native addon:
```bash
npm install
```
## Testing
Run the test script:
```bash
npm test
```

BIN
a.out Executable file

Binary file not shown.

25
binding.gyp Normal file
View File

@@ -0,0 +1,25 @@
{
"targets": [
{
"target_name": "similarity_search_addon",
"sources": [
"similarity_search.c",
"similarity_search_addon.cc"
],
"include_dirs": [
"<!@(node -p \"require('node-addon-api').include\")",
"<!(node -p \"require('node-addon-api').include_dir\")",
"<!(node -e \"require('nan')\")"
],
"dependencies": [
"<!(node -p \"require('node-addon-api').gyp\")"
],
"cflags!": [ "-fno-exceptions" ],
"cflags_cc!": [ "-fno-exceptions" ],
"defines": [ "NAPI_DISABLE_CPP_EXCEPTIONS" ],
"xcode_settings": {
"GCC_ENABLE_CPP_EXCEPTIONS": "YES"
}
}
]
}

137
index.js Normal file
View File

@@ -0,0 +1,137 @@
const addon = require('./build/Release/similarity_search_addon');
/**
* A string similarity search index
*/
class SimilaritySearch {
/**
* Create a new SimilaritySearch instance
*
* @param {number} [capacity=500] - Initial capacity of the index
*/
constructor(capacity = 500) {
this.index = new addon.SearchIndex(capacity);
}
/**
* Add a string to the search index
*
* @param {string} str - The string to add
* @returns {boolean} - True if successful, false otherwise
*/
addString(str) {
return this.index.addString(str) === 0;
}
/**
* Add multiple strings to the search index
*
* @param {string[]} strings - Array of strings to add
* @returns {boolean} - True if all adds were successful, false otherwise
*/
addStrings(strings) {
let success = true;
for (const str of strings) {
if (this.index.addString(str) !== 0) {
success = false;
}
}
return success;
}
/**
* Search the index for strings similar to the query
*
* @param {string} query - The search query
* @param {number} [cutoff=0.2] - Similarity threshold (0.0 to 1.0)
* @returns {Array<{string: string, similarity: number}>} - Array of matching results
*/
search(query, cutoff = 0.2) {
return this.index.search(query, cutoff);
}
/**
* Get the number of strings in the index
*
* @returns {number} - Number of strings in the index
*/
size() {
return this.index.size();
}
}
// Add some functions for convenience
/**
* Generate an index with random test data
*
* @param {number} [size=500] - Number of strings to generate
* @returns {SimilaritySearch} - A new SimilaritySearch instance with random data
*/
SimilaritySearch.createTestIndex = function(size = 500) {
const index = new SimilaritySearch(size);
// Add some specific test strings
index.addString("bio bizz");
index.addString("lightmix bizz btio substrate");
index.addString("bizz bio mix light");
index.addString("plant growth bio formula");
index.addString("garden soil substrate");
// Generate random strings
function randomWord(len) {
const chars = 'abcdefghijklmnopqrstuvwxyz';
let word = '';
for (let i = 0; i < len; i++) {
word += chars.charAt(Math.floor(Math.random() * chars.length));
}
return word;
}
function randomString() {
const numWords = 2 + Math.floor(Math.random() * 5); // 2-6 words
let str = '';
for (let i = 0; i < numWords; i++) {
if (i > 0) str += ' ';
str += randomWord(3 + Math.floor(Math.random() * 8)); // 3-10 chars
}
return str;
}
// Generate the rest of the strings
for (let i = 5; i < size; i++) {
index.addString(randomString());
}
return index;
};
/**
* Benchmark the search performance
*
* @param {SimilaritySearch} index - The index to benchmark
* @param {string[]} queries - Array of search queries
* @param {number} [cutoff=0.2] - Similarity threshold to use
* @returns {Object} - Benchmark results
*/
SimilaritySearch.benchmark = function(index, queries, cutoff = 0.2) {
const results = [];
for (const query of queries) {
const start = process.hrtime.bigint();
const matches = index.search(query, cutoff);
const end = process.hrtime.bigint();
const timeMs = Number(end - start) / 1000000;
results.push({
query,
matches: matches.length,
timeMs,
topResults: matches.slice(0, 5)
});
}
return results;
};
module.exports = SimilaritySearch;

28
package-lock.json generated Normal file
View File

@@ -0,0 +1,28 @@
{
"name": "similarity-search",
"version": "1.0.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "similarity-search",
"version": "1.0.0",
"hasInstallScript": true,
"license": "MIT",
"dependencies": {
"nan": "^2.22.2",
"node-addon-api": "^6.0.0"
}
},
"node_modules/nan": {
"version": "2.22.2",
"resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz",
"integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ=="
},
"node_modules/node-addon-api": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
"integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA=="
}
}
}

23
package.json Normal file
View File

@@ -0,0 +1,23 @@
{
"name": "similarity-search",
"version": "1.0.0",
"description": "A Node.js module for word order independent string similarity search",
"main": "index.js",
"scripts": {
"install": "node-gyp rebuild",
"test": "node test.js"
},
"keywords": [
"search",
"similarity",
"string",
"fuzzy"
],
"author": "",
"license": "MIT",
"dependencies": {
"nan": "^2.22.2",
"node-addon-api": "^6.0.0"
},
"gypfile": true
}

BIN
similarity_search Executable file

Binary file not shown.

198
similarity_search.c Normal file
View File

@@ -0,0 +1,198 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <ctype.h>
#include "similarity_search.h"
// Case insensitive string comparison
int str_case_cmp(const char *s1, const char *s2) {
while (*s1 && *s2) {
int c1 = tolower((unsigned char)*s1);
int c2 = tolower((unsigned char)*s2);
if (c1 != c2) {
return c1 - c2;
}
s1++;
s2++;
}
return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
}
// Split a string into words
int split_into_words(const char *string, char *words[MAX_WORDS]) {
char temp[MAX_STRING_LEN];
strcpy(temp, string);
int word_count = 0;
char *token = strtok(temp, " \t\n");
while (token != NULL && word_count < MAX_WORDS) {
words[word_count] = strdup(token);
word_count++;
token = strtok(NULL, " \t\n");
}
return word_count;
}
// Free memory allocated for words
void free_words(char *words[], int word_count) {
for (int i = 0; i < word_count; i++) {
free(words[i]);
}
}
// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words
char *query_words[MAX_WORDS] = {0};
char *target_words[MAX_WORDS] = {0};
int query_word_count = split_into_words(query, query_words);
int target_word_count = split_into_words(target, target_words);
if (query_word_count == 0 || target_word_count == 0) {
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
return 0.0;
}
// Count matches
int matches = 0;
for (int i = 0; i < query_word_count; i++) {
for (int j = 0; j < target_word_count; j++) {
if (str_case_cmp(query_words[i], target_words[j]) == 0) {
matches++;
break;
}
}
}
// Calculate Jaccard similarity (intersection over union)
float similarity = (float)matches / (query_word_count + target_word_count - matches);
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
return similarity;
}
// Compare function for qsort to sort results by similarity (descending)
int compare_results(const void *a, const void *b) {
const SearchResult *result_a = (const SearchResult *)a;
const SearchResult *result_b = (const SearchResult *)b;
if (result_b->similarity > result_a->similarity) return 1;
if (result_b->similarity < result_a->similarity) return -1;
return 0;
}
// Generate a random word
void generate_random_word(char *word, int max_len) {
int len = 3 + rand() % 8; // Random length between 3 and 10
for (int i = 0; i < len; i++) {
word[i] = 'a' + (rand() % 26);
}
word[len] = '\0';
}
// Generate a random string consisting of multiple words
void generate_random_string(char *string, int max_len) {
int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
string[0] = '\0';
for (int i = 0; i < num_words; i++) {
char word[20];
generate_random_word(word, 10);
// Check if there's enough space to add this word
if (strlen(string) + strlen(word) + 1 < (size_t)max_len) {
if (i > 0) strcat(string, " ");
strcat(string, word);
} else {
break;
}
}
}
// Create a new search index
SearchIndex* create_search_index(int capacity) {
SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
if (!index) return NULL;
index->strings = (char**)malloc(capacity * sizeof(char*));
if (!index->strings) {
free(index);
return NULL;
}
index->num_strings = 0;
return index;
}
// Add a string to the index
int add_string_to_index(SearchIndex* index, const char* string) {
if (!index || !string) return -1;
index->strings[index->num_strings] = strdup(string);
if (!index->strings[index->num_strings]) return -1;
index->num_strings++;
return 0;
}
// Free the search index and all associated memory
void free_search_index(SearchIndex* index) {
if (!index) return;
for (int i = 0; i < index->num_strings; i++) {
free(index->strings[i]);
}
free(index->strings);
free(index);
}
// Search the index with the given query and similarity cutoff
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
if (!index || !query || !num_results) return NULL;
// Allocate temporary array for results
SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
if (!temp_results) return NULL;
*num_results = 0;
// Search through all strings in the index
for (int i = 0; i < index->num_strings; i++) {
float similarity = calculate_similarity(query, index->strings[i], cutoff);
if (similarity >= cutoff) {
temp_results[*num_results].string = index->strings[i];
temp_results[*num_results].similarity = similarity;
(*num_results)++;
}
}
// Sort results by similarity
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
// Allocate final result array with exact size
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
if (!results) {
free(temp_results);
return NULL;
}
// Copy results to final array
memcpy(results, temp_results, *num_results * sizeof(SearchResult));
free(temp_results);
return results;
}
// Free the search results
void free_search_results(SearchResult* results, int num_results) {
free(results);
}

45
similarity_search.h Normal file
View File

@@ -0,0 +1,45 @@
#ifndef SIMILARITY_SEARCH_H
#define SIMILARITY_SEARCH_H
#ifdef __cplusplus
extern "C" {
#endif
#define MAX_STRING_LEN 100
#define MAX_WORDS 20
// Public API
// Structure representing the search index
typedef struct {
char **strings;
int num_strings;
} SearchIndex;
// Structure to hold a search result
typedef struct {
const char *string;
float similarity;
} SearchResult;
// Create a new search index
SearchIndex* create_search_index(int capacity);
// Add a string to the index
int add_string_to_index(SearchIndex* index, const char* string);
// Free the search index and all associated memory
void free_search_index(SearchIndex* index);
// Search the index with the given query and similarity cutoff
// Returns an array of SearchResult pointers that must be freed by the caller
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results);
// Free the search results
void free_search_results(SearchResult* results, int num_results);
#ifdef __cplusplus
}
#endif
#endif /* SIMILARITY_SEARCH_H */

123
similarity_search_addon.cc Normal file
View File

@@ -0,0 +1,123 @@
#include <napi.h>
#include <string>
#include "similarity_search.h"
class SearchIndexWrapper : public Napi::ObjectWrap<SearchIndexWrapper> {
public:
static Napi::Object Init(Napi::Env env, Napi::Object exports);
SearchIndexWrapper(const Napi::CallbackInfo& info);
~SearchIndexWrapper();
private:
static Napi::FunctionReference constructor;
Napi::Value AddString(const Napi::CallbackInfo& info);
Napi::Value Search(const Napi::CallbackInfo& info);
Napi::Value GetSize(const Napi::CallbackInfo& info);
SearchIndex* index_;
};
Napi::FunctionReference SearchIndexWrapper::constructor;
Napi::Object SearchIndexWrapper::Init(Napi::Env env, Napi::Object exports) {
Napi::HandleScope scope(env);
Napi::Function func = DefineClass(env, "SearchIndex", {
InstanceMethod("addString", &SearchIndexWrapper::AddString),
InstanceMethod("search", &SearchIndexWrapper::Search),
InstanceMethod("size", &SearchIndexWrapper::GetSize)
});
constructor = Napi::Persistent(func);
constructor.SuppressDestruct();
exports.Set("SearchIndex", func);
return exports;
}
SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info)
: Napi::ObjectWrap<SearchIndexWrapper>(info) {
Napi::Env env = info.Env();
Napi::HandleScope scope(env);
int capacity = 500; // Default capacity
if (info.Length() > 0 && info[0].IsNumber()) {
capacity = info[0].As<Napi::Number>().Int32Value();
}
this->index_ = create_search_index(capacity);
if (!this->index_) {
Napi::Error::New(env, "Failed to create search index").ThrowAsJavaScriptException();
}
}
SearchIndexWrapper::~SearchIndexWrapper() {
free_search_index(this->index_);
}
Napi::Value SearchIndexWrapper::AddString(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
Napi::HandleScope scope(env);
if (info.Length() < 1 || !info[0].IsString()) {
Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
return env.Null();
}
std::string str = info[0].As<Napi::String>().Utf8Value();
int result = add_string_to_index(this->index_, str.c_str());
return Napi::Number::New(env, result);
}
Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
Napi::HandleScope scope(env);
if (info.Length() < 1 || !info[0].IsString()) {
Napi::TypeError::New(env, "Query string expected").ThrowAsJavaScriptException();
return env.Null();
}
std::string query = info[0].As<Napi::String>().Utf8Value();
float cutoff = 0.2f; // Default cutoff
if (info.Length() > 1 && info[1].IsNumber()) {
cutoff = info[1].As<Napi::Number>().FloatValue();
}
int num_results = 0;
SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
if (!results) {
Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException();
return env.Null();
}
Napi::Array result_array = Napi::Array::New(env, num_results);
for (int i = 0; i < num_results; i++) {
Napi::Object obj = Napi::Object::New(env);
obj.Set("string", Napi::String::New(env, results[i].string));
obj.Set("similarity", Napi::Number::New(env, results[i].similarity));
result_array[i] = obj;
}
free_search_results(results, num_results);
return result_array;
}
Napi::Value SearchIndexWrapper::GetSize(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
Napi::HandleScope scope(env);
return Napi::Number::New(env, this->index_->num_strings);
}
Napi::Object Init(Napi::Env env, Napi::Object exports) {
return SearchIndexWrapper::Init(env, exports);
}
NODE_API_MODULE(similarity_search_addon, Init)

53
test.js Normal file
View File

@@ -0,0 +1,53 @@
const SimilaritySearch = require('./index');
// Create a test index with 500 strings
console.log('Creating test index with 500 strings...');
const index = SimilaritySearch.createTestIndex(500);
console.log(`Index created with ${index.size()} strings`);
// Test queries to run
const queries = [
'bio bizz',
'substrate light',
'plant growth',
'garden mix',
'random query'
];
console.log('\nRunning benchmark...');
const benchmarkResults = SimilaritySearch.benchmark(index, queries);
// Display results
console.log(`\nSearch results with cutoff: 0.2\n`);
benchmarkResults.forEach(result => {
console.log(`Query: "${result.query}"`);
console.log(`Found ${result.matches} matches in ${result.timeMs.toFixed(2)} ms`);
// Display top results
result.topResults.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
});
console.log('');
});
// Demonstrate creating a custom index
console.log('Creating a custom index...');
const customIndex = new SimilaritySearch();
customIndex.addString('bio bizz');
customIndex.addString('lightmix bizz btio substrate');
customIndex.addString('bizz bio mix light');
// Add multiple strings at once
customIndex.addStrings([
'plant growth bio formula',
'garden soil substrate'
]);
console.log(`Custom index created with ${customIndex.size()} strings`);
// Search with a higher similarity threshold
console.log('\nSearching with higher similarity threshold (0.3):');
const results = customIndex.search('bio bizz', 0.3);
results.forEach(match => {
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
});