commit 51a3cc6c2df250624a05051cae74eeefbf0e1d23 Author: seb Date: Fri Apr 18 08:22:35 2025 +0200 genesis diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..13bcd74 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Node.js dependencies +node_modules/ +npm-debug.log +yarn-debug.log +yarn-error.log + +# Build outputs +build/ +*.node + +# Editor directories and files +.vscode/* +!.vscode/c_cpp_properties.json +.idea/ +*.swp +*.swo + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db \ No newline at end of file diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..a55b628 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "${workspaceFolder}/node_modules/node-addon-api", + "${workspaceFolder}/node_modules/nan" + ], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c11", + "cppStandard": "c++14", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..10309d2 --- /dev/null +++ b/README.md @@ -0,0 +1,119 @@ +# Similarity Search + +A Node.js module that performs word order independent similarity search on strings. + +This module is built as a native addon that uses C code for fast similarity computations. It uses Jaccard similarity between word sets to find matches regardless of word order. + +## Installation + +```bash +npm install +``` + +## Usage + +```javascript +const SimilaritySearch = require('./index'); + +// Create a new search index with default capacity (500) +const index = new SimilaritySearch(); + +// Add strings to the index +index.addString('bio bizz'); +index.addString('lightmix bizz btio substrate'); +index.addString('bizz bio mix light'); + +// Add multiple strings at once +index.addStrings([ + 'plant growth bio formula', + 'garden soil substrate' +]); + +// Search the index with a query and similarity cutoff +const results = index.search('bio bizz', 0.2); + +// Display results +results.forEach(match => { + console.log(`${match.similarity.toFixed(2)}: ${match.string}`); +}); +``` + +## API + +### `new SimilaritySearch([capacity])` + +Creates a new search index. + +- `capacity` (optional): Initial capacity for the index. Default: 500. + +### `addString(str)` + +Adds a string to the index. + +- `str`: The string to add. +- Returns: Boolean indicating success. + +### `addStrings(strings)` + +Adds multiple strings to the index. + +- `strings`: Array of strings to add. +- Returns: Boolean indicating if all adds were successful. + +### `search(query, [cutoff])` + +Searches the index for strings similar to the query. + +- `query`: The search query. +- `cutoff` (optional): Similarity threshold between 0.0 and 1.0. Default: 0.2. +- Returns: Array of matching results, sorted by similarity (descending). + +### `size()` + +Gets the number of strings in the index. + +- Returns: Number of strings in the index. + +## Helper Functions + +### `SimilaritySearch.createTestIndex([size])` + +Creates a test index with random data. + +- `size` (optional): Number of strings to generate. Default: 500. +- Returns: A new SimilaritySearch instance with random data. + +### `SimilaritySearch.benchmark(index, queries, [cutoff])` + +Benchmarks the search performance. + +- `index`: The index to benchmark. +- `queries`: Array of search queries. +- `cutoff` (optional): Similarity threshold. Default: 0.2. +- Returns: Benchmark results. + +## How It Works + +The similarity search uses Jaccard similarity between word sets: + +``` +similarity = (number of matching words) / (total unique words) +``` + +This means word order doesn't matter - "bio bizz" will match with "bizz bio" with 100% similarity. + +## Building + +To rebuild the native addon: + +```bash +npm install +``` + +## Testing + +Run the test script: + +```bash +npm test +``` \ No newline at end of file diff --git a/a.out b/a.out new file mode 100755 index 0000000..e4ebf0b Binary files /dev/null and b/a.out differ diff --git a/binding.gyp b/binding.gyp new file mode 100644 index 0000000..8f7c30d --- /dev/null +++ b/binding.gyp @@ -0,0 +1,25 @@ +{ + "targets": [ + { + "target_name": "similarity_search_addon", + "sources": [ + "similarity_search.c", + "similarity_search_addon.cc" + ], + "include_dirs": [ + "} - Array of matching results + */ + search(query, cutoff = 0.2) { + return this.index.search(query, cutoff); + } + + /** + * Get the number of strings in the index + * + * @returns {number} - Number of strings in the index + */ + size() { + return this.index.size(); + } +} + +// Add some functions for convenience +/** + * Generate an index with random test data + * + * @param {number} [size=500] - Number of strings to generate + * @returns {SimilaritySearch} - A new SimilaritySearch instance with random data + */ +SimilaritySearch.createTestIndex = function(size = 500) { + const index = new SimilaritySearch(size); + + // Add some specific test strings + index.addString("bio bizz"); + index.addString("lightmix bizz btio substrate"); + index.addString("bizz bio mix light"); + index.addString("plant growth bio formula"); + index.addString("garden soil substrate"); + + // Generate random strings + function randomWord(len) { + const chars = 'abcdefghijklmnopqrstuvwxyz'; + let word = ''; + for (let i = 0; i < len; i++) { + word += chars.charAt(Math.floor(Math.random() * chars.length)); + } + return word; + } + + function randomString() { + const numWords = 2 + Math.floor(Math.random() * 5); // 2-6 words + let str = ''; + for (let i = 0; i < numWords; i++) { + if (i > 0) str += ' '; + str += randomWord(3 + Math.floor(Math.random() * 8)); // 3-10 chars + } + return str; + } + + // Generate the rest of the strings + for (let i = 5; i < size; i++) { + index.addString(randomString()); + } + + return index; +}; + +/** + * Benchmark the search performance + * + * @param {SimilaritySearch} index - The index to benchmark + * @param {string[]} queries - Array of search queries + * @param {number} [cutoff=0.2] - Similarity threshold to use + * @returns {Object} - Benchmark results + */ +SimilaritySearch.benchmark = function(index, queries, cutoff = 0.2) { + const results = []; + + for (const query of queries) { + const start = process.hrtime.bigint(); + const matches = index.search(query, cutoff); + const end = process.hrtime.bigint(); + + const timeMs = Number(end - start) / 1000000; + + results.push({ + query, + matches: matches.length, + timeMs, + topResults: matches.slice(0, 5) + }); + } + + return results; +}; + +module.exports = SimilaritySearch; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..4ffc12f --- /dev/null +++ b/package-lock.json @@ -0,0 +1,28 @@ +{ + "name": "similarity-search", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "similarity-search", + "version": "1.0.0", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "nan": "^2.22.2", + "node-addon-api": "^6.0.0" + } + }, + "node_modules/nan": { + "version": "2.22.2", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz", + "integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ==" + }, + "node_modules/node-addon-api": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz", + "integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==" + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..cf194dd --- /dev/null +++ b/package.json @@ -0,0 +1,23 @@ +{ + "name": "similarity-search", + "version": "1.0.0", + "description": "A Node.js module for word order independent string similarity search", + "main": "index.js", + "scripts": { + "install": "node-gyp rebuild", + "test": "node test.js" + }, + "keywords": [ + "search", + "similarity", + "string", + "fuzzy" + ], + "author": "", + "license": "MIT", + "dependencies": { + "nan": "^2.22.2", + "node-addon-api": "^6.0.0" + }, + "gypfile": true +} diff --git a/similarity_search b/similarity_search new file mode 100755 index 0000000..e4ebf0b Binary files /dev/null and b/similarity_search differ diff --git a/similarity_search.c b/similarity_search.c new file mode 100644 index 0000000..04d035a --- /dev/null +++ b/similarity_search.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include +#include "similarity_search.h" + +// Case insensitive string comparison +int str_case_cmp(const char *s1, const char *s2) { + while (*s1 && *s2) { + int c1 = tolower((unsigned char)*s1); + int c2 = tolower((unsigned char)*s2); + if (c1 != c2) { + return c1 - c2; + } + s1++; + s2++; + } + return tolower((unsigned char)*s1) - tolower((unsigned char)*s2); +} + +// Split a string into words +int split_into_words(const char *string, char *words[MAX_WORDS]) { + char temp[MAX_STRING_LEN]; + strcpy(temp, string); + + int word_count = 0; + char *token = strtok(temp, " \t\n"); + + while (token != NULL && word_count < MAX_WORDS) { + words[word_count] = strdup(token); + word_count++; + token = strtok(NULL, " \t\n"); + } + + return word_count; +} + +// Free memory allocated for words +void free_words(char *words[], int word_count) { + for (int i = 0; i < word_count; i++) { + free(words[i]); + } +} + +// Calculate similarity between query and target string +float calculate_similarity(const char *query, const char *target, float cutoff) { + // Split strings into words + char *query_words[MAX_WORDS] = {0}; + char *target_words[MAX_WORDS] = {0}; + + int query_word_count = split_into_words(query, query_words); + int target_word_count = split_into_words(target, target_words); + + if (query_word_count == 0 || target_word_count == 0) { + free_words(query_words, query_word_count); + free_words(target_words, target_word_count); + return 0.0; + } + + // Count matches + int matches = 0; + for (int i = 0; i < query_word_count; i++) { + for (int j = 0; j < target_word_count; j++) { + if (str_case_cmp(query_words[i], target_words[j]) == 0) { + matches++; + break; + } + } + } + + // Calculate Jaccard similarity (intersection over union) + float similarity = (float)matches / (query_word_count + target_word_count - matches); + + free_words(query_words, query_word_count); + free_words(target_words, target_word_count); + + return similarity; +} + +// Compare function for qsort to sort results by similarity (descending) +int compare_results(const void *a, const void *b) { + const SearchResult *result_a = (const SearchResult *)a; + const SearchResult *result_b = (const SearchResult *)b; + + if (result_b->similarity > result_a->similarity) return 1; + if (result_b->similarity < result_a->similarity) return -1; + return 0; +} + +// Generate a random word +void generate_random_word(char *word, int max_len) { + int len = 3 + rand() % 8; // Random length between 3 and 10 + for (int i = 0; i < len; i++) { + word[i] = 'a' + (rand() % 26); + } + word[len] = '\0'; +} + +// Generate a random string consisting of multiple words +void generate_random_string(char *string, int max_len) { + int num_words = 2 + rand() % 5; // Random number of words between 2 and 6 + string[0] = '\0'; + + for (int i = 0; i < num_words; i++) { + char word[20]; + generate_random_word(word, 10); + + // Check if there's enough space to add this word + if (strlen(string) + strlen(word) + 1 < (size_t)max_len) { + if (i > 0) strcat(string, " "); + strcat(string, word); + } else { + break; + } + } +} + +// Create a new search index +SearchIndex* create_search_index(int capacity) { + SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex)); + if (!index) return NULL; + + index->strings = (char**)malloc(capacity * sizeof(char*)); + if (!index->strings) { + free(index); + return NULL; + } + + index->num_strings = 0; + return index; +} + +// Add a string to the index +int add_string_to_index(SearchIndex* index, const char* string) { + if (!index || !string) return -1; + + index->strings[index->num_strings] = strdup(string); + if (!index->strings[index->num_strings]) return -1; + + index->num_strings++; + return 0; +} + +// Free the search index and all associated memory +void free_search_index(SearchIndex* index) { + if (!index) return; + + for (int i = 0; i < index->num_strings; i++) { + free(index->strings[i]); + } + + free(index->strings); + free(index); +} + +// Search the index with the given query and similarity cutoff +SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) { + if (!index || !query || !num_results) return NULL; + + // Allocate temporary array for results + SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult)); + if (!temp_results) return NULL; + + *num_results = 0; + + // Search through all strings in the index + for (int i = 0; i < index->num_strings; i++) { + float similarity = calculate_similarity(query, index->strings[i], cutoff); + + if (similarity >= cutoff) { + temp_results[*num_results].string = index->strings[i]; + temp_results[*num_results].similarity = similarity; + (*num_results)++; + } + } + + // Sort results by similarity + qsort(temp_results, *num_results, sizeof(SearchResult), compare_results); + + // Allocate final result array with exact size + SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult)); + if (!results) { + free(temp_results); + return NULL; + } + + // Copy results to final array + memcpy(results, temp_results, *num_results * sizeof(SearchResult)); + free(temp_results); + + return results; +} + +// Free the search results +void free_search_results(SearchResult* results, int num_results) { + free(results); +} \ No newline at end of file diff --git a/similarity_search.h b/similarity_search.h new file mode 100644 index 0000000..9f12c33 --- /dev/null +++ b/similarity_search.h @@ -0,0 +1,45 @@ +#ifndef SIMILARITY_SEARCH_H +#define SIMILARITY_SEARCH_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_STRING_LEN 100 +#define MAX_WORDS 20 + +// Public API + +// Structure representing the search index +typedef struct { + char **strings; + int num_strings; +} SearchIndex; + +// Structure to hold a search result +typedef struct { + const char *string; + float similarity; +} SearchResult; + +// Create a new search index +SearchIndex* create_search_index(int capacity); + +// Add a string to the index +int add_string_to_index(SearchIndex* index, const char* string); + +// Free the search index and all associated memory +void free_search_index(SearchIndex* index); + +// Search the index with the given query and similarity cutoff +// Returns an array of SearchResult pointers that must be freed by the caller +SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results); + +// Free the search results +void free_search_results(SearchResult* results, int num_results); + +#ifdef __cplusplus +} +#endif + +#endif /* SIMILARITY_SEARCH_H */ \ No newline at end of file diff --git a/similarity_search_addon.cc b/similarity_search_addon.cc new file mode 100644 index 0000000..6d68880 --- /dev/null +++ b/similarity_search_addon.cc @@ -0,0 +1,123 @@ +#include +#include +#include "similarity_search.h" + +class SearchIndexWrapper : public Napi::ObjectWrap { +public: + static Napi::Object Init(Napi::Env env, Napi::Object exports); + SearchIndexWrapper(const Napi::CallbackInfo& info); + ~SearchIndexWrapper(); + +private: + static Napi::FunctionReference constructor; + + Napi::Value AddString(const Napi::CallbackInfo& info); + Napi::Value Search(const Napi::CallbackInfo& info); + Napi::Value GetSize(const Napi::CallbackInfo& info); + + SearchIndex* index_; +}; + +Napi::FunctionReference SearchIndexWrapper::constructor; + +Napi::Object SearchIndexWrapper::Init(Napi::Env env, Napi::Object exports) { + Napi::HandleScope scope(env); + + Napi::Function func = DefineClass(env, "SearchIndex", { + InstanceMethod("addString", &SearchIndexWrapper::AddString), + InstanceMethod("search", &SearchIndexWrapper::Search), + InstanceMethod("size", &SearchIndexWrapper::GetSize) + }); + + constructor = Napi::Persistent(func); + constructor.SuppressDestruct(); + + exports.Set("SearchIndex", func); + return exports; +} + +SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + int capacity = 500; // Default capacity + if (info.Length() > 0 && info[0].IsNumber()) { + capacity = info[0].As().Int32Value(); + } + + this->index_ = create_search_index(capacity); + if (!this->index_) { + Napi::Error::New(env, "Failed to create search index").ThrowAsJavaScriptException(); + } +} + +SearchIndexWrapper::~SearchIndexWrapper() { + free_search_index(this->index_); +} + +Napi::Value SearchIndexWrapper::AddString(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + if (info.Length() < 1 || !info[0].IsString()) { + Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException(); + return env.Null(); + } + + std::string str = info[0].As().Utf8Value(); + int result = add_string_to_index(this->index_, str.c_str()); + + return Napi::Number::New(env, result); +} + +Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + if (info.Length() < 1 || !info[0].IsString()) { + Napi::TypeError::New(env, "Query string expected").ThrowAsJavaScriptException(); + return env.Null(); + } + + std::string query = info[0].As().Utf8Value(); + float cutoff = 0.2f; // Default cutoff + + if (info.Length() > 1 && info[1].IsNumber()) { + cutoff = info[1].As().FloatValue(); + } + + int num_results = 0; + SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results); + + if (!results) { + Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException(); + return env.Null(); + } + + Napi::Array result_array = Napi::Array::New(env, num_results); + + for (int i = 0; i < num_results; i++) { + Napi::Object obj = Napi::Object::New(env); + obj.Set("string", Napi::String::New(env, results[i].string)); + obj.Set("similarity", Napi::Number::New(env, results[i].similarity)); + result_array[i] = obj; + } + + free_search_results(results, num_results); + + return result_array; +} + +Napi::Value SearchIndexWrapper::GetSize(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + return Napi::Number::New(env, this->index_->num_strings); +} + +Napi::Object Init(Napi::Env env, Napi::Object exports) { + return SearchIndexWrapper::Init(env, exports); +} + +NODE_API_MODULE(similarity_search_addon, Init) \ No newline at end of file diff --git a/test.js b/test.js new file mode 100644 index 0000000..24f33fa --- /dev/null +++ b/test.js @@ -0,0 +1,53 @@ +const SimilaritySearch = require('./index'); + +// Create a test index with 500 strings +console.log('Creating test index with 500 strings...'); +const index = SimilaritySearch.createTestIndex(500); +console.log(`Index created with ${index.size()} strings`); + +// Test queries to run +const queries = [ + 'bio bizz', + 'substrate light', + 'plant growth', + 'garden mix', + 'random query' +]; + +console.log('\nRunning benchmark...'); +const benchmarkResults = SimilaritySearch.benchmark(index, queries); + +// Display results +console.log(`\nSearch results with cutoff: 0.2\n`); +benchmarkResults.forEach(result => { + console.log(`Query: "${result.query}"`); + console.log(`Found ${result.matches} matches in ${result.timeMs.toFixed(2)} ms`); + + // Display top results + result.topResults.forEach(match => { + console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); + }); + console.log(''); +}); + +// Demonstrate creating a custom index +console.log('Creating a custom index...'); +const customIndex = new SimilaritySearch(); +customIndex.addString('bio bizz'); +customIndex.addString('lightmix bizz btio substrate'); +customIndex.addString('bizz bio mix light'); + +// Add multiple strings at once +customIndex.addStrings([ + 'plant growth bio formula', + 'garden soil substrate' +]); + +console.log(`Custom index created with ${customIndex.size()} strings`); + +// Search with a higher similarity threshold +console.log('\nSearching with higher similarity threshold (0.3):'); +const results = customIndex.search('bio bizz', 0.3); +results.forEach(match => { + console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); +}); \ No newline at end of file