genesis

2025-04-18 08:22:35 +02:00
commit 51a3cc6c2d
13 changed files with 794 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,25 @@
+# Node.js dependencies
+node_modules/
+npm-debug.log
+yarn-debug.log
+yarn-error.log
+
+# Build outputs
+build/
+*.node
+
+# Editor directories and files
+.vscode/*
+!.vscode/c_cpp_properties.json
+.idea/
+*.swp
+*.swo
+
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db 
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "${workspaceFolder}/node_modules/node-addon-api",
+                "${workspaceFolder}/node_modules/nan"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/bin/gcc",
+            "cStandard": "c11",
+            "cppStandard": "c++14",
+            "intelliSenseMode": "linux-gcc-x64"
+        }
+    ],
+    "version": 4
+} 
--- a/README.md
+++ b/README.md
@@ -0,0 +1,119 @@
+# Similarity Search
+
+A Node.js module that performs word order independent similarity search on strings.
+
+This module is built as a native addon that uses C code for fast similarity computations. It uses Jaccard similarity between word sets to find matches regardless of word order.
+
+## Installation
+
+```bash
+npm install
+```
+
+## Usage
+
+```javascript
+const SimilaritySearch = require('./index');
+
+// Create a new search index with default capacity (500)
+const index = new SimilaritySearch();
+
+// Add strings to the index
+index.addString('bio bizz');
+index.addString('lightmix bizz btio substrate');
+index.addString('bizz bio mix light');
+
+// Add multiple strings at once
+index.addStrings([
+  'plant growth bio formula',
+  'garden soil substrate'
+]);
+
+// Search the index with a query and similarity cutoff
+const results = index.search('bio bizz', 0.2);
+
+// Display results
+results.forEach(match => {
+  console.log(`${match.similarity.toFixed(2)}: ${match.string}`);
+});
+```
+
+## API
+
+### `new SimilaritySearch([capacity])`
+
+Creates a new search index.
+
+- `capacity` (optional): Initial capacity for the index. Default: 500.
+
+### `addString(str)`
+
+Adds a string to the index.
+
+- `str`: The string to add.
+- Returns: Boolean indicating success.
+
+### `addStrings(strings)`
+
+Adds multiple strings to the index.
+
+- `strings`: Array of strings to add.
+- Returns: Boolean indicating if all adds were successful.
+
+### `search(query, [cutoff])`
+
+Searches the index for strings similar to the query.
+
+- `query`: The search query.
+- `cutoff` (optional): Similarity threshold between 0.0 and 1.0. Default: 0.2.
+- Returns: Array of matching results, sorted by similarity (descending).
+
+### `size()`
+
+Gets the number of strings in the index.
+
+- Returns: Number of strings in the index.
+
+## Helper Functions
+
+### `SimilaritySearch.createTestIndex([size])`
+
+Creates a test index with random data.
+
+- `size` (optional): Number of strings to generate. Default: 500.
+- Returns: A new SimilaritySearch instance with random data.
+
+### `SimilaritySearch.benchmark(index, queries, [cutoff])`
+
+Benchmarks the search performance.
+
+- `index`: The index to benchmark.
+- `queries`: Array of search queries.
+- `cutoff` (optional): Similarity threshold. Default: 0.2.
+- Returns: Benchmark results.
+
+## How It Works
+
+The similarity search uses Jaccard similarity between word sets:
+
+```
+similarity = (number of matching words) / (total unique words)
+```
+
+This means word order doesn't matter - "bio bizz" will match with "bizz bio" with 100% similarity.
+
+## Building
+
+To rebuild the native addon:
+
+```bash
+npm install
+```
+
+## Testing
+
+Run the test script:
+
+```bash
+npm test
+``` 
--- a/a.out
+++ b/a.out
--- a/binding.gyp
+++ b/binding.gyp
@@ -0,0 +1,25 @@
+{
+  "targets": [
+    {
+      "target_name": "similarity_search_addon",
+      "sources": [ 
+        "similarity_search.c",
+        "similarity_search_addon.cc" 
+      ],
+      "include_dirs": [
+        "<!@(node -p \"require('node-addon-api').include\")",
+        "<!(node -p \"require('node-addon-api').include_dir\")",
+        "<!(node -e \"require('nan')\")"
+      ],
+      "dependencies": [
+        "<!(node -p \"require('node-addon-api').gyp\")"
+      ],
+      "cflags!": [ "-fno-exceptions" ],
+      "cflags_cc!": [ "-fno-exceptions" ],
+      "defines": [ "NAPI_DISABLE_CPP_EXCEPTIONS" ],
+      "xcode_settings": {
+        "GCC_ENABLE_CPP_EXCEPTIONS": "YES"
+      }
+    }
+  ]
+} 
--- a/index.js
+++ b/index.js
@@ -0,0 +1,137 @@
+const addon = require('./build/Release/similarity_search_addon');
+
+/**
+ * A string similarity search index
+ */
+class SimilaritySearch {
+  /**
+   * Create a new SimilaritySearch instance
+   * 
+   * @param {number} [capacity=500] - Initial capacity of the index
+   */
+  constructor(capacity = 500) {
+    this.index = new addon.SearchIndex(capacity);
+  }
+
+  /**
+   * Add a string to the search index
+   * 
+   * @param {string} str - The string to add
+   * @returns {boolean} - True if successful, false otherwise
+   */
+  addString(str) {
+    return this.index.addString(str) === 0;
+  }
+
+  /**
+   * Add multiple strings to the search index
+   * 
+   * @param {string[]} strings - Array of strings to add
+   * @returns {boolean} - True if all adds were successful, false otherwise
+   */
+  addStrings(strings) {
+    let success = true;
+    for (const str of strings) {
+      if (this.index.addString(str) !== 0) {
+        success = false;
+      }
+    }
+    return success;
+  }
+
+  /**
+   * Search the index for strings similar to the query
+   * 
+   * @param {string} query - The search query
+   * @param {number} [cutoff=0.2] - Similarity threshold (0.0 to 1.0)
+   * @returns {Array<{string: string, similarity: number}>} - Array of matching results
+   */
+  search(query, cutoff = 0.2) {
+    return this.index.search(query, cutoff);
+  }
+
+  /**
+   * Get the number of strings in the index
+   * 
+   * @returns {number} - Number of strings in the index
+   */
+  size() {
+    return this.index.size();
+  }
+}
+
+// Add some functions for convenience
+/**
+ * Generate an index with random test data
+ * 
+ * @param {number} [size=500] - Number of strings to generate
+ * @returns {SimilaritySearch} - A new SimilaritySearch instance with random data
+ */
+SimilaritySearch.createTestIndex = function(size = 500) {
+  const index = new SimilaritySearch(size);
+  
+  // Add some specific test strings
+  index.addString("bio bizz");
+  index.addString("lightmix bizz btio substrate");
+  index.addString("bizz bio mix light");
+  index.addString("plant growth bio formula");
+  index.addString("garden soil substrate");
+  
+  // Generate random strings
+  function randomWord(len) {
+    const chars = 'abcdefghijklmnopqrstuvwxyz';
+    let word = '';
+    for (let i = 0; i < len; i++) {
+      word += chars.charAt(Math.floor(Math.random() * chars.length));
+    }
+    return word;
+  }
+  
+  function randomString() {
+    const numWords = 2 + Math.floor(Math.random() * 5); // 2-6 words
+    let str = '';
+    for (let i = 0; i < numWords; i++) {
+      if (i > 0) str += ' ';
+      str += randomWord(3 + Math.floor(Math.random() * 8)); // 3-10 chars
+    }
+    return str;
+  }
+  
+  // Generate the rest of the strings
+  for (let i = 5; i < size; i++) {
+    index.addString(randomString());
+  }
+  
+  return index;
+};
+
+/**
+ * Benchmark the search performance
+ * 
+ * @param {SimilaritySearch} index - The index to benchmark
+ * @param {string[]} queries - Array of search queries
+ * @param {number} [cutoff=0.2] - Similarity threshold to use
+ * @returns {Object} - Benchmark results
+ */
+SimilaritySearch.benchmark = function(index, queries, cutoff = 0.2) {
+  const results = [];
+  
+  for (const query of queries) {
+    const start = process.hrtime.bigint();
+    const matches = index.search(query, cutoff);
+    const end = process.hrtime.bigint();
+    
+    const timeMs = Number(end - start) / 1000000;
+    
+    results.push({
+      query,
+      matches: matches.length,
+      timeMs,
+      topResults: matches.slice(0, 5)
+    });
+  }
+  
+  return results;
+};
+
+module.exports = SimilaritySearch; 
--- a/package-lock.json
+++ b/package-lock.json
@@ -0,0 +1,28 @@
+{
+  "name": "similarity-search",
+  "version": "1.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "similarity-search",
+      "version": "1.0.0",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "dependencies": {
+        "nan": "^2.22.2",
+        "node-addon-api": "^6.0.0"
+      }
+    },
+    "node_modules/nan": {
+      "version": "2.22.2",
+      "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz",
+      "integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ=="
+    },
+    "node_modules/node-addon-api": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
+      "integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA=="
+    }
+  }
+}
--- a/package.json
+++ b/package.json
@@ -0,0 +1,23 @@
+{
+  "name": "similarity-search",
+  "version": "1.0.0",
+  "description": "A Node.js module for word order independent string similarity search",
+  "main": "index.js",
+  "scripts": {
+    "install": "node-gyp rebuild",
+    "test": "node test.js"
+  },
+  "keywords": [
+    "search",
+    "similarity",
+    "string",
+    "fuzzy"
+  ],
+  "author": "",
+  "license": "MIT",
+  "dependencies": {
+    "nan": "^2.22.2",
+    "node-addon-api": "^6.0.0"
+  },
+  "gypfile": true
+}
--- a/BIN
+++ b/BIN
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <ctype.h>
+#include "similarity_search.h"
+
+// Case insensitive string comparison
+int str_case_cmp(const char *s1, const char *s2) {
+    while (*s1 && *s2) {
+        int c1 = tolower((unsigned char)*s1);
+        int c2 = tolower((unsigned char)*s2);
+        if (c1 != c2) {
+            return c1 - c2;
+        }
+        s1++;
+        s2++;
+    }
+    return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
+}
+
+// Split a string into words
+int split_into_words(const char *string, char *words[MAX_WORDS]) {
+    char temp[MAX_STRING_LEN];
+    strcpy(temp, string);
+    
+    int word_count = 0;
+    char *token = strtok(temp, " \t\n");
+    
+    while (token != NULL && word_count < MAX_WORDS) {
+        words[word_count] = strdup(token);
+        word_count++;
+        token = strtok(NULL, " \t\n");
+    }
+    
+    return word_count;
+}
+
+// Free memory allocated for words
+void free_words(char *words[], int word_count) {
+    for (int i = 0; i < word_count; i++) {
+        free(words[i]);
+    }
+}
+
+// Calculate similarity between query and target string
+float calculate_similarity(const char *query, const char *target, float cutoff) {
+    // Split strings into words
+    char *query_words[MAX_WORDS] = {0};
+    char *target_words[MAX_WORDS] = {0};
+    
+    int query_word_count = split_into_words(query, query_words);
+    int target_word_count = split_into_words(target, target_words);
+    
+    if (query_word_count == 0 || target_word_count == 0) {
+        free_words(query_words, query_word_count);
+        free_words(target_words, target_word_count);
+        return 0.0;
+    }
+    
+    // Count matches
+    int matches = 0;
+    for (int i = 0; i < query_word_count; i++) {
+        for (int j = 0; j < target_word_count; j++) {
+            if (str_case_cmp(query_words[i], target_words[j]) == 0) {
+                matches++;
+                break;
+            }
+        }
+    }
+    
+    // Calculate Jaccard similarity (intersection over union)
+    float similarity = (float)matches / (query_word_count + target_word_count - matches);
+    
+    free_words(query_words, query_word_count);
+    free_words(target_words, target_word_count);
+    
+    return similarity;
+}
+
+// Compare function for qsort to sort results by similarity (descending)
+int compare_results(const void *a, const void *b) {
+    const SearchResult *result_a = (const SearchResult *)a;
+    const SearchResult *result_b = (const SearchResult *)b;
+    
+    if (result_b->similarity > result_a->similarity) return 1;
+    if (result_b->similarity < result_a->similarity) return -1;
+    return 0;
+}
+
+// Generate a random word
+void generate_random_word(char *word, int max_len) {
+    int len = 3 + rand() % 8; // Random length between 3 and 10
+    for (int i = 0; i < len; i++) {
+        word[i] = 'a' + (rand() % 26);
+    }
+    word[len] = '\0';
+}
+
+// Generate a random string consisting of multiple words
+void generate_random_string(char *string, int max_len) {
+    int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
+    string[0] = '\0';
+    
+    for (int i = 0; i < num_words; i++) {
+        char word[20];
+        generate_random_word(word, 10);
+        
+        // Check if there's enough space to add this word
+        if (strlen(string) + strlen(word) + 1 < (size_t)max_len) {
+            if (i > 0) strcat(string, " ");
+            strcat(string, word);
+        } else {
+            break;
+        }
+    }
+}
+
+// Create a new search index
+SearchIndex* create_search_index(int capacity) {
+    SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
+    if (!index) return NULL;
+    
+    index->strings = (char**)malloc(capacity * sizeof(char*));
+    if (!index->strings) {
+        free(index);
+        return NULL;
+    }
+    
+    index->num_strings = 0;
+    return index;
+}
+
+// Add a string to the index
+int add_string_to_index(SearchIndex* index, const char* string) {
+    if (!index || !string) return -1;
+    
+    index->strings[index->num_strings] = strdup(string);
+    if (!index->strings[index->num_strings]) return -1;
+    
+    index->num_strings++;
+    return 0;
+}
+
+// Free the search index and all associated memory
+void free_search_index(SearchIndex* index) {
+    if (!index) return;
+    
+    for (int i = 0; i < index->num_strings; i++) {
+        free(index->strings[i]);
+    }
+    
+    free(index->strings);
+    free(index);
+}
+
+// Search the index with the given query and similarity cutoff
+SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
+    if (!index || !query || !num_results) return NULL;
+    
+    // Allocate temporary array for results
+    SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
+    if (!temp_results) return NULL;
+    
+    *num_results = 0;
+    
+    // Search through all strings in the index
+    for (int i = 0; i < index->num_strings; i++) {
+        float similarity = calculate_similarity(query, index->strings[i], cutoff);
+        
+        if (similarity >= cutoff) {
+            temp_results[*num_results].string = index->strings[i];
+            temp_results[*num_results].similarity = similarity;
+            (*num_results)++;
+        }
+    }
+    
+    // Sort results by similarity
+    qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
+    
+    // Allocate final result array with exact size
+    SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
+    if (!results) {
+        free(temp_results);
+        return NULL;
+    }
+    
+    // Copy results to final array
+    memcpy(results, temp_results, *num_results * sizeof(SearchResult));
+    free(temp_results);
+    
+    return results;
+}
+
+// Free the search results
+void free_search_results(SearchResult* results, int num_results) {
+    free(results);
+} 
--- a/similarity_search.h
+++ b/similarity_search.h
@@ -0,0 +1,45 @@
+#ifndef SIMILARITY_SEARCH_H
+#define SIMILARITY_SEARCH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_STRING_LEN 100
+#define MAX_WORDS 20
+
+// Public API
+
+// Structure representing the search index
+typedef struct {
+    char **strings;
+    int num_strings;
+} SearchIndex;
+
+// Structure to hold a search result
+typedef struct {
+    const char *string;
+    float similarity;
+} SearchResult;
+
+// Create a new search index
+SearchIndex* create_search_index(int capacity);
+
+// Add a string to the index
+int add_string_to_index(SearchIndex* index, const char* string);
+
+// Free the search index and all associated memory
+void free_search_index(SearchIndex* index);
+
+// Search the index with the given query and similarity cutoff
+// Returns an array of SearchResult pointers that must be freed by the caller
+SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results);
+
+// Free the search results
+void free_search_results(SearchResult* results, int num_results);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SIMILARITY_SEARCH_H */ 
--- a/similarity_search_addon.cc
+++ b/similarity_search_addon.cc
@@ -0,0 +1,123 @@
+#include <napi.h>
+#include <string>
+#include "similarity_search.h"
+
+class SearchIndexWrapper : public Napi::ObjectWrap<SearchIndexWrapper> {
+public:
+  static Napi::Object Init(Napi::Env env, Napi::Object exports);
+  SearchIndexWrapper(const Napi::CallbackInfo& info);
+  ~SearchIndexWrapper();
+
+private:
+  static Napi::FunctionReference constructor;
+  
+  Napi::Value AddString(const Napi::CallbackInfo& info);
+  Napi::Value Search(const Napi::CallbackInfo& info);
+  Napi::Value GetSize(const Napi::CallbackInfo& info);
+  
+  SearchIndex* index_;
+};
+
+Napi::FunctionReference SearchIndexWrapper::constructor;
+
+Napi::Object SearchIndexWrapper::Init(Napi::Env env, Napi::Object exports) {
+  Napi::HandleScope scope(env);
+  
+  Napi::Function func = DefineClass(env, "SearchIndex", {
+    InstanceMethod("addString", &SearchIndexWrapper::AddString),
+    InstanceMethod("search", &SearchIndexWrapper::Search),
+    InstanceMethod("size", &SearchIndexWrapper::GetSize)
+  });
+  
+  constructor = Napi::Persistent(func);
+  constructor.SuppressDestruct();
+  
+  exports.Set("SearchIndex", func);
+  return exports;
+}
+
+SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info) 
+  : Napi::ObjectWrap<SearchIndexWrapper>(info) {
+  Napi::Env env = info.Env();
+  Napi::HandleScope scope(env);
+  
+  int capacity = 500; // Default capacity
+  if (info.Length() > 0 && info[0].IsNumber()) {
+    capacity = info[0].As<Napi::Number>().Int32Value();
+  }
+  
+  this->index_ = create_search_index(capacity);
+  if (!this->index_) {
+    Napi::Error::New(env, "Failed to create search index").ThrowAsJavaScriptException();
+  }
+}
+
+SearchIndexWrapper::~SearchIndexWrapper() {
+  free_search_index(this->index_);
+}
+
+Napi::Value SearchIndexWrapper::AddString(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  Napi::HandleScope scope(env);
+  
+  if (info.Length() < 1 || !info[0].IsString()) {
+    Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
+    return env.Null();
+  }
+  
+  std::string str = info[0].As<Napi::String>().Utf8Value();
+  int result = add_string_to_index(this->index_, str.c_str());
+  
+  return Napi::Number::New(env, result);
+}
+
+Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  Napi::HandleScope scope(env);
+  
+  if (info.Length() < 1 || !info[0].IsString()) {
+    Napi::TypeError::New(env, "Query string expected").ThrowAsJavaScriptException();
+    return env.Null();
+  }
+  
+  std::string query = info[0].As<Napi::String>().Utf8Value();
+  float cutoff = 0.2f; // Default cutoff
+  
+  if (info.Length() > 1 && info[1].IsNumber()) {
+    cutoff = info[1].As<Napi::Number>().FloatValue();
+  }
+  
+  int num_results = 0;
+  SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
+  
+  if (!results) {
+    Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException();
+    return env.Null();
+  }
+  
+  Napi::Array result_array = Napi::Array::New(env, num_results);
+  
+  for (int i = 0; i < num_results; i++) {
+    Napi::Object obj = Napi::Object::New(env);
+    obj.Set("string", Napi::String::New(env, results[i].string));
+    obj.Set("similarity", Napi::Number::New(env, results[i].similarity));
+    result_array[i] = obj;
+  }
+  
+  free_search_results(results, num_results);
+  
+  return result_array;
+}
+
+Napi::Value SearchIndexWrapper::GetSize(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  Napi::HandleScope scope(env);
+  
+  return Napi::Number::New(env, this->index_->num_strings);
+}
+
+Napi::Object Init(Napi::Env env, Napi::Object exports) {
+  return SearchIndexWrapper::Init(env, exports);
+}
+
+NODE_API_MODULE(similarity_search_addon, Init) 
--- a/test.js
+++ b/test.js
@@ -0,0 +1,53 @@
+const SimilaritySearch = require('./index');
+
+// Create a test index with 500 strings
+console.log('Creating test index with 500 strings...');
+const index = SimilaritySearch.createTestIndex(500);
+console.log(`Index created with ${index.size()} strings`);
+
+// Test queries to run
+const queries = [
+  'bio bizz',
+  'substrate light',
+  'plant growth',
+  'garden mix',
+  'random query'
+];
+
+console.log('\nRunning benchmark...');
+const benchmarkResults = SimilaritySearch.benchmark(index, queries);
+
+// Display results
+console.log(`\nSearch results with cutoff: 0.2\n`);
+benchmarkResults.forEach(result => {
+  console.log(`Query: "${result.query}"`);
+  console.log(`Found ${result.matches} matches in ${result.timeMs.toFixed(2)} ms`);
+  
+  // Display top results
+  result.topResults.forEach(match => {
+    console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
+  });
+  console.log('');
+});
+
+// Demonstrate creating a custom index
+console.log('Creating a custom index...');
+const customIndex = new SimilaritySearch();
+customIndex.addString('bio bizz');
+customIndex.addString('lightmix bizz btio substrate');
+customIndex.addString('bizz bio mix light');
+
+// Add multiple strings at once
+customIndex.addStrings([
+  'plant growth bio formula',
+  'garden soil substrate'
+]);
+
+console.log(`Custom index created with ${customIndex.size()} strings`);
+
+// Search with a higher similarity threshold
+console.log('\nSearching with higher similarity threshold (0.3):');
+const results = customIndex.search('bio bizz', 0.3);
+results.forEach(match => {
+  console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
+});