genesis

2025-04-18 08:22:35 +02:00
commit 51a3cc6c2d
13 changed files with 794 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,25 @@
 # Node.js dependencies
 node_modules/
 npm-debug.log
 yarn-debug.log
 yarn-error.log
 # Build outputs
 build/
 *.node
 # Editor directories and files
 .vscode/*
 !.vscode/c_cpp_properties.json
 .idea/
 *.swp
 *.swo
 # OS generated files
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db 
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,18 @@
 {
    "configurations": [
        {
            "name": "Linux",
            "includePath": [
                "${workspaceFolder}/**",
                "${workspaceFolder}/node_modules/node-addon-api",
                "${workspaceFolder}/node_modules/nan"
            ],
            "defines": [],
            "compilerPath": "/usr/bin/gcc",
            "cStandard": "c11",
            "cppStandard": "c++14",
            "intelliSenseMode": "linux-gcc-x64"
        }
    ],
    "version": 4
 } 
--- a/README.md
+++ b/README.md
@@ -0,0 +1,119 @@
 # Similarity Search
 A Node.js module that performs word order independent similarity search on strings.
 This module is built as a native addon that uses C code for fast similarity computations. It uses Jaccard similarity between word sets to find matches regardless of word order.
 ## Installation
 ```bash
 npm install
 ```
 ## Usage
 ```javascript
 const SimilaritySearch = require('./index');
 // Create a new search index with default capacity (500)
 const index = new SimilaritySearch();
 // Add strings to the index
 index.addString('bio bizz');
 index.addString('lightmix bizz btio substrate');
 index.addString('bizz bio mix light');
 // Add multiple strings at once
 index.addStrings([
  'plant growth bio formula',
  'garden soil substrate'
 ]);
 // Search the index with a query and similarity cutoff
 const results = index.search('bio bizz', 0.2);
 // Display results
 results.forEach(match => {
  console.log(`${match.similarity.toFixed(2)}: ${match.string}`);
 });
 ```
 ## API
 ### `new SimilaritySearch([capacity])`
 Creates a new search index.
 - `capacity` (optional): Initial capacity for the index. Default: 500.
 ### `addString(str)`
 Adds a string to the index.
 - `str`: The string to add.
 - Returns: Boolean indicating success.
 ### `addStrings(strings)`
 Adds multiple strings to the index.
 - `strings`: Array of strings to add.
 - Returns: Boolean indicating if all adds were successful.
 ### `search(query, [cutoff])`
 Searches the index for strings similar to the query.
 - `query`: The search query.
 - `cutoff` (optional): Similarity threshold between 0.0 and 1.0. Default: 0.2.
 - Returns: Array of matching results, sorted by similarity (descending).
 ### `size()`
 Gets the number of strings in the index.
 - Returns: Number of strings in the index.
 ## Helper Functions
 ### `SimilaritySearch.createTestIndex([size])`
 Creates a test index with random data.
 - `size` (optional): Number of strings to generate. Default: 500.
 - Returns: A new SimilaritySearch instance with random data.
 ### `SimilaritySearch.benchmark(index, queries, [cutoff])`
 Benchmarks the search performance.
 - `index`: The index to benchmark.
 - `queries`: Array of search queries.
 - `cutoff` (optional): Similarity threshold. Default: 0.2.
 - Returns: Benchmark results.
 ## How It Works
 The similarity search uses Jaccard similarity between word sets:
 ```
 similarity = (number of matching words) / (total unique words)
 ```
 This means word order doesn't matter - "bio bizz" will match with "bizz bio" with 100% similarity.
 ## Building
 To rebuild the native addon:
 ```bash
 npm install
 ```
 ## Testing
 Run the test script:
 ```bash
 npm test
 ``` 
--- a/a.out
+++ b/a.out
--- a/binding.gyp
+++ b/binding.gyp
@@ -0,0 +1,25 @@
 {
  "targets": [
    {
      "target_name": "similarity_search_addon",
      "sources": [ 
        "similarity_search.c",
        "similarity_search_addon.cc" 
      ],
      "include_dirs": [
        "<!@(node -p \"require('node-addon-api').include\")",
        "<!(node -p \"require('node-addon-api').include_dir\")",
        "<!(node -e \"require('nan')\")"
      ],
      "dependencies": [
        "<!(node -p \"require('node-addon-api').gyp\")"
      ],
      "cflags!": [ "-fno-exceptions" ],
      "cflags_cc!": [ "-fno-exceptions" ],
      "defines": [ "NAPI_DISABLE_CPP_EXCEPTIONS" ],
      "xcode_settings": {
        "GCC_ENABLE_CPP_EXCEPTIONS": "YES"
      }
    }
  ]
 } 
--- a/index.js
+++ b/index.js
@@ -0,0 +1,137 @@
 const addon = require('./build/Release/similarity_search_addon');
 /**
 * A string similarity search index
 */
 class SimilaritySearch {
  /**
   * Create a new SimilaritySearch instance
   * 
   * @param {number} [capacity=500] - Initial capacity of the index
   */
  constructor(capacity = 500) {
    this.index = new addon.SearchIndex(capacity);
  }
  /**
   * Add a string to the search index
   * 
   * @param {string} str - The string to add
   * @returns {boolean} - True if successful, false otherwise
   */
  addString(str) {
    return this.index.addString(str) === 0;
  }
  /**
   * Add multiple strings to the search index
   * 
   * @param {string[]} strings - Array of strings to add
   * @returns {boolean} - True if all adds were successful, false otherwise
   */
  addStrings(strings) {
    let success = true;
    for (const str of strings) {
      if (this.index.addString(str) !== 0) {
        success = false;
      }
    }
    return success;
  }
  /**
   * Search the index for strings similar to the query
   * 
   * @param {string} query - The search query
   * @param {number} [cutoff=0.2] - Similarity threshold (0.0 to 1.0)
   * @returns {Array<{string: string, similarity: number}>} - Array of matching results
   */
  search(query, cutoff = 0.2) {
    return this.index.search(query, cutoff);
  }
  /**
   * Get the number of strings in the index
   * 
   * @returns {number} - Number of strings in the index
   */
  size() {
    return this.index.size();
  }
 }
 // Add some functions for convenience
 /**
 * Generate an index with random test data
 * 
 * @param {number} [size=500] - Number of strings to generate
 * @returns {SimilaritySearch} - A new SimilaritySearch instance with random data
 */
 SimilaritySearch.createTestIndex = function(size = 500) {
  const index = new SimilaritySearch(size);
  // Add some specific test strings
  index.addString("bio bizz");
  index.addString("lightmix bizz btio substrate");
  index.addString("bizz bio mix light");
  index.addString("plant growth bio formula");
  index.addString("garden soil substrate");
  // Generate random strings
  function randomWord(len) {
    const chars = 'abcdefghijklmnopqrstuvwxyz';
    let word = '';
    for (let i = 0; i < len; i++) {
      word += chars.charAt(Math.floor(Math.random() * chars.length));
    }
    return word;
  }
  function randomString() {
    const numWords = 2 + Math.floor(Math.random() * 5); // 2-6 words
    let str = '';
    for (let i = 0; i < numWords; i++) {
      if (i > 0) str += ' ';
      str += randomWord(3 + Math.floor(Math.random() * 8)); // 3-10 chars
    }
    return str;
  }
  // Generate the rest of the strings
  for (let i = 5; i < size; i++) {
    index.addString(randomString());
  }
  return index;
 };
 /**
 * Benchmark the search performance
 * 
 * @param {SimilaritySearch} index - The index to benchmark
 * @param {string[]} queries - Array of search queries
 * @param {number} [cutoff=0.2] - Similarity threshold to use
 * @returns {Object} - Benchmark results
 */
 SimilaritySearch.benchmark = function(index, queries, cutoff = 0.2) {
  const results = [];
  for (const query of queries) {
    const start = process.hrtime.bigint();
    const matches = index.search(query, cutoff);
    const end = process.hrtime.bigint();
    const timeMs = Number(end - start) / 1000000;
    results.push({
      query,
      matches: matches.length,
      timeMs,
      topResults: matches.slice(0, 5)
    });
  }
  return results;
 };
 module.exports = SimilaritySearch; 
--- a/package-lock.json
+++ b/package-lock.json
@@ -0,0 +1,28 @@
 {
  "name": "similarity-search",
  "version": "1.0.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "similarity-search",
      "version": "1.0.0",
      "hasInstallScript": true,
      "license": "MIT",
      "dependencies": {
        "nan": "^2.22.2",
        "node-addon-api": "^6.0.0"
      }
    },
    "node_modules/nan": {
      "version": "2.22.2",
      "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz",
      "integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ=="
    },
    "node_modules/node-addon-api": {
      "version": "6.1.0",
      "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
      "integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA=="
    }
  }
 }
--- a/package.json
+++ b/package.json
@@ -0,0 +1,23 @@
 {
  "name": "similarity-search",
  "version": "1.0.0",
  "description": "A Node.js module for word order independent string similarity search",
  "main": "index.js",
  "scripts": {
    "install": "node-gyp rebuild",
    "test": "node test.js"
  },
  "keywords": [
    "search",
    "similarity",
    "string",
    "fuzzy"
  ],
  "author": "",
  "license": "MIT",
  "dependencies": {
    "nan": "^2.22.2",
    "node-addon-api": "^6.0.0"
  },
  "gypfile": true
 }
--- a/BIN
+++ b/BIN
--- a/similarity_search.c
+++ b/similarity_search.c
@@ -0,0 +1,198 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <ctype.h>
 #include "similarity_search.h"
 // Case insensitive string comparison
 int str_case_cmp(const char *s1, const char *s2) {
    while (*s1 && *s2) {
        int c1 = tolower((unsigned char)*s1);
        int c2 = tolower((unsigned char)*s2);
        if (c1 != c2) {
            return c1 - c2;
        }
        s1++;
        s2++;
    }
    return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
 }
 // Split a string into words
 int split_into_words(const char *string, char *words[MAX_WORDS]) {
    char temp[MAX_STRING_LEN];
    strcpy(temp, string);
    int word_count = 0;
    char *token = strtok(temp, " \t\n");
    while (token != NULL && word_count < MAX_WORDS) {
        words[word_count] = strdup(token);
        word_count++;
        token = strtok(NULL, " \t\n");
    }
    return word_count;
 }
 // Free memory allocated for words
 void free_words(char *words[], int word_count) {
    for (int i = 0; i < word_count; i++) {
        free(words[i]);
    }
 }
 // Calculate similarity between query and target string
 float calculate_similarity(const char *query, const char *target, float cutoff) {
    // Split strings into words
    char *query_words[MAX_WORDS] = {0};
    char *target_words[MAX_WORDS] = {0};
    int query_word_count = split_into_words(query, query_words);
    int target_word_count = split_into_words(target, target_words);
    if (query_word_count == 0 || target_word_count == 0) {
        free_words(query_words, query_word_count);
        free_words(target_words, target_word_count);
        return 0.0;
    }
    // Count matches
    int matches = 0;
    for (int i = 0; i < query_word_count; i++) {
        for (int j = 0; j < target_word_count; j++) {
            if (str_case_cmp(query_words[i], target_words[j]) == 0) {
                matches++;
                break;
            }
        }
    }
    // Calculate Jaccard similarity (intersection over union)
    float similarity = (float)matches / (query_word_count + target_word_count - matches);
    free_words(query_words, query_word_count);
    free_words(target_words, target_word_count);
    return similarity;
 }
 // Compare function for qsort to sort results by similarity (descending)
 int compare_results(const void *a, const void *b) {
    const SearchResult *result_a = (const SearchResult *)a;
    const SearchResult *result_b = (const SearchResult *)b;
    if (result_b->similarity > result_a->similarity) return 1;
    if (result_b->similarity < result_a->similarity) return -1;
    return 0;
 }
 // Generate a random word
 void generate_random_word(char *word, int max_len) {
    int len = 3 + rand() % 8; // Random length between 3 and 10
    for (int i = 0; i < len; i++) {
        word[i] = 'a' + (rand() % 26);
    }
    word[len] = '\0';
 }
 // Generate a random string consisting of multiple words
 void generate_random_string(char *string, int max_len) {
    int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
    string[0] = '\0';
    for (int i = 0; i < num_words; i++) {
        char word[20];
        generate_random_word(word, 10);
        // Check if there's enough space to add this word
        if (strlen(string) + strlen(word) + 1 < (size_t)max_len) {
            if (i > 0) strcat(string, " ");
            strcat(string, word);
        } else {
            break;
        }
    }
 }
 // Create a new search index
 SearchIndex* create_search_index(int capacity) {
    SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
    if (!index) return NULL;
    index->strings = (char**)malloc(capacity * sizeof(char*));
    if (!index->strings) {
        free(index);
        return NULL;
    }
    index->num_strings = 0;
    return index;
 }
 // Add a string to the index
 int add_string_to_index(SearchIndex* index, const char* string) {
    if (!index || !string) return -1;
    index->strings[index->num_strings] = strdup(string);
    if (!index->strings[index->num_strings]) return -1;
    index->num_strings++;
    return 0;
 }
 // Free the search index and all associated memory
 void free_search_index(SearchIndex* index) {
    if (!index) return;
    for (int i = 0; i < index->num_strings; i++) {
        free(index->strings[i]);
    }
    free(index->strings);
    free(index);
 }
 // Search the index with the given query and similarity cutoff
 SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
    if (!index || !query || !num_results) return NULL;
    // Allocate temporary array for results
    SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
    if (!temp_results) return NULL;
    *num_results = 0;
    // Search through all strings in the index
    for (int i = 0; i < index->num_strings; i++) {
        float similarity = calculate_similarity(query, index->strings[i], cutoff);
        if (similarity >= cutoff) {
            temp_results[*num_results].string = index->strings[i];
            temp_results[*num_results].similarity = similarity;
            (*num_results)++;
        }
    }
    // Sort results by similarity
    qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
    // Allocate final result array with exact size
    SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
    if (!results) {
        free(temp_results);
        return NULL;
    }
    // Copy results to final array
    memcpy(results, temp_results, *num_results * sizeof(SearchResult));
    free(temp_results);
    return results;
 }
 // Free the search results
 void free_search_results(SearchResult* results, int num_results) {
    free(results);
 } 
--- a/similarity_search.h
+++ b/similarity_search.h
@@ -0,0 +1,45 @@
 #ifndef SIMILARITY_SEARCH_H
 #define SIMILARITY_SEARCH_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define MAX_STRING_LEN 100
 #define MAX_WORDS 20
 // Public API
 // Structure representing the search index
 typedef struct {
    char **strings;
    int num_strings;
 } SearchIndex;
 // Structure to hold a search result
 typedef struct {
    const char *string;
    float similarity;
 } SearchResult;
 // Create a new search index
 SearchIndex* create_search_index(int capacity);
 // Add a string to the index
 int add_string_to_index(SearchIndex* index, const char* string);
 // Free the search index and all associated memory
 void free_search_index(SearchIndex* index);
 // Search the index with the given query and similarity cutoff
 // Returns an array of SearchResult pointers that must be freed by the caller
 SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results);
 // Free the search results
 void free_search_results(SearchResult* results, int num_results);
 #ifdef __cplusplus
 }
 #endif
 #endif /* SIMILARITY_SEARCH_H */ 
--- a/similarity_search_addon.cc
+++ b/similarity_search_addon.cc
@@ -0,0 +1,123 @@
 #include <napi.h>
 #include <string>
 #include "similarity_search.h"
 class SearchIndexWrapper : public Napi::ObjectWrap<SearchIndexWrapper> {
 public:
  static Napi::Object Init(Napi::Env env, Napi::Object exports);
  SearchIndexWrapper(const Napi::CallbackInfo& info);
  ~SearchIndexWrapper();
 private:
  static Napi::FunctionReference constructor;
  Napi::Value AddString(const Napi::CallbackInfo& info);
  Napi::Value Search(const Napi::CallbackInfo& info);
  Napi::Value GetSize(const Napi::CallbackInfo& info);
  SearchIndex* index_;
 };
 Napi::FunctionReference SearchIndexWrapper::constructor;
 Napi::Object SearchIndexWrapper::Init(Napi::Env env, Napi::Object exports) {
  Napi::HandleScope scope(env);
  Napi::Function func = DefineClass(env, "SearchIndex", {
    InstanceMethod("addString", &SearchIndexWrapper::AddString),
    InstanceMethod("search", &SearchIndexWrapper::Search),
    InstanceMethod("size", &SearchIndexWrapper::GetSize)
  });
  constructor = Napi::Persistent(func);
  constructor.SuppressDestruct();
  exports.Set("SearchIndex", func);
  return exports;
 }
 SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info) 
  : Napi::ObjectWrap<SearchIndexWrapper>(info) {
  Napi::Env env = info.Env();
  Napi::HandleScope scope(env);
  int capacity = 500; // Default capacity
  if (info.Length() > 0 && info[0].IsNumber()) {
    capacity = info[0].As<Napi::Number>().Int32Value();
  }
  this->index_ = create_search_index(capacity);
  if (!this->index_) {
    Napi::Error::New(env, "Failed to create search index").ThrowAsJavaScriptException();
  }
 }
 SearchIndexWrapper::~SearchIndexWrapper() {
  free_search_index(this->index_);
 }
 Napi::Value SearchIndexWrapper::AddString(const Napi::CallbackInfo& info) {
  Napi::Env env = info.Env();
  Napi::HandleScope scope(env);
  if (info.Length() < 1 || !info[0].IsString()) {
    Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
    return env.Null();
  }
  std::string str = info[0].As<Napi::String>().Utf8Value();
  int result = add_string_to_index(this->index_, str.c_str());
  return Napi::Number::New(env, result);
 }
 Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) {
  Napi::Env env = info.Env();
  Napi::HandleScope scope(env);
  if (info.Length() < 1 || !info[0].IsString()) {
    Napi::TypeError::New(env, "Query string expected").ThrowAsJavaScriptException();
    return env.Null();
  }
  std::string query = info[0].As<Napi::String>().Utf8Value();
  float cutoff = 0.2f; // Default cutoff
  if (info.Length() > 1 && info[1].IsNumber()) {
    cutoff = info[1].As<Napi::Number>().FloatValue();
  }
  int num_results = 0;
  SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
  if (!results) {
    Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException();
    return env.Null();
  }
  Napi::Array result_array = Napi::Array::New(env, num_results);
  for (int i = 0; i < num_results; i++) {
    Napi::Object obj = Napi::Object::New(env);
    obj.Set("string", Napi::String::New(env, results[i].string));
    obj.Set("similarity", Napi::Number::New(env, results[i].similarity));
    result_array[i] = obj;
  }
  free_search_results(results, num_results);
  return result_array;
 }
 Napi::Value SearchIndexWrapper::GetSize(const Napi::CallbackInfo& info) {
  Napi::Env env = info.Env();
  Napi::HandleScope scope(env);
  return Napi::Number::New(env, this->index_->num_strings);
 }
 Napi::Object Init(Napi::Env env, Napi::Object exports) {
  return SearchIndexWrapper::Init(env, exports);
 }
 NODE_API_MODULE(similarity_search_addon, Init) 
--- a/test.js
+++ b/test.js
@@ -0,0 +1,53 @@
 const SimilaritySearch = require('./index');
 // Create a test index with 500 strings
 console.log('Creating test index with 500 strings...');
 const index = SimilaritySearch.createTestIndex(500);
 console.log(`Index created with ${index.size()} strings`);
 // Test queries to run
 const queries = [
  'bio bizz',
  'substrate light',
  'plant growth',
  'garden mix',
  'random query'
 ];
 console.log('\nRunning benchmark...');
 const benchmarkResults = SimilaritySearch.benchmark(index, queries);
 // Display results
 console.log(`\nSearch results with cutoff: 0.2\n`);
 benchmarkResults.forEach(result => {
  console.log(`Query: "${result.query}"`);
  console.log(`Found ${result.matches} matches in ${result.timeMs.toFixed(2)} ms`);
  // Display top results
  result.topResults.forEach(match => {
    console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
  });
  console.log('');
 });
 // Demonstrate creating a custom index
 console.log('Creating a custom index...');
 const customIndex = new SimilaritySearch();
 customIndex.addString('bio bizz');
 customIndex.addString('lightmix bizz btio substrate');
 customIndex.addString('bizz bio mix light');
 // Add multiple strings at once
 customIndex.addStrings([
  'plant growth bio formula',
  'garden soil substrate'
 ]);
 console.log(`Custom index created with ${customIndex.size()} strings`);
 // Search with a higher similarity threshold
 console.log('\nSearching with higher similarity threshold (0.3):');
 const results = customIndex.search('bio bizz', 0.3);
 results.forEach(match => {
  console.log(`  ${match.similarity.toFixed(2)}: ${match.string}`);
 });