genesis
This commit is contained in:
25
.gitignore
vendored
Normal file
25
.gitignore
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Node.js dependencies
|
||||||
|
node_modules/
|
||||||
|
npm-debug.log
|
||||||
|
yarn-debug.log
|
||||||
|
yarn-error.log
|
||||||
|
|
||||||
|
# Build outputs
|
||||||
|
build/
|
||||||
|
*.node
|
||||||
|
|
||||||
|
# Editor directories and files
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/c_cpp_properties.json
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# OS generated files
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store?
|
||||||
|
._*
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
ehthumbs.db
|
||||||
|
Thumbs.db
|
||||||
18
.vscode/c_cpp_properties.json
vendored
Normal file
18
.vscode/c_cpp_properties.json
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
{
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Linux",
|
||||||
|
"includePath": [
|
||||||
|
"${workspaceFolder}/**",
|
||||||
|
"${workspaceFolder}/node_modules/node-addon-api",
|
||||||
|
"${workspaceFolder}/node_modules/nan"
|
||||||
|
],
|
||||||
|
"defines": [],
|
||||||
|
"compilerPath": "/usr/bin/gcc",
|
||||||
|
"cStandard": "c11",
|
||||||
|
"cppStandard": "c++14",
|
||||||
|
"intelliSenseMode": "linux-gcc-x64"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"version": 4
|
||||||
|
}
|
||||||
119
README.md
Normal file
119
README.md
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
# Similarity Search
|
||||||
|
|
||||||
|
A Node.js module that performs word order independent similarity search on strings.
|
||||||
|
|
||||||
|
This module is built as a native addon that uses C code for fast similarity computations. It uses Jaccard similarity between word sets to find matches regardless of word order.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const SimilaritySearch = require('./index');
|
||||||
|
|
||||||
|
// Create a new search index with default capacity (500)
|
||||||
|
const index = new SimilaritySearch();
|
||||||
|
|
||||||
|
// Add strings to the index
|
||||||
|
index.addString('bio bizz');
|
||||||
|
index.addString('lightmix bizz btio substrate');
|
||||||
|
index.addString('bizz bio mix light');
|
||||||
|
|
||||||
|
// Add multiple strings at once
|
||||||
|
index.addStrings([
|
||||||
|
'plant growth bio formula',
|
||||||
|
'garden soil substrate'
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Search the index with a query and similarity cutoff
|
||||||
|
const results = index.search('bio bizz', 0.2);
|
||||||
|
|
||||||
|
// Display results
|
||||||
|
results.forEach(match => {
|
||||||
|
console.log(`${match.similarity.toFixed(2)}: ${match.string}`);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
## API
|
||||||
|
|
||||||
|
### `new SimilaritySearch([capacity])`
|
||||||
|
|
||||||
|
Creates a new search index.
|
||||||
|
|
||||||
|
- `capacity` (optional): Initial capacity for the index. Default: 500.
|
||||||
|
|
||||||
|
### `addString(str)`
|
||||||
|
|
||||||
|
Adds a string to the index.
|
||||||
|
|
||||||
|
- `str`: The string to add.
|
||||||
|
- Returns: Boolean indicating success.
|
||||||
|
|
||||||
|
### `addStrings(strings)`
|
||||||
|
|
||||||
|
Adds multiple strings to the index.
|
||||||
|
|
||||||
|
- `strings`: Array of strings to add.
|
||||||
|
- Returns: Boolean indicating if all adds were successful.
|
||||||
|
|
||||||
|
### `search(query, [cutoff])`
|
||||||
|
|
||||||
|
Searches the index for strings similar to the query.
|
||||||
|
|
||||||
|
- `query`: The search query.
|
||||||
|
- `cutoff` (optional): Similarity threshold between 0.0 and 1.0. Default: 0.2.
|
||||||
|
- Returns: Array of matching results, sorted by similarity (descending).
|
||||||
|
|
||||||
|
### `size()`
|
||||||
|
|
||||||
|
Gets the number of strings in the index.
|
||||||
|
|
||||||
|
- Returns: Number of strings in the index.
|
||||||
|
|
||||||
|
## Helper Functions
|
||||||
|
|
||||||
|
### `SimilaritySearch.createTestIndex([size])`
|
||||||
|
|
||||||
|
Creates a test index with random data.
|
||||||
|
|
||||||
|
- `size` (optional): Number of strings to generate. Default: 500.
|
||||||
|
- Returns: A new SimilaritySearch instance with random data.
|
||||||
|
|
||||||
|
### `SimilaritySearch.benchmark(index, queries, [cutoff])`
|
||||||
|
|
||||||
|
Benchmarks the search performance.
|
||||||
|
|
||||||
|
- `index`: The index to benchmark.
|
||||||
|
- `queries`: Array of search queries.
|
||||||
|
- `cutoff` (optional): Similarity threshold. Default: 0.2.
|
||||||
|
- Returns: Benchmark results.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
The similarity search uses Jaccard similarity between word sets:
|
||||||
|
|
||||||
|
```
|
||||||
|
similarity = (number of matching words) / (total unique words)
|
||||||
|
```
|
||||||
|
|
||||||
|
This means word order doesn't matter - "bio bizz" will match with "bizz bio" with 100% similarity.
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
To rebuild the native addon:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
Run the test script:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm test
|
||||||
|
```
|
||||||
25
binding.gyp
Normal file
25
binding.gyp
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"target_name": "similarity_search_addon",
|
||||||
|
"sources": [
|
||||||
|
"similarity_search.c",
|
||||||
|
"similarity_search_addon.cc"
|
||||||
|
],
|
||||||
|
"include_dirs": [
|
||||||
|
"<!@(node -p \"require('node-addon-api').include\")",
|
||||||
|
"<!(node -p \"require('node-addon-api').include_dir\")",
|
||||||
|
"<!(node -e \"require('nan')\")"
|
||||||
|
],
|
||||||
|
"dependencies": [
|
||||||
|
"<!(node -p \"require('node-addon-api').gyp\")"
|
||||||
|
],
|
||||||
|
"cflags!": [ "-fno-exceptions" ],
|
||||||
|
"cflags_cc!": [ "-fno-exceptions" ],
|
||||||
|
"defines": [ "NAPI_DISABLE_CPP_EXCEPTIONS" ],
|
||||||
|
"xcode_settings": {
|
||||||
|
"GCC_ENABLE_CPP_EXCEPTIONS": "YES"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
137
index.js
Normal file
137
index.js
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
const addon = require('./build/Release/similarity_search_addon');
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A string similarity search index
|
||||||
|
*/
|
||||||
|
class SimilaritySearch {
|
||||||
|
/**
|
||||||
|
* Create a new SimilaritySearch instance
|
||||||
|
*
|
||||||
|
* @param {number} [capacity=500] - Initial capacity of the index
|
||||||
|
*/
|
||||||
|
constructor(capacity = 500) {
|
||||||
|
this.index = new addon.SearchIndex(capacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a string to the search index
|
||||||
|
*
|
||||||
|
* @param {string} str - The string to add
|
||||||
|
* @returns {boolean} - True if successful, false otherwise
|
||||||
|
*/
|
||||||
|
addString(str) {
|
||||||
|
return this.index.addString(str) === 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add multiple strings to the search index
|
||||||
|
*
|
||||||
|
* @param {string[]} strings - Array of strings to add
|
||||||
|
* @returns {boolean} - True if all adds were successful, false otherwise
|
||||||
|
*/
|
||||||
|
addStrings(strings) {
|
||||||
|
let success = true;
|
||||||
|
for (const str of strings) {
|
||||||
|
if (this.index.addString(str) !== 0) {
|
||||||
|
success = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Search the index for strings similar to the query
|
||||||
|
*
|
||||||
|
* @param {string} query - The search query
|
||||||
|
* @param {number} [cutoff=0.2] - Similarity threshold (0.0 to 1.0)
|
||||||
|
* @returns {Array<{string: string, similarity: number}>} - Array of matching results
|
||||||
|
*/
|
||||||
|
search(query, cutoff = 0.2) {
|
||||||
|
return this.index.search(query, cutoff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the number of strings in the index
|
||||||
|
*
|
||||||
|
* @returns {number} - Number of strings in the index
|
||||||
|
*/
|
||||||
|
size() {
|
||||||
|
return this.index.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add some functions for convenience
|
||||||
|
/**
|
||||||
|
* Generate an index with random test data
|
||||||
|
*
|
||||||
|
* @param {number} [size=500] - Number of strings to generate
|
||||||
|
* @returns {SimilaritySearch} - A new SimilaritySearch instance with random data
|
||||||
|
*/
|
||||||
|
SimilaritySearch.createTestIndex = function(size = 500) {
|
||||||
|
const index = new SimilaritySearch(size);
|
||||||
|
|
||||||
|
// Add some specific test strings
|
||||||
|
index.addString("bio bizz");
|
||||||
|
index.addString("lightmix bizz btio substrate");
|
||||||
|
index.addString("bizz bio mix light");
|
||||||
|
index.addString("plant growth bio formula");
|
||||||
|
index.addString("garden soil substrate");
|
||||||
|
|
||||||
|
// Generate random strings
|
||||||
|
function randomWord(len) {
|
||||||
|
const chars = 'abcdefghijklmnopqrstuvwxyz';
|
||||||
|
let word = '';
|
||||||
|
for (let i = 0; i < len; i++) {
|
||||||
|
word += chars.charAt(Math.floor(Math.random() * chars.length));
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
function randomString() {
|
||||||
|
const numWords = 2 + Math.floor(Math.random() * 5); // 2-6 words
|
||||||
|
let str = '';
|
||||||
|
for (let i = 0; i < numWords; i++) {
|
||||||
|
if (i > 0) str += ' ';
|
||||||
|
str += randomWord(3 + Math.floor(Math.random() * 8)); // 3-10 chars
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate the rest of the strings
|
||||||
|
for (let i = 5; i < size; i++) {
|
||||||
|
index.addString(randomString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return index;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Benchmark the search performance
|
||||||
|
*
|
||||||
|
* @param {SimilaritySearch} index - The index to benchmark
|
||||||
|
* @param {string[]} queries - Array of search queries
|
||||||
|
* @param {number} [cutoff=0.2] - Similarity threshold to use
|
||||||
|
* @returns {Object} - Benchmark results
|
||||||
|
*/
|
||||||
|
SimilaritySearch.benchmark = function(index, queries, cutoff = 0.2) {
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
for (const query of queries) {
|
||||||
|
const start = process.hrtime.bigint();
|
||||||
|
const matches = index.search(query, cutoff);
|
||||||
|
const end = process.hrtime.bigint();
|
||||||
|
|
||||||
|
const timeMs = Number(end - start) / 1000000;
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
query,
|
||||||
|
matches: matches.length,
|
||||||
|
timeMs,
|
||||||
|
topResults: matches.slice(0, 5)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
};
|
||||||
|
|
||||||
|
module.exports = SimilaritySearch;
|
||||||
28
package-lock.json
generated
Normal file
28
package-lock.json
generated
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
"name": "similarity-search",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"lockfileVersion": 3,
|
||||||
|
"requires": true,
|
||||||
|
"packages": {
|
||||||
|
"": {
|
||||||
|
"name": "similarity-search",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"hasInstallScript": true,
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"nan": "^2.22.2",
|
||||||
|
"node-addon-api": "^6.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/nan": {
|
||||||
|
"version": "2.22.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz",
|
||||||
|
"integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ=="
|
||||||
|
},
|
||||||
|
"node_modules/node-addon-api": {
|
||||||
|
"version": "6.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz",
|
||||||
|
"integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA=="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
23
package.json
Normal file
23
package.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"name": "similarity-search",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "A Node.js module for word order independent string similarity search",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"install": "node-gyp rebuild",
|
||||||
|
"test": "node test.js"
|
||||||
|
},
|
||||||
|
"keywords": [
|
||||||
|
"search",
|
||||||
|
"similarity",
|
||||||
|
"string",
|
||||||
|
"fuzzy"
|
||||||
|
],
|
||||||
|
"author": "",
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"nan": "^2.22.2",
|
||||||
|
"node-addon-api": "^6.0.0"
|
||||||
|
},
|
||||||
|
"gypfile": true
|
||||||
|
}
|
||||||
BIN
similarity_search
Executable file
BIN
similarity_search
Executable file
Binary file not shown.
198
similarity_search.c
Normal file
198
similarity_search.c
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#include "similarity_search.h"
|
||||||
|
|
||||||
|
// Case insensitive string comparison
|
||||||
|
int str_case_cmp(const char *s1, const char *s2) {
|
||||||
|
while (*s1 && *s2) {
|
||||||
|
int c1 = tolower((unsigned char)*s1);
|
||||||
|
int c2 = tolower((unsigned char)*s2);
|
||||||
|
if (c1 != c2) {
|
||||||
|
return c1 - c2;
|
||||||
|
}
|
||||||
|
s1++;
|
||||||
|
s2++;
|
||||||
|
}
|
||||||
|
return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split a string into words
|
||||||
|
int split_into_words(const char *string, char *words[MAX_WORDS]) {
|
||||||
|
char temp[MAX_STRING_LEN];
|
||||||
|
strcpy(temp, string);
|
||||||
|
|
||||||
|
int word_count = 0;
|
||||||
|
char *token = strtok(temp, " \t\n");
|
||||||
|
|
||||||
|
while (token != NULL && word_count < MAX_WORDS) {
|
||||||
|
words[word_count] = strdup(token);
|
||||||
|
word_count++;
|
||||||
|
token = strtok(NULL, " \t\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return word_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free memory allocated for words
|
||||||
|
void free_words(char *words[], int word_count) {
|
||||||
|
for (int i = 0; i < word_count; i++) {
|
||||||
|
free(words[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate similarity between query and target string
|
||||||
|
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
||||||
|
// Split strings into words
|
||||||
|
char *query_words[MAX_WORDS] = {0};
|
||||||
|
char *target_words[MAX_WORDS] = {0};
|
||||||
|
|
||||||
|
int query_word_count = split_into_words(query, query_words);
|
||||||
|
int target_word_count = split_into_words(target, target_words);
|
||||||
|
|
||||||
|
if (query_word_count == 0 || target_word_count == 0) {
|
||||||
|
free_words(query_words, query_word_count);
|
||||||
|
free_words(target_words, target_word_count);
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count matches
|
||||||
|
int matches = 0;
|
||||||
|
for (int i = 0; i < query_word_count; i++) {
|
||||||
|
for (int j = 0; j < target_word_count; j++) {
|
||||||
|
if (str_case_cmp(query_words[i], target_words[j]) == 0) {
|
||||||
|
matches++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate Jaccard similarity (intersection over union)
|
||||||
|
float similarity = (float)matches / (query_word_count + target_word_count - matches);
|
||||||
|
|
||||||
|
free_words(query_words, query_word_count);
|
||||||
|
free_words(target_words, target_word_count);
|
||||||
|
|
||||||
|
return similarity;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare function for qsort to sort results by similarity (descending)
|
||||||
|
int compare_results(const void *a, const void *b) {
|
||||||
|
const SearchResult *result_a = (const SearchResult *)a;
|
||||||
|
const SearchResult *result_b = (const SearchResult *)b;
|
||||||
|
|
||||||
|
if (result_b->similarity > result_a->similarity) return 1;
|
||||||
|
if (result_b->similarity < result_a->similarity) return -1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate a random word
|
||||||
|
void generate_random_word(char *word, int max_len) {
|
||||||
|
int len = 3 + rand() % 8; // Random length between 3 and 10
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
word[i] = 'a' + (rand() % 26);
|
||||||
|
}
|
||||||
|
word[len] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate a random string consisting of multiple words
|
||||||
|
void generate_random_string(char *string, int max_len) {
|
||||||
|
int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
|
||||||
|
string[0] = '\0';
|
||||||
|
|
||||||
|
for (int i = 0; i < num_words; i++) {
|
||||||
|
char word[20];
|
||||||
|
generate_random_word(word, 10);
|
||||||
|
|
||||||
|
// Check if there's enough space to add this word
|
||||||
|
if (strlen(string) + strlen(word) + 1 < (size_t)max_len) {
|
||||||
|
if (i > 0) strcat(string, " ");
|
||||||
|
strcat(string, word);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new search index
|
||||||
|
SearchIndex* create_search_index(int capacity) {
|
||||||
|
SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
|
||||||
|
if (!index) return NULL;
|
||||||
|
|
||||||
|
index->strings = (char**)malloc(capacity * sizeof(char*));
|
||||||
|
if (!index->strings) {
|
||||||
|
free(index);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
index->num_strings = 0;
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a string to the index
|
||||||
|
int add_string_to_index(SearchIndex* index, const char* string) {
|
||||||
|
if (!index || !string) return -1;
|
||||||
|
|
||||||
|
index->strings[index->num_strings] = strdup(string);
|
||||||
|
if (!index->strings[index->num_strings]) return -1;
|
||||||
|
|
||||||
|
index->num_strings++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free the search index and all associated memory
|
||||||
|
void free_search_index(SearchIndex* index) {
|
||||||
|
if (!index) return;
|
||||||
|
|
||||||
|
for (int i = 0; i < index->num_strings; i++) {
|
||||||
|
free(index->strings[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(index->strings);
|
||||||
|
free(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search the index with the given query and similarity cutoff
|
||||||
|
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
|
||||||
|
if (!index || !query || !num_results) return NULL;
|
||||||
|
|
||||||
|
// Allocate temporary array for results
|
||||||
|
SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
|
||||||
|
if (!temp_results) return NULL;
|
||||||
|
|
||||||
|
*num_results = 0;
|
||||||
|
|
||||||
|
// Search through all strings in the index
|
||||||
|
for (int i = 0; i < index->num_strings; i++) {
|
||||||
|
float similarity = calculate_similarity(query, index->strings[i], cutoff);
|
||||||
|
|
||||||
|
if (similarity >= cutoff) {
|
||||||
|
temp_results[*num_results].string = index->strings[i];
|
||||||
|
temp_results[*num_results].similarity = similarity;
|
||||||
|
(*num_results)++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort results by similarity
|
||||||
|
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
|
||||||
|
|
||||||
|
// Allocate final result array with exact size
|
||||||
|
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
|
||||||
|
if (!results) {
|
||||||
|
free(temp_results);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy results to final array
|
||||||
|
memcpy(results, temp_results, *num_results * sizeof(SearchResult));
|
||||||
|
free(temp_results);
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free the search results
|
||||||
|
void free_search_results(SearchResult* results, int num_results) {
|
||||||
|
free(results);
|
||||||
|
}
|
||||||
45
similarity_search.h
Normal file
45
similarity_search.h
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
#ifndef SIMILARITY_SEARCH_H
|
||||||
|
#define SIMILARITY_SEARCH_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAX_STRING_LEN 100
|
||||||
|
#define MAX_WORDS 20
|
||||||
|
|
||||||
|
// Public API
|
||||||
|
|
||||||
|
// Structure representing the search index
|
||||||
|
typedef struct {
|
||||||
|
char **strings;
|
||||||
|
int num_strings;
|
||||||
|
} SearchIndex;
|
||||||
|
|
||||||
|
// Structure to hold a search result
|
||||||
|
typedef struct {
|
||||||
|
const char *string;
|
||||||
|
float similarity;
|
||||||
|
} SearchResult;
|
||||||
|
|
||||||
|
// Create a new search index
|
||||||
|
SearchIndex* create_search_index(int capacity);
|
||||||
|
|
||||||
|
// Add a string to the index
|
||||||
|
int add_string_to_index(SearchIndex* index, const char* string);
|
||||||
|
|
||||||
|
// Free the search index and all associated memory
|
||||||
|
void free_search_index(SearchIndex* index);
|
||||||
|
|
||||||
|
// Search the index with the given query and similarity cutoff
|
||||||
|
// Returns an array of SearchResult pointers that must be freed by the caller
|
||||||
|
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results);
|
||||||
|
|
||||||
|
// Free the search results
|
||||||
|
void free_search_results(SearchResult* results, int num_results);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* SIMILARITY_SEARCH_H */
|
||||||
123
similarity_search_addon.cc
Normal file
123
similarity_search_addon.cc
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
#include <napi.h>
|
||||||
|
#include <string>
|
||||||
|
#include "similarity_search.h"
|
||||||
|
|
||||||
|
class SearchIndexWrapper : public Napi::ObjectWrap<SearchIndexWrapper> {
|
||||||
|
public:
|
||||||
|
static Napi::Object Init(Napi::Env env, Napi::Object exports);
|
||||||
|
SearchIndexWrapper(const Napi::CallbackInfo& info);
|
||||||
|
~SearchIndexWrapper();
|
||||||
|
|
||||||
|
private:
|
||||||
|
static Napi::FunctionReference constructor;
|
||||||
|
|
||||||
|
Napi::Value AddString(const Napi::CallbackInfo& info);
|
||||||
|
Napi::Value Search(const Napi::CallbackInfo& info);
|
||||||
|
Napi::Value GetSize(const Napi::CallbackInfo& info);
|
||||||
|
|
||||||
|
SearchIndex* index_;
|
||||||
|
};
|
||||||
|
|
||||||
|
Napi::FunctionReference SearchIndexWrapper::constructor;
|
||||||
|
|
||||||
|
Napi::Object SearchIndexWrapper::Init(Napi::Env env, Napi::Object exports) {
|
||||||
|
Napi::HandleScope scope(env);
|
||||||
|
|
||||||
|
Napi::Function func = DefineClass(env, "SearchIndex", {
|
||||||
|
InstanceMethod("addString", &SearchIndexWrapper::AddString),
|
||||||
|
InstanceMethod("search", &SearchIndexWrapper::Search),
|
||||||
|
InstanceMethod("size", &SearchIndexWrapper::GetSize)
|
||||||
|
});
|
||||||
|
|
||||||
|
constructor = Napi::Persistent(func);
|
||||||
|
constructor.SuppressDestruct();
|
||||||
|
|
||||||
|
exports.Set("SearchIndex", func);
|
||||||
|
return exports;
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info)
|
||||||
|
: Napi::ObjectWrap<SearchIndexWrapper>(info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
Napi::HandleScope scope(env);
|
||||||
|
|
||||||
|
int capacity = 500; // Default capacity
|
||||||
|
if (info.Length() > 0 && info[0].IsNumber()) {
|
||||||
|
capacity = info[0].As<Napi::Number>().Int32Value();
|
||||||
|
}
|
||||||
|
|
||||||
|
this->index_ = create_search_index(capacity);
|
||||||
|
if (!this->index_) {
|
||||||
|
Napi::Error::New(env, "Failed to create search index").ThrowAsJavaScriptException();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchIndexWrapper::~SearchIndexWrapper() {
|
||||||
|
free_search_index(this->index_);
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Value SearchIndexWrapper::AddString(const Napi::CallbackInfo& info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
Napi::HandleScope scope(env);
|
||||||
|
|
||||||
|
if (info.Length() < 1 || !info[0].IsString()) {
|
||||||
|
Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
|
||||||
|
return env.Null();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string str = info[0].As<Napi::String>().Utf8Value();
|
||||||
|
int result = add_string_to_index(this->index_, str.c_str());
|
||||||
|
|
||||||
|
return Napi::Number::New(env, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
Napi::HandleScope scope(env);
|
||||||
|
|
||||||
|
if (info.Length() < 1 || !info[0].IsString()) {
|
||||||
|
Napi::TypeError::New(env, "Query string expected").ThrowAsJavaScriptException();
|
||||||
|
return env.Null();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string query = info[0].As<Napi::String>().Utf8Value();
|
||||||
|
float cutoff = 0.2f; // Default cutoff
|
||||||
|
|
||||||
|
if (info.Length() > 1 && info[1].IsNumber()) {
|
||||||
|
cutoff = info[1].As<Napi::Number>().FloatValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
int num_results = 0;
|
||||||
|
SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results);
|
||||||
|
|
||||||
|
if (!results) {
|
||||||
|
Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException();
|
||||||
|
return env.Null();
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Array result_array = Napi::Array::New(env, num_results);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_results; i++) {
|
||||||
|
Napi::Object obj = Napi::Object::New(env);
|
||||||
|
obj.Set("string", Napi::String::New(env, results[i].string));
|
||||||
|
obj.Set("similarity", Napi::Number::New(env, results[i].similarity));
|
||||||
|
result_array[i] = obj;
|
||||||
|
}
|
||||||
|
|
||||||
|
free_search_results(results, num_results);
|
||||||
|
|
||||||
|
return result_array;
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Value SearchIndexWrapper::GetSize(const Napi::CallbackInfo& info) {
|
||||||
|
Napi::Env env = info.Env();
|
||||||
|
Napi::HandleScope scope(env);
|
||||||
|
|
||||||
|
return Napi::Number::New(env, this->index_->num_strings);
|
||||||
|
}
|
||||||
|
|
||||||
|
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
||||||
|
return SearchIndexWrapper::Init(env, exports);
|
||||||
|
}
|
||||||
|
|
||||||
|
NODE_API_MODULE(similarity_search_addon, Init)
|
||||||
53
test.js
Normal file
53
test.js
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
const SimilaritySearch = require('./index');
|
||||||
|
|
||||||
|
// Create a test index with 500 strings
|
||||||
|
console.log('Creating test index with 500 strings...');
|
||||||
|
const index = SimilaritySearch.createTestIndex(500);
|
||||||
|
console.log(`Index created with ${index.size()} strings`);
|
||||||
|
|
||||||
|
// Test queries to run
|
||||||
|
const queries = [
|
||||||
|
'bio bizz',
|
||||||
|
'substrate light',
|
||||||
|
'plant growth',
|
||||||
|
'garden mix',
|
||||||
|
'random query'
|
||||||
|
];
|
||||||
|
|
||||||
|
console.log('\nRunning benchmark...');
|
||||||
|
const benchmarkResults = SimilaritySearch.benchmark(index, queries);
|
||||||
|
|
||||||
|
// Display results
|
||||||
|
console.log(`\nSearch results with cutoff: 0.2\n`);
|
||||||
|
benchmarkResults.forEach(result => {
|
||||||
|
console.log(`Query: "${result.query}"`);
|
||||||
|
console.log(`Found ${result.matches} matches in ${result.timeMs.toFixed(2)} ms`);
|
||||||
|
|
||||||
|
// Display top results
|
||||||
|
result.topResults.forEach(match => {
|
||||||
|
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
||||||
|
});
|
||||||
|
console.log('');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Demonstrate creating a custom index
|
||||||
|
console.log('Creating a custom index...');
|
||||||
|
const customIndex = new SimilaritySearch();
|
||||||
|
customIndex.addString('bio bizz');
|
||||||
|
customIndex.addString('lightmix bizz btio substrate');
|
||||||
|
customIndex.addString('bizz bio mix light');
|
||||||
|
|
||||||
|
// Add multiple strings at once
|
||||||
|
customIndex.addStrings([
|
||||||
|
'plant growth bio formula',
|
||||||
|
'garden soil substrate'
|
||||||
|
]);
|
||||||
|
|
||||||
|
console.log(`Custom index created with ${customIndex.size()} strings`);
|
||||||
|
|
||||||
|
// Search with a higher similarity threshold
|
||||||
|
console.log('\nSearching with higher similarity threshold (0.3):');
|
||||||
|
const results = customIndex.search('bio bizz', 0.3);
|
||||||
|
results.forEach(match => {
|
||||||
|
console.log(` ${match.similarity.toFixed(2)}: ${match.string}`);
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user