From 51a3cc6c2df250624a05051cae74eeefbf0e1d23 Mon Sep 17 00:00:00 2001 From: seb Date: Fri, 18 Apr 2025 08:22:35 +0200 Subject: [PATCH] genesis --- .gitignore | 25 +++++ .vscode/c_cpp_properties.json | 18 ++++ README.md | 119 ++++++++++++++++++++ a.out | Bin 0 -> 17000 bytes binding.gyp | 25 +++++ index.js | 137 +++++++++++++++++++++++ package-lock.json | 28 +++++ package.json | 23 ++++ similarity_search | Bin 0 -> 17000 bytes similarity_search.c | 198 ++++++++++++++++++++++++++++++++++ similarity_search.h | 45 ++++++++ similarity_search_addon.cc | 123 +++++++++++++++++++++ test.js | 53 +++++++++ 13 files changed, 794 insertions(+) create mode 100644 .gitignore create mode 100644 .vscode/c_cpp_properties.json create mode 100644 README.md create mode 100755 a.out create mode 100644 binding.gyp create mode 100644 index.js create mode 100644 package-lock.json create mode 100644 package.json create mode 100755 similarity_search create mode 100644 similarity_search.c create mode 100644 similarity_search.h create mode 100644 similarity_search_addon.cc create mode 100644 test.js diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..13bcd74 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Node.js dependencies +node_modules/ +npm-debug.log +yarn-debug.log +yarn-error.log + +# Build outputs +build/ +*.node + +# Editor directories and files +.vscode/* +!.vscode/c_cpp_properties.json +.idea/ +*.swp +*.swo + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db \ No newline at end of file diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..a55b628 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "${workspaceFolder}/node_modules/node-addon-api", + "${workspaceFolder}/node_modules/nan" + ], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c11", + "cppStandard": "c++14", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..10309d2 --- /dev/null +++ b/README.md @@ -0,0 +1,119 @@ +# Similarity Search + +A Node.js module that performs word order independent similarity search on strings. + +This module is built as a native addon that uses C code for fast similarity computations. It uses Jaccard similarity between word sets to find matches regardless of word order. + +## Installation + +```bash +npm install +``` + +## Usage + +```javascript +const SimilaritySearch = require('./index'); + +// Create a new search index with default capacity (500) +const index = new SimilaritySearch(); + +// Add strings to the index +index.addString('bio bizz'); +index.addString('lightmix bizz btio substrate'); +index.addString('bizz bio mix light'); + +// Add multiple strings at once +index.addStrings([ + 'plant growth bio formula', + 'garden soil substrate' +]); + +// Search the index with a query and similarity cutoff +const results = index.search('bio bizz', 0.2); + +// Display results +results.forEach(match => { + console.log(`${match.similarity.toFixed(2)}: ${match.string}`); +}); +``` + +## API + +### `new SimilaritySearch([capacity])` + +Creates a new search index. + +- `capacity` (optional): Initial capacity for the index. Default: 500. + +### `addString(str)` + +Adds a string to the index. + +- `str`: The string to add. +- Returns: Boolean indicating success. + +### `addStrings(strings)` + +Adds multiple strings to the index. + +- `strings`: Array of strings to add. +- Returns: Boolean indicating if all adds were successful. + +### `search(query, [cutoff])` + +Searches the index for strings similar to the query. + +- `query`: The search query. +- `cutoff` (optional): Similarity threshold between 0.0 and 1.0. Default: 0.2. +- Returns: Array of matching results, sorted by similarity (descending). + +### `size()` + +Gets the number of strings in the index. + +- Returns: Number of strings in the index. + +## Helper Functions + +### `SimilaritySearch.createTestIndex([size])` + +Creates a test index with random data. + +- `size` (optional): Number of strings to generate. Default: 500. +- Returns: A new SimilaritySearch instance with random data. + +### `SimilaritySearch.benchmark(index, queries, [cutoff])` + +Benchmarks the search performance. + +- `index`: The index to benchmark. +- `queries`: Array of search queries. +- `cutoff` (optional): Similarity threshold. Default: 0.2. +- Returns: Benchmark results. + +## How It Works + +The similarity search uses Jaccard similarity between word sets: + +``` +similarity = (number of matching words) / (total unique words) +``` + +This means word order doesn't matter - "bio bizz" will match with "bizz bio" with 100% similarity. + +## Building + +To rebuild the native addon: + +```bash +npm install +``` + +## Testing + +Run the test script: + +```bash +npm test +``` \ No newline at end of file diff --git a/a.out b/a.out new file mode 100755 index 0000000000000000000000000000000000000000..e4ebf0bf536e7616d3e1ad8a644893861f713964 GIT binary patch literal 17000 zcmeHOf0R_!eZR8{vLG_60YymVWdRG~4nIU9Qe;>dxTGK`VAWKI+4-?Mb#`W)c@ut! zxOA8tCgYmbsKK^LIjN`R#Gb}9Mo|jPVq{lpat2RKwr$$8$D?+(i;6&!)gaT)_x_l9 z^W5?DoVKU^WA8aL-~0V}zxR9Z{l0hKd-oB~`ZaY92h*jV-Np!WUnMDS#c01(RRG-V z7B&h02Ajby13pI3B;PFoxUw|XSHlKHCo<_htN3>KMJ1=9_K+ay)hdU^OOA%DQR1XG zmZ}Q3!_*`xr=jMT+cPAe+2E7Pe+{jA6dQN5)N?PdhJ`wb_baIN2qnEuN^g_W)9@@g zgrcF2C&h$*o0LCxxPX*UG_>n=Dm^=#CDoXQ)CLuT;x|*EY>W+dOES|?>urA53{c(f zpT@O+8frf`DZPRr4cnoq+S5?CcL($+p8wU-s08m+?bRB;yc#zR)v$<|7_dQeZSCz_ z8N~ABiwZBR>y)@vezM||{6iOfuRR)_^Y2ew{c+~9-+zD3HHTaNXnrKzx@7VENT4|q zj;3}r@3>`2^OD7`WX!dQ?6WEOkv-YZy{JhV!PCK?br7sot_4Fa^(v9sHRxMv(2sy_ z;Ab!7*GTriQG-s?ekA!7HRv=yMv_n0phs)ae+jyQpS@7p8_9mC27N;f`b{_!MrnhAe=C-ZufrqBO?*WdntH{=UPSSS$;5;KsBvv|t% zxBC*zABp)NAfg$2fSKVAArJ{hg%O|0I(!jGz!oN|pA05UZ-*}&Wo9fA+a64?WWpB> zfbZYo^M=AvUnIOM$l{4`)C{o)ld*)!NRwvt61H~zx|UVmMXtr=+D(<}BG=8#yKc({ zZy=ZmwuO^sFtKIBsz@vv+~RAE1kprWM=Yvb@Ji7U1sPUO9ag17K&p=swF+KW=25sD zkfmIv^TXtD_zKE^f2L?JJpMG6Up;$P;i+k1T#4$~4}eqA`CsQV;|r3`Resm>0X4V{ zZpqhljrY?+KtW^WoI?13g|2fS(Tf)PL?Wni)1W3!U1~rPD&!u^`@l3te6Eg-W-DK0zA+JYb<;ZlNEv&@rIp za>zngw+4duyoK&8OEY%ZLf3PI*m(>6N{f7-g|4Q8;1(=&x-N{C(Flx2U^D`w5g3iY z|3(DPJFokMr_VI-+}MAb&lu0-&AL*7r+?D$s%Wlsa}&s=*|YHPoZ)69-%gytw@al` zs9WH4LooPe1*aQXpQzw;V=(xq6`XDe1``#WZUhEf zD>&T%4BlD6>B2wQQo-qhKe(uZ(~ZzzQw66B|KL>>oG$o-brqa0^n>Rv>Hd~xA9FDl za?V)IL_A8fC)x1JZ1|<-YWd&T@DFVGdp7*vZ1^v1_|I(kNgIC5hQDaT|IUX0wGIES z4S%`{U-_ZSS!Mc_|0}AgN z`vifd!{!uBy}PK6sW-kf;GEG(a<3`~`LK|`xs|}&OFVm)_k6H|_YBu_$4P$rk~tX$ zW-9{?r2#oETA$`;=W=4R)V!@cy}XX9Jo~OWk>{3Q2BbI_16vGWv`>!x^hVs>-Ur=s z`=md(9Zafhh2CbYmN`88a6iwc3PUYC=jnI8$eb_cv%SS{!cx{#1g%h}JtSyna-QNc z&l$4;aGvMP&{ufc)9=7v!J#bd%bS-g#eMnIAw`vTzf3k>Ra{A$MWhqi{<#I7yKngQ zS1a!AJ##?x@Mt@{=Gh-_0|@ZknAvj#*;8DY=eef@v&+-Tv%lc*eCwyBQg6nDD1V%;eITj29-W(Nd@Hs*vNKft>tzb3?U8ES-geX)+`gbbNHAyn$qLfC-;e@2)) zd7M;=`zi)``Y73D=J@nc^56@5zN=cuZ7l})yvf3Ib8PYKMKn^?Ta1N)u_$I!@AP^u zlHsA5InPC_YZwj(VuX(L7B|8`+H=tXk!7BDcE8o#T?riDd!`$CjAxJTM`#hZPJr{y zD>J(Rt??U1{JNfc9@N9s;LCKE&a=Nf&z}Q%6Z{{6OwtIAv@8O?9+Yh6S)xlVu&G5A z?`FY1g#YwW5)#27l%>9$drGQh$f_K^GpOaBB0lfRkPnjl76>8dhxx$|;A3y*08nzd zH}e;&Hngobb5Pe;II{+L!J2I~zEW%3>1L$^bq72oSZA^QHv2cc$d4A=5^bYh`nf6og z3dj~Tk9hXuxJ0R(V~h@GQ!iLoI5K2$GP&5p{XIOKZwrM&yr*bnPn`ZF+Xtt~*`9$q zM`3PH_Ec9(UDopkT0acyy`DZA&!H4jXyF*$hfpYnDKrqE`t)H6)2qZMbRwaSevUBb zJTG9NmYztqq#y2Md!}oR!QYYwjoo4NFVxEnB``Z&PxgAgjdBs|J&8la2D zE%+CGz9%Q@);TOh=(=z`0FmM&La)A-oXFG1NRlFR7lM`+79D$!QX}WF3itdu@+7rO z8U5wfaLi~8@7l$)4JX|VOtaE?tWd^}M8a+DW=D7jKPIG%R+DF^@@%|;zl=I`lPP|r z6^}K2W)MyCD|q&9+>J>4>3pMVW7jUI$M_KgrVQ9bYY=Scbvt3+(&$tnACLH=rqPz* zM`GK}cA+Wd`O6A7_>oX7(UFSysL}DTb!DYdgNz(&^CbeosFCEyW8nzgY)$g)0-oK` zz>ih@jZeZwm5-rlW#SZ)uCc2mwnJhbiFHbBwZ!&IY^B7yfnmju{UK0FxEwf=@V}gQ z-uoR<=gvL-A?}Rkk6oavW7pW}La|q*#A5W2W8c9=4b@C-9%FLHeG zIcaU1kjw}$PuQkw&pGbAA%D)w32X<$VlS{>@ zVAJ?G(nkr5aX$7{tgGJ4Y9PYF8*`%m*;QM*2OL zw%eduNjgZk4V9fGwLV(@uOmS3JE$1rCor8(86_N%6)D{z$LX9#8I7sjgk;F*txl}N zcrX=A>{K1u9P}mp?cr#fF*{(u2x>`Vdl;E88SV&2e2K8R)9|OvSSWO>G268$G+_d} zgY<7Tnr0`PCa^WJR5U=!9X`B#3MP$kRLB_}$qCF5RVbp)*iYYBe;YO1xzDvyH1GbV z+cCX<8NKtm0>4ckmP&^Jx1BAO3V?CIBEgSvX99Q+rY*h97zUgL_#Vwxz=c@Y?SR+) zrc~MwxEJscAYB0a0B7NHTLhemdxVMVrH6i2@7Q%GbL?nzOrJQef%X$br*9bg(f0L# zZ1p(0K@#uN7Xu#rpj2vv9Gld*X42HJI4|Gc(8*RzTYAGyvzkbb^#2&Y{pgEOBq9pY zP5e4Rx4w`uKzkIwJJ83!1Z18`jbE=@b;V_#bba09a0(;?zvuCL12VKTD9doj{Hv59 zpMQ?u(}>LnSx_is|i@`q)zUY5h ze_UCA3-~>_R(z{U{~JQz0L=t{F~&wDGObVj{78r}lA%kI6XTY`1*q)9mUMmn-D9K> z$i$-9LX^=vcq&NrYGp|NHsE`vT{^X9^+xtklesel}b_ZY2PG@a+F?;Qkg1&_?A>v z1yE|ty+Tsec87H;ecFdmp;xq2u91MvQ4VT)yQ*lv^GZ&CyQSM#VJY|TcO}_BG$&N? z@Iv%=j{jrg zg3l|MSFoVqfP!ZgWa^``MgUHKWL-hr09Ab z0u^N){vB*OYjjIo&lB3^IWSxGx{_SRcB4Y=>UFK@`-S|~<#np*e}l?M{@3akeJAZ; z;Lg+*DsUt_uM7F9%s$WF6lzzof>F_@+PnBUkX7`0)!g5qVhfpIO+-LOq8l~n%Rx7^ zQ45@h#9&aia$AU~-c!R)qy~Kt=#8+W$4gtw)R6zSpgY-I3OI^*<^lLD(d!z^c~&s- z_kAUQK_whLQG06G|9K7ihc)PRh{IKwKR#vuTdLt}1YLVkfpbBp{@UZVMA_+6{Swb< zptcrtqoQ7}-39uUb<>z$$C}|MIwc9EM9(^?B!qogxBjPV(7#=S{(aCJvA(9dtyUPD zFXWx&b423V7es!b`#~rT`_$5?rRNuOj(8+!1_Q1IBTLYS`(dvyk?`&G2BRjP_=XaA*5wVPIy&ea zfd%tI(yS7#JpA=0#S>yU=k>0+bL9q)*K_-7FFw^@eb4PHH>_I)GM>SCJzR15>N}Zt z?fQ)^E7yBBu359$v&Fk*Wy^YxSDaLE>(c^sFhCtwsG@!5OhTE9Qw5)Qk^s-HaXz9f zP4>k}g(`|S81R`sc|M|w0d23}mkfIS9dRZi$J&C?AiZhy%BSWyqd*VG=?Dc*QdBA1 z&t#ODi`AKlGJ(9Z+ua^Z1muB>Dlv>YT$LvySTY{L0O9yW%uC`lC#ug)cmv6pw;hl7 z=}g7CjZh1Oqux|97!c<_sw}HR9#u4XBBY9hNkd0KT9ZlT1f4Wd&5Oe#a$=~sB-L?~ zDibs&KC_DC_2T@8o`RP{er_o1*nFN;Qk7uUHVi1cob^OhwEj^ZO!xytBms zRwgrJuH?=R)7J`!(=h_J>smBs23>8@lq;Ty#e)fRCv%lat*J1MbA&lBO}x%!RDdfn=eQ z+kXSlzKCx5ksd|S=LI!fpy(RteTK&IL1vYHC-Ny34QENT{I9Q*ny?ZT2Y$N!TIW@@ zJjq%M-G|~;MNI4K{hEfleZ7CP`@b3bv{%#mdOxROqhipuhuYTqdSCTb(5TB=U+)h! z6iVRhV)vh}Pqf!6>#O~h-e=*%Nl~=_T28|_xU_fExZb~M=$7i@T$9y*MP`qnLSv%! z^?p*ryb{#y+xwqY`YlRM?@u-CR2?RLy4A4j(|sI?sBTxT=jysn^V3@F^XD-zt@?)2 z(@@{PXd8-MzHNBMroTY#^EA}*we-Jd(|0R<4ejHv?P>U&U0>Y`y{7DU+(x1$YcJ&jL;GSPJVdcV-9^zHK0mbJ`K?JMyzOY2`> zrpSucw?aL4t)jOmgZowDqvs~HB(*;}|BnYv`)<|G-z>35Y)sv*mZfxI6A`QxOQpmd Jn}Q8x{|S59+QI+; literal 0 HcmV?d00001 diff --git a/binding.gyp b/binding.gyp new file mode 100644 index 0000000..8f7c30d --- /dev/null +++ b/binding.gyp @@ -0,0 +1,25 @@ +{ + "targets": [ + { + "target_name": "similarity_search_addon", + "sources": [ + "similarity_search.c", + "similarity_search_addon.cc" + ], + "include_dirs": [ + "} - Array of matching results + */ + search(query, cutoff = 0.2) { + return this.index.search(query, cutoff); + } + + /** + * Get the number of strings in the index + * + * @returns {number} - Number of strings in the index + */ + size() { + return this.index.size(); + } +} + +// Add some functions for convenience +/** + * Generate an index with random test data + * + * @param {number} [size=500] - Number of strings to generate + * @returns {SimilaritySearch} - A new SimilaritySearch instance with random data + */ +SimilaritySearch.createTestIndex = function(size = 500) { + const index = new SimilaritySearch(size); + + // Add some specific test strings + index.addString("bio bizz"); + index.addString("lightmix bizz btio substrate"); + index.addString("bizz bio mix light"); + index.addString("plant growth bio formula"); + index.addString("garden soil substrate"); + + // Generate random strings + function randomWord(len) { + const chars = 'abcdefghijklmnopqrstuvwxyz'; + let word = ''; + for (let i = 0; i < len; i++) { + word += chars.charAt(Math.floor(Math.random() * chars.length)); + } + return word; + } + + function randomString() { + const numWords = 2 + Math.floor(Math.random() * 5); // 2-6 words + let str = ''; + for (let i = 0; i < numWords; i++) { + if (i > 0) str += ' '; + str += randomWord(3 + Math.floor(Math.random() * 8)); // 3-10 chars + } + return str; + } + + // Generate the rest of the strings + for (let i = 5; i < size; i++) { + index.addString(randomString()); + } + + return index; +}; + +/** + * Benchmark the search performance + * + * @param {SimilaritySearch} index - The index to benchmark + * @param {string[]} queries - Array of search queries + * @param {number} [cutoff=0.2] - Similarity threshold to use + * @returns {Object} - Benchmark results + */ +SimilaritySearch.benchmark = function(index, queries, cutoff = 0.2) { + const results = []; + + for (const query of queries) { + const start = process.hrtime.bigint(); + const matches = index.search(query, cutoff); + const end = process.hrtime.bigint(); + + const timeMs = Number(end - start) / 1000000; + + results.push({ + query, + matches: matches.length, + timeMs, + topResults: matches.slice(0, 5) + }); + } + + return results; +}; + +module.exports = SimilaritySearch; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..4ffc12f --- /dev/null +++ b/package-lock.json @@ -0,0 +1,28 @@ +{ + "name": "similarity-search", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "similarity-search", + "version": "1.0.0", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "nan": "^2.22.2", + "node-addon-api": "^6.0.0" + } + }, + "node_modules/nan": { + "version": "2.22.2", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz", + "integrity": "sha512-DANghxFkS1plDdRsX0X9pm0Z6SJNN6gBdtXfanwoZ8hooC5gosGFSBGRYHUVPz1asKA/kMRqDRdHrluZ61SpBQ==" + }, + "node_modules/node-addon-api": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz", + "integrity": "sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==" + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..cf194dd --- /dev/null +++ b/package.json @@ -0,0 +1,23 @@ +{ + "name": "similarity-search", + "version": "1.0.0", + "description": "A Node.js module for word order independent string similarity search", + "main": "index.js", + "scripts": { + "install": "node-gyp rebuild", + "test": "node test.js" + }, + "keywords": [ + "search", + "similarity", + "string", + "fuzzy" + ], + "author": "", + "license": "MIT", + "dependencies": { + "nan": "^2.22.2", + "node-addon-api": "^6.0.0" + }, + "gypfile": true +} diff --git a/similarity_search b/similarity_search new file mode 100755 index 0000000000000000000000000000000000000000..e4ebf0bf536e7616d3e1ad8a644893861f713964 GIT binary patch literal 17000 zcmeHOf0R_!eZR8{vLG_60YymVWdRG~4nIU9Qe;>dxTGK`VAWKI+4-?Mb#`W)c@ut! zxOA8tCgYmbsKK^LIjN`R#Gb}9Mo|jPVq{lpat2RKwr$$8$D?+(i;6&!)gaT)_x_l9 z^W5?DoVKU^WA8aL-~0V}zxR9Z{l0hKd-oB~`ZaY92h*jV-Np!WUnMDS#c01(RRG-V z7B&h02Ajby13pI3B;PFoxUw|XSHlKHCo<_htN3>KMJ1=9_K+ay)hdU^OOA%DQR1XG zmZ}Q3!_*`xr=jMT+cPAe+2E7Pe+{jA6dQN5)N?PdhJ`wb_baIN2qnEuN^g_W)9@@g zgrcF2C&h$*o0LCxxPX*UG_>n=Dm^=#CDoXQ)CLuT;x|*EY>W+dOES|?>urA53{c(f zpT@O+8frf`DZPRr4cnoq+S5?CcL($+p8wU-s08m+?bRB;yc#zR)v$<|7_dQeZSCz_ z8N~ABiwZBR>y)@vezM||{6iOfuRR)_^Y2ew{c+~9-+zD3HHTaNXnrKzx@7VENT4|q zj;3}r@3>`2^OD7`WX!dQ?6WEOkv-YZy{JhV!PCK?br7sot_4Fa^(v9sHRxMv(2sy_ z;Ab!7*GTriQG-s?ekA!7HRv=yMv_n0phs)ae+jyQpS@7p8_9mC27N;f`b{_!MrnhAe=C-ZufrqBO?*WdntH{=UPSSS$;5;KsBvv|t% zxBC*zABp)NAfg$2fSKVAArJ{hg%O|0I(!jGz!oN|pA05UZ-*}&Wo9fA+a64?WWpB> zfbZYo^M=AvUnIOM$l{4`)C{o)ld*)!NRwvt61H~zx|UVmMXtr=+D(<}BG=8#yKc({ zZy=ZmwuO^sFtKIBsz@vv+~RAE1kprWM=Yvb@Ji7U1sPUO9ag17K&p=swF+KW=25sD zkfmIv^TXtD_zKE^f2L?JJpMG6Up;$P;i+k1T#4$~4}eqA`CsQV;|r3`Resm>0X4V{ zZpqhljrY?+KtW^WoI?13g|2fS(Tf)PL?Wni)1W3!U1~rPD&!u^`@l3te6Eg-W-DK0zA+JYb<;ZlNEv&@rIp za>zngw+4duyoK&8OEY%ZLf3PI*m(>6N{f7-g|4Q8;1(=&x-N{C(Flx2U^D`w5g3iY z|3(DPJFokMr_VI-+}MAb&lu0-&AL*7r+?D$s%Wlsa}&s=*|YHPoZ)69-%gytw@al` zs9WH4LooPe1*aQXpQzw;V=(xq6`XDe1``#WZUhEf zD>&T%4BlD6>B2wQQo-qhKe(uZ(~ZzzQw66B|KL>>oG$o-brqa0^n>Rv>Hd~xA9FDl za?V)IL_A8fC)x1JZ1|<-YWd&T@DFVGdp7*vZ1^v1_|I(kNgIC5hQDaT|IUX0wGIES z4S%`{U-_ZSS!Mc_|0}AgN z`vifd!{!uBy}PK6sW-kf;GEG(a<3`~`LK|`xs|}&OFVm)_k6H|_YBu_$4P$rk~tX$ zW-9{?r2#oETA$`;=W=4R)V!@cy}XX9Jo~OWk>{3Q2BbI_16vGWv`>!x^hVs>-Ur=s z`=md(9Zafhh2CbYmN`88a6iwc3PUYC=jnI8$eb_cv%SS{!cx{#1g%h}JtSyna-QNc z&l$4;aGvMP&{ufc)9=7v!J#bd%bS-g#eMnIAw`vTzf3k>Ra{A$MWhqi{<#I7yKngQ zS1a!AJ##?x@Mt@{=Gh-_0|@ZknAvj#*;8DY=eef@v&+-Tv%lc*eCwyBQg6nDD1V%;eITj29-W(Nd@Hs*vNKft>tzb3?U8ES-geX)+`gbbNHAyn$qLfC-;e@2)) zd7M;=`zi)``Y73D=J@nc^56@5zN=cuZ7l})yvf3Ib8PYKMKn^?Ta1N)u_$I!@AP^u zlHsA5InPC_YZwj(VuX(L7B|8`+H=tXk!7BDcE8o#T?riDd!`$CjAxJTM`#hZPJr{y zD>J(Rt??U1{JNfc9@N9s;LCKE&a=Nf&z}Q%6Z{{6OwtIAv@8O?9+Yh6S)xlVu&G5A z?`FY1g#YwW5)#27l%>9$drGQh$f_K^GpOaBB0lfRkPnjl76>8dhxx$|;A3y*08nzd zH}e;&Hngobb5Pe;II{+L!J2I~zEW%3>1L$^bq72oSZA^QHv2cc$d4A=5^bYh`nf6og z3dj~Tk9hXuxJ0R(V~h@GQ!iLoI5K2$GP&5p{XIOKZwrM&yr*bnPn`ZF+Xtt~*`9$q zM`3PH_Ec9(UDopkT0acyy`DZA&!H4jXyF*$hfpYnDKrqE`t)H6)2qZMbRwaSevUBb zJTG9NmYztqq#y2Md!}oR!QYYwjoo4NFVxEnB``Z&PxgAgjdBs|J&8la2D zE%+CGz9%Q@);TOh=(=z`0FmM&La)A-oXFG1NRlFR7lM`+79D$!QX}WF3itdu@+7rO z8U5wfaLi~8@7l$)4JX|VOtaE?tWd^}M8a+DW=D7jKPIG%R+DF^@@%|;zl=I`lPP|r z6^}K2W)MyCD|q&9+>J>4>3pMVW7jUI$M_KgrVQ9bYY=Scbvt3+(&$tnACLH=rqPz* zM`GK}cA+Wd`O6A7_>oX7(UFSysL}DTb!DYdgNz(&^CbeosFCEyW8nzgY)$g)0-oK` zz>ih@jZeZwm5-rlW#SZ)uCc2mwnJhbiFHbBwZ!&IY^B7yfnmju{UK0FxEwf=@V}gQ z-uoR<=gvL-A?}Rkk6oavW7pW}La|q*#A5W2W8c9=4b@C-9%FLHeG zIcaU1kjw}$PuQkw&pGbAA%D)w32X<$VlS{>@ zVAJ?G(nkr5aX$7{tgGJ4Y9PYF8*`%m*;QM*2OL zw%eduNjgZk4V9fGwLV(@uOmS3JE$1rCor8(86_N%6)D{z$LX9#8I7sjgk;F*txl}N zcrX=A>{K1u9P}mp?cr#fF*{(u2x>`Vdl;E88SV&2e2K8R)9|OvSSWO>G268$G+_d} zgY<7Tnr0`PCa^WJR5U=!9X`B#3MP$kRLB_}$qCF5RVbp)*iYYBe;YO1xzDvyH1GbV z+cCX<8NKtm0>4ckmP&^Jx1BAO3V?CIBEgSvX99Q+rY*h97zUgL_#Vwxz=c@Y?SR+) zrc~MwxEJscAYB0a0B7NHTLhemdxVMVrH6i2@7Q%GbL?nzOrJQef%X$br*9bg(f0L# zZ1p(0K@#uN7Xu#rpj2vv9Gld*X42HJI4|Gc(8*RzTYAGyvzkbb^#2&Y{pgEOBq9pY zP5e4Rx4w`uKzkIwJJ83!1Z18`jbE=@b;V_#bba09a0(;?zvuCL12VKTD9doj{Hv59 zpMQ?u(}>LnSx_is|i@`q)zUY5h ze_UCA3-~>_R(z{U{~JQz0L=t{F~&wDGObVj{78r}lA%kI6XTY`1*q)9mUMmn-D9K> z$i$-9LX^=vcq&NrYGp|NHsE`vT{^X9^+xtklesel}b_ZY2PG@a+F?;Qkg1&_?A>v z1yE|ty+Tsec87H;ecFdmp;xq2u91MvQ4VT)yQ*lv^GZ&CyQSM#VJY|TcO}_BG$&N? z@Iv%=j{jrg zg3l|MSFoVqfP!ZgWa^``MgUHKWL-hr09Ab z0u^N){vB*OYjjIo&lB3^IWSxGx{_SRcB4Y=>UFK@`-S|~<#np*e}l?M{@3akeJAZ; z;Lg+*DsUt_uM7F9%s$WF6lzzof>F_@+PnBUkX7`0)!g5qVhfpIO+-LOq8l~n%Rx7^ zQ45@h#9&aia$AU~-c!R)qy~Kt=#8+W$4gtw)R6zSpgY-I3OI^*<^lLD(d!z^c~&s- z_kAUQK_whLQG06G|9K7ihc)PRh{IKwKR#vuTdLt}1YLVkfpbBp{@UZVMA_+6{Swb< zptcrtqoQ7}-39uUb<>z$$C}|MIwc9EM9(^?B!qogxBjPV(7#=S{(aCJvA(9dtyUPD zFXWx&b423V7es!b`#~rT`_$5?rRNuOj(8+!1_Q1IBTLYS`(dvyk?`&G2BRjP_=XaA*5wVPIy&ea zfd%tI(yS7#JpA=0#S>yU=k>0+bL9q)*K_-7FFw^@eb4PHH>_I)GM>SCJzR15>N}Zt z?fQ)^E7yBBu359$v&Fk*Wy^YxSDaLE>(c^sFhCtwsG@!5OhTE9Qw5)Qk^s-HaXz9f zP4>k}g(`|S81R`sc|M|w0d23}mkfIS9dRZi$J&C?AiZhy%BSWyqd*VG=?Dc*QdBA1 z&t#ODi`AKlGJ(9Z+ua^Z1muB>Dlv>YT$LvySTY{L0O9yW%uC`lC#ug)cmv6pw;hl7 z=}g7CjZh1Oqux|97!c<_sw}HR9#u4XBBY9hNkd0KT9ZlT1f4Wd&5Oe#a$=~sB-L?~ zDibs&KC_DC_2T@8o`RP{er_o1*nFN;Qk7uUHVi1cob^OhwEj^ZO!xytBms zRwgrJuH?=R)7J`!(=h_J>smBs23>8@lq;Ty#e)fRCv%lat*J1MbA&lBO}x%!RDdfn=eQ z+kXSlzKCx5ksd|S=LI!fpy(RteTK&IL1vYHC-Ny34QENT{I9Q*ny?ZT2Y$N!TIW@@ zJjq%M-G|~;MNI4K{hEfleZ7CP`@b3bv{%#mdOxROqhipuhuYTqdSCTb(5TB=U+)h! z6iVRhV)vh}Pqf!6>#O~h-e=*%Nl~=_T28|_xU_fExZb~M=$7i@T$9y*MP`qnLSv%! z^?p*ryb{#y+xwqY`YlRM?@u-CR2?RLy4A4j(|sI?sBTxT=jysn^V3@F^XD-zt@?)2 z(@@{PXd8-MzHNBMroTY#^EA}*we-Jd(|0R<4ejHv?P>U&U0>Y`y{7DU+(x1$YcJ&jL;GSPJVdcV-9^zHK0mbJ`K?JMyzOY2`> zrpSucw?aL4t)jOmgZowDqvs~HB(*;}|BnYv`)<|G-z>35Y)sv*mZfxI6A`QxOQpmd Jn}Q8x{|S59+QI+; literal 0 HcmV?d00001 diff --git a/similarity_search.c b/similarity_search.c new file mode 100644 index 0000000..04d035a --- /dev/null +++ b/similarity_search.c @@ -0,0 +1,198 @@ +#include +#include +#include +#include +#include +#include "similarity_search.h" + +// Case insensitive string comparison +int str_case_cmp(const char *s1, const char *s2) { + while (*s1 && *s2) { + int c1 = tolower((unsigned char)*s1); + int c2 = tolower((unsigned char)*s2); + if (c1 != c2) { + return c1 - c2; + } + s1++; + s2++; + } + return tolower((unsigned char)*s1) - tolower((unsigned char)*s2); +} + +// Split a string into words +int split_into_words(const char *string, char *words[MAX_WORDS]) { + char temp[MAX_STRING_LEN]; + strcpy(temp, string); + + int word_count = 0; + char *token = strtok(temp, " \t\n"); + + while (token != NULL && word_count < MAX_WORDS) { + words[word_count] = strdup(token); + word_count++; + token = strtok(NULL, " \t\n"); + } + + return word_count; +} + +// Free memory allocated for words +void free_words(char *words[], int word_count) { + for (int i = 0; i < word_count; i++) { + free(words[i]); + } +} + +// Calculate similarity between query and target string +float calculate_similarity(const char *query, const char *target, float cutoff) { + // Split strings into words + char *query_words[MAX_WORDS] = {0}; + char *target_words[MAX_WORDS] = {0}; + + int query_word_count = split_into_words(query, query_words); + int target_word_count = split_into_words(target, target_words); + + if (query_word_count == 0 || target_word_count == 0) { + free_words(query_words, query_word_count); + free_words(target_words, target_word_count); + return 0.0; + } + + // Count matches + int matches = 0; + for (int i = 0; i < query_word_count; i++) { + for (int j = 0; j < target_word_count; j++) { + if (str_case_cmp(query_words[i], target_words[j]) == 0) { + matches++; + break; + } + } + } + + // Calculate Jaccard similarity (intersection over union) + float similarity = (float)matches / (query_word_count + target_word_count - matches); + + free_words(query_words, query_word_count); + free_words(target_words, target_word_count); + + return similarity; +} + +// Compare function for qsort to sort results by similarity (descending) +int compare_results(const void *a, const void *b) { + const SearchResult *result_a = (const SearchResult *)a; + const SearchResult *result_b = (const SearchResult *)b; + + if (result_b->similarity > result_a->similarity) return 1; + if (result_b->similarity < result_a->similarity) return -1; + return 0; +} + +// Generate a random word +void generate_random_word(char *word, int max_len) { + int len = 3 + rand() % 8; // Random length between 3 and 10 + for (int i = 0; i < len; i++) { + word[i] = 'a' + (rand() % 26); + } + word[len] = '\0'; +} + +// Generate a random string consisting of multiple words +void generate_random_string(char *string, int max_len) { + int num_words = 2 + rand() % 5; // Random number of words between 2 and 6 + string[0] = '\0'; + + for (int i = 0; i < num_words; i++) { + char word[20]; + generate_random_word(word, 10); + + // Check if there's enough space to add this word + if (strlen(string) + strlen(word) + 1 < (size_t)max_len) { + if (i > 0) strcat(string, " "); + strcat(string, word); + } else { + break; + } + } +} + +// Create a new search index +SearchIndex* create_search_index(int capacity) { + SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex)); + if (!index) return NULL; + + index->strings = (char**)malloc(capacity * sizeof(char*)); + if (!index->strings) { + free(index); + return NULL; + } + + index->num_strings = 0; + return index; +} + +// Add a string to the index +int add_string_to_index(SearchIndex* index, const char* string) { + if (!index || !string) return -1; + + index->strings[index->num_strings] = strdup(string); + if (!index->strings[index->num_strings]) return -1; + + index->num_strings++; + return 0; +} + +// Free the search index and all associated memory +void free_search_index(SearchIndex* index) { + if (!index) return; + + for (int i = 0; i < index->num_strings; i++) { + free(index->strings[i]); + } + + free(index->strings); + free(index); +} + +// Search the index with the given query and similarity cutoff +SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) { + if (!index || !query || !num_results) return NULL; + + // Allocate temporary array for results + SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult)); + if (!temp_results) return NULL; + + *num_results = 0; + + // Search through all strings in the index + for (int i = 0; i < index->num_strings; i++) { + float similarity = calculate_similarity(query, index->strings[i], cutoff); + + if (similarity >= cutoff) { + temp_results[*num_results].string = index->strings[i]; + temp_results[*num_results].similarity = similarity; + (*num_results)++; + } + } + + // Sort results by similarity + qsort(temp_results, *num_results, sizeof(SearchResult), compare_results); + + // Allocate final result array with exact size + SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult)); + if (!results) { + free(temp_results); + return NULL; + } + + // Copy results to final array + memcpy(results, temp_results, *num_results * sizeof(SearchResult)); + free(temp_results); + + return results; +} + +// Free the search results +void free_search_results(SearchResult* results, int num_results) { + free(results); +} \ No newline at end of file diff --git a/similarity_search.h b/similarity_search.h new file mode 100644 index 0000000..9f12c33 --- /dev/null +++ b/similarity_search.h @@ -0,0 +1,45 @@ +#ifndef SIMILARITY_SEARCH_H +#define SIMILARITY_SEARCH_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_STRING_LEN 100 +#define MAX_WORDS 20 + +// Public API + +// Structure representing the search index +typedef struct { + char **strings; + int num_strings; +} SearchIndex; + +// Structure to hold a search result +typedef struct { + const char *string; + float similarity; +} SearchResult; + +// Create a new search index +SearchIndex* create_search_index(int capacity); + +// Add a string to the index +int add_string_to_index(SearchIndex* index, const char* string); + +// Free the search index and all associated memory +void free_search_index(SearchIndex* index); + +// Search the index with the given query and similarity cutoff +// Returns an array of SearchResult pointers that must be freed by the caller +SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results); + +// Free the search results +void free_search_results(SearchResult* results, int num_results); + +#ifdef __cplusplus +} +#endif + +#endif /* SIMILARITY_SEARCH_H */ \ No newline at end of file diff --git a/similarity_search_addon.cc b/similarity_search_addon.cc new file mode 100644 index 0000000..6d68880 --- /dev/null +++ b/similarity_search_addon.cc @@ -0,0 +1,123 @@ +#include +#include +#include "similarity_search.h" + +class SearchIndexWrapper : public Napi::ObjectWrap { +public: + static Napi::Object Init(Napi::Env env, Napi::Object exports); + SearchIndexWrapper(const Napi::CallbackInfo& info); + ~SearchIndexWrapper(); + +private: + static Napi::FunctionReference constructor; + + Napi::Value AddString(const Napi::CallbackInfo& info); + Napi::Value Search(const Napi::CallbackInfo& info); + Napi::Value GetSize(const Napi::CallbackInfo& info); + + SearchIndex* index_; +}; + +Napi::FunctionReference SearchIndexWrapper::constructor; + +Napi::Object SearchIndexWrapper::Init(Napi::Env env, Napi::Object exports) { + Napi::HandleScope scope(env); + + Napi::Function func = DefineClass(env, "SearchIndex", { + InstanceMethod("addString", &SearchIndexWrapper::AddString), + InstanceMethod("search", &SearchIndexWrapper::Search), + InstanceMethod("size", &SearchIndexWrapper::GetSize) + }); + + constructor = Napi::Persistent(func); + constructor.SuppressDestruct(); + + exports.Set("SearchIndex", func); + return exports; +} + +SearchIndexWrapper::SearchIndexWrapper(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + int capacity = 500; // Default capacity + if (info.Length() > 0 && info[0].IsNumber()) { + capacity = info[0].As().Int32Value(); + } + + this->index_ = create_search_index(capacity); + if (!this->index_) { + Napi::Error::New(env, "Failed to create search index").ThrowAsJavaScriptException(); + } +} + +SearchIndexWrapper::~SearchIndexWrapper() { + free_search_index(this->index_); +} + +Napi::Value SearchIndexWrapper::AddString(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + if (info.Length() < 1 || !info[0].IsString()) { + Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException(); + return env.Null(); + } + + std::string str = info[0].As().Utf8Value(); + int result = add_string_to_index(this->index_, str.c_str()); + + return Napi::Number::New(env, result); +} + +Napi::Value SearchIndexWrapper::Search(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + if (info.Length() < 1 || !info[0].IsString()) { + Napi::TypeError::New(env, "Query string expected").ThrowAsJavaScriptException(); + return env.Null(); + } + + std::string query = info[0].As().Utf8Value(); + float cutoff = 0.2f; // Default cutoff + + if (info.Length() > 1 && info[1].IsNumber()) { + cutoff = info[1].As().FloatValue(); + } + + int num_results = 0; + SearchResult* results = search_index(this->index_, query.c_str(), cutoff, &num_results); + + if (!results) { + Napi::Error::New(env, "Search failed").ThrowAsJavaScriptException(); + return env.Null(); + } + + Napi::Array result_array = Napi::Array::New(env, num_results); + + for (int i = 0; i < num_results; i++) { + Napi::Object obj = Napi::Object::New(env); + obj.Set("string", Napi::String::New(env, results[i].string)); + obj.Set("similarity", Napi::Number::New(env, results[i].similarity)); + result_array[i] = obj; + } + + free_search_results(results, num_results); + + return result_array; +} + +Napi::Value SearchIndexWrapper::GetSize(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + Napi::HandleScope scope(env); + + return Napi::Number::New(env, this->index_->num_strings); +} + +Napi::Object Init(Napi::Env env, Napi::Object exports) { + return SearchIndexWrapper::Init(env, exports); +} + +NODE_API_MODULE(similarity_search_addon, Init) \ No newline at end of file diff --git a/test.js b/test.js new file mode 100644 index 0000000..24f33fa --- /dev/null +++ b/test.js @@ -0,0 +1,53 @@ +const SimilaritySearch = require('./index'); + +// Create a test index with 500 strings +console.log('Creating test index with 500 strings...'); +const index = SimilaritySearch.createTestIndex(500); +console.log(`Index created with ${index.size()} strings`); + +// Test queries to run +const queries = [ + 'bio bizz', + 'substrate light', + 'plant growth', + 'garden mix', + 'random query' +]; + +console.log('\nRunning benchmark...'); +const benchmarkResults = SimilaritySearch.benchmark(index, queries); + +// Display results +console.log(`\nSearch results with cutoff: 0.2\n`); +benchmarkResults.forEach(result => { + console.log(`Query: "${result.query}"`); + console.log(`Found ${result.matches} matches in ${result.timeMs.toFixed(2)} ms`); + + // Display top results + result.topResults.forEach(match => { + console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); + }); + console.log(''); +}); + +// Demonstrate creating a custom index +console.log('Creating a custom index...'); +const customIndex = new SimilaritySearch(); +customIndex.addString('bio bizz'); +customIndex.addString('lightmix bizz btio substrate'); +customIndex.addString('bizz bio mix light'); + +// Add multiple strings at once +customIndex.addStrings([ + 'plant growth bio formula', + 'garden soil substrate' +]); + +console.log(`Custom index created with ${customIndex.size()} strings`); + +// Search with a higher similarity threshold +console.log('\nSearching with higher similarity threshold (0.3):'); +const results = customIndex.search('bio bizz', 0.3); +results.forEach(match => { + console.log(` ${match.similarity.toFixed(2)}: ${match.string}`); +}); \ No newline at end of file