Files
node-similarity-search-native/similarity_search.c

363 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <ctype.h>
#include <math.h>
#include <stdbool.h>
#include "similarity_search.h"
// Case insensitive string comparison
int str_case_cmp(const char *s1, const char *s2) {
while (*s1 && *s2) {
int c1 = tolower((unsigned char)*s1);
int c2 = tolower((unsigned char)*s2);
if (c1 != c2) {
return c1 - c2;
}
s1++;
s2++;
}
return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
}
// Split a string into words
int split_into_words(const char *s,
char *words[MAX_WORDS],
char **storage) /* NEW OUT PARAM */
{
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
char *buf = strdup(s); /* one single allocation */
if (!buf) return 0;
*storage = buf; /* hand ownership to caller */
int n = 0;
for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
tok = strtok(NULL, " \t\n"))
{
words[n++] = tok; /* pointers into buf */
}
return n;
}
// Free memory allocated for words
void free_words(char *storage) { /* simplified */
free(storage); /* single free, if any */
}
// Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *a, const char *b)
{
size_t m = strlen(a), n = strlen(b);
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
int *row0 = alloca((n + 1) * sizeof(int));
int *row1 = alloca((n + 1) * sizeof(int));
for (size_t j = 0; j <= n; ++j) row0[j] = j;
for (size_t i = 1; i <= m; ++i) {
row1[0] = i;
for (size_t j = 1; j <= n; ++j) {
int cost = (tolower((unsigned)a[i-1]) ==
tolower((unsigned)b[j-1])) ? 0 : 1;
int del = row0[j] + 1;
int ins = row1[j-1] + 1;
int sub = row0[j-1] + cost;
row1[j] = (del < ins ? (del < sub ? del : sub)
: (ins < sub ? ins : sub));
}
int *tmp = row0; row0 = row1; row1 = tmp;
}
return row0[n];
}
// Calculate similarity between two words based on Levenshtein distance
float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1);
int len2 = strlen(word2);
// For very short words (3 chars or less), require exact match
if (len1 <= 3 || len2 <= 3) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance with exponential decay
float similarity = 1.0f - (float)distance / max_len;
// Apply exponential decay to make it more sensitive to differences
similarity = pow(similarity, 3.0f);
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
}
return similarity;
}
// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words
char *query_buf, *target_buf;
char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
int query_word_count = split_into_words(query, query_words, &query_buf);
int target_word_count = split_into_words(target, target_words, &target_buf);
if (query_word_count == 0 || target_word_count == 0) {
free_words(query_buf);
free_words(target_buf);
return 0.0;
}
// Track best matches for each query word
float best_word_similarities[MAX_WORDS] = {0.0f};
int query_words_found = 0;
// For each query word, find its best match in target words
for (int i = 0; i < query_word_count; i++) {
float best_similarity = 0.0f;
for (int j = 0; j < target_word_count; j++) {
/* quick lengthdifference filter (earlyexit #4) */
int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;
float similarity = word_similarity(query_words[i], target_words[j]);
if (similarity > best_similarity) {
best_similarity = similarity;
if (best_similarity >= 0.90f) break; /* early exit #4 */
}
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
query_words_found++;
}
}
// Calculate overall similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i];
}
avg_word_similarity /= query_word_count;
// Combine scores: 60% weight on word matches, 40% on character similarity
// This gives more weight to finding all words, regardless of order
float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);
// If all words are found, boost the score
if (query_words_found == query_word_count) {
similarity = 0.7f + (similarity * 0.3f);
}
free_words(query_buf);
free_words(target_buf);
return similarity;
}
// Compare function for qsort to sort results by similarity (descending)
int compare_results(const void *a, const void *b) {
const SearchResult *result_a = (const SearchResult *)a;
const SearchResult *result_b = (const SearchResult *)b;
if (result_b->similarity > result_a->similarity) return 1;
if (result_b->similarity < result_a->similarity) return -1;
return 0;
}
// Generate a random word
void generate_random_word(char *word, int max_len) {
int len = 3 + rand() % 8; // Random length between 3 and 10
for (int i = 0; i < len; i++) {
word[i] = 'a' + (rand() % 26);
}
word[len] = '\0';
}
// Generate a random string consisting of multiple words
void generate_random_string(char *string, int max_len) {
if (!string || max_len <= 0) {
return;
}
int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
string[0] = '\0';
size_t current_len = 0;
for (int i = 0; i < num_words; i++) {
char word[20];
generate_random_word(word, sizeof(word) - 1);
size_t word_len = strlen(word);
size_t space_needed = word_len + (i > 0 ? 1 : 0); // +1 for space if not first word
if (current_len + space_needed < (size_t)max_len - 1) {
if (i > 0) {
strncat(string, " ", max_len - current_len - 1);
current_len++;
}
strncat(string, word, max_len - current_len - 1);
current_len += word_len;
} else {
break;
}
}
}
// Create a new search index
SearchIndex* create_search_index(int capacity) {
if (capacity <= 0) return NULL;
SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
if (!index) return NULL;
index->strings = (char**)malloc(capacity * sizeof(char*));
if (!index->strings) {
free(index);
return NULL;
}
index->num_strings = 0;
index->capacity = capacity;
return index;
}
// Add a string to the index
int add_string_to_index(SearchIndex* index, const char* string) {
if (!index || !string) return -1;
// Check if we've reached capacity
if (index->num_strings >= index->capacity) {
return -1;
}
// Check if string is too long
if (strlen(string) >= MAX_STRING_LEN) {
return -1;
}
index->strings[index->num_strings] = strdup(string);
if (!index->strings[index->num_strings]) return -1;
index->num_strings++;
return 0;
}
// Free the search index and all associated memory
void free_search_index(SearchIndex* index) {
if (!index) return;
for (int i = 0; i < index->num_strings; i++) {
free(index->strings[i]);
}
free(index->strings);
free(index);
}
// Search the index with the given query and similarity cutoff
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
if (!index || !query || !num_results) return NULL;
// Validate input string length
if (strlen(query) >= MAX_STRING_LEN) {
*num_results = 0;
return NULL;
}
// Validate cutoff
if (cutoff < 0.0f || cutoff > 1.0f) {
*num_results = 0;
return NULL;
}
// Allocate temporary array for results
SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
if (!temp_results) return NULL;
*num_results = 0;
// Search through all strings in the index
for (int i = 0; i < index->num_strings; i++) {
float similarity = calculate_similarity(query, index->strings[i], cutoff);
if (similarity >= cutoff) {
// Store a copy of the string in the result
temp_results[*num_results].string = strdup(index->strings[i]);
if (!temp_results[*num_results].string) {
// Free any already allocated strings on error
for (int j = 0; j < *num_results; j++) {
free(temp_results[j].string);
}
free(temp_results);
return NULL;
}
temp_results[*num_results].similarity = similarity;
(*num_results)++;
}
}
// Sort results by similarity
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
// Allocate final result array with exact size
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
if (!results) {
// Free all strings in temp_results
for (int i = 0; i < *num_results; i++) {
free(temp_results[i].string);
}
free(temp_results);
return NULL;
}
// Copy results to final array
for (int i = 0; i < *num_results; i++) {
results[i].string = temp_results[i].string;
results[i].similarity = temp_results[i].similarity;
}
free(temp_results);
return results;
}
// Free the search results
void free_search_results(SearchResult* results, int num_results) {
if (!results) return;
// Free all strings in the results
for (int i = 0; i < num_results; i++) {
free(results[i].string);
}
free(results);
}