Files
node-similarity-search-native/similarity_search.c

389 lines
12 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <ctype.h>
#include <math.h>
#include <stdbool.h>
#include "similarity_search.h"
// Case insensitive string comparison
int str_case_cmp(const char *s1, const char *s2) {
while (*s1 && *s2) {
int c1 = tolower((unsigned char)*s1);
int c2 = tolower((unsigned char)*s2);
if (c1 != c2) {
return c1 - c2;
}
s1++;
s2++;
}
return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
}
// Split a string into words
int split_into_words(const char *string, char *words[MAX_WORDS]) {
if (!string || strlen(string) >= MAX_STRING_LEN) {
return 0;
}
char temp[MAX_STRING_LEN];
strncpy(temp, string, MAX_STRING_LEN - 1);
temp[MAX_STRING_LEN - 1] = '\0';
int word_count = 0;
char *token = strtok(temp, " \t\n");
while (token != NULL && word_count < MAX_WORDS) {
words[word_count] = strdup(token);
if (!words[word_count]) {
// Free any already allocated words on error
for (int i = 0; i < word_count; i++) {
free(words[i]);
}
return 0;
}
word_count++;
token = strtok(NULL, " \t\n");
}
return word_count;
}
// Free memory allocated for words
void free_words(char *words[], int word_count) {
for (int i = 0; i < word_count; i++) {
free(words[i]);
}
}
// Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *s1, const char *s2) {
int len1 = strlen(s1);
int len2 = strlen(s2);
// Convert to lowercase for comparison
char s1_lower[MAX_STRING_LEN];
char s2_lower[MAX_STRING_LEN];
for (int i = 0; i < len1; i++) s1_lower[i] = tolower((unsigned char)s1[i]);
for (int i = 0; i < len2; i++) s2_lower[i] = tolower((unsigned char)s2[i]);
s1_lower[len1] = '\0';
s2_lower[len2] = '\0';
// Create distance matrix
int matrix[len1 + 1][len2 + 1];
// Initialize first row and column
for (int i = 0; i <= len1; i++) matrix[i][0] = i;
for (int j = 0; j <= len2; j++) matrix[0][j] = j;
// Fill in the rest of the matrix
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (s1_lower[i-1] == s2_lower[j-1]) {
matrix[i][j] = matrix[i-1][j-1];
} else {
int min = matrix[i-1][j-1]; // substitution
if (matrix[i-1][j] < min) min = matrix[i-1][j]; // deletion
if (matrix[i][j-1] < min) min = matrix[i][j-1]; // insertion
matrix[i][j] = min + 1;
}
}
}
return matrix[len1][len2];
}
// Calculate similarity between two words based on Levenshtein distance
float word_similarity(const char *word1, const char *word2) {
int len1 = strlen(word1);
int len2 = strlen(word2);
// For very short words (3 chars or less), require exact match
if (len1 <= 3 || len2 <= 3) {
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
}
// If one word is significantly shorter than the other, it must be a prefix
if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
// Check if the shorter word is a prefix of the longer word
const char *longer = len1 > len2 ? word1 : word2;
const char *shorter = len1 > len2 ? word2 : word1;
int shorter_len = len1 > len2 ? len2 : len1;
if (strncasecmp(longer, shorter, shorter_len) == 0) {
return 0.8f; // Good prefix match
}
return 0.0f; // Not a prefix match
}
// For words of similar length, calculate similarity
int distance = levenshtein_distance(word1, word2);
int max_len = len1 > len2 ? len1 : len2;
// Calculate similarity based on edit distance
float similarity = 1.0f - (float)distance / max_len;
// Adjust similarity based on word lengths
if (len1 != len2) {
float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
similarity *= length_ratio;
}
// For words of similar length, require reasonable similarity
if (similarity < 0.4f) {
return 0.0f;
}
// Never return perfect similarity for non-identical words
if (distance > 0) {
similarity = fmin(similarity, 0.9f);
}
return similarity;
}
// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
// Split strings into words
char *query_words[MAX_WORDS] = {0};
char *target_words[MAX_WORDS] = {0};
int query_word_count = split_into_words(query, query_words);
int target_word_count = split_into_words(target, target_words);
if (query_word_count == 0 || target_word_count == 0) {
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
return 0.0;
}
// Track best matches for each query word
float best_word_similarities[MAX_WORDS] = {0.0f};
int query_words_found = 0;
// For each query word, find its best match in target words
for (int i = 0; i < query_word_count; i++) {
float best_similarity = 0.0f;
for (int j = 0; j < target_word_count; j++) {
float similarity = word_similarity(query_words[i], target_words[j]);
if (similarity > best_similarity) {
best_similarity = similarity;
}
}
best_word_similarities[i] = best_similarity;
if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
query_words_found++;
}
}
// Calculate overall similarity
float word_match_score = (float)query_words_found / query_word_count;
// Calculate average of best word similarities
float avg_word_similarity = 0.0f;
for (int i = 0; i < query_word_count; i++) {
avg_word_similarity += best_word_similarities[i];
}
avg_word_similarity /= query_word_count;
// Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = (word_match_score * 0.7f) + (avg_word_similarity * 0.3f);
// Never return perfect similarity unless all words are exact matches
bool all_exact_matches = true;
for (int i = 0; i < query_word_count; i++) {
if (best_word_similarities[i] < 1.0f) {
all_exact_matches = false;
break;
}
}
if (!all_exact_matches) {
similarity = fmin(similarity, 0.9f);
}
free_words(query_words, query_word_count);
free_words(target_words, target_word_count);
return similarity;
}
// Compare function for qsort to sort results by similarity (descending)
int compare_results(const void *a, const void *b) {
const SearchResult *result_a = (const SearchResult *)a;
const SearchResult *result_b = (const SearchResult *)b;
if (result_b->similarity > result_a->similarity) return 1;
if (result_b->similarity < result_a->similarity) return -1;
return 0;
}
// Generate a random word
void generate_random_word(char *word, int max_len) {
int len = 3 + rand() % 8; // Random length between 3 and 10
for (int i = 0; i < len; i++) {
word[i] = 'a' + (rand() % 26);
}
word[len] = '\0';
}
// Generate a random string consisting of multiple words
void generate_random_string(char *string, int max_len) {
if (!string || max_len <= 0) {
return;
}
int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
string[0] = '\0';
size_t current_len = 0;
for (int i = 0; i < num_words; i++) {
char word[20];
generate_random_word(word, sizeof(word) - 1);
size_t word_len = strlen(word);
size_t space_needed = word_len + (i > 0 ? 1 : 0); // +1 for space if not first word
if (current_len + space_needed < (size_t)max_len - 1) {
if (i > 0) {
strncat(string, " ", max_len - current_len - 1);
current_len++;
}
strncat(string, word, max_len - current_len - 1);
current_len += word_len;
} else {
break;
}
}
}
// Create a new search index
SearchIndex* create_search_index(int capacity) {
if (capacity <= 0) return NULL;
SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
if (!index) return NULL;
index->strings = (char**)malloc(capacity * sizeof(char*));
if (!index->strings) {
free(index);
return NULL;
}
index->num_strings = 0;
index->capacity = capacity;
return index;
}
// Add a string to the index
int add_string_to_index(SearchIndex* index, const char* string) {
if (!index || !string) return -1;
// Check if we've reached capacity
if (index->num_strings >= index->capacity) {
return -1;
}
// Check if string is too long
if (strlen(string) >= MAX_STRING_LEN) {
return -1;
}
index->strings[index->num_strings] = strdup(string);
if (!index->strings[index->num_strings]) return -1;
index->num_strings++;
return 0;
}
// Free the search index and all associated memory
void free_search_index(SearchIndex* index) {
if (!index) return;
for (int i = 0; i < index->num_strings; i++) {
free(index->strings[i]);
}
free(index->strings);
free(index);
}
// Search the index with the given query and similarity cutoff
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
if (!index || !query || !num_results) return NULL;
// Validate input string length
if (strlen(query) >= MAX_STRING_LEN) {
*num_results = 0;
return NULL;
}
// Validate cutoff
if (cutoff < 0.0f || cutoff > 1.0f) {
*num_results = 0;
return NULL;
}
// Allocate temporary array for results
SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
if (!temp_results) return NULL;
*num_results = 0;
// Search through all strings in the index
for (int i = 0; i < index->num_strings; i++) {
float similarity = calculate_similarity(query, index->strings[i], cutoff);
if (similarity >= cutoff) {
// Store a copy of the string in the result
temp_results[*num_results].string = strdup(index->strings[i]);
if (!temp_results[*num_results].string) {
// Free any already allocated strings on error
for (int j = 0; j < *num_results; j++) {
free(temp_results[j].string);
}
free(temp_results);
return NULL;
}
temp_results[*num_results].similarity = similarity;
(*num_results)++;
}
}
// Sort results by similarity
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
// Allocate final result array with exact size
SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
if (!results) {
// Free all strings in temp_results
for (int i = 0; i < *num_results; i++) {
free(temp_results[i].string);
}
free(temp_results);
return NULL;
}
// Copy results to final array
for (int i = 0; i < *num_results; i++) {
results[i].string = temp_results[i].string;
results[i].similarity = temp_results[i].similarity;
}
free(temp_results);
return results;
}
// Free the search results
void free_search_results(SearchResult* results, int num_results) {
if (!results) return;
// Free all strings in the results
for (int i = 0; i < num_results; i++) {
free(results[i].string);
}
free(results);
}