342 lines
10 KiB
C
342 lines
10 KiB
C
#include <stdio.h>
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <time.h>
|
||
#include <ctype.h>
|
||
#include <math.h>
|
||
#include <stdbool.h>
|
||
#ifdef _WIN32
|
||
#include <malloc.h> // For alloca on Windows
|
||
#else
|
||
#include <alloca.h> // For alloca on Unix-like systems
|
||
#endif
|
||
#include "similarity_search.h"
|
||
|
||
// Case insensitive string comparison
|
||
int str_case_cmp(const char *s1, const char *s2) {
|
||
while (*s1 && *s2) {
|
||
int c1 = tolower((unsigned char)*s1);
|
||
int c2 = tolower((unsigned char)*s2);
|
||
if (c1 != c2) {
|
||
return c1 - c2;
|
||
}
|
||
s1++;
|
||
s2++;
|
||
}
|
||
return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
|
||
}
|
||
|
||
// Split a string into words
|
||
int split_into_words(const char *s,
|
||
char *words[MAX_WORDS],
|
||
char **storage) /* NEW OUT PARAM */
|
||
{
|
||
if (!s || strlen(s) >= MAX_STRING_LEN) return 0;
|
||
|
||
char *buf = strdup(s); /* one single allocation */
|
||
if (!buf) return 0;
|
||
*storage = buf; /* hand ownership to caller */
|
||
|
||
int n = 0;
|
||
for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
|
||
tok = strtok(NULL, " \t\n"))
|
||
{
|
||
words[n++] = tok; /* pointers into buf */
|
||
}
|
||
return n;
|
||
}
|
||
|
||
// Free memory allocated for words
|
||
void free_words(char *storage) { /* simplified */
|
||
if (storage) { /* check for NULL */
|
||
free(storage); /* single free, if any */
|
||
}
|
||
}
|
||
|
||
// Calculate Levenshtein distance between two strings
|
||
int levenshtein_distance(const char *a, const char *b)
|
||
{
|
||
size_t m = strlen(a), n = strlen(b);
|
||
if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }
|
||
|
||
int *row0 = alloca((n + 1) * sizeof(int));
|
||
int *row1 = alloca((n + 1) * sizeof(int));
|
||
|
||
for (size_t j = 0; j <= n; ++j) row0[j] = j;
|
||
for (size_t i = 1; i <= m; ++i) {
|
||
row1[0] = i;
|
||
for (size_t j = 1; j <= n; ++j) {
|
||
int cost = (tolower((unsigned)a[i-1]) ==
|
||
tolower((unsigned)b[j-1])) ? 0 : 1;
|
||
int del = row0[j] + 1;
|
||
int ins = row1[j-1] + 1;
|
||
int sub = row0[j-1] + cost;
|
||
row1[j] = (del < ins ? (del < sub ? del : sub)
|
||
: (ins < sub ? ins : sub));
|
||
}
|
||
int *tmp = row0; row0 = row1; row1 = tmp;
|
||
}
|
||
return row0[n];
|
||
}
|
||
|
||
// Calculate similarity between two words based on Levenshtein distance
|
||
float word_similarity(const char *word1, const char *word2) {
|
||
int len1 = strlen(word1);
|
||
int len2 = strlen(word2);
|
||
|
||
// For very short words (2 chars or less), require exact match
|
||
if (len1 <= 2 || len2 <= 2) {
|
||
return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
|
||
}
|
||
|
||
// Calculate Levenshtein distance
|
||
int distance = levenshtein_distance(word1, word2);
|
||
int max_len = len1 > len2 ? len1 : len2;
|
||
|
||
// Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
|
||
float similarity = 1.0f - (float)distance / max_len;
|
||
|
||
// Boost similarity for small differences
|
||
if (distance <= 1) {
|
||
similarity = 0.9f + (similarity * 0.1f);
|
||
}
|
||
|
||
return similarity;
|
||
}
|
||
|
||
// Calculate similarity between query and target string
|
||
float calculate_similarity(const char *query, const char *target, float cutoff) {
|
||
// Split strings into words
|
||
char *query_buf = NULL, *target_buf = NULL;
|
||
char *query_words[MAX_WORDS], *target_words[MAX_WORDS];
|
||
|
||
int query_word_count = split_into_words(query, query_words, &query_buf);
|
||
int target_word_count = split_into_words(target, target_words, &target_buf);
|
||
|
||
if (query_word_count == 0 || target_word_count == 0) {
|
||
free_words(query_buf);
|
||
free_words(target_buf);
|
||
return 0.0;
|
||
}
|
||
|
||
// Track best matches for each query word
|
||
float best_word_similarities[MAX_WORDS] = {0.0f};
|
||
int query_words_found = 0;
|
||
|
||
// For each query word, find its best match in target words
|
||
for (int i = 0; i < query_word_count; i++) {
|
||
float best_similarity = 0.0f;
|
||
|
||
for (int j = 0; j < target_word_count; j++) {
|
||
float similarity = word_similarity(query_words[i], target_words[j]);
|
||
if (similarity > best_similarity) {
|
||
best_similarity = similarity;
|
||
}
|
||
}
|
||
|
||
best_word_similarities[i] = best_similarity;
|
||
if (best_similarity >= 0.4f) {
|
||
query_words_found++;
|
||
}
|
||
}
|
||
|
||
// Calculate average word similarity
|
||
float avg_word_similarity = 0.0f;
|
||
for (int i = 0; i < query_word_count; i++) {
|
||
avg_word_similarity += best_word_similarities[i];
|
||
}
|
||
avg_word_similarity /= query_word_count;
|
||
|
||
// Calculate word match ratio
|
||
float word_match_ratio = (float)query_words_found / query_word_count;
|
||
|
||
// Final score is the average of word match ratio and average word similarity
|
||
float similarity = (word_match_ratio + avg_word_similarity) / 2.0f;
|
||
|
||
// Boost score if all words are found
|
||
if (query_words_found == query_word_count) {
|
||
similarity = 0.8f + (similarity * 0.2f);
|
||
}
|
||
|
||
free_words(query_buf);
|
||
free_words(target_buf);
|
||
|
||
return similarity;
|
||
}
|
||
|
||
// Compare function for qsort to sort results by similarity (descending)
|
||
int compare_results(const void *a, const void *b) {
|
||
const SearchResult *result_a = (const SearchResult *)a;
|
||
const SearchResult *result_b = (const SearchResult *)b;
|
||
|
||
if (result_b->similarity > result_a->similarity) return 1;
|
||
if (result_b->similarity < result_a->similarity) return -1;
|
||
return 0;
|
||
}
|
||
|
||
// Generate a random word
|
||
void generate_random_word(char *word, int max_len) {
|
||
int len = 3 + rand() % 8; // Random length between 3 and 10
|
||
for (int i = 0; i < len; i++) {
|
||
word[i] = 'a' + (rand() % 26);
|
||
}
|
||
word[len] = '\0';
|
||
}
|
||
|
||
// Generate a random string consisting of multiple words
|
||
void generate_random_string(char *string, int max_len) {
|
||
if (!string || max_len <= 0) {
|
||
return;
|
||
}
|
||
|
||
int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
|
||
string[0] = '\0';
|
||
size_t current_len = 0;
|
||
|
||
for (int i = 0; i < num_words; i++) {
|
||
char word[20];
|
||
generate_random_word(word, sizeof(word) - 1);
|
||
|
||
size_t word_len = strlen(word);
|
||
size_t space_needed = word_len + (i > 0 ? 1 : 0); // +1 for space if not first word
|
||
|
||
if (current_len + space_needed < (size_t)max_len - 1) {
|
||
if (i > 0) {
|
||
strncat(string, " ", max_len - current_len - 1);
|
||
current_len++;
|
||
}
|
||
strncat(string, word, max_len - current_len - 1);
|
||
current_len += word_len;
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Create a new search index
|
||
SearchIndex* create_search_index(int capacity) {
|
||
if (capacity <= 0) return NULL;
|
||
|
||
SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
|
||
if (!index) return NULL;
|
||
|
||
index->strings = (char**)malloc(capacity * sizeof(char*));
|
||
if (!index->strings) {
|
||
free(index);
|
||
return NULL;
|
||
}
|
||
|
||
index->num_strings = 0;
|
||
index->capacity = capacity;
|
||
return index;
|
||
}
|
||
|
||
// Add a string to the index
|
||
int add_string_to_index(SearchIndex* index, const char* string) {
|
||
if (!index || !string) return -1;
|
||
|
||
// Check if we've reached capacity
|
||
if (index->num_strings >= index->capacity) {
|
||
return -1;
|
||
}
|
||
|
||
// Check if string is too long
|
||
if (strlen(string) >= MAX_STRING_LEN) {
|
||
return -1;
|
||
}
|
||
|
||
index->strings[index->num_strings] = strdup(string);
|
||
if (!index->strings[index->num_strings]) return -1;
|
||
|
||
index->num_strings++;
|
||
return 0;
|
||
}
|
||
|
||
// Free the search index and all associated memory
|
||
void free_search_index(SearchIndex* index) {
|
||
if (!index) return;
|
||
|
||
for (int i = 0; i < index->num_strings; i++) {
|
||
free(index->strings[i]);
|
||
}
|
||
|
||
free(index->strings);
|
||
free(index);
|
||
}
|
||
|
||
// Search the index with the given query and similarity cutoff
|
||
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
|
||
if (!index || !query || !num_results) return NULL;
|
||
|
||
// Validate input string length
|
||
if (strlen(query) >= MAX_STRING_LEN) {
|
||
*num_results = 0;
|
||
return NULL;
|
||
}
|
||
|
||
// Validate cutoff
|
||
if (cutoff < 0.0f || cutoff > 1.0f) {
|
||
*num_results = 0;
|
||
return NULL;
|
||
}
|
||
|
||
// Allocate temporary array for results
|
||
SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
|
||
if (!temp_results) return NULL;
|
||
|
||
*num_results = 0;
|
||
|
||
// Search through all strings in the index
|
||
for (int i = 0; i < index->num_strings; i++) {
|
||
float similarity = calculate_similarity(query, index->strings[i], cutoff);
|
||
|
||
if (similarity >= cutoff) {
|
||
// Store a copy of the string in the result
|
||
temp_results[*num_results].string = strdup(index->strings[i]);
|
||
if (!temp_results[*num_results].string) {
|
||
// Free any already allocated strings on error
|
||
for (int j = 0; j < *num_results; j++) {
|
||
free(temp_results[j].string);
|
||
}
|
||
free(temp_results);
|
||
return NULL;
|
||
}
|
||
temp_results[*num_results].similarity = similarity;
|
||
(*num_results)++;
|
||
}
|
||
}
|
||
|
||
// If no results found, return NULL properly
|
||
if (*num_results == 0) {
|
||
free(temp_results);
|
||
return NULL;
|
||
}
|
||
|
||
// Sort results by similarity
|
||
qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);
|
||
|
||
// Shrink temp_results to exact size and return it directly
|
||
SearchResult* results = (SearchResult*)realloc(
|
||
temp_results, *num_results * sizeof(SearchResult));
|
||
if (!results) {
|
||
// realloc failure – temp_results unchanged, clean up
|
||
for (int i = 0; i < *num_results; i++) {
|
||
free(temp_results[i].string);
|
||
}
|
||
free(temp_results);
|
||
return NULL;
|
||
}
|
||
return results;
|
||
}
|
||
|
||
// Free the search results
|
||
void free_search_results(SearchResult* results, int num_results) {
|
||
if (!results) return;
|
||
|
||
// Free all strings in the results
|
||
for (int i = 0; i < num_results; i++) {
|
||
free(results[i].string);
|
||
}
|
||
free(results);
|
||
}
|