@@ -5,6 +5,11 @@
# include <ctype.h>
# include <math.h>
# include <stdbool.h>
# ifdef _WIN32
# include <malloc.h> // For alloca on Windows
# else
# include <alloca.h> // For alloca on Unix-like systems
# endif
# include "similarity_search.h"
// Case insensitive string comparison
@@ -22,76 +27,56 @@ int str_case_cmp(const char *s1, const char *s2) {
}
// Split a string into words
int split_into_words ( const char * string , char * words [ MAX_WORDS ] ) {
if ( ! string | | strlen ( string ) > = MAX_STRING_LEN ) {
return 0 ;
int split_into_words ( const char * s,
char * words [ MAX_WORDS ] ,
char * * storage ) /* NEW OUT PARAM */
{
if ( ! s | | strlen ( s ) > = MAX_STRING_LEN ) return 0 ;
char * buf = strdup ( s ) ; /* one single allocation */
if ( ! buf ) return 0 ;
* storage = buf ; /* hand ownership to caller */
int n = 0 ;
for ( char * tok = strtok ( buf , " \t \n " ) ; tok & & n < MAX_WORDS ;
tok = strtok ( NULL , " \t \n " ) )
{
words [ n + + ] = tok ; /* pointers into buf */
}
char temp [ MAX_STRING_LEN ] ;
strncpy ( temp , string , MAX_STRING_LEN - 1 ) ;
temp [ MAX_STRING_LEN - 1 ] = ' \0 ' ;
int word_count = 0 ;
char * token = strtok ( temp , " \t \n " ) ;
while ( token ! = NULL & & word_count < MAX_WORDS ) {
words [ word_count ] = strdup ( token ) ;
if ( ! words [ word_count ] ) {
// Free any already allocated words on error
for ( int i = 0 ; i < word_count ; i + + ) {
free ( words [ i ] ) ;
}
return 0 ;
}
word_count + + ;
token = strtok ( NULL , " \t \n " ) ;
}
return word_count ;
return n ;
}
// Free memory allocated for words
void free_words ( char * words [ ] , int word_count ) {
for ( int i = 0 ; i < word_count ; i + + ) {
free ( words [ i ] ) ;
void free_words ( char * storage ) { /* simplified */
i f ( storage ) { /* check for NULL */
free ( storage ) ; /* single free, if any */
}
}
// Calculate Levenshtein distance between two strings
int levenshtein_distance ( const char * s1 , const char * s2 ) {
int len1 = strlen ( s1 ) ;
int len2 = strlen ( s2 ) ;
// Convert to lowercase for comparison
char s1_lower [ MAX_STRING_LEN ] ;
char s2_lower [ MAX_STRING_LEN ] ;
for ( int i = 0 ; i < len1 ; i + + ) s1_lower [ i ] = tolower ( ( unsigned char ) s1 [ i ] ) ;
for ( in t i = 0 ; i < len2 ; i + + ) s2_lower [ i ] = tolower ( ( unsigned char ) s2 [ i ] ) ;
s1_lower [ len1 ] = ' \0 ' ;
s2_lower [ len2 ] = ' \0 ' ;
// Create distance matrix
int matrix [ len1 + 1 ] [ len2 + 1 ] ;
// Initialize first row and column
for ( int i = 0 ; i < = len1 ; i + + ) matrix [ i ] [ 0 ] = i ;
for ( int j = 0 ; j < = len2 ; j + + ) matrix [ 0 ] [ j ] = j ;
// Fill in the rest of the matrix
for ( int i = 1 ; i < = len1 ; i + + ) {
for ( int j = 1 ; j < = len2 ; j + + ) {
if ( s1_lower [ i - 1 ] = = s2_lower [ j - 1 ] ) {
matrix [ i ] [ j ] = matrix [ i - 1 ] [ j - 1 ] ;
} else {
int min = matrix [ i - 1 ] [ j - 1 ] ; // substitution
if ( matrix [ i - 1 ] [ j ] < min ) min = matrix [ i - 1 ] [ j ] ; // deletion
if ( matrix [ i ] [ j - 1 ] < min ) min = matrix [ i ] [ j - 1 ] ; // insertion
matrix [ i ] [ j ] = min + 1 ;
}
int levenshtein_distance ( const char * a , const char * b )
{
size_t m = strlen ( a ) , n = strlen ( b ) ;
if ( m < n ) { const char * t = a ; a = b ; b = t ; size_t tmp = m ; m = n ; n = tmp ; }
int * row0 = alloca ( ( n + 1 ) * sizeof ( int ) ) ;
int * row1 = alloca ( ( n + 1 ) * sizeof ( int ) ) ;
for ( size_ t j = 0 ; j < = n ; + + j ) row0 [ j ] = j ;
for ( size_t i = 1 ; i < = m ; + + i ) {
row1 [ 0 ] = i ;
for ( size_t j = 1 ; j < = n ; + + j ) {
int cost = ( tolower ( ( unsigned ) a [ i - 1 ] ) = =
tolower ( ( unsigned ) b [ j - 1 ] ) ) ? 0 : 1 ;
int del = row0 [ j ] + 1 ;
int ins = row1 [ j - 1 ] + 1 ;
int sub = row0 [ j - 1 ] + cost ;
row1 [ j ] = ( del < ins ? ( del < sub ? del : sub )
: ( ins < sub ? ins : sub ) ) ;
}
int * tmp = row0 ; row0 = row1 ; row1 = tmp ;
}
return matrix [ len1 ] [ len2 ] ;
return row0 [ n ] ;
}
// Calculate similarity between two words based on Levenshtein distance
@@ -99,45 +84,21 @@ float word_similarity(const char *word1, const char *word2) {
int len1 = strlen ( word1 ) ;
int len2 = strlen ( word2 ) ;
// For very short words (3 chars or less), require exact match
if ( len1 < = 3 | | len2 < = 3 ) {
// For very short words (2 chars or less), require exact match
if ( len1 < = 2 | | len2 < = 2 ) {
return str_case_cmp ( word1 , word2 ) = = 0 ? 1.0f : 0.0f ;
}
// If one word is significantly shorter than the other, it must be a prefix
if ( len1 < len2 * 0.7 | | len2 < len1 * 0.7 ) {
// Check if the shorter word is a prefix of the longer word
const char * longer = len1 > len2 ? word1 : word2 ;
const char * shorter = len1 > len2 ? word2 : word1 ;
int shorter_len = len1 > len2 ? len2 : len1 ;
if ( strncasecmp ( longer , shorter , shorter_len ) = = 0 ) {
return 0.8f ; // Good prefix match
}
return 0.0f ; // Not a prefix match
}
// For words of similar length, calculate similarity
// Calculate Levenshtein distance
int distance = levenshtein_distance ( word1 , word2 ) ;
int max_len = len1 > len2 ? len1 : len2 ;
// Calculate similarity based on edit distance
// Simple linear scoring: 1.0 for exact match, 0.9 for one char difference, etc.
float similarity = 1.0f - ( float ) distance / max_len ;
// Adju st similarity based on word length s
if ( len1 ! = len2 ) {
float length_ratio = ( float ) ( len1 < len2 ? len1 : len2 ) / ( len1 > len2 ? len1 : len2 ) ;
similarity * = length_ratio ;
}
// For words of similar length, require reasonable similarity
if ( similarity < 0.4f ) {
return 0.0f ;
}
// Never return perfect similarity for non-identical words
if ( distance > 0 ) {
similarity = fmin ( similarity , 0.9f ) ;
// Boo st similarity for small difference s
if ( distance < = 1 ) {
similarity = 0.9f + ( similarity * 0.1f ) ;
}
return similarity ;
@@ -146,15 +107,15 @@ float word_similarity(const char *word1, const char *word2) {
// Calculate similarity between query and target string
float calculate_similarity ( const char * query , const char * target , float cutoff ) {
// Split strings into words
char * query_words [ MAX_WORDS ] = { 0 } ;
char * target _words[ MAX_WORDS ] = { 0 } ;
char * query_buf = NULL , * target_buf = NULL ;
char * query _words[ MAX_WORDS ] , * target_words [ MAX_WORDS ] ;
int query_word_count = split_into_words ( query , query_words ) ;
int target_word_count = split_into_words ( target , target_words ) ;
int query_word_count = split_into_words ( query , query_words , & query_buf );
int target_word_count = split_into_words ( target , target_words , & target_buf );
if ( query_word_count = = 0 | | target_word_count = = 0 ) {
free_words ( query_words , query_word_count ) ;
free_words ( target_words , target_word_count ) ;
free_words ( query_buf ) ;
free_words ( target_buf ) ;
return 0.0 ;
}
@@ -174,39 +135,31 @@ float calculate_similarity(const char *query, const char *target, float cutoff)
}
best_word_similarities [ i ] = best_similarity ;
if ( best_similarity > = 0.4f ) { // Consider it a match if similarity is reasonable
if ( best_similarity > = 0.4f ) {
query_words_found + + ;
}
}
// Calculate o verall similarity
float word_match_score = ( float ) query_words_found / query_word_count ;
// Calculate average of best word similarities
// Calculate a verage word similarity
float avg_word_similarity = 0.0f ;
for ( int i = 0 ; i < query_word_count ; i + + ) {
avg_word_similarity + = best_word_similarities [ i ] ;
}
avg_word_similarity / = query_word_count ;
// Combine scores: 70% weight on word matches, 30% on character similarity
float similarity = ( word_match_score * 0.7f ) + ( avg_word_similarity * 0.3f ) ;
// Calculate word match ratio
float word_match_ratio = ( float ) query_words_found / query_word_count ;
// Never return perfect similarity unless all words are exact matches
bool all_exact_matches = true ;
for ( int i = 0 ; i < query_word_count ; i + + ) {
if ( best_word_similarities [ i ] < 1.0f ) {
all_exact_matches = false ;
break ;
}
// Final score is the average of word match ratio and average word similarity
float similarity = ( word_match_ratio + avg_word_similarity ) / 2.0f ;
// Boost score if all words are found
if ( query_words_found = = query_word_count ) {
similarity = 0.8f + ( similarity * 0.2f ) ;
}
if ( ! all_exact_matches ) {
similarity = fmin ( similarity , 0.9 f ) ;
}
free_words ( query_words , query_word_count ) ;
free_words ( target_words , target_word_count ) ;
free_words ( query_buf ) ;
free_words ( target_bu f) ;
return similarity ;
}
@@ -353,27 +306,26 @@ SearchResult* search_index(SearchIndex* index, const char* query, float cutoff,
}
}
// If no results found, return NULL properly
if ( * num_results = = 0 ) {
free ( temp_results ) ;
return NULL ;
}
// Sort results by similarity
qsort ( temp_results , * num_results , sizeof ( SearchResult ) , compare_results ) ;
// Allocate final result array with exact size
SearchResult * results = ( SearchResult * ) m alloc( * num_results * sizeof ( SearchResult ) ) ;
// Shrink temp_results to exact size and return it directly
SearchResult * results = ( SearchResult * ) re alloc(
temp_results , * num_results * sizeof ( SearchResult ) ) ;
if ( ! results ) {
// F ree all strings in temp_results
// realloc failure – temp_results unchanged, clean up
for ( int i = 0 ; i < * num_results ; i + + ) {
free ( temp_results [ i ] . string ) ;
}
free ( temp_results ) ;
return NULL ;
}
// Copy results to final array
for ( int i = 0 ; i < * num_results ; i + + ) {
results [ i ] . string = temp_results [ i ] . string ;
results [ i ] . similarity = temp_results [ i ] . similarity ;
}
free ( temp_results ) ;
return results ;
}
@@ -386,4 +338,4 @@ void free_search_results(SearchResult* results, int num_results) {
free ( results [ i ] . string ) ;
}
free ( results ) ;
}
}