node-similarity-search-native/similarity_search.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <ctype.h>
#include <math.h>
#include <stdbool.h>
#include "similarity_search.h"

// Case insensitive string comparison
int str_case_cmp(const char *s1, const char *s2) {
    while (*s1 && *s2) {
        int c1 = tolower((unsigned char)*s1);
        int c2 = tolower((unsigned char)*s2);
        if (c1 != c2) {
            return c1 - c2;
        }
        s1++;
        s2++;
    }
    return tolower((unsigned char)*s1) - tolower((unsigned char)*s2);
}

// Split a string into words
int split_into_words(const char *s,
                     char  *words[MAX_WORDS],
                     char **storage)          /* NEW OUT PARAM            */
{
    if (!s || strlen(s) >= MAX_STRING_LEN) return 0;

    char *buf = strdup(s);                    /* one single allocation    */
    if (!buf) return 0;
    *storage = buf;                           /* hand ownership to caller */

    int n = 0;
    for (char *tok = strtok(buf, " \t\n"); tok && n < MAX_WORDS;
         tok = strtok(NULL, " \t\n"))
    {
        words[n++] = tok;                     /* pointers into buf        */
    }
    return n;
}

// Free memory allocated for words
void free_words(char *storage) {              /* simplified               */
    free(storage);                            /* single free, if any      */
}

// Calculate Levenshtein distance between two strings
int levenshtein_distance(const char *a, const char *b)
{
    size_t m = strlen(a), n = strlen(b);
    if (m < n) { const char *t=a; a=b; b=t; size_t tmp=m; m=n; n=tmp; }

    int *row0 = alloca((n + 1) * sizeof(int));
    int *row1 = alloca((n + 1) * sizeof(int));

    for (size_t j = 0; j <= n; ++j) row0[j] = j;
    for (size_t i = 1; i <= m; ++i) {
        row1[0] = i;
        for (size_t j = 1; j <= n; ++j) {
            int cost = (tolower((unsigned)a[i-1]) ==
                        tolower((unsigned)b[j-1])) ? 0 : 1;
            int del  = row0[j]   + 1;
            int ins  = row1[j-1] + 1;
            int sub  = row0[j-1] + cost;
            row1[j] = (del < ins ? (del < sub ? del : sub)
                                 : (ins < sub ? ins : sub));
        }
        int *tmp = row0; row0 = row1; row1 = tmp;
    }
    return row0[n];
}

// Calculate similarity between two words based on Levenshtein distance
float word_similarity(const char *word1, const char *word2) {
    int len1 = strlen(word1);
    int len2 = strlen(word2);

    // For very short words (3 chars or less), require exact match
    if (len1 <= 3 || len2 <= 3) {
        return str_case_cmp(word1, word2) == 0 ? 1.0f : 0.0f;
    }

    // If one word is significantly shorter than the other, it must be a prefix
    if (len1 < len2 * 0.7 || len2 < len1 * 0.7) {
        // Check if the shorter word is a prefix of the longer word
        const char *longer = len1 > len2 ? word1 : word2;
        const char *shorter = len1 > len2 ? word2 : word1;
        int shorter_len = len1 > len2 ? len2 : len1;

        if (strncasecmp(longer, shorter, shorter_len) == 0) {
            return 0.8f; // Good prefix match
        }
        return 0.0f; // Not a prefix match
    }

    // For words of similar length, calculate similarity
    int distance = levenshtein_distance(word1, word2);
    int max_len = len1 > len2 ? len1 : len2;

    // Calculate similarity based on edit distance with exponential decay
    float similarity = 1.0f - (float)distance / max_len;

    // Apply exponential decay to make it more sensitive to differences
    similarity = pow(similarity, 3.0f);

    // Adjust similarity based on word lengths
    if (len1 != len2) {
        float length_ratio = (float)(len1 < len2 ? len1 : len2) / (len1 > len2 ? len1 : len2);
        similarity *= length_ratio;
    }

    // For words of similar length, require reasonable similarity
    if (similarity < 0.4f) {
        return 0.0f;
    }

    return similarity;
}

// Calculate similarity between query and target string
float calculate_similarity(const char *query, const char *target, float cutoff) {
    // Split strings into words
    char *query_buf, *target_buf;
    char *query_words[MAX_WORDS], *target_words[MAX_WORDS];

    int query_word_count = split_into_words(query,  query_words,  &query_buf);
    int target_word_count = split_into_words(target, target_words, &target_buf);

    if (query_word_count == 0 || target_word_count == 0) {
        free_words(query_buf);
        free_words(target_buf);
        return 0.0;
    }

    // Track best matches for each query word
    float best_word_similarities[MAX_WORDS] = {0.0f};
    int query_words_found = 0;

    // For each query word, find its best match in target words
    for (int i = 0; i < query_word_count; i++) {
        float best_similarity = 0.0f;

        for (int j = 0; j < target_word_count; j++) {
            /* quick length‑difference filter (early‑exit #4)            */
            int l1 = strlen(query_words[i]), l2 = strlen(target_words[j]);
            if (l1 < l2 * 0.5f || l2 < l1 * 0.5f) continue;

            float similarity = word_similarity(query_words[i], target_words[j]);
            if (similarity > best_similarity) {
                best_similarity = similarity;
                if (best_similarity >= 0.90f) break;   /* early exit #4 */
            }
        }

        best_word_similarities[i] = best_similarity;
        if (best_similarity >= 0.4f) { // Consider it a match if similarity is reasonable
            query_words_found++;
        }
    }

    // Calculate overall similarity
    float word_match_score = (float)query_words_found / query_word_count;

    // Calculate average of best word similarities
    float avg_word_similarity = 0.0f;
    for (int i = 0; i < query_word_count; i++) {
        avg_word_similarity += best_word_similarities[i];
    }
    avg_word_similarity /= query_word_count;

    // Combine scores: 60% weight on word matches, 40% on character similarity
    // This gives more weight to finding all words, regardless of order
    float similarity = (word_match_score * 0.6f) + (avg_word_similarity * 0.4f);

    // If all words are found, boost the score
    if (query_words_found == query_word_count) {
        similarity = 0.7f + (similarity * 0.3f);
    }

    free_words(query_buf);
    free_words(target_buf);

    return similarity;
}

// Compare function for qsort to sort results by similarity (descending)
int compare_results(const void *a, const void *b) {
    const SearchResult *result_a = (const SearchResult *)a;
    const SearchResult *result_b = (const SearchResult *)b;

    if (result_b->similarity > result_a->similarity) return 1;
    if (result_b->similarity < result_a->similarity) return -1;
    return 0;
}

// Generate a random word
void generate_random_word(char *word, int max_len) {
    int len = 3 + rand() % 8; // Random length between 3 and 10
    for (int i = 0; i < len; i++) {
        word[i] = 'a' + (rand() % 26);
    }
    word[len] = '\0';
}

// Generate a random string consisting of multiple words
void generate_random_string(char *string, int max_len) {
    if (!string || max_len <= 0) {
        return;
    }

    int num_words = 2 + rand() % 5; // Random number of words between 2 and 6
    string[0] = '\0';
    size_t current_len = 0;

    for (int i = 0; i < num_words; i++) {
        char word[20];
        generate_random_word(word, sizeof(word) - 1);

        size_t word_len = strlen(word);
        size_t space_needed = word_len + (i > 0 ? 1 : 0); // +1 for space if not first word

        if (current_len + space_needed < (size_t)max_len - 1) {
            if (i > 0) {
                strncat(string, " ", max_len - current_len - 1);
                current_len++;
            }
            strncat(string, word, max_len - current_len - 1);
            current_len += word_len;
        } else {
            break;
        }
    }
}

// Create a new search index
SearchIndex* create_search_index(int capacity) {
    if (capacity <= 0) return NULL;

    SearchIndex* index = (SearchIndex*)malloc(sizeof(SearchIndex));
    if (!index) return NULL;

    index->strings = (char**)malloc(capacity * sizeof(char*));
    if (!index->strings) {
        free(index);
        return NULL;
    }

    index->num_strings = 0;
    index->capacity = capacity;
    return index;
}

// Add a string to the index
int add_string_to_index(SearchIndex* index, const char* string) {
    if (!index || !string) return -1;

    // Check if we've reached capacity
    if (index->num_strings >= index->capacity) {
        return -1;
    }

    // Check if string is too long
    if (strlen(string) >= MAX_STRING_LEN) {
        return -1;
    }

    index->strings[index->num_strings] = strdup(string);
    if (!index->strings[index->num_strings]) return -1;

    index->num_strings++;
    return 0;
}

// Free the search index and all associated memory
void free_search_index(SearchIndex* index) {
    if (!index) return;

    for (int i = 0; i < index->num_strings; i++) {
        free(index->strings[i]);
    }

    free(index->strings);
    free(index);
}

// Search the index with the given query and similarity cutoff
SearchResult* search_index(SearchIndex* index, const char* query, float cutoff, int* num_results) {
    if (!index || !query || !num_results) return NULL;

    // Validate input string length
    if (strlen(query) >= MAX_STRING_LEN) {
        *num_results = 0;
        return NULL;
    }

    // Validate cutoff
    if (cutoff < 0.0f || cutoff > 1.0f) {
        *num_results = 0;
        return NULL;
    }

    // Allocate temporary array for results
    SearchResult* temp_results = (SearchResult*)malloc(index->num_strings * sizeof(SearchResult));
    if (!temp_results) return NULL;

    *num_results = 0;

    // Search through all strings in the index
    for (int i = 0; i < index->num_strings; i++) {
        float similarity = calculate_similarity(query, index->strings[i], cutoff);

        if (similarity >= cutoff) {
            // Store a copy of the string in the result
            temp_results[*num_results].string = strdup(index->strings[i]);
            if (!temp_results[*num_results].string) {
                // Free any already allocated strings on error
                for (int j = 0; j < *num_results; j++) {
                    free(temp_results[j].string);
                }
                free(temp_results);
                return NULL;
            }
            temp_results[*num_results].similarity = similarity;
            (*num_results)++;
        }
    }

    // Sort results by similarity
    qsort(temp_results, *num_results, sizeof(SearchResult), compare_results);

    // Allocate final result array with exact size
    SearchResult* results = (SearchResult*)malloc(*num_results * sizeof(SearchResult));
    if (!results) {
        // Free all strings in temp_results
        for (int i = 0; i < *num_results; i++) {
            free(temp_results[i].string);
        }
        free(temp_results);
        return NULL;
    }

    // Copy results to final array
    for (int i = 0; i < *num_results; i++) {
        results[i].string = temp_results[i].string;
        results[i].similarity = temp_results[i].similarity;
    }
    free(temp_results);

    return results;
}

// Free the search results
void free_search_results(SearchResult* results, int num_results) {
    if (!results) return;

    // Free all strings in the results
    for (int i = 0; i < num_results; i++) {
        free(results[i].string);
    }
    free(results);
}