Files
lib11sht/lib11sht.c

268 lines
7.8 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* ************************************************************************ *
* lib11sht.c, v1.0 *
* Fuzzy comparison library implementation *
* *
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
* GNU GPL version 2 or later. *
* ************************************************************************ */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include "lib11sht.h"
/* Internal helpers — kept private to this translation unit. */
static int uselesschar(int c);
static void trim(char *s);
static void asciify(const char *src, char *dest, size_t dest_size);
static int ulen(unsigned char c);
static float shit11(char *s1, char *s2);
static float fmin3(float a, float b, float c);
static float fmax2(float a, float b);
/* ---------------------------------------------------------------------- */
int fequal(float a, float b, float delta)
{
if(a < b - delta)
return -1;
if(a > b + delta)
return 1;
/* b-delta <= a <= b+delta */
return 0;
}
/* ---------------------------------------------------------------------- */
/* Simple wrapper: most callers don't need the ratio or normalized buffers.
* Symmetric with fequal(a, b, delta). */
int sequal(char *a, char *b, float shold)
{
float ratio;
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
return sequal_full(a, b, shold, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
}
/* ---------------------------------------------------------------------- */
int sequal_full(char *a, char *b, float shold, float *ratio,
char *s1, size_t s1_size,
char *s2, size_t s2_size)
{
size_t i;
if(!a || !b || !s1 || !s2 || !ratio || s1_size == 0 || s2_size == 0)
{
errno = EINVAL;
return 0;
}
/* remove accents (asciify NUL-terminates within s1_size / s2_size) */
asciify(a, s1, s1_size);
asciify(b, s2, s2_size);
/* trim spaces (leading, trailing, and internal collapse) */
trim(s1);
trim(s2);
/* lowercase, bounded by each buffer's actual size */
for(i=0; i<s1_size && s1[i]; i++)
s1[i] = (char)tolower((unsigned char)s1[i]);
for(i=0; i<s2_size && s2[i]; i++)
s2[i] = (char)tolower((unsigned char)s2[i]);
{
int cmp = strcmp(s1, s2);
if(cmp == 0)
{
*ratio = 1.0;
return 0;
}
*ratio = shit11(s1, s2);
if(*ratio > shold)
return 0;
return (cmp < 0) ? -1 : 1;
}
}
/* ---------------------------------------------------------------------- */
/* Levenshtein similarity ratio 0.0..1.0 */
static float shit11(char *s1, char *s2)
{
int len1, len2;
int i, j, cost;
/* strnlen guards against missing NUL (no read past LEVN_SBUFF) */
len1 = (int)strnlen(s1, LEVN_SBUFF);
len2 = (int)strnlen(s2, LEVN_SBUFF);
if(len1 == 0 && len2 == 0)
return 1.0; /* both empty == identical */
if(len1 == 0 || len2 == 0)
return 0.0; /* one empty == no similarity */
/* Defensive cap: refuse to allocate too-large VLA. Bounds the stack
* matrix at LEVN_SBUFF × LEVN_SBUFF × sizeof(int) = ~256 KB worst case. */
if(len1 >= LEVN_SBUFF || len2 >= LEVN_SBUFF)
return 0.0; /* treat as "completely dissimilar" */
int d[len1+1][len2+1];
for(i=0; i<=len1; i++)
d[i][0] = i;
for(j=0; j<=len2; j++)
d[0][j] = j;
for(i=1; i <= len1; i++)
for(j=1; j <= len2; j++)
{
cost = (s1[i-1] == s2[j-1])? 0 : 1;
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
}
return 1.0 - d[len1][len2] / fmax2(len1, len2);
}
/* ---------------------------------------------------------------------- */
static float fmin3(float a, float b, float c)
{
float m=a;
if(b<m)
m=b;
if(c<m)
m=c;
return m;
}
/* ---------------------------------------------------------------------- */
static float fmax2(float a, float b)
{
return (a>b)? a : b;
}
/* ---------------------------------------------------------------------- */
static int uselesschar(int c)
{
if(c == ' ') return 1;
if(c == '\t') return 1;
if(c == '\n') return 1;
if(c == '\r') return 1;
if(c == 0xC2) return 1;
if(c == 0xA0) return 1;
return 0;
}
/* ---------------------------------------------------------------------- */
/* remove leading/trailing whitespace + collapse internal multiple whitespace */
static void trim(char *s)
{
int len, i, j, fin = 0;
if(!s)
return;
/* remove leading spaces */
i = 0;
while(uselesschar(s[i]))
i++;
if(i > 0)
memmove(s, s + i, strlen(s + i) + 1);
/* remove trailing spaces */
len = strlen(s);
while(len > 0 && uselesschar(s[len - 1]))
s[--len] = '\0';
/* remove double spaces in between */
i=j=0;
while(s[i] != '\0')
{
if(uselesschar(s[i]))
{
if(!fin)
{
s[j++] = ' ';
fin = 1;
}
}
else
{
s[j++] = s[i];
fin = 0;
}
i++;
}
s[j] = '\0';
}
/* ---------------------------------------------------------------------- */
static int ulen(unsigned char c)
{
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
return 1; /* ASCII or invalid or UTF continuation byte */
}
/* ---------------------------------------------------------------------- */
static void asciify(const char *src, char *dest, size_t dest_size)
{
int len, i, k, j, found;
char ch[5]; /* UTF8 multibyte char */
const char transclear[] =
"AEIOUAEIOUAEIOUAEIOUAEIOU"
"aeiouaeiouaeiouaeiouaeiou"
"aoCcNn123"
" "; /* NBSP → space */
const char *translit[] = {
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
"Ã","","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
"à","è","ì","ò","ù", "ã","","ĩ","õ","ũ",
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
"\xC2\xA0"}; /* NBSP */
if(!src || !dest || dest_size == 0)
return;
if(*src=='\0')
{
dest[0] = '\0'; /* maintain "dest is a valid C string" contract */
return;
}
j=i=0;
/* leave room for the final NUL: writes capped at dest_size-1 */
while(src[i]!='\0' && i < LEVN_SBUFF && j < (int)dest_size - 1)
{
if(((unsigned char)src[i] & 0xc0) == 0x80) /* non ASCII, UTF continuation char */
{
i++;
continue;
}
len = ulen((unsigned char)src[i]);
memcpy(ch, &src[i], len);
ch[len]='\0';
if(((unsigned char)src[i]) < 0x80) /* ASCII */
{
dest[j++] = ch[0];
i++;
continue;
}
found = 0;
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
if(strcmp(ch, translit[k])==0)
{
dest[j++] = transclear[k];
found = 1;
break;
}
if(!found)
dest[j++] = '?';
i+=len;
}
dest[j]='\0';
}
/* ---------------------------------------------------------------------- */
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */