Files
lib11sht/lib11sht.c

247 lines
6.6 KiB
C

/* ************************************************************************ *
* lib11sht.c, v1.0 *
* Fuzzy comparison library implementation *
* *
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
* GNU GPL version 2 or later. *
* ************************************************************************ */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include "lib11sht.h"
/* Internal helpers — kept private to this translation unit. */
static int uselesschar(int c);
static void trim(char *s);
static void asciify(const char *src, char *dest);
static int ulen(unsigned char c);
static float shit11(char *s1, char *s2);
static float fmin3(float a, float b, float c);
static float fmax2(float a, float b);
/* ---------------------------------------------------------------------- */
int fequal(float a, float b, float delta)
{
if(a < b - delta)
return -1;
if(a > b + delta)
return 1;
/* b-delta <= a <= b+delta */
return 0;
}
/* ---------------------------------------------------------------------- */
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2)
{
int i;
if(!a || !b || !s1 || !s2 || !ratio)
{
errno = EINVAL;
return 0;
}
/* remove accents */
asciify(a, s1);
asciify(b, s2);
/* trim spaces (leading, trailing, and internal collapse) */
trim(s1);
trim(s2);
/* lowercase */
for(i=0; i<LEVN_SBUFF && s1[i]; i++)
s1[i] = (char)tolower((unsigned char)s1[i]);
for(i=0; i<LEVN_SBUFF && s2[i]; i++)
s2[i] = (char)tolower((unsigned char)s2[i]);
i=strcmp(s1, s2);
if(!i)
{
*ratio=1.0;
return 0;
}
*ratio = shit11(s1, s2);
if(*ratio > thr)
return 0;
return (i < 0)? -1 : 1;
}
/* ---------------------------------------------------------------------- */
/* Levenshtein similarity ratio 0.0..1.0 */
static float shit11(char *s1, char *s2)
{
int len1, len2;
int i, j, cost;
len1 = strlen(s1);
len2 = strlen(s2);
if(len1 == 0 && len2 == 0)
return 1.0; /* both empty == identical */
if(len1 == 0 || len2 == 0)
return 0.0; /* one empty == no similarity */
int d[len1+1][len2+1];
for(i=0; i<=len1; i++)
d[i][0] = i;
for(j=0; j<=len2; j++)
d[0][j] = j;
for(i=1; i <= len1; i++)
for(j=1; j <= len2; j++)
{
cost = (s1[i-1] == s2[j-1])? 0 : 1;
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
}
return 1.0 - d[len1][len2] / fmax2(len1, len2);
}
/* ---------------------------------------------------------------------- */
static float fmin3(float a, float b, float c)
{
float m=a;
if(b<m)
m=b;
if(c<m)
m=c;
return m;
}
/* ---------------------------------------------------------------------- */
static float fmax2(float a, float b)
{
return (a>b)? a : b;
}
/* ---------------------------------------------------------------------- */
static int uselesschar(int c)
{
if(c == ' ') return 1;
if(c == '\t') return 1;
if(c == '\n') return 1;
if(c == '\r') return 1;
if(c == 0xC2) return 1;
if(c == 0xA0) return 1;
return 0;
}
/* ---------------------------------------------------------------------- */
/* remove leading/trailing whitespace + collapse internal multiple whitespace */
static void trim(char *s)
{
int len, i, j, fin = 0;
if(!s)
return;
/* remove leading spaces */
i = 0;
while(uselesschar(s[i]))
i++;
if(i > 0)
memmove(s, s + i, strlen(s + i) + 1);
/* remove trailing spaces */
len = strlen(s);
while(len > 0 && uselesschar(s[len - 1]))
s[--len] = '\0';
/* remove double spaces in between */
i=j=0;
while(s[i] != '\0')
{
if(uselesschar(s[i]))
{
if(!fin)
{
s[j++] = ' ';
fin = 1;
}
}
else
{
s[j++] = s[i];
fin = 0;
}
i++;
}
s[j] = '\0';
}
/* ---------------------------------------------------------------------- */
static int ulen(unsigned char c)
{
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
return 1; /* ASCII or invalid or UTF continuation byte */
}
/* ---------------------------------------------------------------------- */
static void asciify(const char *src, char *dest)
{
int len, i, k, j, found;
char ch[5]; /* UTF8 multibyte char */
const char transclear[] =
"AEIOUAEIOUAEIOUAEIOUAEIOU"
"aeiouaeiouaeiouaeiouaeiou"
"aoCcNn123"
" "; /* NBSP → space */
const char *translit[] = {
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
"Ã","","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
"à","è","ì","ò","ù", "ã","","ĩ","õ","ũ",
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
"\xC2\xA0"}; /* NBSP */
if(!src || !dest)
return;
if(*src=='\0')
{
dest[0] = '\0'; /* maintain "dest is a valid C string" contract */
return;
}
j=i=0;
while(src[i]!='\0' && i < LEVN_SBUFF)
{
if(((unsigned char)src[i] & 0xc0) == 0x80) /* non ASCII, UTF continuation char */
{
i++;
continue;
}
len = ulen((unsigned char)src[i]);
memcpy(ch, &src[i], len);
ch[len]='\0';
if(((unsigned char)src[i]) < 0x80) /* ASCII */
{
dest[j++] = ch[0];
i++;
continue;
}
found = 0;
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
if(strcmp(ch, translit[k])==0)
{
dest[j++] = transclear[k];
found = 1;
break;
}
if(!found)
dest[j++] = '?';
i+=len;
}
dest[j]='\0';
}
/* ---------------------------------------------------------------------- */
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */