2026-05-22 19:41:10 -03:00
|
|
|
/* ************************************************************************ *
|
|
|
|
|
* lib11sht.c, v1.0 *
|
|
|
|
|
* Fuzzy comparison library implementation *
|
|
|
|
|
* *
|
|
|
|
|
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
|
|
|
|
|
* GNU GPL version 2 or later. *
|
|
|
|
|
* ************************************************************************ */
|
|
|
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <ctype.h>
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
|
|
|
|
|
#include "lib11sht.h"
|
|
|
|
|
|
|
|
|
|
/* Internal helpers — kept private to this translation unit. */
|
|
|
|
|
static int uselesschar(int c);
|
|
|
|
|
static void trim(char *s);
|
|
|
|
|
static void asciify(const char *src, char *dest);
|
|
|
|
|
static int ulen(unsigned char c);
|
|
|
|
|
static float shit11(char *s1, char *s2);
|
|
|
|
|
static float fmin3(float a, float b, float c);
|
|
|
|
|
static float fmax2(float a, float b);
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
int fequal(float a, float b, float delta)
|
|
|
|
|
{
|
|
|
|
|
if(a < b - delta)
|
|
|
|
|
return -1;
|
|
|
|
|
if(a > b + delta)
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
/* b-delta <= a <= b+delta */
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
if(!a || !b || !s1 || !s2 || !ratio)
|
|
|
|
|
{
|
|
|
|
|
errno = EINVAL;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* remove accents */
|
|
|
|
|
asciify(a, s1);
|
|
|
|
|
asciify(b, s2);
|
|
|
|
|
|
|
|
|
|
/* trim spaces (leading, trailing, and internal collapse) */
|
|
|
|
|
trim(s1);
|
|
|
|
|
trim(s2);
|
|
|
|
|
|
|
|
|
|
/* lowercase */
|
|
|
|
|
for(i=0; i<LEVN_SBUFF && s1[i]; i++)
|
|
|
|
|
s1[i] = (char)tolower((unsigned char)s1[i]);
|
|
|
|
|
for(i=0; i<LEVN_SBUFF && s2[i]; i++)
|
|
|
|
|
s2[i] = (char)tolower((unsigned char)s2[i]);
|
|
|
|
|
|
|
|
|
|
i=strcmp(s1, s2);
|
|
|
|
|
if(!i)
|
|
|
|
|
{
|
|
|
|
|
*ratio=1.0;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
*ratio = shit11(s1, s2);
|
|
|
|
|
if(*ratio > thr)
|
|
|
|
|
return 0;
|
|
|
|
|
return (i < 0)? -1 : 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
/* Levenshtein similarity ratio 0.0..1.0 */
|
|
|
|
|
static float shit11(char *s1, char *s2)
|
|
|
|
|
{
|
|
|
|
|
int len1, len2;
|
|
|
|
|
int i, j, cost;
|
|
|
|
|
|
|
|
|
|
len1 = strlen(s1);
|
|
|
|
|
len2 = strlen(s2);
|
|
|
|
|
|
2026-05-22 19:45:50 -03:00
|
|
|
if(len1 == 0 && len2 == 0)
|
|
|
|
|
return 1.0; /* both empty == identical */
|
2026-05-22 19:41:10 -03:00
|
|
|
if(len1 == 0 || len2 == 0)
|
2026-05-22 19:45:50 -03:00
|
|
|
return 0.0; /* one empty == no similarity */
|
2026-05-22 19:41:10 -03:00
|
|
|
|
|
|
|
|
int d[len1+1][len2+1];
|
|
|
|
|
|
|
|
|
|
for(i=0; i<=len1; i++)
|
|
|
|
|
d[i][0] = i;
|
|
|
|
|
for(j=0; j<=len2; j++)
|
|
|
|
|
d[0][j] = j;
|
|
|
|
|
|
|
|
|
|
for(i=1; i <= len1; i++)
|
|
|
|
|
for(j=1; j <= len2; j++)
|
|
|
|
|
{
|
|
|
|
|
cost = (s1[i-1] == s2[j-1])? 0 : 1;
|
|
|
|
|
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
|
|
|
|
|
}
|
|
|
|
|
return 1.0 - d[len1][len2] / fmax2(len1, len2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
static float fmin3(float a, float b, float c)
|
|
|
|
|
{
|
|
|
|
|
float m=a;
|
|
|
|
|
if(b<m)
|
|
|
|
|
m=b;
|
|
|
|
|
if(c<m)
|
|
|
|
|
m=c;
|
|
|
|
|
return m;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
static float fmax2(float a, float b)
|
|
|
|
|
{
|
|
|
|
|
return (a>b)? a : b;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
static int uselesschar(int c)
|
|
|
|
|
{
|
|
|
|
|
if(c == ' ') return 1;
|
|
|
|
|
if(c == '\t') return 1;
|
|
|
|
|
if(c == '\n') return 1;
|
|
|
|
|
if(c == '\r') return 1;
|
|
|
|
|
if(c == 0xC2) return 1;
|
|
|
|
|
if(c == 0xA0) return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
/* remove leading/trailing whitespace + collapse internal multiple whitespace */
|
|
|
|
|
static void trim(char *s)
|
|
|
|
|
{
|
2026-05-22 19:45:05 -03:00
|
|
|
int len, i, j, fin = 0;
|
2026-05-22 19:41:10 -03:00
|
|
|
|
|
|
|
|
if(!s)
|
|
|
|
|
return;
|
|
|
|
|
/* remove leading spaces */
|
|
|
|
|
i = 0;
|
|
|
|
|
while(uselesschar(s[i]))
|
|
|
|
|
i++;
|
|
|
|
|
if(i > 0)
|
|
|
|
|
memmove(s, s + i, strlen(s + i) + 1);
|
|
|
|
|
/* remove trailing spaces */
|
|
|
|
|
len = strlen(s);
|
|
|
|
|
while(len > 0 && uselesschar(s[len - 1]))
|
|
|
|
|
s[--len] = '\0';
|
|
|
|
|
/* remove double spaces in between */
|
|
|
|
|
i=j=0;
|
|
|
|
|
while(s[i] != '\0')
|
|
|
|
|
{
|
|
|
|
|
if(uselesschar(s[i]))
|
|
|
|
|
{
|
|
|
|
|
if(!fin)
|
|
|
|
|
{
|
|
|
|
|
s[j++] = ' ';
|
|
|
|
|
fin = 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
s[j++] = s[i];
|
|
|
|
|
fin = 0;
|
|
|
|
|
}
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
s[j] = '\0';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
static int ulen(unsigned char c)
|
|
|
|
|
{
|
|
|
|
|
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
|
|
|
|
|
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
|
|
|
|
|
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
|
|
|
|
|
return 1; /* ASCII or invalid or UTF continuation byte */
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
static void asciify(const char *src, char *dest)
|
|
|
|
|
{
|
|
|
|
|
int len, i, k, j, found;
|
|
|
|
|
char ch[5]; /* UTF8 multibyte char */
|
|
|
|
|
const char transclear[] =
|
|
|
|
|
"AEIOUAEIOUAEIOUAEIOUAEIOU"
|
|
|
|
|
"aeiouaeiouaeiouaeiouaeiou"
|
|
|
|
|
"aoCcNn123"
|
|
|
|
|
" "; /* NBSP → space */
|
|
|
|
|
const char *translit[] = {
|
|
|
|
|
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
|
|
|
|
|
"Ã","Ẽ","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
|
|
|
|
|
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
|
|
|
|
|
"à","è","ì","ò","ù", "ã","ẽ","ĩ","õ","ũ",
|
|
|
|
|
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
|
|
|
|
|
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
|
|
|
|
|
"\xC2\xA0"}; /* NBSP */
|
|
|
|
|
|
|
|
|
|
if(!src || !dest)
|
|
|
|
|
return;
|
|
|
|
|
if(*src=='\0')
|
2026-05-22 19:48:12 -03:00
|
|
|
{
|
|
|
|
|
dest[0] = '\0'; /* maintain "dest is a valid C string" contract */
|
2026-05-22 19:41:10 -03:00
|
|
|
return;
|
2026-05-22 19:48:12 -03:00
|
|
|
}
|
2026-05-22 19:41:10 -03:00
|
|
|
|
|
|
|
|
j=i=0;
|
|
|
|
|
while(src[i]!='\0' && i < LEVN_SBUFF)
|
|
|
|
|
{
|
|
|
|
|
if(((unsigned char)src[i] & 0xc0) == 0x80) /* non ASCII, UTF continuation char */
|
|
|
|
|
{
|
|
|
|
|
i++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
len = ulen((unsigned char)src[i]);
|
|
|
|
|
memcpy(ch, &src[i], len);
|
|
|
|
|
ch[len]='\0';
|
|
|
|
|
if(((unsigned char)src[i]) < 0x80) /* ASCII */
|
|
|
|
|
{
|
|
|
|
|
dest[j++] = ch[0];
|
|
|
|
|
i++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
found = 0;
|
|
|
|
|
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
|
|
|
|
|
if(strcmp(ch, translit[k])==0)
|
|
|
|
|
{
|
|
|
|
|
dest[j++] = transclear[k];
|
|
|
|
|
found = 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if(!found)
|
|
|
|
|
dest[j++] = '?';
|
|
|
|
|
i+=len;
|
|
|
|
|
}
|
|
|
|
|
dest[j]='\0';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
|
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */
|