Compare commits
10 Commits
f1d8ff7de1
...
d48c8c1c36
| Author | SHA1 | Date | |
|---|---|---|---|
| d48c8c1c36 | |||
| 9bed332fe9 | |||
| 2888850684 | |||
| d2e5dfce38 | |||
| aabab06352 | |||
| af355f2661 | |||
| 1016b15f34 | |||
| 18c34e3b41 | |||
| 6efce8c601 | |||
| 29e4be2bad |
306
cmp11sht.c
306
cmp11sht.c
@@ -1,68 +1,34 @@
|
|||||||
/* ************************************************************************ *
|
/* ************************************************************************ *
|
||||||
* cmp11sht.c, v20251221.085434 *
|
* cmp11sht.c, v20251221.085434 *
|
||||||
* A fuzzy comparisson between values (floats or strings) *
|
* Fuzzy comparison CLI — thin shell over lib11sht *
|
||||||
* *
|
* *
|
||||||
* Copyright (C) 2025 by Ruben Carlo Benante *
|
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
|
||||||
|
* GNU GPL version 2 or later. *
|
||||||
* *
|
* *
|
||||||
* This program is free software; you can redistribute it and/or modify *
|
* Build: gcc cmp11sht.c lib11sht.c -lm -o cmp11sht *
|
||||||
* it under the terms of the GNU General Public License as published by *
|
* ************************************************************************ */
|
||||||
* the Free Software Foundation; either version 2 of the License, or *
|
|
||||||
* (at your option) any later version. *
|
|
||||||
* *
|
|
||||||
* This program is distributed in the hope that it will be useful, *
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
||||||
* GNU General Public License for more details. *
|
|
||||||
* *
|
|
||||||
* You should have received a copy of the GNU General Public License *
|
|
||||||
* along with this program; if not, write to the *
|
|
||||||
* Free Software Foundation, Inc., *
|
|
||||||
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
|
|
||||||
* *
|
|
||||||
* To contact the author, please write to: *
|
|
||||||
* Ruben Carlo Benante <rcb@beco.cc> *
|
|
||||||
* Webpage: http://www.beco.cc *
|
|
||||||
* Phone: +55 (81) 3184-7555 *
|
|
||||||
* ************************************************************************ *
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <ctype.h>
|
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
#define SBUFF 256 /* Max string size */
|
#include "lib11sht.h"
|
||||||
|
|
||||||
/* fuzzy comparisson */
|
|
||||||
int fequal(float a, float b, float delta); /* compare equallity of two float numbers */
|
|
||||||
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2); /* compare equality of two strings */
|
|
||||||
|
|
||||||
/* auxiliary functions */
|
|
||||||
int uselesschar(int c); /* check if it is a useless char */
|
|
||||||
void trim(char *s); /* remove trailing spaces and tabs */
|
|
||||||
void asciify(const char *src, char *dest); /* remove accents */
|
|
||||||
int ulen(unsigned char c); /* lenght in chars, not bytes, of a multibyte UTF8 string */
|
|
||||||
float shit11(char *s1, char *s2); /* levenshtein similarity */
|
|
||||||
float fmin3(float a, float b, float c); /* return the minimum */
|
|
||||||
float fmax2(float a, float b); /* return the maximum */
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
/* get two objets a and b (float or string) and a delta (float)
|
/* get two objects a and b (float or string) and a delta (float)
|
||||||
* return -1 if a < b, within range delta
|
* Translates the library's {-1, 0, +1} return convention to CLI exit codes:
|
||||||
* return 0 if a ~ b, within range delta
|
* return 0 if a ~ b, within range delta
|
||||||
* return +1 if a > b, within range delta
|
* return +1 if a > b, within range delta
|
||||||
* return -2 if an error occurred
|
* return +2 if a < b, within range delta
|
||||||
|
* return +3 if an error occurred
|
||||||
*/
|
*/
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
int res; /* comparisson results */
|
int res; /* comparison result (CLI exit code) */
|
||||||
char *fenda, *fendb, *fendd;
|
char *fenda, *fendb, *fendd;
|
||||||
char s1[SBUFF], s2[SBUFF];
|
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
|
||||||
float a, b, delta, ratio;
|
float a, b, delta, ratio;
|
||||||
int opt=0; /* -no-option:0, -v:1, -o:2, -n:3 */
|
int opt=0; /* -no-option:0, -v:1, -o:2, -n:3 */
|
||||||
|
|
||||||
@@ -72,11 +38,11 @@ int main(int argc, char **argv)
|
|||||||
"cmp11sht - Fuzzy compare strings or floats within range\n\n\
|
"cmp11sht - Fuzzy compare strings or floats within range\n\n\
|
||||||
Usage:\n\t$ cmp11sht -h\n\
|
Usage:\n\t$ cmp11sht -h\n\
|
||||||
\t$ cmp11sht o1 o2 delta [-v|-o|-n]\n\n\
|
\t$ cmp11sht o1 o2 delta [-v|-o|-n]\n\n\
|
||||||
After getting two objets o1 and o2 (float or string)\n\
|
After getting two objects o1 and o2 (float or string)\n\
|
||||||
and a FLT_MIN < delta < 1.0 (float), where:\n\
|
and a FLT_MIN < delta < 1.0 (float), where:\n\
|
||||||
- For strings, delta is the Levenshtein ratio\n\
|
- For strings, delta is the Levenshtein ratio\n\
|
||||||
- For floats, delta is the precision\n\
|
- For floats, delta is the precision\n\
|
||||||
the comparisson will:\n\n\
|
the comparison will:\n\n\
|
||||||
* return 0 if a ~ b, within range delta\n\
|
* return 0 if a ~ b, within range delta\n\
|
||||||
* return +1 if a > b, within range delta\n\
|
* return +1 if a > b, within range delta\n\
|
||||||
* return +2 if a < b, within range delta\n\
|
* return +2 if a < b, within range delta\n\
|
||||||
@@ -84,7 +50,7 @@ the comparisson will:\n\n\
|
|||||||
Options:\n\t-h Print this help\n\
|
Options:\n\t-h Print this help\n\
|
||||||
\t-v Print result to stdout (default is system err)\n\
|
\t-v Print result to stdout (default is system err)\n\
|
||||||
\t-o Print also the Levenshtein ratio or float difference\n\
|
\t-o Print also the Levenshtein ratio or float difference\n\
|
||||||
\t-n Print also the normalized strings or floats used for comparisson\n\n\
|
\t-n Print also the normalized strings or floats used for comparison\n\n\
|
||||||
cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
||||||
);
|
);
|
||||||
return 3;
|
return 3;
|
||||||
@@ -103,27 +69,28 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
|||||||
default: return 3;
|
default: return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(fendd == argv[3]) // error, need a threshold
|
if(fendd == argv[3]) /* error, need a threshold */
|
||||||
{
|
{
|
||||||
if(opt) printf("%d\n", 3);
|
if(opt) printf("%d\n", 3);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
if(delta <= FLT_MIN) // near zero delta not accepted
|
if(delta <= FLT_MIN) /* near zero delta not accepted */
|
||||||
{
|
{
|
||||||
if(opt) printf("%d\n", 3);
|
if(opt) printf("%d\n", 3);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
if(delta >= 1.0) // maximum precision 1.0
|
if(delta >= 1.0) /* maximum precision 1.0 */
|
||||||
{
|
{
|
||||||
if(opt) printf("%d\n", 3);
|
if(opt) printf("%d\n", 3);
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(fenda == argv[1] || fendb == argv[2]) // string
|
if(fenda == argv[1] || fendb == argv[2]) /* string */
|
||||||
{
|
{
|
||||||
if(opt==3) printf("cmp11sht: string\n");
|
if(opt==3) printf("cmp11sht: string\n");
|
||||||
errno = 0;
|
errno = 0;
|
||||||
res = sequal(argv[1], argv[2], delta, &ratio, s1, s2);
|
res = sequal_full(argv[1], argv[2], delta, &ratio,
|
||||||
|
s1, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||||
if(errno == EINVAL)
|
if(errno == EINVAL)
|
||||||
res = 3; /* error → CLI exit 3 */
|
res = 3; /* error → CLI exit 3 */
|
||||||
else if(res > 0)
|
else if(res > 0)
|
||||||
@@ -137,7 +104,7 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
|||||||
if(opt>=2) printf("%f\n", ratio);
|
if(opt>=2) printf("%f\n", ratio);
|
||||||
if(opt==3) printf("s1: %s\ns2: %s\n", s1, s2);
|
if(opt==3) printf("s1: %s\ns2: %s\n", s1, s2);
|
||||||
}
|
}
|
||||||
else // float
|
else /* float */
|
||||||
{
|
{
|
||||||
if(opt==3) printf("cmp11sht: float\n");
|
if(opt==3) printf("cmp11sht: float\n");
|
||||||
res = fequal(a, b, delta);
|
res = fequal(a, b, delta);
|
||||||
@@ -156,234 +123,5 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
/* compare equallity of two float numbers within an error margin delta.
|
|
||||||
* Return 0 if equal within the error margin,
|
|
||||||
* -1 if a < b-delta and
|
|
||||||
* +1 if a > b+delta
|
|
||||||
*/
|
|
||||||
int fequal(float a, float b, float delta)
|
|
||||||
{
|
|
||||||
if(a < b - delta)
|
|
||||||
return -1;
|
|
||||||
if(a > b + delta)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
/* b-delta <= a <= b+delta */
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
/* compare similarity between two strings.
|
|
||||||
* Return:
|
|
||||||
* 0 if equal or similar above given threshold
|
|
||||||
* -1 if a < b alphabetically (after normalization)
|
|
||||||
* +1 if a > b alphabetically (after normalization)
|
|
||||||
* On error: sets errno = EINVAL and returns 0; result is undefined.
|
|
||||||
* Caller must reset errno = 0 before the call to detect errors.
|
|
||||||
*/
|
|
||||||
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
|
|
||||||
if(!a || !b || !s1 || !s2 || !ratio)
|
|
||||||
{
|
|
||||||
errno = EINVAL;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove accents
|
|
||||||
asciify(a, s1);
|
|
||||||
asciify(b, s2);
|
|
||||||
|
|
||||||
// trim spaces
|
|
||||||
trim(s1);
|
|
||||||
trim(s2);
|
|
||||||
|
|
||||||
// lowercase
|
|
||||||
for(i=0; i<SBUFF && s1[i]; i++)
|
|
||||||
s1[i] = (char)tolower((unsigned char)s1[i]);
|
|
||||||
for(i=0; i<SBUFF && s2[i]; i++)
|
|
||||||
s2[i] = (char)tolower((unsigned char)s2[i]);
|
|
||||||
|
|
||||||
i=strcmp(s1, s2);
|
|
||||||
if(!i)
|
|
||||||
{
|
|
||||||
*ratio=1.0;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
*ratio = shit11(s1, s2);
|
|
||||||
if(*ratio > thr)
|
|
||||||
return 0;
|
|
||||||
return (i < 0)? -1 : 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
float shit11(char *s1, char *s2)
|
|
||||||
{
|
|
||||||
int len1, len2;
|
|
||||||
int i, j, cost;
|
|
||||||
|
|
||||||
len1 = strlen(s1);
|
|
||||||
len2 = strlen(s2);
|
|
||||||
|
|
||||||
if(len1 == 0 || len2 == 0)
|
|
||||||
return 0.0;
|
|
||||||
|
|
||||||
int d[len1+1][len2+1];
|
|
||||||
|
|
||||||
for(i=0; i<=len1; i++)
|
|
||||||
d[i][0] = i;
|
|
||||||
for(j=0; j<=len2; j++)
|
|
||||||
d[0][j] = j;
|
|
||||||
|
|
||||||
for(i=1; i <= len1; i++)
|
|
||||||
for(j=1; j <= len2; j++)
|
|
||||||
{
|
|
||||||
cost = (s1[i-1] == s2[j-1])? 0 : 1;
|
|
||||||
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
|
|
||||||
}
|
|
||||||
return 1.0 - d[len1][len2] / fmax2(len1, len2);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
/* return the minimum */
|
|
||||||
float fmin3(float a, float b, float c)
|
|
||||||
{
|
|
||||||
float m=a;
|
|
||||||
if(b<m)
|
|
||||||
m=b;
|
|
||||||
if(c<m)
|
|
||||||
m=c;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
/* return the maximum */
|
|
||||||
float fmax2(float a, float b)
|
|
||||||
{
|
|
||||||
return (a>b)? a : b;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
/* check if it is a useless char */
|
|
||||||
int uselesschar(int c)
|
|
||||||
{
|
|
||||||
if(c == ' ') return 1;
|
|
||||||
if(c == '\t') return 1;
|
|
||||||
if(c == '\n') return 1;
|
|
||||||
if(c == '\r') return 1;
|
|
||||||
if(c == 0xC2) return 1;
|
|
||||||
if(c == 0xA0) return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
|
||||||
/* remove leading/trailing spaces and tabs */
|
|
||||||
void trim(char *s)
|
|
||||||
{
|
|
||||||
int len, i, j, fin;
|
|
||||||
|
|
||||||
if(!s)
|
|
||||||
return;
|
|
||||||
/* remove leading spaces */
|
|
||||||
i = 0;
|
|
||||||
while(uselesschar(s[i]))
|
|
||||||
i++;
|
|
||||||
if(i > 0)
|
|
||||||
memmove(s, s + i, strlen(s + i) + 1);
|
|
||||||
/* remove trailing spaces */
|
|
||||||
len = strlen(s);
|
|
||||||
while(len > 0 && uselesschar(s[len - 1]))
|
|
||||||
s[--len] = '\0';
|
|
||||||
/* remove double spaces in between */
|
|
||||||
i=j=0;
|
|
||||||
while(s[i] != '\0')
|
|
||||||
{
|
|
||||||
if(uselesschar(s[i]))
|
|
||||||
{
|
|
||||||
if(!fin)
|
|
||||||
{
|
|
||||||
s[j++] = ' ';
|
|
||||||
fin = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
s[j++] = s[i];
|
|
||||||
fin = 0;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
s[j] = '\0';
|
|
||||||
}
|
|
||||||
|
|
||||||
int ulen(unsigned char c)
|
|
||||||
{
|
|
||||||
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
|
|
||||||
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
|
|
||||||
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
|
|
||||||
return 1; /* ASCII or invalid or UTF continution byte */
|
|
||||||
}
|
|
||||||
|
|
||||||
void asciify(const char *src, char *dest)
|
|
||||||
{
|
|
||||||
int len, i, k, j, found;
|
|
||||||
char ch[5]; // UTF8 multibyte char
|
|
||||||
const char transclear[] =
|
|
||||||
"AEIOUAEIOUAEIOUAEIOUAEIOU"
|
|
||||||
"aeiouaeiouaeiouaeiouaeiou"
|
|
||||||
"aoCcNn123"
|
|
||||||
" "; /* NBSP → space */
|
|
||||||
const char *translit[] = {
|
|
||||||
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
|
|
||||||
"Ã","Ẽ","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
|
|
||||||
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
|
|
||||||
"à","è","ì","ò","ù", "ã","ẽ","ĩ","õ","ũ",
|
|
||||||
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
|
|
||||||
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
|
|
||||||
"\xC2\xA0"}; /* NBSP */
|
|
||||||
|
|
||||||
if(!src || !dest)
|
|
||||||
return;
|
|
||||||
if(*src=='\0')
|
|
||||||
return;
|
|
||||||
|
|
||||||
j=i=0;
|
|
||||||
while(src[i]!='\0' && i < SBUFF)
|
|
||||||
{
|
|
||||||
if(((unsigned char)src[i] & 0xc0) == 0x80) // non ASCII, UTF continuation char
|
|
||||||
{
|
|
||||||
i++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
len = ulen((unsigned char)src[i]);
|
|
||||||
memcpy(ch, &src[i], len);
|
|
||||||
ch[len]='\0';
|
|
||||||
if(((unsigned char)src[i]) < 0x80) /* ASCII */
|
|
||||||
{
|
|
||||||
dest[j++] = ch[0];
|
|
||||||
i++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
found = 0;
|
|
||||||
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
|
|
||||||
if(strcmp(ch, translit[k])==0)
|
|
||||||
{
|
|
||||||
dest[j++] = transclear[k];
|
|
||||||
found = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if(!found)
|
|
||||||
dest[j++] = '?';
|
|
||||||
i+=len;
|
|
||||||
}
|
|
||||||
dest[j]='\0';
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */
|
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */
|
||||||
/* Template by Dr. Beco <rcb at beco dot cc> Version 20160612.142044 */
|
|
||||||
|
|
||||||
|
|||||||
275
lib11sht.c
Normal file
275
lib11sht.c
Normal file
@@ -0,0 +1,275 @@
|
|||||||
|
/* ************************************************************************ *
|
||||||
|
* lib11sht.c, v1.0 *
|
||||||
|
* Fuzzy comparison library implementation *
|
||||||
|
* *
|
||||||
|
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
|
||||||
|
* GNU GPL version 2 or later. *
|
||||||
|
* ************************************************************************ */
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include "lib11sht.h"
|
||||||
|
|
||||||
|
/* Internal helpers — kept private to this translation unit. */
|
||||||
|
static int uselesschar(int c);
|
||||||
|
/* trim is public (declared in lib11sht.h) */
|
||||||
|
static void asciify(const char *src, char *dest, size_t dest_size);
|
||||||
|
static int ulen(unsigned char c);
|
||||||
|
static float shit11(char *s1, char *s2);
|
||||||
|
static float fmin3(float a, float b, float c);
|
||||||
|
static float fmax2(float a, float b);
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
int fequal(float a, float b, float delta)
|
||||||
|
{
|
||||||
|
if(a < b - delta)
|
||||||
|
return -1;
|
||||||
|
if(a > b + delta)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
/* b-delta <= a <= b+delta */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
/* Simple wrapper: most callers don't need the ratio or normalized buffers.
|
||||||
|
* Symmetric with fequal(a, b, delta). */
|
||||||
|
int sequal(char *a, char *b, float shold)
|
||||||
|
{
|
||||||
|
float ratio;
|
||||||
|
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
|
||||||
|
return sequal_full(a, b, shold, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
int sequal_full(char *a, char *b, float shold, float *ratio,
|
||||||
|
char *s1, size_t s1_size,
|
||||||
|
char *s2, size_t s2_size)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
if(!a || !b || !s1 || !s2 || !ratio || s1_size == 0 || s2_size == 0)
|
||||||
|
{
|
||||||
|
errno = EINVAL;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* remove accents (asciify NUL-terminates within s1_size / s2_size) */
|
||||||
|
asciify(a, s1, s1_size);
|
||||||
|
asciify(b, s2, s2_size);
|
||||||
|
|
||||||
|
/* trim spaces (leading, trailing, and internal collapse) */
|
||||||
|
trim(s1);
|
||||||
|
trim(s2);
|
||||||
|
|
||||||
|
/* lowercase, bounded by each buffer's actual size */
|
||||||
|
for(i=0; i<s1_size && s1[i]; i++)
|
||||||
|
s1[i] = (char)tolower((unsigned char)s1[i]);
|
||||||
|
for(i=0; i<s2_size && s2[i]; i++)
|
||||||
|
s2[i] = (char)tolower((unsigned char)s2[i]);
|
||||||
|
|
||||||
|
{
|
||||||
|
int cmp = strcmp(s1, s2);
|
||||||
|
if(cmp == 0)
|
||||||
|
{
|
||||||
|
*ratio = 1.0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if(shold >= 1.0f)
|
||||||
|
{
|
||||||
|
/* strict mode: shold=1.0 means no fuzzy match possible
|
||||||
|
* (shit11 ratio is always <= 1.0, never > 1.0). Skip the
|
||||||
|
* Levenshtein DP entirely. */
|
||||||
|
*ratio = 0.0f;
|
||||||
|
return (cmp < 0) ? -1 : 1;
|
||||||
|
}
|
||||||
|
*ratio = shit11(s1, s2);
|
||||||
|
if(*ratio > shold)
|
||||||
|
return 0;
|
||||||
|
return (cmp < 0) ? -1 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
/* Levenshtein similarity ratio 0.0..1.0 */
|
||||||
|
static float shit11(char *s1, char *s2)
|
||||||
|
{
|
||||||
|
int len1, len2;
|
||||||
|
int i, j, cost;
|
||||||
|
|
||||||
|
/* strnlen guards against missing NUL (no read past LEVN_SBUFF) */
|
||||||
|
len1 = (int)strnlen(s1, LEVN_SBUFF);
|
||||||
|
len2 = (int)strnlen(s2, LEVN_SBUFF);
|
||||||
|
|
||||||
|
if(len1 == 0 && len2 == 0)
|
||||||
|
return 1.0; /* both empty == identical */
|
||||||
|
if(len1 == 0 || len2 == 0)
|
||||||
|
return 0.0; /* one empty == no similarity */
|
||||||
|
|
||||||
|
/* Defensive cap: refuse to allocate too-large VLA. Bounds the stack
|
||||||
|
* matrix at LEVN_SBUFF × LEVN_SBUFF × sizeof(int) = ~256 KB worst case. */
|
||||||
|
if(len1 >= LEVN_SBUFF || len2 >= LEVN_SBUFF)
|
||||||
|
return 0.0; /* treat as "completely dissimilar" */
|
||||||
|
|
||||||
|
int d[len1+1][len2+1];
|
||||||
|
|
||||||
|
for(i=0; i<=len1; i++)
|
||||||
|
d[i][0] = i;
|
||||||
|
for(j=0; j<=len2; j++)
|
||||||
|
d[0][j] = j;
|
||||||
|
|
||||||
|
for(i=1; i <= len1; i++)
|
||||||
|
for(j=1; j <= len2; j++)
|
||||||
|
{
|
||||||
|
cost = (s1[i-1] == s2[j-1])? 0 : 1;
|
||||||
|
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
|
||||||
|
}
|
||||||
|
return 1.0 - d[len1][len2] / fmax2(len1, len2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
static float fmin3(float a, float b, float c)
|
||||||
|
{
|
||||||
|
float m=a;
|
||||||
|
if(b<m)
|
||||||
|
m=b;
|
||||||
|
if(c<m)
|
||||||
|
m=c;
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
static float fmax2(float a, float b)
|
||||||
|
{
|
||||||
|
return (a>b)? a : b;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
static int uselesschar(int c)
|
||||||
|
{
|
||||||
|
if(c == ' ') return 1;
|
||||||
|
if(c == '\t') return 1;
|
||||||
|
if(c == '\n') return 1;
|
||||||
|
if(c == '\r') return 1;
|
||||||
|
if(c == 0xC2) return 1;
|
||||||
|
if(c == 0xA0) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
/* remove leading/trailing whitespace + collapse internal multiple whitespace */
|
||||||
|
void trim(char *s)
|
||||||
|
{
|
||||||
|
int len, i, j, fin = 0;
|
||||||
|
|
||||||
|
if(!s)
|
||||||
|
return;
|
||||||
|
/* remove leading spaces */
|
||||||
|
i = 0;
|
||||||
|
while(uselesschar(s[i]))
|
||||||
|
i++;
|
||||||
|
if(i > 0)
|
||||||
|
memmove(s, s + i, strlen(s + i) + 1);
|
||||||
|
/* remove trailing spaces */
|
||||||
|
len = strlen(s);
|
||||||
|
while(len > 0 && uselesschar(s[len - 1]))
|
||||||
|
s[--len] = '\0';
|
||||||
|
/* remove double spaces in between */
|
||||||
|
i=j=0;
|
||||||
|
while(s[i] != '\0')
|
||||||
|
{
|
||||||
|
if(uselesschar(s[i]))
|
||||||
|
{
|
||||||
|
if(!fin)
|
||||||
|
{
|
||||||
|
s[j++] = ' ';
|
||||||
|
fin = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
s[j++] = s[i];
|
||||||
|
fin = 0;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
s[j] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
static int ulen(unsigned char c)
|
||||||
|
{
|
||||||
|
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
|
||||||
|
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
|
||||||
|
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
|
||||||
|
return 1; /* ASCII or invalid or UTF continuation byte */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
static void asciify(const char *src, char *dest, size_t dest_size)
|
||||||
|
{
|
||||||
|
int len, i, k, j, found;
|
||||||
|
char ch[5]; /* UTF8 multibyte char */
|
||||||
|
const char transclear[] =
|
||||||
|
"AEIOUAEIOUAEIOUAEIOUAEIOU"
|
||||||
|
"aeiouaeiouaeiouaeiouaeiou"
|
||||||
|
"aoCcNn123"
|
||||||
|
" "; /* NBSP → space */
|
||||||
|
const char *translit[] = {
|
||||||
|
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
|
||||||
|
"Ã","Ẽ","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
|
||||||
|
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
|
||||||
|
"à","è","ì","ò","ù", "ã","ẽ","ĩ","õ","ũ",
|
||||||
|
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
|
||||||
|
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
|
||||||
|
"\xC2\xA0"}; /* NBSP */
|
||||||
|
|
||||||
|
if(!src || !dest || dest_size == 0)
|
||||||
|
return;
|
||||||
|
if(*src=='\0')
|
||||||
|
{
|
||||||
|
dest[0] = '\0'; /* maintain "dest is a valid C string" contract */
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
j=i=0;
|
||||||
|
/* leave room for the final NUL: writes capped at dest_size-1 */
|
||||||
|
while(src[i]!='\0' && i < LEVN_SBUFF && j < (int)dest_size - 1)
|
||||||
|
{
|
||||||
|
if(((unsigned char)src[i] & 0xc0) == 0x80) /* non ASCII, UTF continuation char */
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = ulen((unsigned char)src[i]);
|
||||||
|
memcpy(ch, &src[i], len);
|
||||||
|
ch[len]='\0';
|
||||||
|
if(((unsigned char)src[i]) < 0x80) /* ASCII */
|
||||||
|
{
|
||||||
|
dest[j++] = ch[0];
|
||||||
|
i++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
found = 0;
|
||||||
|
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
|
||||||
|
if(strcmp(ch, translit[k])==0)
|
||||||
|
{
|
||||||
|
dest[j++] = transclear[k];
|
||||||
|
found = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(!found)
|
||||||
|
dest[j++] = '?';
|
||||||
|
i+=len;
|
||||||
|
}
|
||||||
|
dest[j]='\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */
|
||||||
34
lib11sht.h
34
lib11sht.h
@@ -9,11 +9,14 @@
|
|||||||
#ifndef LIB11SHT_H
|
#ifndef LIB11SHT_H
|
||||||
#define LIB11SHT_H
|
#define LIB11SHT_H
|
||||||
|
|
||||||
#define LEVN_SBUFF 256 /* min size for s1/s2 buffers and bounded input */
|
#include <stddef.h> /* size_t */
|
||||||
|
|
||||||
|
#define LEVN_SBUFF 256 /* recommended size for s1/s2 buffers */
|
||||||
|
|
||||||
/* Compare similarity between two strings (after asciify + trim + lowercase).
|
/* Compare similarity between two strings (after asciify + trim + lowercase).
|
||||||
|
* Symmetric in shape with fequal(a, b, delta).
|
||||||
* Returns:
|
* Returns:
|
||||||
* 0 if equal or similar above the given threshold ratio
|
* 0 if equal or similar above the lratio threshold
|
||||||
* -1 if a < b alphabetically (after normalization)
|
* -1 if a < b alphabetically (after normalization)
|
||||||
* +1 if a > b alphabetically (after normalization)
|
* +1 if a > b alphabetically (after normalization)
|
||||||
* On error: sets errno = EINVAL and returns 0; comparison result is undefined.
|
* On error: sets errno = EINVAL and returns 0; comparison result is undefined.
|
||||||
@@ -21,12 +24,23 @@
|
|||||||
*
|
*
|
||||||
* Parameters:
|
* Parameters:
|
||||||
* a, b input strings (NUL-terminated, may contain UTF-8 accented Latin chars)
|
* a, b input strings (NUL-terminated, may contain UTF-8 accented Latin chars)
|
||||||
* thr Levenshtein similarity threshold 0.0..1.0; matches above this count as equal
|
* shold Levenshtein similarity threshold 0.0..1.0; matches above this count as equal
|
||||||
* ratio out: Levenshtein similarity 0.0..1.0 (1.0 on exact-after-normalize)
|
|
||||||
* s1, s2 out: caller-provided buffers (>= LEVN_SBUFF) filled with the
|
|
||||||
* normalized inputs (useful for diagnostics; can be ignored)
|
|
||||||
*/
|
*/
|
||||||
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2);
|
int sequal(char *a, char *b, float shold);
|
||||||
|
|
||||||
|
/* Full variant of sequal: same comparison but also returns diagnostics.
|
||||||
|
* Used by callers that need the computed ratio or the normalized strings
|
||||||
|
* (e.g. cmp11sht CLI's -o / -n flags).
|
||||||
|
*
|
||||||
|
* Extra parameters:
|
||||||
|
* ratio out: Levenshtein similarity 0.0..1.0 (1.0 on exact-after-normalize)
|
||||||
|
* s1, s2 out: caller-provided buffers filled with the normalized inputs
|
||||||
|
* s1_size size of s1 in bytes (writes capped at s1_size-1 + final NUL)
|
||||||
|
* s2_size size of s2 in bytes (writes capped at s2_size-1 + final NUL)
|
||||||
|
*/
|
||||||
|
int sequal_full(char *a, char *b, float shold, float *ratio,
|
||||||
|
char *s1, size_t s1_size,
|
||||||
|
char *s2, size_t s2_size);
|
||||||
|
|
||||||
/* Compare two floats within ±delta.
|
/* Compare two floats within ±delta.
|
||||||
* Returns:
|
* Returns:
|
||||||
@@ -36,4 +50,10 @@ int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2);
|
|||||||
*/
|
*/
|
||||||
int fequal(float a, float b, float delta);
|
int fequal(float a, float b, float delta);
|
||||||
|
|
||||||
|
/* String trim: removes leading + trailing whitespace (including UTF-8
|
||||||
|
* NBSP bytes 0xC2 / 0xA0) AND collapses internal runs of whitespace
|
||||||
|
* to a single space. Modifies s in place. Caller's buffer must already
|
||||||
|
* be NUL-terminated. */
|
||||||
|
void trim(char *s);
|
||||||
|
|
||||||
#endif /* LIB11SHT_H */
|
#endif /* LIB11SHT_H */
|
||||||
|
|||||||
Reference in New Issue
Block a user