From 29e4be2bad58a59f7c81974ced1cd0fedea041ef Mon Sep 17 00:00:00 2001 From: Ruben Carlo Benante Date: Fri, 22 May 2026 19:41:10 -0300 Subject: [PATCH] IMPROVES-i12 lib11sht.c added --- cmp11sht.c | 303 ++++------------------------------------------------- lib11sht.c | 241 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+), 283 deletions(-) create mode 100644 lib11sht.c diff --git a/cmp11sht.c b/cmp11sht.c index 4b2fd64..7d9018d 100644 --- a/cmp11sht.c +++ b/cmp11sht.c @@ -1,68 +1,34 @@ /* ************************************************************************ * * cmp11sht.c, v20251221.085434 * - * A fuzzy comparisson between values (floats or strings) * + * Fuzzy comparison CLI — thin shell over lib11sht * * * - * Copyright (C) 2025 by Ruben Carlo Benante * + * Copyright (C) 2025 by Ruben Carlo Benante * + * GNU GPL version 2 or later. * * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * * - * To contact the author, please write to: * - * Ruben Carlo Benante * - * Webpage: http://www.beco.cc * - * Phone: +55 (81) 3184-7555 * - * ************************************************************************ * - * - */ + * Build: gcc cmp11sht.c lib11sht.c -lm -o cmp11sht * + * ************************************************************************ */ #include #include -#include -#include #include -#include #include #include -#define SBUFF 256 /* Max string size */ - -/* fuzzy comparisson */ -int fequal(float a, float b, float delta); /* compare equallity of two float numbers */ -int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2); /* compare equality of two strings */ - -/* auxiliary functions */ -int uselesschar(int c); /* check if it is a useless char */ -void trim(char *s); /* remove trailing spaces and tabs */ -void asciify(const char *src, char *dest); /* remove accents */ -int ulen(unsigned char c); /* lenght in chars, not bytes, of a multibyte UTF8 string */ -float shit11(char *s1, char *s2); /* levenshtein similarity */ -float fmin3(float a, float b, float c); /* return the minimum */ -float fmax2(float a, float b); /* return the maximum */ +#include "lib11sht.h" /* ---------------------------------------------------------------------- */ -/* get two objets a and b (float or string) and a delta (float) - * return -1 if a < b, within range delta +/* get two objects a and b (float or string) and a delta (float) + * Translates the library's {-1, 0, +1} return convention to CLI exit codes: * return 0 if a ~ b, within range delta * return +1 if a > b, within range delta - * return -2 if an error occurred + * return +2 if a < b, within range delta + * return +3 if an error occurred */ int main(int argc, char **argv) { - int res; /* comparisson results */ + int res; /* comparison result (CLI exit code) */ char *fenda, *fendb, *fendd; - char s1[SBUFF], s2[SBUFF]; + char s1[LEVN_SBUFF], s2[LEVN_SBUFF]; float a, b, delta, ratio; int opt=0; /* -no-option:0, -v:1, -o:2, -n:3 */ @@ -72,11 +38,11 @@ int main(int argc, char **argv) "cmp11sht - Fuzzy compare strings or floats within range\n\n\ Usage:\n\t$ cmp11sht -h\n\ \t$ cmp11sht o1 o2 delta [-v|-o|-n]\n\n\ -After getting two objets o1 and o2 (float or string)\n\ +After getting two objects o1 and o2 (float or string)\n\ and a FLT_MIN < delta < 1.0 (float), where:\n\ - For strings, delta is the Levenshtein ratio\n\ - For floats, delta is the precision\n\ -the comparisson will:\n\n\ +the comparison will:\n\n\ * return 0 if a ~ b, within range delta\n\ * return +1 if a > b, within range delta\n\ * return +2 if a < b, within range delta\n\ @@ -84,7 +50,7 @@ the comparisson will:\n\n\ Options:\n\t-h Print this help\n\ \t-v Print result to stdout (default is system err)\n\ \t-o Print also the Levenshtein ratio or float difference\n\ -\t-n Print also the normalized strings or floats used for comparisson\n\n\ +\t-n Print also the normalized strings or floats used for comparison\n\n\ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n" ); return 3; @@ -103,23 +69,23 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n" default: return 3; } - if(fendd == argv[3]) // error, need a threshold + if(fendd == argv[3]) /* error, need a threshold */ { if(opt) printf("%d\n", 3); return 3; } - if(delta <= FLT_MIN) // near zero delta not accepted + if(delta <= FLT_MIN) /* near zero delta not accepted */ { if(opt) printf("%d\n", 3); return 3; } - if(delta >= 1.0) // maximum precision 1.0 + if(delta >= 1.0) /* maximum precision 1.0 */ { if(opt) printf("%d\n", 3); return 3; } - if(fenda == argv[1] || fendb == argv[2]) // string + if(fenda == argv[1] || fendb == argv[2]) /* string */ { if(opt==3) printf("cmp11sht: string\n"); errno = 0; @@ -137,7 +103,7 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n" if(opt>=2) printf("%f\n", ratio); if(opt==3) printf("s1: %s\ns2: %s\n", s1, s2); } - else // float + else /* float */ { if(opt==3) printf("cmp11sht: float\n"); res = fequal(a, b, delta); @@ -156,234 +122,5 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n" return res; } -/* ---------------------------------------------------------------------- */ -/* compare equallity of two float numbers within an error margin delta. - * Return 0 if equal within the error margin, - * -1 if a < b-delta and - * +1 if a > b+delta - */ -int fequal(float a, float b, float delta) -{ - if(a < b - delta) - return -1; - if(a > b + delta) - return 1; - - /* b-delta <= a <= b+delta */ - return 0; -} - -/* ---------------------------------------------------------------------- */ -/* compare similarity between two strings. - * Return: - * 0 if equal or similar above given threshold - * -1 if a < b alphabetically (after normalization) - * +1 if a > b alphabetically (after normalization) - * On error: sets errno = EINVAL and returns 0; result is undefined. - * Caller must reset errno = 0 before the call to detect errors. - */ -int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2) -{ - int i; - - if(!a || !b || !s1 || !s2 || !ratio) - { - errno = EINVAL; - return 0; - } - - // remove accents - asciify(a, s1); - asciify(b, s2); - - // trim spaces - trim(s1); - trim(s2); - - // lowercase - for(i=0; i thr) - return 0; - return (i < 0)? -1 : 1; -} - -/* ---------------------------------------------------------------------- */ -float shit11(char *s1, char *s2) -{ - int len1, len2; - int i, j, cost; - - len1 = strlen(s1); - len2 = strlen(s2); - - if(len1 == 0 || len2 == 0) - return 0.0; - - int d[len1+1][len2+1]; - - for(i=0; i<=len1; i++) - d[i][0] = i; - for(j=0; j<=len2; j++) - d[0][j] = j; - - for(i=1; i <= len1; i++) - for(j=1; j <= len2; j++) - { - cost = (s1[i-1] == s2[j-1])? 0 : 1; - d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost); - } - return 1.0 - d[len1][len2] / fmax2(len1, len2); -} - -/* ---------------------------------------------------------------------- */ -/* return the minimum */ -float fmin3(float a, float b, float c) -{ - float m=a; - if(bb)? a : b; -} - -/* ---------------------------------------------------------------------- */ -/* check if it is a useless char */ -int uselesschar(int c) -{ - if(c == ' ') return 1; - if(c == '\t') return 1; - if(c == '\n') return 1; - if(c == '\r') return 1; - if(c == 0xC2) return 1; - if(c == 0xA0) return 1; - return 0; -} - -/* ---------------------------------------------------------------------- */ -/* remove leading/trailing spaces and tabs */ -void trim(char *s) -{ - int len, i, j, fin; - - if(!s) - return; - /* remove leading spaces */ - i = 0; - while(uselesschar(s[i])) - i++; - if(i > 0) - memmove(s, s + i, strlen(s + i) + 1); - /* remove trailing spaces */ - len = strlen(s); - while(len > 0 && uselesschar(s[len - 1])) - s[--len] = '\0'; - /* remove double spaces in between */ - i=j=0; - while(s[i] != '\0') - { - if(uselesschar(s[i])) - { - if(!fin) - { - s[j++] = ' '; - fin = 1; - } - } - else - { - s[j++] = s[i]; - fin = 0; - } - i++; - } - s[j] = '\0'; -} - -int ulen(unsigned char c) -{ - if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */ - if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */ - if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */ - return 1; /* ASCII or invalid or UTF continution byte */ -} - -void asciify(const char *src, char *dest) -{ - int len, i, k, j, found; - char ch[5]; // UTF8 multibyte char - const char transclear[] = - "AEIOUAEIOUAEIOUAEIOUAEIOU" - "aeiouaeiouaeiouaeiouaeiou" - "aoCcNn123" - " "; /* NBSP → space */ - const char *translit[] = { - "Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù", - "Ã","Ẽ","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û", - "Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú", - "à","è","ì","ò","ù", "ã","ẽ","ĩ","õ","ũ", - "â","ê","î","ô","û", "ä","ë","ï","ö","ü", - "ª","º","Ç","ç","Ñ", "ñ","¹","²","³", - "\xC2\xA0"}; /* NBSP */ - - if(!src || !dest) - return; - if(*src=='\0') - return; - - j=i=0; - while(src[i]!='\0' && i < SBUFF) - { - if(((unsigned char)src[i] & 0xc0) == 0x80) // non ASCII, UTF continuation char - { - i++; - continue; - } - - len = ulen((unsigned char)src[i]); - memcpy(ch, &src[i], len); - ch[len]='\0'; - if(((unsigned char)src[i]) < 0x80) /* ASCII */ - { - dest[j++] = ch[0]; - i++; - continue; - } - - found = 0; - for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++) - if(strcmp(ch, translit[k])==0) - { - dest[j++] = transclear[k]; - found = 1; - break; - } - if(!found) - dest[j++] = '?'; - i+=len; - } - dest[j]='\0'; -} - /* ---------------------------------------------------------------------- */ /* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */ -/* Template by Dr. Beco Version 20160612.142044 */ - diff --git a/lib11sht.c b/lib11sht.c new file mode 100644 index 0000000..1e26ee5 --- /dev/null +++ b/lib11sht.c @@ -0,0 +1,241 @@ +/* ************************************************************************ * + * lib11sht.c, v1.0 * + * Fuzzy comparison library implementation * + * * + * Copyright (C) 2025 by Ruben Carlo Benante * + * GNU GPL version 2 or later. * + * ************************************************************************ */ + +#include +#include +#include +#include +#include + +#include "lib11sht.h" + +/* Internal helpers — kept private to this translation unit. */ +static int uselesschar(int c); +static void trim(char *s); +static void asciify(const char *src, char *dest); +static int ulen(unsigned char c); +static float shit11(char *s1, char *s2); +static float fmin3(float a, float b, float c); +static float fmax2(float a, float b); + +/* ---------------------------------------------------------------------- */ +int fequal(float a, float b, float delta) +{ + if(a < b - delta) + return -1; + if(a > b + delta) + return 1; + + /* b-delta <= a <= b+delta */ + return 0; +} + +/* ---------------------------------------------------------------------- */ +int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2) +{ + int i; + + if(!a || !b || !s1 || !s2 || !ratio) + { + errno = EINVAL; + return 0; + } + + /* remove accents */ + asciify(a, s1); + asciify(b, s2); + + /* trim spaces (leading, trailing, and internal collapse) */ + trim(s1); + trim(s2); + + /* lowercase */ + for(i=0; i thr) + return 0; + return (i < 0)? -1 : 1; +} + +/* ---------------------------------------------------------------------- */ +/* Levenshtein similarity ratio 0.0..1.0 */ +static float shit11(char *s1, char *s2) +{ + int len1, len2; + int i, j, cost; + + len1 = strlen(s1); + len2 = strlen(s2); + + if(len1 == 0 || len2 == 0) + return 0.0; + + int d[len1+1][len2+1]; + + for(i=0; i<=len1; i++) + d[i][0] = i; + for(j=0; j<=len2; j++) + d[0][j] = j; + + for(i=1; i <= len1; i++) + for(j=1; j <= len2; j++) + { + cost = (s1[i-1] == s2[j-1])? 0 : 1; + d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost); + } + return 1.0 - d[len1][len2] / fmax2(len1, len2); +} + +/* ---------------------------------------------------------------------- */ +static float fmin3(float a, float b, float c) +{ + float m=a; + if(bb)? a : b; +} + +/* ---------------------------------------------------------------------- */ +static int uselesschar(int c) +{ + if(c == ' ') return 1; + if(c == '\t') return 1; + if(c == '\n') return 1; + if(c == '\r') return 1; + if(c == 0xC2) return 1; + if(c == 0xA0) return 1; + return 0; +} + +/* ---------------------------------------------------------------------- */ +/* remove leading/trailing whitespace + collapse internal multiple whitespace */ +static void trim(char *s) +{ + int len, i, j, fin; + + if(!s) + return; + /* remove leading spaces */ + i = 0; + while(uselesschar(s[i])) + i++; + if(i > 0) + memmove(s, s + i, strlen(s + i) + 1); + /* remove trailing spaces */ + len = strlen(s); + while(len > 0 && uselesschar(s[len - 1])) + s[--len] = '\0'; + /* remove double spaces in between */ + i=j=0; + while(s[i] != '\0') + { + if(uselesschar(s[i])) + { + if(!fin) + { + s[j++] = ' '; + fin = 1; + } + } + else + { + s[j++] = s[i]; + fin = 0; + } + i++; + } + s[j] = '\0'; +} + +/* ---------------------------------------------------------------------- */ +static int ulen(unsigned char c) +{ + if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */ + if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */ + if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */ + return 1; /* ASCII or invalid or UTF continuation byte */ +} + +/* ---------------------------------------------------------------------- */ +static void asciify(const char *src, char *dest) +{ + int len, i, k, j, found; + char ch[5]; /* UTF8 multibyte char */ + const char transclear[] = + "AEIOUAEIOUAEIOUAEIOUAEIOU" + "aeiouaeiouaeiouaeiouaeiou" + "aoCcNn123" + " "; /* NBSP → space */ + const char *translit[] = { + "Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù", + "Ã","Ẽ","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û", + "Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú", + "à","è","ì","ò","ù", "ã","ẽ","ĩ","õ","ũ", + "â","ê","î","ô","û", "ä","ë","ï","ö","ü", + "ª","º","Ç","ç","Ñ", "ñ","¹","²","³", + "\xC2\xA0"}; /* NBSP */ + + if(!src || !dest) + return; + if(*src=='\0') + return; + + j=i=0; + while(src[i]!='\0' && i < LEVN_SBUFF) + { + if(((unsigned char)src[i] & 0xc0) == 0x80) /* non ASCII, UTF continuation char */ + { + i++; + continue; + } + + len = ulen((unsigned char)src[i]); + memcpy(ch, &src[i], len); + ch[len]='\0'; + if(((unsigned char)src[i]) < 0x80) /* ASCII */ + { + dest[j++] = ch[0]; + i++; + continue; + } + + found = 0; + for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++) + if(strcmp(ch, translit[k])==0) + { + dest[j++] = transclear[k]; + found = 1; + break; + } + if(!found) + dest[j++] = '?'; + i+=len; + } + dest[j]='\0'; +} + +/* ---------------------------------------------------------------------- */ +/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */