Compare commits

...

17 Commits

Author SHA1 Message Date
902e29dc93 public asciify 2026-05-27 09:58:19 -03:00
74a5ef63b0 test clean comment 2026-05-23 07:31:46 -03:00
8c14340adc cleaning comments 2026-05-23 07:24:05 -03:00
f615ee0242 .gitignore binaries 2026-05-23 07:17:13 -03:00
b046872fd8 test case 2026-05-23 07:17:04 -03:00
97076a2d48 makefile added 2026-05-23 07:16:28 -03:00
e9d00dc619 better comments on lib11sht API 2026-05-23 07:14:51 -03:00
d48c8c1c36 allow trim() public function 2026-05-22 22:08:21 -03:00
9bed332fe9 fast path if threshold>=1.0 2026-05-22 21:30:10 -03:00
2888850684 IMPROVES-i4 strnlen() caps shit11() 2026-05-22 20:36:53 -03:00
d2e5dfce38 IMPROVES-i5 full fix on sequal_full() 2026-05-22 20:28:13 -03:00
aabab06352 IMPROVES-i5 partial fix on asciify() buffer size dst_size 2026-05-22 20:20:34 -03:00
af355f2661 IMPROVES-i6 wrapper sequal(a,b,shold) : sequal_full(...) 2026-05-22 20:00:47 -03:00
1016b15f34 IMPROVES-i3 asciify copy empty to empty 2026-05-22 19:48:12 -03:00
18c34e3b41 IMPROVES-i3 both empties are equal (ratio 1.0) 2026-05-22 19:45:50 -03:00
6efce8c601 IMPROVES-i1 trim() fin initialized (done also i7 and i8) 2026-05-22 19:45:05 -03:00
29e4be2bad IMPROVES-i12 lib11sht.c added 2026-05-22 19:41:10 -03:00
6 changed files with 598 additions and 300 deletions

2
.gitignore vendored
View File

@@ -1,2 +1,4 @@
untracked/
cmp11sht
new file: test_lib11sht.c
test_lib11sht

View File

@@ -1,68 +1,34 @@
/* ************************************************************************ *
* cmp11sht.c, v20251221.085434 *
* A fuzzy comparisson between values (floats or strings) *
* Fuzzy comparison CLI -- thin shell over lib11sht *
* *
* Copyright (C) 2025 by Ruben Carlo Benante *
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
* GNU GPL version 2 or later. *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
* *
* To contact the author, please write to: *
* Ruben Carlo Benante <rcb@beco.cc> *
* Webpage: http://www.beco.cc *
* Phone: +55 (81) 3184-7555 *
* ************************************************************************ *
*
*/
* Build: gcc cmp11sht.c lib11sht.c -lm -o cmp11sht *
* ************************************************************************ */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <ctype.h>
#include <float.h>
#include <errno.h>
#define SBUFF 256 /* Max string size */
/* fuzzy comparisson */
int fequal(float a, float b, float delta); /* compare equallity of two float numbers */
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2); /* compare equality of two strings */
/* auxiliary functions */
int uselesschar(int c); /* check if it is a useless char */
void trim(char *s); /* remove trailing spaces and tabs */
void asciify(const char *src, char *dest); /* remove accents */
int ulen(unsigned char c); /* lenght in chars, not bytes, of a multibyte UTF8 string */
float shit11(char *s1, char *s2); /* levenshtein similarity */
float fmin3(float a, float b, float c); /* return the minimum */
float fmax2(float a, float b); /* return the maximum */
#include "lib11sht.h"
/* ---------------------------------------------------------------------- */
/* get two objets a and b (float or string) and a delta (float)
* return -1 if a < b, within range delta
/* get two objects a and b (float or string) and a delta (float)
* Translates the library's {-1, 0, +1} return convention to CLI exit codes:
* return 0 if a ~ b, within range delta
* return +1 if a > b, within range delta
* return -2 if an error occurred
* return +2 if a < b, within range delta
* return +3 if an error occurred
*/
int main(int argc, char **argv)
{
int res; /* comparisson results */
int res; /* comparison result (CLI exit code) */
char *fenda, *fendb, *fendd;
char s1[SBUFF], s2[SBUFF];
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
float a, b, delta, ratio;
int opt=0; /* -no-option:0, -v:1, -o:2, -n:3 */
@@ -72,11 +38,11 @@ int main(int argc, char **argv)
"cmp11sht - Fuzzy compare strings or floats within range\n\n\
Usage:\n\t$ cmp11sht -h\n\
\t$ cmp11sht o1 o2 delta [-v|-o|-n]\n\n\
After getting two objets o1 and o2 (float or string)\n\
After getting two objects o1 and o2 (float or string)\n\
and a FLT_MIN < delta < 1.0 (float), where:\n\
- For strings, delta is the Levenshtein ratio\n\
- For floats, delta is the precision\n\
the comparisson will:\n\n\
the comparison will:\n\n\
* return 0 if a ~ b, within range delta\n\
* return +1 if a > b, within range delta\n\
* return +2 if a < b, within range delta\n\
@@ -84,7 +50,7 @@ the comparisson will:\n\n\
Options:\n\t-h Print this help\n\
\t-v Print result to stdout (default is system err)\n\
\t-o Print also the Levenshtein ratio or float difference\n\
\t-n Print also the normalized strings or floats used for comparisson\n\n\
\t-n Print also the normalized strings or floats used for comparison\n\n\
cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
);
return 3;
@@ -103,33 +69,34 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
default: return 3;
}
if(fendd == argv[3]) // error, need a threshold
if(fendd == argv[3]) /* error, need a threshold */
{
if(opt) printf("%d\n", 3);
return 3;
}
if(delta <= FLT_MIN) // near zero delta not accepted
if(delta <= FLT_MIN) /* near zero delta not accepted */
{
if(opt) printf("%d\n", 3);
return 3;
}
if(delta >= 1.0) // maximum precision 1.0
if(delta >= 1.0) /* maximum precision 1.0 */
{
if(opt) printf("%d\n", 3);
return 3;
}
if(fenda == argv[1] || fendb == argv[2]) // string
if(fenda == argv[1] || fendb == argv[2]) /* string */
{
if(opt==3) printf("cmp11sht: string\n");
errno = 0;
res = sequal(argv[1], argv[2], delta, &ratio, s1, s2);
res = sequal_full(argv[1], argv[2], delta, &ratio,
s1, LEVN_SBUFF, s2, LEVN_SBUFF);
if(errno == EINVAL)
res = 3; /* error CLI exit 3 */
res = 3; /* error -> CLI exit 3 */
else if(res > 0)
res = 1; /* a > b CLI exit 1 */
res = 1; /* a > b -> CLI exit 1 */
else if(res < 0)
res = 2; /* a < b CLI exit 2 */
res = 2; /* a < b -> CLI exit 2 */
/* res == 0 stays 0 (equal) */
if(opt==3) printf("result: ");
if(opt) printf("%d\n", res);
@@ -137,14 +104,14 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
if(opt>=2) printf("%f\n", ratio);
if(opt==3) printf("s1: %s\ns2: %s\n", s1, s2);
}
else // float
else /* float */
{
if(opt==3) printf("cmp11sht: float\n");
res = fequal(a, b, delta);
if(res > 0)
res = 1; /* a > b CLI exit 1 */
res = 1; /* a > b -> CLI exit 1 */
else if(res < 0)
res = 2; /* a < b CLI exit 2 */
res = 2; /* a < b -> CLI exit 2 */
/* res == 0 stays 0 (equal) */
if(opt==3) printf("result: ");
if(opt) printf("%d\n", res);
@@ -156,234 +123,5 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
return res;
}
/* ---------------------------------------------------------------------- */
/* compare equallity of two float numbers within an error margin delta.
* Return 0 if equal within the error margin,
* -1 if a < b-delta and
* +1 if a > b+delta
*/
int fequal(float a, float b, float delta)
{
if(a < b - delta)
return -1;
if(a > b + delta)
return 1;
/* b-delta <= a <= b+delta */
return 0;
}
/* ---------------------------------------------------------------------- */
/* compare similarity between two strings.
* Return:
* 0 if equal or similar above given threshold
* -1 if a < b alphabetically (after normalization)
* +1 if a > b alphabetically (after normalization)
* On error: sets errno = EINVAL and returns 0; result is undefined.
* Caller must reset errno = 0 before the call to detect errors.
*/
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2)
{
int i;
if(!a || !b || !s1 || !s2 || !ratio)
{
errno = EINVAL;
return 0;
}
// remove accents
asciify(a, s1);
asciify(b, s2);
// trim spaces
trim(s1);
trim(s2);
// lowercase
for(i=0; i<SBUFF && s1[i]; i++)
s1[i] = (char)tolower((unsigned char)s1[i]);
for(i=0; i<SBUFF && s2[i]; i++)
s2[i] = (char)tolower((unsigned char)s2[i]);
i=strcmp(s1, s2);
if(!i)
{
*ratio=1.0;
return 0;
}
*ratio = shit11(s1, s2);
if(*ratio > thr)
return 0;
return (i < 0)? -1 : 1;
}
/* ---------------------------------------------------------------------- */
float shit11(char *s1, char *s2)
{
int len1, len2;
int i, j, cost;
len1 = strlen(s1);
len2 = strlen(s2);
if(len1 == 0 || len2 == 0)
return 0.0;
int d[len1+1][len2+1];
for(i=0; i<=len1; i++)
d[i][0] = i;
for(j=0; j<=len2; j++)
d[0][j] = j;
for(i=1; i <= len1; i++)
for(j=1; j <= len2; j++)
{
cost = (s1[i-1] == s2[j-1])? 0 : 1;
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
}
return 1.0 - d[len1][len2] / fmax2(len1, len2);
}
/* ---------------------------------------------------------------------- */
/* return the minimum */
float fmin3(float a, float b, float c)
{
float m=a;
if(b<m)
m=b;
if(c<m)
m=c;
return m;
}
/* ---------------------------------------------------------------------- */
/* return the maximum */
float fmax2(float a, float b)
{
return (a>b)? a : b;
}
/* ---------------------------------------------------------------------- */
/* check if it is a useless char */
int uselesschar(int c)
{
if(c == ' ') return 1;
if(c == '\t') return 1;
if(c == '\n') return 1;
if(c == '\r') return 1;
if(c == 0xC2) return 1;
if(c == 0xA0) return 1;
return 0;
}
/* ---------------------------------------------------------------------- */
/* remove leading/trailing spaces and tabs */
void trim(char *s)
{
int len, i, j, fin;
if(!s)
return;
/* remove leading spaces */
i = 0;
while(uselesschar(s[i]))
i++;
if(i > 0)
memmove(s, s + i, strlen(s + i) + 1);
/* remove trailing spaces */
len = strlen(s);
while(len > 0 && uselesschar(s[len - 1]))
s[--len] = '\0';
/* remove double spaces in between */
i=j=0;
while(s[i] != '\0')
{
if(uselesschar(s[i]))
{
if(!fin)
{
s[j++] = ' ';
fin = 1;
}
}
else
{
s[j++] = s[i];
fin = 0;
}
i++;
}
s[j] = '\0';
}
int ulen(unsigned char c)
{
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
return 1; /* ASCII or invalid or UTF continution byte */
}
void asciify(const char *src, char *dest)
{
int len, i, k, j, found;
char ch[5]; // UTF8 multibyte char
const char transclear[] =
"AEIOUAEIOUAEIOUAEIOUAEIOU"
"aeiouaeiouaeiouaeiouaeiou"
"aoCcNn123"
" "; /* NBSP → space */
const char *translit[] = {
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
"Ã","","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
"à","è","ì","ò","ù", "ã","","ĩ","õ","ũ",
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
"\xC2\xA0"}; /* NBSP */
if(!src || !dest)
return;
if(*src=='\0')
return;
j=i=0;
while(src[i]!='\0' && i < SBUFF)
{
if(((unsigned char)src[i] & 0xc0) == 0x80) // non ASCII, UTF continuation char
{
i++;
continue;
}
len = ulen((unsigned char)src[i]);
memcpy(ch, &src[i], len);
ch[len]='\0';
if(((unsigned char)src[i]) < 0x80) /* ASCII */
{
dest[j++] = ch[0];
i++;
continue;
}
found = 0;
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
if(strcmp(ch, translit[k])==0)
{
dest[j++] = transclear[k];
found = 1;
break;
}
if(!found)
dest[j++] = '?';
i+=len;
}
dest[j]='\0';
}
/* ---------------------------------------------------------------------- */
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */
/* Template by Dr. Beco <rcb at beco dot cc> Version 20160612.142044 */

274
lib11sht.c Normal file
View File

@@ -0,0 +1,274 @@
/* ************************************************************************ *
* lib11sht.c, v1.0 *
* Fuzzy comparison library implementation *
* *
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
* GNU GPL version 2 or later. *
* ************************************************************************ */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include "lib11sht.h"
/* Internal helpers -- kept private to this translation unit. */
static int uselesschar(int c);
/* trim and asciify are public (declared in lib11sht.h) */
static int ulen(unsigned char c);
static float shit11(char *s1, char *s2);
static float fmin3(float a, float b, float c);
static float fmax2(float a, float b);
/* ---------------------------------------------------------------------- */
int fequal(float a, float b, float delta)
{
if(a < b - delta)
return -1;
if(a > b + delta)
return 1;
/* b-delta <= a <= b+delta */
return 0;
}
/* ---------------------------------------------------------------------- */
/* Simple wrapper: most callers don't need the ratio or normalized buffers.
* Symmetric with fequal(a, b, delta). */
int sequal(char *a, char *b, float shold)
{
float ratio;
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
return sequal_full(a, b, shold, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
}
/* ---------------------------------------------------------------------- */
int sequal_full(char *a, char *b, float shold, float *ratio,
char *s1, size_t s1_size,
char *s2, size_t s2_size)
{
size_t i;
if(!a || !b || !s1 || !s2 || !ratio || s1_size == 0 || s2_size == 0)
{
errno = EINVAL;
return 0;
}
/* remove accents (asciify NUL-terminates within s1_size / s2_size) */
asciify(a, s1, s1_size);
asciify(b, s2, s2_size);
/* trim spaces (leading, trailing, and internal collapse) */
trim(s1);
trim(s2);
/* lowercase, bounded by each buffer's actual size */
for(i=0; i<s1_size && s1[i]; i++)
s1[i] = (char)tolower((unsigned char)s1[i]);
for(i=0; i<s2_size && s2[i]; i++)
s2[i] = (char)tolower((unsigned char)s2[i]);
{
int cmp = strcmp(s1, s2);
if(cmp == 0)
{
*ratio = 1.0;
return 0;
}
if(shold >= 1.0f)
{
/* strict mode: shold=1.0 means no fuzzy match possible
* (shit11 ratio is always <= 1.0, never > 1.0). Skip the
* Levenshtein DP entirely. */
*ratio = 0.0f;
return (cmp < 0) ? -1 : 1;
}
*ratio = shit11(s1, s2);
if(*ratio > shold)
return 0;
return (cmp < 0) ? -1 : 1;
}
}
/* ---------------------------------------------------------------------- */
/* Levenshtein similarity ratio 0.0..1.0 */
static float shit11(char *s1, char *s2)
{
int len1, len2;
int i, j, cost;
/* strnlen guards against missing NUL (no read past LEVN_SBUFF) */
len1 = (int)strnlen(s1, LEVN_SBUFF);
len2 = (int)strnlen(s2, LEVN_SBUFF);
if(len1 == 0 && len2 == 0)
return 1.0; /* both empty == identical */
if(len1 == 0 || len2 == 0)
return 0.0; /* one empty == no similarity */
/* Defensive cap: refuse to allocate too-large VLA. Bounds the stack
* matrix at LEVN_SBUFF * LEVN_SBUFF * sizeof(int) = ~256 KB worst case. */
if(len1 >= LEVN_SBUFF || len2 >= LEVN_SBUFF)
return 0.0; /* treat as "completely dissimilar" */
int d[len1+1][len2+1];
for(i=0; i<=len1; i++)
d[i][0] = i;
for(j=0; j<=len2; j++)
d[0][j] = j;
for(i=1; i <= len1; i++)
for(j=1; j <= len2; j++)
{
cost = (s1[i-1] == s2[j-1])? 0 : 1;
d[i][j] = fmin3(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost);
}
return 1.0 - d[len1][len2] / fmax2(len1, len2);
}
/* ---------------------------------------------------------------------- */
static float fmin3(float a, float b, float c)
{
float m=a;
if(b<m)
m=b;
if(c<m)
m=c;
return m;
}
/* ---------------------------------------------------------------------- */
static float fmax2(float a, float b)
{
return (a>b)? a : b;
}
/* ---------------------------------------------------------------------- */
static int uselesschar(int c)
{
if(c == ' ') return 1;
if(c == '\t') return 1;
if(c == '\n') return 1;
if(c == '\r') return 1;
if(c == 0xC2) return 1;
if(c == 0xA0) return 1;
return 0;
}
/* ---------------------------------------------------------------------- */
/* remove leading/trailing whitespace + collapse internal multiple whitespace */
void trim(char *s)
{
int len, i, j, fin = 0;
if(!s)
return;
/* remove leading spaces */
i = 0;
while(uselesschar(s[i]))
i++;
if(i > 0)
memmove(s, s + i, strlen(s + i) + 1);
/* remove trailing spaces */
len = strlen(s);
while(len > 0 && uselesschar(s[len - 1]))
s[--len] = '\0';
/* remove double spaces in between */
i=j=0;
while(s[i] != '\0')
{
if(uselesschar(s[i]))
{
if(!fin)
{
s[j++] = ' ';
fin = 1;
}
}
else
{
s[j++] = s[i];
fin = 0;
}
i++;
}
s[j] = '\0';
}
/* ---------------------------------------------------------------------- */
static int ulen(unsigned char c)
{
if((c & 0xE0) == 0xC0) return 2; /* UTF8 lead 2 bytes 110xxxxx */
if((c & 0xF0) == 0xE0) return 3; /* UTF8 lead 3 bytes 1110xxxx */
if((c & 0xF8) == 0xF0) return 4; /* UTF8 lead 4 bytes 11110xxx */
return 1; /* ASCII or invalid or UTF continuation byte */
}
/* ---------------------------------------------------------------------- */
void asciify(const char *src, char *dest, size_t dest_size)
{
int len, i, k, j, found;
char ch[5]; /* UTF8 multibyte char */
const char transclear[] =
"AEIOUAEIOUAEIOUAEIOUAEIOU"
"aeiouaeiouaeiouaeiouaeiou"
"aoCcNn123"
" "; /* NBSP -> space */
const char *translit[] = {
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
"Ã","","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
"Ä","Ë","Ï","Ö","Ü", "á","é","í","ó","ú",
"à","è","ì","ò","ù", "ã","","ĩ","õ","ũ",
"â","ê","î","ô","û", "ä","ë","ï","ö","ü",
"ª","º","Ç","ç","Ñ", "ñ","¹","²","³",
"\xC2\xA0"}; /* NBSP */
if(!src || !dest || dest_size == 0)
return;
if(*src=='\0')
{
dest[0] = '\0'; /* maintain "dest is a valid C string" contract */
return;
}
j=i=0;
/* leave room for the final NUL: writes capped at dest_size-1 */
while(src[i]!='\0' && i < LEVN_SBUFF && j < (int)dest_size - 1)
{
if(((unsigned char)src[i] & 0xc0) == 0x80) /* non ASCII, UTF continuation char */
{
i++;
continue;
}
len = ulen((unsigned char)src[i]);
memcpy(ch, &src[i], len);
ch[len]='\0';
if(((unsigned char)src[i]) < 0x80) /* ASCII */
{
dest[j++] = ch[0];
i++;
continue;
}
found = 0;
for(k=0; k<(int)(sizeof(translit)/sizeof(translit[0])); k++)
if(strcmp(ch, translit[k])==0)
{
dest[j++] = transclear[k];
found = 1;
break;
}
if(!found)
dest[j++] = '?';
i+=len;
}
dest[j]='\0';
}
/* ---------------------------------------------------------------------- */
/* vi: set ai et ts=4 sw=4 tw=0 wm=0 fo=croql : C config for Vim modeline */

View File

@@ -9,26 +9,76 @@
#ifndef LIB11SHT_H
#define LIB11SHT_H
#define LEVN_SBUFF 256 /* min size for s1/s2 buffers and bounded input */
#include <stddef.h> /* size_t */
#define LEVN_SBUFF 256 /* recommended size for s1/s2 buffers */
/* ************************************************************************ *
* ERROR CONVENTION (read before using sequal / sequal_full) *
* ************************************************************************ *
*
* sequal() and sequal_full() return -1 / 0 / +1 for the three comparison
* outcomes (a<b / equal-or-similar / a>b). On error (NULL inputs, zero
* buffer size), they set errno = EINVAL and return 0.
*
* The return value 0 is shared by "match" and "error". To distinguish,
* callers MUST reset errno = 0 before the call and check it after:
*
* errno = 0;
* int r = sequal(a, b, 0.85f);
* if(errno == EINVAL) { ... error handling ... }
* else if(r == 0) { ... match ... }
* else { ... a<b or a>b ... }
*
* Forgetting the errno=0 reset means errno from a previous failed call
* (anywhere in the program) could be mistaken for a sequal error. This
* follows the same pattern as strtol(), but be deliberate about it.
*
* ************************************************************************ */
/* Compare similarity between two strings (after asciify + trim + lowercase).
* Symmetric in shape with fequal(a, b, delta).
* Returns:
* 0 if equal or similar above the given threshold ratio
* 0 if equal or similar above the shold threshold
* -1 if a < b alphabetically (after normalization)
* +1 if a > b alphabetically (after normalization)
* On error: sets errno = EINVAL and returns 0; comparison result is undefined.
* Caller must reset errno = 0 before the call to detect errors.
* On error: sets errno = EINVAL and returns 0 -- see ERROR CONVENTION above.
*
* Parameters:
* a, b input strings (NUL-terminated, may contain UTF-8 accented Latin chars)
* thr Levenshtein similarity threshold 0.0..1.0; matches above this count as equal
* ratio out: Levenshtein similarity 0.0..1.0 (1.0 on exact-after-normalize)
* s1, s2 out: caller-provided buffers (>= LEVN_SBUFF) filled with the
* normalized inputs (useful for diagnostics; can be ignored)
* a, b input strings (NUL-terminated, may contain UTF-8 accented Latin chars).
* Strings longer than LEVN_SBUFF-1 chars are silently truncated to
* their first LEVN_SBUFF-1 chars for comparison.
* shold similarity threshold 0.0..1.0; matches above this count as equal.
* Use 1.0 for strict mode (no fuzzy fallback, only exact-after-normalize).
*/
int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2);
int sequal(char *a, char *b, float shold);
/* Compare two floats within ±delta.
/* Full variant of sequal: same comparison but also returns diagnostics.
* Used by callers that need the computed ratio or the normalized strings
* (e.g. cmp11sht CLI's -o / -n flags).
*
* Extra parameters:
* ratio out: Levenshtein similarity 0.0..1.0 (1.0 on exact-after-normalize)
* s1, s2 out: caller-provided buffers filled with the normalized inputs
* s1_size size of s1 in bytes (writes capped at s1_size-1 + final NUL)
* s2_size size of s2 in bytes (writes capped at s2_size-1 + final NUL)
*
* TRUNCATION SEMANTICS: when an input is longer than its buffer, only the
* leading (s_size-1)-bytes-after-normalization participate in the
* comparison. The Levenshtein ratio in *ratio is computed on the
* normalized contents of s1 / s2 (i.e. on the possibly-truncated buffer
* data), NOT on the original a / b strings. To compare without truncation,
* pass buffers at least as large as the longest input -- LEVN_SBUFF (256)
* is the recommended floor.
*
* On error: same convention as sequal -- sets errno = EINVAL and returns 0;
* *ratio, s1, s2 are not modified in that case. See ERROR CONVENTION above.
*/
int sequal_full(char *a, char *b, float shold, float *ratio,
char *s1, size_t s1_size,
char *s2, size_t s2_size);
/* Compare two floats within +/-delta.
* Returns:
* 0 if |a - b| <= delta
* -1 if a < b - delta
@@ -36,4 +86,24 @@ int sequal(char *a, char *b, float thr, float *ratio, char *s1, char *s2);
*/
int fequal(float a, float b, float delta);
/* String trim: removes leading + trailing whitespace (including UTF-8
* NBSP bytes 0xC2 / 0xA0) AND collapses internal runs of whitespace
* to a single space. Modifies s in place. Caller's buffer must already
* be NUL-terminated. */
void trim(char *s);
/* Transliterate accented Latin chars to plain ASCII. Walks src as UTF-8,
* mapping known accented chars (a-acute, c-cedilla, n-tilde, NBSP, ...)
* to their ASCII equivalents and copying ASCII bytes verbatim. Output is
* always pure ASCII and NUL-terminated. Bytes that don't match the
* transliteration table are skipped.
*
* Parameters:
* src input UTF-8 string (NUL-terminated)
* dest output buffer (filled with NUL-terminated ASCII)
* dest_size size of dest in bytes (writes capped at dest_size-1 + final NUL)
*
* Safe for NULL / zero-size args (no-op). */
void asciify(const char *src, char *dest, size_t dest_size);
#endif /* LIB11SHT_H */

37
makefile Normal file
View File

@@ -0,0 +1,37 @@
# Copyright (C) 2026 by Prof. Dr. Ruben Carlo Benante
# levenshtein library -- makefile
#
# Usage:
# make build both: cmp11sht (CLI) and test_lib11sht
# make cmp11sht build only the CLI
# make test build + run the stress test suite
# make clean remove built binaries
#
# Uses clang with strict warnings to catch bugs early.
CC = clang
CFLAGS = -std=gnu17 -O2 -g \
-Wall -Wextra -Wpedantic \
-Wshadow -Wpointer-arith -Wcast-qual -Wundef \
-Wstrict-prototypes -Wmissing-prototypes \
-fdiagnostics-color=always
LDLIBS = -lm
LIB_SRC = lib11sht.c
LIB_HDR = lib11sht.h
.PHONY: all test clean
all: cmp11sht test_lib11sht
cmp11sht: cmp11sht.c $(LIB_SRC) $(LIB_HDR)
$(CC) $(CFLAGS) cmp11sht.c $(LIB_SRC) $(LDLIBS) -o $@
test_lib11sht: test_lib11sht.c $(LIB_SRC) $(LIB_HDR)
$(CC) $(CFLAGS) test_lib11sht.c $(LIB_SRC) $(LDLIBS) -o $@
test: test_lib11sht
./test_lib11sht
clean:
rm -f cmp11sht test_lib11sht

177
test_lib11sht.c Normal file
View File

@@ -0,0 +1,177 @@
/* Stress tests for lib11sht -- aim to break upper / lower bounds.
* Compile: gcc -Wall -Wextra -O2 test_lib11sht.c lib11sht.c -lm -o test_lib11sht
* Run: ./test_lib11sht
* Exit 0 if all tests pass, non-zero on first failure.
*/
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include "lib11sht.h"
static int fails = 0;
#define CHECK(cond, msg) do { \
printf(" %s ... ", msg); \
if(cond) { printf("PASS\n"); } else { printf("FAIL\n"); fails++; } \
} while(0)
int main(void)
{
int r;
float ratio;
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
/* ----- UPPER bound tests ----- */
/* Test 1: pure ASCII input longer than LEVN_SBUFF. Should NOT crash,
* should truncate cleanly. Build a 400-char string. */
{
char long_a[400], long_b[400];
int i;
for(i = 0; i < 399; i++) { long_a[i] = 'x'; long_b[i] = 'x'; }
long_a[399] = long_b[399] = '\0';
printf("Test 1: 400-char identical ASCII strings (>LEVN_SBUFF)\n");
errno = 0;
r = sequal(long_a, long_b, 0.85);
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal-after-truncate)");
}
/* Test 2: one 400-char of 'x', other 400-char of 'y'. Truncated to 256
* each. Should differ. */
{
char xs[400], ys[400];
int i;
for(i = 0; i < 399; i++) { xs[i] = 'x'; ys[i] = 'y'; }
xs[399] = ys[399] = '\0';
printf("Test 2: 400 'x' vs 400 'y' (all different, over-length)\n");
errno = 0;
r = sequal(xs, ys, 0.85);
CHECK(errno == 0 && r != 0, "no errno, returns non-zero (different)");
}
/* Test 3: exact-boundary length: 255 chars + NUL = 256 = LEVN_SBUFF */
{
char a[LEVN_SBUFF], b[LEVN_SBUFF];
int i;
for(i = 0; i < LEVN_SBUFF - 1; i++) { a[i] = 'a'; b[i] = 'a'; }
a[LEVN_SBUFF - 1] = b[LEVN_SBUFF - 1] = '\0';
printf("Test 3: exactly LEVN_SBUFF-1 chars + NUL\n");
errno = 0;
r = sequal(a, b, 0.85);
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal at boundary)");
}
/* Test 4: long input WITH accents -- UTF-8 multi-byte at offset 250.
* Should asciify each accent to ASCII, total visible chars < 256. */
{
char a[400] = "", b[400] = "";
int i;
/* Fill with 240 'a' then put 6 A-tilde chars (UTF-8: 0xC3 0x83 each, 12 bytes) */
for(i = 0; i < 240; i++) { a[i] = 'a'; b[i] = 'a'; }
a[240] = b[240] = '\0';
strcat(a, "ÃÃÃÃÃÃ"); /* +12 bytes UTF-8, 6 chars visible -> asciify to 'AAAAAA' */
strcat(b, "ÃÃÃÃÃÃ");
printf("Test 4: 240 'a' + 6x A-tilde (multi-byte near boundary)\n");
errno = 0;
r = sequal(a, b, 0.85);
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal)");
}
/* Test 5: full-detail call with undersized buffers -- this is the actual
* realistic break path. Pass 64-byte buffers to sequal_full. */
{
char small1[64], small2[64];
char a[300], b[300];
int i;
for(i = 0; i < 299; i++) { a[i] = 'a'; b[i] = 'a'; }
a[299] = b[299] = '\0';
printf("Test 5: sequal_full with 64-byte buffers + 300-char input\n");
errno = 0;
r = sequal_full(a, b, 0.85, &ratio, small1, sizeof(small1), small2, sizeof(small2));
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal after truncation to 63 chars)");
CHECK(strlen(small1) <= sizeof(small1) - 1, "small1 NUL-terminated within buffer");
CHECK(strlen(small2) <= sizeof(small2) - 1, "small2 NUL-terminated within buffer");
}
/* ----- LOWER bound tests ----- */
/* Test 6: NULL inputs -- should set errno=EINVAL and return 0 */
{
printf("Test 6: NULL inputs via sequal_full\n");
errno = 0;
r = sequal_full(NULL, "x", 0.85, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
CHECK(errno == EINVAL && r == 0, "NULL a: errno=EINVAL, returns 0");
errno = 0;
r = sequal_full("x", NULL, 0.85, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
CHECK(errno == EINVAL && r == 0, "NULL b: errno=EINVAL, returns 0");
errno = 0;
r = sequal_full("x", "y", 0.85, NULL, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
CHECK(errno == EINVAL && r == 0, "NULL ratio: errno=EINVAL, returns 0");
errno = 0;
r = sequal_full("x", "y", 0.85, &ratio, NULL, LEVN_SBUFF, s2, LEVN_SBUFF);
CHECK(errno == EINVAL && r == 0, "NULL s1: errno=EINVAL, returns 0");
errno = 0;
r = sequal_full("x", "y", 0.85, &ratio, s1, LEVN_SBUFF, NULL, LEVN_SBUFF);
CHECK(errno == EINVAL && r == 0, "NULL s2: errno=EINVAL, returns 0");
}
/* Test 7: zero-size buffers -- should set errno=EINVAL */
{
printf("Test 7: zero-size buffers\n");
errno = 0;
r = sequal_full("x", "y", 0.85, &ratio, s1, 0, s2, LEVN_SBUFF);
CHECK(errno == EINVAL && r == 0, "s1_size=0: errno=EINVAL, returns 0");
errno = 0;
r = sequal_full("x", "y", 0.85, &ratio, s1, LEVN_SBUFF, s2, 0);
CHECK(errno == EINVAL && r == 0, "s2_size=0: errno=EINVAL, returns 0");
}
/* Test 8: minimum non-empty strings + threshold extremes */
{
printf("Test 8: minimal strings + threshold edge cases\n");
/* single char equal */
errno = 0;
r = sequal("a", "a", 0.85);
CHECK(r == 0 && errno == 0, "'a' vs 'a' -> 0 (equal)");
/* single char different */
errno = 0;
r = sequal("a", "b", 0.85);
CHECK(r != 0 && errno == 0, "'a' vs 'b' -> non-zero (different)");
/* threshold = 0.0: any non-zero similarity matches -> 0 even when different */
r = sequal("hello", "world", 0.0);
CHECK(r == 0, "shold=0.0 makes any non-empty pair 'match'");
/* threshold = 1.0: only exact-after-normalize -> strict strcmp */
r = sequal("hello", "Hello", 1.0);
CHECK(r == 0, "shold=1.0 + case-only diff still matches (normalized exact)");
r = sequal("hello", "world", 1.0);
CHECK(r != 0, "shold=1.0 + completely different -> non-zero");
/* whitespace-only strings */
r = sequal(" ", " ", 0.85);
CHECK(r == 0, "whitespace-only strings normalize equal");
}
/* Summary */
printf("\n");
if(fails == 0) {
printf("ALL TESTS PASSED\n");
return 0;
} else {
printf("%d TEST(S) FAILED\n", fails);
return 1;
}
}