Compare commits
7 Commits
d48c8c1c36
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| 902e29dc93 | |||
| 74a5ef63b0 | |||
| 8c14340adc | |||
| f615ee0242 | |||
| b046872fd8 | |||
| 97076a2d48 | |||
| e9d00dc619 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,2 +1,4 @@
|
||||
untracked/
|
||||
cmp11sht
|
||||
new file: test_lib11sht.c
|
||||
test_lib11sht
|
||||
|
||||
12
cmp11sht.c
12
cmp11sht.c
@@ -1,6 +1,6 @@
|
||||
/* ************************************************************************ *
|
||||
* cmp11sht.c, v20251221.085434 *
|
||||
* Fuzzy comparison CLI — thin shell over lib11sht *
|
||||
* Fuzzy comparison CLI -- thin shell over lib11sht *
|
||||
* *
|
||||
* Copyright (C) 2025 by Ruben Carlo Benante <rcb@beco.cc> *
|
||||
* GNU GPL version 2 or later. *
|
||||
@@ -92,11 +92,11 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
||||
res = sequal_full(argv[1], argv[2], delta, &ratio,
|
||||
s1, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||
if(errno == EINVAL)
|
||||
res = 3; /* error → CLI exit 3 */
|
||||
res = 3; /* error -> CLI exit 3 */
|
||||
else if(res > 0)
|
||||
res = 1; /* a > b → CLI exit 1 */
|
||||
res = 1; /* a > b -> CLI exit 1 */
|
||||
else if(res < 0)
|
||||
res = 2; /* a < b → CLI exit 2 */
|
||||
res = 2; /* a < b -> CLI exit 2 */
|
||||
/* res == 0 stays 0 (equal) */
|
||||
if(opt==3) printf("result: ");
|
||||
if(opt) printf("%d\n", res);
|
||||
@@ -109,9 +109,9 @@ cmp11sht v20251221.0718 (C) 2025 by Ruben C. Benante (MIT Lic)\n\n"
|
||||
if(opt==3) printf("cmp11sht: float\n");
|
||||
res = fequal(a, b, delta);
|
||||
if(res > 0)
|
||||
res = 1; /* a > b → CLI exit 1 */
|
||||
res = 1; /* a > b -> CLI exit 1 */
|
||||
else if(res < 0)
|
||||
res = 2; /* a < b → CLI exit 2 */
|
||||
res = 2; /* a < b -> CLI exit 2 */
|
||||
/* res == 0 stays 0 (equal) */
|
||||
if(opt==3) printf("result: ");
|
||||
if(opt) printf("%d\n", res);
|
||||
|
||||
11
lib11sht.c
11
lib11sht.c
@@ -14,10 +14,9 @@
|
||||
|
||||
#include "lib11sht.h"
|
||||
|
||||
/* Internal helpers — kept private to this translation unit. */
|
||||
/* Internal helpers -- kept private to this translation unit. */
|
||||
static int uselesschar(int c);
|
||||
/* trim is public (declared in lib11sht.h) */
|
||||
static void asciify(const char *src, char *dest, size_t dest_size);
|
||||
/* trim and asciify are public (declared in lib11sht.h) */
|
||||
static int ulen(unsigned char c);
|
||||
static float shit11(char *s1, char *s2);
|
||||
static float fmin3(float a, float b, float c);
|
||||
@@ -111,7 +110,7 @@ static float shit11(char *s1, char *s2)
|
||||
return 0.0; /* one empty == no similarity */
|
||||
|
||||
/* Defensive cap: refuse to allocate too-large VLA. Bounds the stack
|
||||
* matrix at LEVN_SBUFF × LEVN_SBUFF × sizeof(int) = ~256 KB worst case. */
|
||||
* matrix at LEVN_SBUFF * LEVN_SBUFF * sizeof(int) = ~256 KB worst case. */
|
||||
if(len1 >= LEVN_SBUFF || len2 >= LEVN_SBUFF)
|
||||
return 0.0; /* treat as "completely dissimilar" */
|
||||
|
||||
@@ -210,7 +209,7 @@ static int ulen(unsigned char c)
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
static void asciify(const char *src, char *dest, size_t dest_size)
|
||||
void asciify(const char *src, char *dest, size_t dest_size)
|
||||
{
|
||||
int len, i, k, j, found;
|
||||
char ch[5]; /* UTF8 multibyte char */
|
||||
@@ -218,7 +217,7 @@ static void asciify(const char *src, char *dest, size_t dest_size)
|
||||
"AEIOUAEIOUAEIOUAEIOUAEIOU"
|
||||
"aeiouaeiouaeiouaeiouaeiou"
|
||||
"aoCcNn123"
|
||||
" "; /* NBSP → space */
|
||||
" "; /* NBSP -> space */
|
||||
const char *translit[] = {
|
||||
"Á","É","Í","Ó","Ú", "À","È","Ì","Ò","Ù",
|
||||
"Ã","Ẽ","Ĩ","Õ","Ũ", "Â","Ê","Î","Ô","Û",
|
||||
|
||||
62
lib11sht.h
62
lib11sht.h
@@ -13,18 +13,43 @@
|
||||
|
||||
#define LEVN_SBUFF 256 /* recommended size for s1/s2 buffers */
|
||||
|
||||
/* ************************************************************************ *
|
||||
* ERROR CONVENTION (read before using sequal / sequal_full) *
|
||||
* ************************************************************************ *
|
||||
*
|
||||
* sequal() and sequal_full() return -1 / 0 / +1 for the three comparison
|
||||
* outcomes (a<b / equal-or-similar / a>b). On error (NULL inputs, zero
|
||||
* buffer size), they set errno = EINVAL and return 0.
|
||||
*
|
||||
* The return value 0 is shared by "match" and "error". To distinguish,
|
||||
* callers MUST reset errno = 0 before the call and check it after:
|
||||
*
|
||||
* errno = 0;
|
||||
* int r = sequal(a, b, 0.85f);
|
||||
* if(errno == EINVAL) { ... error handling ... }
|
||||
* else if(r == 0) { ... match ... }
|
||||
* else { ... a<b or a>b ... }
|
||||
*
|
||||
* Forgetting the errno=0 reset means errno from a previous failed call
|
||||
* (anywhere in the program) could be mistaken for a sequal error. This
|
||||
* follows the same pattern as strtol(), but be deliberate about it.
|
||||
*
|
||||
* ************************************************************************ */
|
||||
|
||||
/* Compare similarity between two strings (after asciify + trim + lowercase).
|
||||
* Symmetric in shape with fequal(a, b, delta).
|
||||
* Returns:
|
||||
* 0 if equal or similar above the lratio threshold
|
||||
* 0 if equal or similar above the shold threshold
|
||||
* -1 if a < b alphabetically (after normalization)
|
||||
* +1 if a > b alphabetically (after normalization)
|
||||
* On error: sets errno = EINVAL and returns 0; comparison result is undefined.
|
||||
* Caller must reset errno = 0 before the call to detect errors.
|
||||
* On error: sets errno = EINVAL and returns 0 -- see ERROR CONVENTION above.
|
||||
*
|
||||
* Parameters:
|
||||
* a, b input strings (NUL-terminated, may contain UTF-8 accented Latin chars)
|
||||
* shold Levenshtein similarity threshold 0.0..1.0; matches above this count as equal
|
||||
* a, b input strings (NUL-terminated, may contain UTF-8 accented Latin chars).
|
||||
* Strings longer than LEVN_SBUFF-1 chars are silently truncated to
|
||||
* their first LEVN_SBUFF-1 chars for comparison.
|
||||
* shold similarity threshold 0.0..1.0; matches above this count as equal.
|
||||
* Use 1.0 for strict mode (no fuzzy fallback, only exact-after-normalize).
|
||||
*/
|
||||
int sequal(char *a, char *b, float shold);
|
||||
|
||||
@@ -37,12 +62,23 @@ int sequal(char *a, char *b, float shold);
|
||||
* s1, s2 out: caller-provided buffers filled with the normalized inputs
|
||||
* s1_size size of s1 in bytes (writes capped at s1_size-1 + final NUL)
|
||||
* s2_size size of s2 in bytes (writes capped at s2_size-1 + final NUL)
|
||||
*
|
||||
* TRUNCATION SEMANTICS: when an input is longer than its buffer, only the
|
||||
* leading (s_size-1)-bytes-after-normalization participate in the
|
||||
* comparison. The Levenshtein ratio in *ratio is computed on the
|
||||
* normalized contents of s1 / s2 (i.e. on the possibly-truncated buffer
|
||||
* data), NOT on the original a / b strings. To compare without truncation,
|
||||
* pass buffers at least as large as the longest input -- LEVN_SBUFF (256)
|
||||
* is the recommended floor.
|
||||
*
|
||||
* On error: same convention as sequal -- sets errno = EINVAL and returns 0;
|
||||
* *ratio, s1, s2 are not modified in that case. See ERROR CONVENTION above.
|
||||
*/
|
||||
int sequal_full(char *a, char *b, float shold, float *ratio,
|
||||
char *s1, size_t s1_size,
|
||||
char *s2, size_t s2_size);
|
||||
|
||||
/* Compare two floats within ±delta.
|
||||
/* Compare two floats within +/-delta.
|
||||
* Returns:
|
||||
* 0 if |a - b| <= delta
|
||||
* -1 if a < b - delta
|
||||
@@ -56,4 +92,18 @@ int fequal(float a, float b, float delta);
|
||||
* be NUL-terminated. */
|
||||
void trim(char *s);
|
||||
|
||||
/* Transliterate accented Latin chars to plain ASCII. Walks src as UTF-8,
|
||||
* mapping known accented chars (a-acute, c-cedilla, n-tilde, NBSP, ...)
|
||||
* to their ASCII equivalents and copying ASCII bytes verbatim. Output is
|
||||
* always pure ASCII and NUL-terminated. Bytes that don't match the
|
||||
* transliteration table are skipped.
|
||||
*
|
||||
* Parameters:
|
||||
* src input UTF-8 string (NUL-terminated)
|
||||
* dest output buffer (filled with NUL-terminated ASCII)
|
||||
* dest_size size of dest in bytes (writes capped at dest_size-1 + final NUL)
|
||||
*
|
||||
* Safe for NULL / zero-size args (no-op). */
|
||||
void asciify(const char *src, char *dest, size_t dest_size);
|
||||
|
||||
#endif /* LIB11SHT_H */
|
||||
|
||||
37
makefile
Normal file
37
makefile
Normal file
@@ -0,0 +1,37 @@
|
||||
# Copyright (C) 2026 by Prof. Dr. Ruben Carlo Benante
|
||||
# levenshtein library -- makefile
|
||||
#
|
||||
# Usage:
|
||||
# make build both: cmp11sht (CLI) and test_lib11sht
|
||||
# make cmp11sht build only the CLI
|
||||
# make test build + run the stress test suite
|
||||
# make clean remove built binaries
|
||||
#
|
||||
# Uses clang with strict warnings to catch bugs early.
|
||||
|
||||
CC = clang
|
||||
CFLAGS = -std=gnu17 -O2 -g \
|
||||
-Wall -Wextra -Wpedantic \
|
||||
-Wshadow -Wpointer-arith -Wcast-qual -Wundef \
|
||||
-Wstrict-prototypes -Wmissing-prototypes \
|
||||
-fdiagnostics-color=always
|
||||
LDLIBS = -lm
|
||||
|
||||
LIB_SRC = lib11sht.c
|
||||
LIB_HDR = lib11sht.h
|
||||
|
||||
.PHONY: all test clean
|
||||
|
||||
all: cmp11sht test_lib11sht
|
||||
|
||||
cmp11sht: cmp11sht.c $(LIB_SRC) $(LIB_HDR)
|
||||
$(CC) $(CFLAGS) cmp11sht.c $(LIB_SRC) $(LDLIBS) -o $@
|
||||
|
||||
test_lib11sht: test_lib11sht.c $(LIB_SRC) $(LIB_HDR)
|
||||
$(CC) $(CFLAGS) test_lib11sht.c $(LIB_SRC) $(LDLIBS) -o $@
|
||||
|
||||
test: test_lib11sht
|
||||
./test_lib11sht
|
||||
|
||||
clean:
|
||||
rm -f cmp11sht test_lib11sht
|
||||
177
test_lib11sht.c
Normal file
177
test_lib11sht.c
Normal file
@@ -0,0 +1,177 @@
|
||||
/* Stress tests for lib11sht -- aim to break upper / lower bounds.
|
||||
* Compile: gcc -Wall -Wextra -O2 test_lib11sht.c lib11sht.c -lm -o test_lib11sht
|
||||
* Run: ./test_lib11sht
|
||||
* Exit 0 if all tests pass, non-zero on first failure.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include "lib11sht.h"
|
||||
|
||||
static int fails = 0;
|
||||
|
||||
#define CHECK(cond, msg) do { \
|
||||
printf(" %s ... ", msg); \
|
||||
if(cond) { printf("PASS\n"); } else { printf("FAIL\n"); fails++; } \
|
||||
} while(0)
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int r;
|
||||
float ratio;
|
||||
char s1[LEVN_SBUFF], s2[LEVN_SBUFF];
|
||||
|
||||
/* ----- UPPER bound tests ----- */
|
||||
|
||||
/* Test 1: pure ASCII input longer than LEVN_SBUFF. Should NOT crash,
|
||||
* should truncate cleanly. Build a 400-char string. */
|
||||
{
|
||||
char long_a[400], long_b[400];
|
||||
int i;
|
||||
for(i = 0; i < 399; i++) { long_a[i] = 'x'; long_b[i] = 'x'; }
|
||||
long_a[399] = long_b[399] = '\0';
|
||||
|
||||
printf("Test 1: 400-char identical ASCII strings (>LEVN_SBUFF)\n");
|
||||
errno = 0;
|
||||
r = sequal(long_a, long_b, 0.85);
|
||||
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal-after-truncate)");
|
||||
}
|
||||
|
||||
/* Test 2: one 400-char of 'x', other 400-char of 'y'. Truncated to 256
|
||||
* each. Should differ. */
|
||||
{
|
||||
char xs[400], ys[400];
|
||||
int i;
|
||||
for(i = 0; i < 399; i++) { xs[i] = 'x'; ys[i] = 'y'; }
|
||||
xs[399] = ys[399] = '\0';
|
||||
|
||||
printf("Test 2: 400 'x' vs 400 'y' (all different, over-length)\n");
|
||||
errno = 0;
|
||||
r = sequal(xs, ys, 0.85);
|
||||
CHECK(errno == 0 && r != 0, "no errno, returns non-zero (different)");
|
||||
}
|
||||
|
||||
/* Test 3: exact-boundary length: 255 chars + NUL = 256 = LEVN_SBUFF */
|
||||
{
|
||||
char a[LEVN_SBUFF], b[LEVN_SBUFF];
|
||||
int i;
|
||||
for(i = 0; i < LEVN_SBUFF - 1; i++) { a[i] = 'a'; b[i] = 'a'; }
|
||||
a[LEVN_SBUFF - 1] = b[LEVN_SBUFF - 1] = '\0';
|
||||
|
||||
printf("Test 3: exactly LEVN_SBUFF-1 chars + NUL\n");
|
||||
errno = 0;
|
||||
r = sequal(a, b, 0.85);
|
||||
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal at boundary)");
|
||||
}
|
||||
|
||||
/* Test 4: long input WITH accents -- UTF-8 multi-byte at offset 250.
|
||||
* Should asciify each accent to ASCII, total visible chars < 256. */
|
||||
{
|
||||
char a[400] = "", b[400] = "";
|
||||
int i;
|
||||
/* Fill with 240 'a' then put 6 A-tilde chars (UTF-8: 0xC3 0x83 each, 12 bytes) */
|
||||
for(i = 0; i < 240; i++) { a[i] = 'a'; b[i] = 'a'; }
|
||||
a[240] = b[240] = '\0';
|
||||
strcat(a, "ÃÃÃÃÃÃ"); /* +12 bytes UTF-8, 6 chars visible -> asciify to 'AAAAAA' */
|
||||
strcat(b, "ÃÃÃÃÃÃ");
|
||||
printf("Test 4: 240 'a' + 6x A-tilde (multi-byte near boundary)\n");
|
||||
errno = 0;
|
||||
r = sequal(a, b, 0.85);
|
||||
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal)");
|
||||
}
|
||||
|
||||
/* Test 5: full-detail call with undersized buffers -- this is the actual
|
||||
* realistic break path. Pass 64-byte buffers to sequal_full. */
|
||||
{
|
||||
char small1[64], small2[64];
|
||||
char a[300], b[300];
|
||||
int i;
|
||||
for(i = 0; i < 299; i++) { a[i] = 'a'; b[i] = 'a'; }
|
||||
a[299] = b[299] = '\0';
|
||||
|
||||
printf("Test 5: sequal_full with 64-byte buffers + 300-char input\n");
|
||||
errno = 0;
|
||||
r = sequal_full(a, b, 0.85, &ratio, small1, sizeof(small1), small2, sizeof(small2));
|
||||
CHECK(errno == 0 && r == 0, "no errno, returns 0 (equal after truncation to 63 chars)");
|
||||
CHECK(strlen(small1) <= sizeof(small1) - 1, "small1 NUL-terminated within buffer");
|
||||
CHECK(strlen(small2) <= sizeof(small2) - 1, "small2 NUL-terminated within buffer");
|
||||
}
|
||||
|
||||
/* ----- LOWER bound tests ----- */
|
||||
|
||||
/* Test 6: NULL inputs -- should set errno=EINVAL and return 0 */
|
||||
{
|
||||
printf("Test 6: NULL inputs via sequal_full\n");
|
||||
errno = 0;
|
||||
r = sequal_full(NULL, "x", 0.85, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||
CHECK(errno == EINVAL && r == 0, "NULL a: errno=EINVAL, returns 0");
|
||||
|
||||
errno = 0;
|
||||
r = sequal_full("x", NULL, 0.85, &ratio, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||
CHECK(errno == EINVAL && r == 0, "NULL b: errno=EINVAL, returns 0");
|
||||
|
||||
errno = 0;
|
||||
r = sequal_full("x", "y", 0.85, NULL, s1, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||
CHECK(errno == EINVAL && r == 0, "NULL ratio: errno=EINVAL, returns 0");
|
||||
|
||||
errno = 0;
|
||||
r = sequal_full("x", "y", 0.85, &ratio, NULL, LEVN_SBUFF, s2, LEVN_SBUFF);
|
||||
CHECK(errno == EINVAL && r == 0, "NULL s1: errno=EINVAL, returns 0");
|
||||
|
||||
errno = 0;
|
||||
r = sequal_full("x", "y", 0.85, &ratio, s1, LEVN_SBUFF, NULL, LEVN_SBUFF);
|
||||
CHECK(errno == EINVAL && r == 0, "NULL s2: errno=EINVAL, returns 0");
|
||||
}
|
||||
|
||||
/* Test 7: zero-size buffers -- should set errno=EINVAL */
|
||||
{
|
||||
printf("Test 7: zero-size buffers\n");
|
||||
errno = 0;
|
||||
r = sequal_full("x", "y", 0.85, &ratio, s1, 0, s2, LEVN_SBUFF);
|
||||
CHECK(errno == EINVAL && r == 0, "s1_size=0: errno=EINVAL, returns 0");
|
||||
|
||||
errno = 0;
|
||||
r = sequal_full("x", "y", 0.85, &ratio, s1, LEVN_SBUFF, s2, 0);
|
||||
CHECK(errno == EINVAL && r == 0, "s2_size=0: errno=EINVAL, returns 0");
|
||||
}
|
||||
|
||||
/* Test 8: minimum non-empty strings + threshold extremes */
|
||||
{
|
||||
printf("Test 8: minimal strings + threshold edge cases\n");
|
||||
|
||||
/* single char equal */
|
||||
errno = 0;
|
||||
r = sequal("a", "a", 0.85);
|
||||
CHECK(r == 0 && errno == 0, "'a' vs 'a' -> 0 (equal)");
|
||||
|
||||
/* single char different */
|
||||
errno = 0;
|
||||
r = sequal("a", "b", 0.85);
|
||||
CHECK(r != 0 && errno == 0, "'a' vs 'b' -> non-zero (different)");
|
||||
|
||||
/* threshold = 0.0: any non-zero similarity matches -> 0 even when different */
|
||||
r = sequal("hello", "world", 0.0);
|
||||
CHECK(r == 0, "shold=0.0 makes any non-empty pair 'match'");
|
||||
|
||||
/* threshold = 1.0: only exact-after-normalize -> strict strcmp */
|
||||
r = sequal("hello", "Hello", 1.0);
|
||||
CHECK(r == 0, "shold=1.0 + case-only diff still matches (normalized exact)");
|
||||
|
||||
r = sequal("hello", "world", 1.0);
|
||||
CHECK(r != 0, "shold=1.0 + completely different -> non-zero");
|
||||
|
||||
/* whitespace-only strings */
|
||||
r = sequal(" ", " ", 0.85);
|
||||
CHECK(r == 0, "whitespace-only strings normalize equal");
|
||||
}
|
||||
|
||||
/* Summary */
|
||||
printf("\n");
|
||||
if(fails == 0) {
|
||||
printf("ALL TESTS PASSED\n");
|
||||
return 0;
|
||||
} else {
|
||||
printf("%d TEST(S) FAILED\n", fails);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user