improvements

This commit is contained in:
Fritz Grimpen 2025-02-06 22:00:22 +01:00
parent 44f7981013
commit 27bfbe6ef4

95
regex.c
View file

@ -3,8 +3,6 @@
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
// TODO POSIX ERE use "[]]" for plain ']'
// Checks if the first char in needle matches on c // Checks if the first char in needle matches on c
// needle may begin with a plain character or a character class // needle may begin with a plain character or a character class
bool char_match(char c, char *needle, char **nneedl) bool char_match(char c, char *needle, char **nneedl)
@ -16,27 +14,26 @@ bool char_match(char c, char *needle, char **nneedl)
m = false; m = false;
if (*needle == '[') if (*needle == '[')
{ {
// match against character class
needle++; needle++;
neg = *needle == '^'; neg = *needle == '^';
if (neg) if (neg)
needle++; needle++;
for (db = 1; *needle && (*needle != ']' || db); needle++) for (db = 1; *needle && (*needle != ']' || db); needle++)
{ {
if (db > 1 && ((*needle == c) != neg)) if ((*needle == c) != neg)
m = true; m = true;
else if (db > 1) if (db)
db--; db--;
else if (*needle == '\\')
db++;
else if (*needle == ']')
db--;
else if ((*needle == c) != neg)
m = true;
} }
} }
else if (*needle == '$') else if (*needle == '$')
{
m = (c == 0); m = (c == 0);
else if (*needle == '.') while (*needle) needle++;
if (nneedl) *nneedl = needle;
}
else if (*needle == '.' || *needle == 0)
m = true; m = true;
else else
m = (*needle == c); m = (*needle == c);
@ -50,15 +47,14 @@ char *next_var(char *needle)
{ {
for (int dp = 0, db = 0; for (int dp = 0, db = 0;
*needle && (*needle != '|' || dp > 0 || db > 0); *needle && (*needle != '|' || dp > 0 || db > 0);
needle++) { needle++)
if (db && *needle == ']') {
if (db > 1)
db--; db--;
else if (db == 1 && *needle == '\\') else if (db && *needle == ']')
db++;
else if (db == 2)
db--; db--;
else if (!db && *needle == '[') else if (!db && *needle == '[')
db++; db = 2;
else if (!db && *needle == '(') else if (!db && *needle == '(')
dp++; dp++;
else if (!db && *needle == ')') else if (!db && *needle == ')')
@ -77,18 +73,12 @@ char *next_char(char *needle)
int db; int db;
if (*needle && *needle != '[') if (*needle && *needle != '[')
return needle + 1; return needle + 1;
for (db = (*needle == '[' ? 1 : -1); for (db = (*needle == '[' ? 2 : 0);
*needle && db; *needle && (*needle != ']' || db);
needle++) needle++)
{ {
if (*needle == ']') if (db > 1)
db--; db--;
else if (db == 1 && *needle == '\\')
db++;
else if (db > 1)
db--;
else if (db < 0)
db++;
} }
return needle; return needle;
} }
@ -116,25 +106,17 @@ char *start_of_group(char *needle0, char *needle)
else if (!db && *needle == '(') else if (!db && *needle == '(')
dp--; dp--;
else if (!db && *needle == ']') else if (!db && *needle == ']')
db++; db = 1;
else if (db && *needle == '[') else if (db && *needle == '[')
db--; db = 0;
else if (db && *needle == '\\')
db--;
} }
return needle; return needle;
} }
char quant(char c) char quant(char c)
{ {
switch (c) char *r = strchr("*+?", c);
{ return r ? *r : 0;
case '*':
case '+':
case '?':
return c;
}
return 0;
} }
// Tries to prefix-match the regex needle on haystack // Tries to prefix-match the regex needle on haystack
@ -152,24 +134,21 @@ bool regex_match(char *haystack, char *needle)
states = strlen(needle) + 1; states = strlen(needle) + 1;
// active states // active states
bool bv[states] = { }; bool bv[states] = { };
// next iteration active states
bool bvtmp[states] = { };
// acceptance state // acceptance state
accept = states - 1; accept = states - 1;
na = false; na = false;
// first, decativate all states // first, decativate all states
for (size_t st = 0; st < states; st++) for (st = 0; st < states; st++)
bv[st] = false; bv[st] = false;
// then, find all top-level variants, and activate them // then, find all top-level variants, and activate them
for (char *var = needle; *var; var = next_var(var)) { for (var = needle; *var; var = next_var(var))
bv[var - needle] = true; bv[var - needle] = true;
}
// iterate over the haystack until accepting // iterate over the haystack until accepting
do { do {
// handle groups, variants, and quantifier // handle groups, variants, and quantifier
for (size_t st = 0; st < states - 1; st++) for (st = 0; st < states - 1; st++)
{ {
if (!bv[st]) if (!bv[st])
continue; continue;
@ -193,19 +172,20 @@ bool regex_match(char *haystack, char *needle)
// then, handle quantifiers // then, handle quantifiers
fprintf(stderr, "quant %c\n", q); fprintf(stderr, "quant %c\n", q);
fprintf(stderr, "suffix \"%s\"\n", needle + (q ? st + 2 : st + 1)); fprintf(stderr, "suffix \"%s\"\n", needle + (q ? st + 2 : st + 1));
if (q == '+' || q == '*') nneedl = start_of_group(needle, needle + st);
if ((q == '+' || q == '*') && !bv[nneedl - needle])
{ {
// spicy: if the quantifier allows multiple occurences, we have to activate a previous state // spicy: if the quantifier allows multiple occurences, we have to activate a previous state
nneedl = start_of_group(needle, needle + st);
fprintf(stderr, "start of group: \"%s\"\n", nneedl); fprintf(stderr, "start of group: \"%s\"\n", nneedl);
bv[nneedl - needle] = true; bv[nneedl - needle] = true;
st = nneedl - needle - 1; st = nneedl - needle - 1;
} }
else
bv[nneedl - needle] = false;
} }
else if (needle[st] == '(') else if (needle[st] == '(')
{ {
// at start of group, look at variants and quantifier // at start of group, look at variants and quantifier
bv[st] = false;
for (var = needle + st + 1; *var && *var != ')'; var = next_var(var)) for (var = needle + st + 1; *var && *var != ')'; var = next_var(var))
bv[var - needle] = true; bv[var - needle] = true;
if (*var == ')') if (*var == ')')
@ -223,31 +203,30 @@ bool regex_match(char *haystack, char *needle)
bv[nneedl + 1 - needle] = true; bv[nneedl + 1 - needle] = true;
} }
} }
memcpy(bvtmp, bv, sizeof(bv));
// actual match, backwards // actual match, backwards
na = true; na = true;
for (st = states; st > 0; st--) for (st = accept; st > 0; st--)
{ {
if (!bv[st-1]) // skip beginnings of groups b/c they have been handled before
if (!bv[st-1] || needle[st-1] == '(')
continue; continue;
na = false;
// perform match for active state // perform match for active state
fprintf(stderr, "state %lu: \"%s\" ~ \"%s\"\n", st-1, haystack, needle + st - 1); bv[st-1] = false;
bvtmp[st-1] = false;
m = char_match(*haystack, needle + st-1, &nneedl); m = char_match(*haystack, needle + st-1, &nneedl);
q = quant(*nneedl); q = quant(*nneedl);
if (m || q == '*' || q == '?') fprintf(stderr, "state %lu: \"%s\" ~ \"%s\" => %b\n", st-1, haystack, needle + st - 1, m);
if (m)
{ {
na = false;
if (q) if (q)
nneedl++; nneedl++;
if (q == '*' || q == '+') if (q == '*' || q == '+')
bvtmp[st-1] = true; bv[st-1] = true;
bvtmp[nneedl - needle] = true; bv[nneedl - needle] = true;
} }
} }
memcpy(bv, bvtmp, sizeof(bv));
putc('\n', stderr); putc('\n', stderr);
// check if we have reached the end of haystack, and advance // check if we have reached the end of haystack, and advance if not
if (!*(haystack++)) if (!*(haystack++))
break; break;
} while (!bv[accept] && !na); } while (!bv[accept] && !na);