optimizations and fixes for friday

This commit is contained in:
Fritz Grimpen 2025-02-06 18:54:31 +01:00
parent 29a99ffa9d
commit 44f7981013

70
regex.c
View file

@ -7,11 +7,13 @@
// Checks if the first char in needle matches on c // Checks if the first char in needle matches on c
// needle may begin with a plain character or a character class // needle may begin with a plain character or a character class
bool char_match(char c, char *needle) bool char_match(char c, char *needle, char **nneedl)
{ {
// depth of brackets // depth of brackets
int db; int db;
bool neg; bool neg;
bool m;
m = false;
if (*needle == '[') if (*needle == '[')
{ {
needle++; needle++;
@ -21,7 +23,7 @@ bool char_match(char c, char *needle)
for (db = 1; *needle && (*needle != ']' || db); needle++) for (db = 1; *needle && (*needle != ']' || db); needle++)
{ {
if (db > 1 && ((*needle == c) != neg)) if (db > 1 && ((*needle == c) != neg))
return true; m = true;
else if (db > 1) else if (db > 1)
db--; db--;
else if (*needle == '\\') else if (*needle == '\\')
@ -29,15 +31,18 @@ bool char_match(char c, char *needle)
else if (*needle == ']') else if (*needle == ']')
db--; db--;
else if ((*needle == c) != neg) else if ((*needle == c) != neg)
return true; m = true;
} }
return false;
} }
if (*needle == '$') else if (*needle == '$')
return c == 0; m = (c == 0);
else if (*needle == '.') else if (*needle == '.')
return true; m = true;
return *needle == c; else
m = (*needle == c);
if (*needle && nneedl)
*nneedl = needle + 1;
return m;
} }
// Find next variant, or closing parenthesis of open group, or end of string // Find next variant, or closing parenthesis of open group, or end of string
@ -100,6 +105,8 @@ char *end_of_group(char *needle)
// Returns opening parenthesis, or needle0 // Returns opening parenthesis, or needle0
char *start_of_group(char *needle0, char *needle) char *start_of_group(char *needle0, char *needle)
{ {
if (*needle == ')' && needle0 < needle)
needle--;
for (int dp = 0, db = 0; for (int dp = 0, db = 0;
needle > needle0 && (*needle != '(' || dp > 0 || db > 0 || *needle == ')'); needle > needle0 && (*needle != '(' || dp > 0 || db > 0 || *needle == ')');
needle--) needle--)
@ -118,20 +125,14 @@ char *start_of_group(char *needle0, char *needle)
return needle; return needle;
} }
// Determine quantifier for group, character class, or character char quant(char c)
// Returns quantifier character, or 0 if absent.
char quant(char *needle)
{ {
if (*needle == '(') switch (c)
needle = end_of_group(needle + 1);
else
needle = next_char(needle);
switch (*needle)
{ {
case '*': case '*':
case '+': case '+':
case '?': case '?':
return *needle; return c;
} }
return 0; return 0;
} }
@ -166,7 +167,7 @@ bool regex_match(char *haystack, char *needle)
} }
// iterate over the haystack until accepting // iterate over the haystack until accepting
for (; !bv[accept] && !na; haystack++) { do {
// handle groups, variants, and quantifier // handle groups, variants, and quantifier
for (size_t st = 0; st < states - 1; st++) for (size_t st = 0; st < states - 1; st++)
{ {
@ -186,7 +187,7 @@ bool regex_match(char *haystack, char *needle)
{ {
// at end of group, look for quantifier // at end of group, look for quantifier
bv[st] = false; bv[st] = false;
q = quant(needle + st); q = quant(needle[st+1]);
// first, activate subsequent state after group // first, activate subsequent state after group
bv[q ? st + 2 : st + 1] = true; bv[q ? st + 2 : st + 1] = true;
// then, handle quantifiers // then, handle quantifiers
@ -205,23 +206,23 @@ bool regex_match(char *haystack, char *needle)
{ {
// at start of group, look at variants and quantifier // at start of group, look at variants and quantifier
bv[st] = false; bv[st] = false;
for (var = needle + 1; *var && *var != ')'; var = next_var(var)) for (var = needle + st + 1; *var && *var != ')'; var = next_var(var))
bv[var - needle] = true; bv[var - needle] = true;
q = quant(var);
if (*var == ')') if (*var == ')')
var++; var++;
q = quant(*var);
if (q == '*' || q == '?') if (q == '*' || q == '?')
bv[var + 1 - needle] = true; bv[var + 1 - needle] = true;
} }
else if ((q = quant(needle + st)) && (q == '*' || q == '?')) else
{ {
// current needle has quantifier // check if current needle is optional
bv[next_char(needle + st) + 1 - needle] = true; nneedl = next_char(needle + st);
q = *nneedl;
if (q == '*' || q == '?')
bv[nneedl + 1 - needle] = true;
} }
} }
// check if we are only tidying up
if (!*haystack)
break;
memcpy(bvtmp, bv, sizeof(bv)); memcpy(bvtmp, bv, sizeof(bv));
// actual match, backwards // actual match, backwards
na = true; na = true;
@ -233,22 +234,23 @@ bool regex_match(char *haystack, char *needle)
// perform match for active state // perform match for active state
fprintf(stderr, "state %lu: \"%s\" ~ \"%s\"\n", st-1, haystack, needle + st - 1); fprintf(stderr, "state %lu: \"%s\" ~ \"%s\"\n", st-1, haystack, needle + st - 1);
bvtmp[st-1] = false; bvtmp[st-1] = false;
char q = quant(needle + st-1); m = char_match(*haystack, needle + st-1, &nneedl);
q = quant(needle + st-1); q = quant(*nneedl);
m = char_match(*haystack, needle + st-1);
if (m || q == '*' || q == '?') if (m || q == '*' || q == '?')
{ {
if (q == '*' || q == '+')
bvtmp[st-1] = true;
nneedl = next_char(needle + st-1);
if (q) if (q)
nneedl++; nneedl++;
if (q == '*' || q == '+')
bvtmp[st-1] = true;
bvtmp[nneedl - needle] = true; bvtmp[nneedl - needle] = true;
} }
} }
memcpy(bv, bvtmp, sizeof(bv)); memcpy(bv, bvtmp, sizeof(bv));
putc('\n', stderr); putc('\n', stderr);
} // check if we have reached the end of haystack, and advance
if (!*(haystack++))
break;
} while (!bv[accept] && !na);
return bv[accept]; return bv[accept];
} }