improvements
This commit is contained in:
parent
44f7981013
commit
27bfbe6ef4
1 changed files with 38 additions and 59 deletions
95
regex.c
95
regex.c
|
@ -3,8 +3,6 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// TODO POSIX ERE use "[]]" for plain ']'
|
|
||||||
|
|
||||||
// Checks if the first char in needle matches on c
|
// Checks if the first char in needle matches on c
|
||||||
// needle may begin with a plain character or a character class
|
// needle may begin with a plain character or a character class
|
||||||
bool char_match(char c, char *needle, char **nneedl)
|
bool char_match(char c, char *needle, char **nneedl)
|
||||||
|
@ -16,27 +14,26 @@ bool char_match(char c, char *needle, char **nneedl)
|
||||||
m = false;
|
m = false;
|
||||||
if (*needle == '[')
|
if (*needle == '[')
|
||||||
{
|
{
|
||||||
|
// match against character class
|
||||||
needle++;
|
needle++;
|
||||||
neg = *needle == '^';
|
neg = *needle == '^';
|
||||||
if (neg)
|
if (neg)
|
||||||
needle++;
|
needle++;
|
||||||
for (db = 1; *needle && (*needle != ']' || db); needle++)
|
for (db = 1; *needle && (*needle != ']' || db); needle++)
|
||||||
{
|
{
|
||||||
if (db > 1 && ((*needle == c) != neg))
|
if ((*needle == c) != neg)
|
||||||
m = true;
|
m = true;
|
||||||
else if (db > 1)
|
if (db)
|
||||||
db--;
|
db--;
|
||||||
else if (*needle == '\\')
|
|
||||||
db++;
|
|
||||||
else if (*needle == ']')
|
|
||||||
db--;
|
|
||||||
else if ((*needle == c) != neg)
|
|
||||||
m = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (*needle == '$')
|
else if (*needle == '$')
|
||||||
|
{
|
||||||
m = (c == 0);
|
m = (c == 0);
|
||||||
else if (*needle == '.')
|
while (*needle) needle++;
|
||||||
|
if (nneedl) *nneedl = needle;
|
||||||
|
}
|
||||||
|
else if (*needle == '.' || *needle == 0)
|
||||||
m = true;
|
m = true;
|
||||||
else
|
else
|
||||||
m = (*needle == c);
|
m = (*needle == c);
|
||||||
|
@ -50,15 +47,14 @@ char *next_var(char *needle)
|
||||||
{
|
{
|
||||||
for (int dp = 0, db = 0;
|
for (int dp = 0, db = 0;
|
||||||
*needle && (*needle != '|' || dp > 0 || db > 0);
|
*needle && (*needle != '|' || dp > 0 || db > 0);
|
||||||
needle++) {
|
needle++)
|
||||||
if (db && *needle == ']')
|
{
|
||||||
|
if (db > 1)
|
||||||
db--;
|
db--;
|
||||||
else if (db == 1 && *needle == '\\')
|
else if (db && *needle == ']')
|
||||||
db++;
|
|
||||||
else if (db == 2)
|
|
||||||
db--;
|
db--;
|
||||||
else if (!db && *needle == '[')
|
else if (!db && *needle == '[')
|
||||||
db++;
|
db = 2;
|
||||||
else if (!db && *needle == '(')
|
else if (!db && *needle == '(')
|
||||||
dp++;
|
dp++;
|
||||||
else if (!db && *needle == ')')
|
else if (!db && *needle == ')')
|
||||||
|
@ -77,18 +73,12 @@ char *next_char(char *needle)
|
||||||
int db;
|
int db;
|
||||||
if (*needle && *needle != '[')
|
if (*needle && *needle != '[')
|
||||||
return needle + 1;
|
return needle + 1;
|
||||||
for (db = (*needle == '[' ? 1 : -1);
|
for (db = (*needle == '[' ? 2 : 0);
|
||||||
*needle && db;
|
*needle && (*needle != ']' || db);
|
||||||
needle++)
|
needle++)
|
||||||
{
|
{
|
||||||
if (*needle == ']')
|
if (db > 1)
|
||||||
db--;
|
db--;
|
||||||
else if (db == 1 && *needle == '\\')
|
|
||||||
db++;
|
|
||||||
else if (db > 1)
|
|
||||||
db--;
|
|
||||||
else if (db < 0)
|
|
||||||
db++;
|
|
||||||
}
|
}
|
||||||
return needle;
|
return needle;
|
||||||
}
|
}
|
||||||
|
@ -116,25 +106,17 @@ char *start_of_group(char *needle0, char *needle)
|
||||||
else if (!db && *needle == '(')
|
else if (!db && *needle == '(')
|
||||||
dp--;
|
dp--;
|
||||||
else if (!db && *needle == ']')
|
else if (!db && *needle == ']')
|
||||||
db++;
|
db = 1;
|
||||||
else if (db && *needle == '[')
|
else if (db && *needle == '[')
|
||||||
db--;
|
db = 0;
|
||||||
else if (db && *needle == '\\')
|
|
||||||
db--;
|
|
||||||
}
|
}
|
||||||
return needle;
|
return needle;
|
||||||
}
|
}
|
||||||
|
|
||||||
char quant(char c)
|
char quant(char c)
|
||||||
{
|
{
|
||||||
switch (c)
|
char *r = strchr("*+?", c);
|
||||||
{
|
return r ? *r : 0;
|
||||||
case '*':
|
|
||||||
case '+':
|
|
||||||
case '?':
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tries to prefix-match the regex needle on haystack
|
// Tries to prefix-match the regex needle on haystack
|
||||||
|
@ -152,24 +134,21 @@ bool regex_match(char *haystack, char *needle)
|
||||||
states = strlen(needle) + 1;
|
states = strlen(needle) + 1;
|
||||||
// active states
|
// active states
|
||||||
bool bv[states] = { };
|
bool bv[states] = { };
|
||||||
// next iteration active states
|
|
||||||
bool bvtmp[states] = { };
|
|
||||||
// acceptance state
|
// acceptance state
|
||||||
accept = states - 1;
|
accept = states - 1;
|
||||||
na = false;
|
na = false;
|
||||||
|
|
||||||
// first, decativate all states
|
// first, decativate all states
|
||||||
for (size_t st = 0; st < states; st++)
|
for (st = 0; st < states; st++)
|
||||||
bv[st] = false;
|
bv[st] = false;
|
||||||
// then, find all top-level variants, and activate them
|
// then, find all top-level variants, and activate them
|
||||||
for (char *var = needle; *var; var = next_var(var)) {
|
for (var = needle; *var; var = next_var(var))
|
||||||
bv[var - needle] = true;
|
bv[var - needle] = true;
|
||||||
}
|
|
||||||
|
|
||||||
// iterate over the haystack until accepting
|
// iterate over the haystack until accepting
|
||||||
do {
|
do {
|
||||||
// handle groups, variants, and quantifier
|
// handle groups, variants, and quantifier
|
||||||
for (size_t st = 0; st < states - 1; st++)
|
for (st = 0; st < states - 1; st++)
|
||||||
{
|
{
|
||||||
if (!bv[st])
|
if (!bv[st])
|
||||||
continue;
|
continue;
|
||||||
|
@ -193,19 +172,20 @@ bool regex_match(char *haystack, char *needle)
|
||||||
// then, handle quantifiers
|
// then, handle quantifiers
|
||||||
fprintf(stderr, "quant %c\n", q);
|
fprintf(stderr, "quant %c\n", q);
|
||||||
fprintf(stderr, "suffix \"%s\"\n", needle + (q ? st + 2 : st + 1));
|
fprintf(stderr, "suffix \"%s\"\n", needle + (q ? st + 2 : st + 1));
|
||||||
if (q == '+' || q == '*')
|
nneedl = start_of_group(needle, needle + st);
|
||||||
|
if ((q == '+' || q == '*') && !bv[nneedl - needle])
|
||||||
{
|
{
|
||||||
// spicy: if the quantifier allows multiple occurences, we have to activate a previous state
|
// spicy: if the quantifier allows multiple occurences, we have to activate a previous state
|
||||||
nneedl = start_of_group(needle, needle + st);
|
|
||||||
fprintf(stderr, "start of group: \"%s\"\n", nneedl);
|
fprintf(stderr, "start of group: \"%s\"\n", nneedl);
|
||||||
bv[nneedl - needle] = true;
|
bv[nneedl - needle] = true;
|
||||||
st = nneedl - needle - 1;
|
st = nneedl - needle - 1;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
bv[nneedl - needle] = false;
|
||||||
}
|
}
|
||||||
else if (needle[st] == '(')
|
else if (needle[st] == '(')
|
||||||
{
|
{
|
||||||
// at start of group, look at variants and quantifier
|
// at start of group, look at variants and quantifier
|
||||||
bv[st] = false;
|
|
||||||
for (var = needle + st + 1; *var && *var != ')'; var = next_var(var))
|
for (var = needle + st + 1; *var && *var != ')'; var = next_var(var))
|
||||||
bv[var - needle] = true;
|
bv[var - needle] = true;
|
||||||
if (*var == ')')
|
if (*var == ')')
|
||||||
|
@ -223,31 +203,30 @@ bool regex_match(char *haystack, char *needle)
|
||||||
bv[nneedl + 1 - needle] = true;
|
bv[nneedl + 1 - needle] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memcpy(bvtmp, bv, sizeof(bv));
|
|
||||||
// actual match, backwards
|
// actual match, backwards
|
||||||
na = true;
|
na = true;
|
||||||
for (st = states; st > 0; st--)
|
for (st = accept; st > 0; st--)
|
||||||
{
|
{
|
||||||
if (!bv[st-1])
|
// skip beginnings of groups b/c they have been handled before
|
||||||
|
if (!bv[st-1] || needle[st-1] == '(')
|
||||||
continue;
|
continue;
|
||||||
na = false;
|
|
||||||
// perform match for active state
|
// perform match for active state
|
||||||
fprintf(stderr, "state %lu: \"%s\" ~ \"%s\"\n", st-1, haystack, needle + st - 1);
|
bv[st-1] = false;
|
||||||
bvtmp[st-1] = false;
|
|
||||||
m = char_match(*haystack, needle + st-1, &nneedl);
|
m = char_match(*haystack, needle + st-1, &nneedl);
|
||||||
q = quant(*nneedl);
|
q = quant(*nneedl);
|
||||||
if (m || q == '*' || q == '?')
|
fprintf(stderr, "state %lu: \"%s\" ~ \"%s\" => %b\n", st-1, haystack, needle + st - 1, m);
|
||||||
|
if (m)
|
||||||
{
|
{
|
||||||
|
na = false;
|
||||||
if (q)
|
if (q)
|
||||||
nneedl++;
|
nneedl++;
|
||||||
if (q == '*' || q == '+')
|
if (q == '*' || q == '+')
|
||||||
bvtmp[st-1] = true;
|
bv[st-1] = true;
|
||||||
bvtmp[nneedl - needle] = true;
|
bv[nneedl - needle] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memcpy(bv, bvtmp, sizeof(bv));
|
|
||||||
putc('\n', stderr);
|
putc('\n', stderr);
|
||||||
// check if we have reached the end of haystack, and advance
|
// check if we have reached the end of haystack, and advance if not
|
||||||
if (!*(haystack++))
|
if (!*(haystack++))
|
||||||
break;
|
break;
|
||||||
} while (!bv[accept] && !na);
|
} while (!bv[accept] && !na);
|
||||||
|
|
Loading…
Reference in a new issue