initial commit for friday

This commit is contained in:
Fritz Grimpen 2025-02-05 22:26:51 +01:00
commit 29a99ffa9d
2 changed files with 271 additions and 0 deletions

2
Makefile Normal file
View file

@ -0,0 +1,2 @@
regex: regex.c
$(CC) $(CFLAGS) -Wall -Wextra -pedantic -std=c23 -o $@ $<

269
regex.c Normal file
View file

@ -0,0 +1,269 @@
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
// TODO POSIX ERE use "[]]" for plain ']'
// Checks if the first char in needle matches on c
// needle may begin with a plain character or a character class
bool char_match(char c, char *needle)
{
// depth of brackets
int db;
bool neg;
if (*needle == '[')
{
needle++;
neg = *needle == '^';
if (neg)
needle++;
for (db = 1; *needle && (*needle != ']' || db); needle++)
{
if (db > 1 && ((*needle == c) != neg))
return true;
else if (db > 1)
db--;
else if (*needle == '\\')
db++;
else if (*needle == ']')
db--;
else if ((*needle == c) != neg)
return true;
}
return false;
}
if (*needle == '$')
return c == 0;
else if (*needle == '.')
return true;
return *needle == c;
}
// Find next variant, or closing parenthesis of open group, or end of string
char *next_var(char *needle)
{
for (int dp = 0, db = 0;
*needle && (*needle != '|' || dp > 0 || db > 0);
needle++) {
if (db && *needle == ']')
db--;
else if (db == 1 && *needle == '\\')
db++;
else if (db == 2)
db--;
else if (!db && *needle == '[')
db++;
else if (!db && *needle == '(')
dp++;
else if (!db && *needle == ')')
dp--;
if (dp < 0)
return needle;
}
if (*needle == '|')
needle++;
return needle;
}
// Find next character behind character class or character, but does not skip quantifier
char *next_char(char *needle)
{
int db;
if (*needle && *needle != '[')
return needle + 1;
for (db = (*needle == '[' ? 1 : -1);
*needle && db;
needle++)
{
if (*needle == ']')
db--;
else if (db == 1 && *needle == '\\')
db++;
else if (db > 1)
db--;
else if (db < 0)
db++;
}
return needle;
}
// Find first character behind the closing parenthesis of the current group, or end of string
char *end_of_group(char *needle)
{
while (*needle && *needle != ')')
needle = next_var(needle);
return *needle ? needle + 1 : needle;
}
// Find start of group at needle pointing to the closing parenthesis.
// Returns opening parenthesis, or needle0
char *start_of_group(char *needle0, char *needle)
{
for (int dp = 0, db = 0;
needle > needle0 && (*needle != '(' || dp > 0 || db > 0 || *needle == ')');
needle--)
{
if (!db && *needle == ')')
dp++;
else if (!db && *needle == '(')
dp--;
else if (!db && *needle == ']')
db++;
else if (db && *needle == '[')
db--;
else if (db && *needle == '\\')
db--;
}
return needle;
}
// Determine quantifier for group, character class, or character
// Returns quantifier character, or 0 if absent.
char quant(char *needle)
{
if (*needle == '(')
needle = end_of_group(needle + 1);
else
needle = next_char(needle);
switch (*needle)
{
case '*':
case '+':
case '?':
return *needle;
}
return 0;
}
// Tries to prefix-match the regex needle on haystack
bool regex_match(char *haystack, char *needle)
{
// states, working state, and accept state
size_t states, st, accept;
// quantifier
char q;
// character match, no active state
bool m, na;
// new needle or variant
char *nneedl, *var;
states = strlen(needle) + 1;
// active states
bool bv[states] = { };
// next iteration active states
bool bvtmp[states] = { };
// acceptance state
accept = states - 1;
na = false;
// first, decativate all states
for (size_t st = 0; st < states; st++)
bv[st] = false;
// then, find all top-level variants, and activate them
for (char *var = needle; *var; var = next_var(var)) {
bv[var - needle] = true;
}
// iterate over the haystack until accepting
for (; !bv[accept] && !na; haystack++) {
// handle groups, variants, and quantifier
for (size_t st = 0; st < states - 1; st++)
{
if (!bv[st])
continue;
fprintf(stderr, "prepr %lu: \"%s\"\n", st, needle + st);
if (needle[st] == '|')
{
// at end of variant, look at end of group
bv[st] = false;
nneedl = end_of_group(needle+st);
if (nneedl[-1] == ')')
nneedl--;
bv[nneedl - needle] = true;
}
else if (needle[st] == ')')
{
// at end of group, look for quantifier
bv[st] = false;
q = quant(needle + st);
// first, activate subsequent state after group
bv[q ? st + 2 : st + 1] = true;
// then, handle quantifiers
fprintf(stderr, "quant %c\n", q);
fprintf(stderr, "suffix \"%s\"\n", needle + (q ? st + 2 : st + 1));
if (q == '+' || q == '*')
{
// spicy: if the quantifier allows multiple occurences, we have to activate a previous state
nneedl = start_of_group(needle, needle + st);
fprintf(stderr, "start of group: \"%s\"\n", nneedl);
bv[nneedl - needle] = true;
st = nneedl - needle - 1;
}
}
else if (needle[st] == '(')
{
// at start of group, look at variants and quantifier
bv[st] = false;
for (var = needle + 1; *var && *var != ')'; var = next_var(var))
bv[var - needle] = true;
q = quant(var);
if (*var == ')')
var++;
if (q == '*' || q == '?')
bv[var + 1 - needle] = true;
}
else if ((q = quant(needle + st)) && (q == '*' || q == '?'))
{
// current needle has quantifier
bv[next_char(needle + st) + 1 - needle] = true;
}
}
// check if we are only tidying up
if (!*haystack)
break;
memcpy(bvtmp, bv, sizeof(bv));
// actual match, backwards
na = true;
for (st = states; st > 0; st--)
{
if (!bv[st-1])
continue;
na = false;
// perform match for active state
fprintf(stderr, "state %lu: \"%s\" ~ \"%s\"\n", st-1, haystack, needle + st - 1);
bvtmp[st-1] = false;
char q = quant(needle + st-1);
q = quant(needle + st-1);
m = char_match(*haystack, needle + st-1);
if (m || q == '*' || q == '?')
{
if (q == '*' || q == '+')
bvtmp[st-1] = true;
nneedl = next_char(needle + st-1);
if (q)
nneedl++;
bvtmp[nneedl - needle] = true;
}
}
memcpy(bv, bvtmp, sizeof(bv));
putc('\n', stderr);
}
return bv[accept];
}
int main(int argc, char *argv[])
{
if (argc < 3)
return 1;
char *needle = argv[1];
char *haystack = argv[2];
if (regex_match(haystack, needle))
{
puts("Match!");
return 0;
}
return 1;
}