diff options
author | midipix <writeonce@midipix.org> | 2024-05-15 20:40:33 +0000 |
---|---|---|
committer | midipix <writeonce@midipix.org> | 2024-05-15 20:40:33 +0000 |
commit | c257188cd912503371db1c2b7b2c59b4fd53df1c (patch) | |
tree | 8077d9f09fa85521f53a94f3fa33d17a7975a11c /src/regex | |
parent | ae7810f56e1daa1d2e35c06969c26835c1ed7800 (diff) | |
download | treebnf-c257188cd912503371db1c2b7b2c59b4fd53df1c.tar.bz2 treebnf-c257188cd912503371db1c2b7b2c59b4fd53df1c.tar.xz |
regex module: implemented token scanners, added definitions and scan table.
Diffstat (limited to 'src/regex')
-rw-r--r-- | src/regex/tbnf_regex.c | 185 | ||||
-rw-r--r-- | src/regex/tbnf_regex_defs.h | 64 | ||||
-rw-r--r-- | src/regex/tbnf_regex_scanfns.h | 287 |
3 files changed, 536 insertions, 0 deletions
diff --git a/src/regex/tbnf_regex.c b/src/regex/tbnf_regex.c new file mode 100644 index 0000000..2cdcb48 --- /dev/null +++ b/src/regex/tbnf_regex.c @@ -0,0 +1,185 @@ +/**************************************************************/ +/* treebnf: a tree oriented bnf library */ +/* Copyright (C) 2024 SysDeer Technologies, LLC */ +/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */ +/**************************************************************/ + +#include <treebnf/treebnf.h> + +#include "treebnf_regex_impl.h" +#include "treebnf_visibility_impl.h" + +#include "tbnf_regex_defs.h" +#include "tbnf_regex_scanfns.h" + +#define TBNF_STATE_STACK_SIZE (512) + +/* init state scan table*/ +static struct tbnf_scan_tbl tbnf_regex_scan_tbl__init[TBNF_REGEX_TOK_CAP] = { + /* --> brace */ + [TBNF_REGEX_TOK_LBRACE] = { + .tok_scan_fn = tbnf_regex_scan_lbrace, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACE, + }, + + + /* --> bracket */ + [TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_lbracket_circumflex_rbracket, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + [TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX] = { + .tok_scan_fn = tbnf_regex_scan_lbracket_circumflex, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + [TBNF_REGEX_TOK_LBRACKET_RBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_lbracket_rbracket, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + [TBNF_REGEX_TOK_LBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_lbracket, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + + /* (expression) */ + [TBNF_REGEX_TOK_ESCAPED_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_escaped_char, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK] = { + .tok_scan_fn = tbnf_regex_scan_circumflex_asterisk, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_CIRCUMFLEX] = { + .tok_scan_fn = tbnf_regex_scan_circumflex, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_LPAREN] = { + .tok_scan_fn = tbnf_regex_scan_lparen, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_RPAREN] = { + .tok_scan_fn = tbnf_regex_scan_rparen, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_ASTERISK] = { + .tok_scan_fn = tbnf_regex_scan_asterisk, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_PERIOD] = { + .tok_scan_fn = tbnf_regex_scan_period, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_DOLLAR] = { + .tok_scan_fn = tbnf_regex_scan_dollar, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_VLINE] = { + .tok_scan_fn = tbnf_regex_scan_vline, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_QMARK] = { + .tok_scan_fn = tbnf_regex_scan_qmark, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_PLUS] = { + .tok_scan_fn = tbnf_regex_scan_plus, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_char, + .tok_state_op = TBNF_STATE_KEEP, + }, +}; + + +/* brace state scan table*/ +static struct tbnf_scan_tbl tbnf_regex_scan_tbl__brace[TBNF_REGEX_TOK_CAP] = { + [TBNF_REGEX_TOK_BRACE_RBRACE] = { + .tok_scan_fn = tbnf_regex_scan_brace_rbrace, + .tok_state_op = TBNF_STATE_POP, + }, + + [TBNF_REGEX_TOK_BRACE_DIGIT] = { + .tok_scan_fn = tbnf_regex_scan_brace_digit, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACE_COMMA] = { + .tok_scan_fn = tbnf_regex_scan_brace_comma, + .tok_state_op = TBNF_STATE_KEEP, + }, +}; + + +/* bracket state scan table*/ +static struct tbnf_scan_tbl tbnf_regex_scan_tbl__bracket[TBNF_REGEX_TOK_CAP] = { + [TBNF_REGEX_TOK_BRACKET_RBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_bracket_rbracket, + .tok_state_op = TBNF_STATE_POP, + }, + + [TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_bracket_escaped_char, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS] = { + .tok_scan_fn = tbnf_regex_scan_bracket_character_class, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL] = { + .tok_scan_fn = tbnf_regex_scan_bracket_collation_symbol, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS] = { + .tok_scan_fn = tbnf_regex_scan_bracket_equivalence_class, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_HYPHEN] = { + .tok_scan_fn = tbnf_regex_scan_bracket_hyphen, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_ERROR] = { + .tok_scan_fn = tbnf_regex_scan_bracket_error, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_char, + .tok_state_op = TBNF_STATE_KEEP, + }, +}; + + +/* extended regex expression token scan table */ +tbnf_hidden struct tbnf_scan_tbl * tbnf_regex_scan_tbl[] = { + [TBNF_REGEX_STATE_INIT] = tbnf_regex_scan_tbl__init, + [TBNF_REGEX_STATE_BRACE] = tbnf_regex_scan_tbl__brace, + [TBNF_REGEX_STATE_BRACKET] = tbnf_regex_scan_tbl__bracket, + [TBNF_REGEX_STATE_CAP] = 0, +}; diff --git a/src/regex/tbnf_regex_defs.h b/src/regex/tbnf_regex_defs.h new file mode 100644 index 0000000..9276ae0 --- /dev/null +++ b/src/regex/tbnf_regex_defs.h @@ -0,0 +1,64 @@ +/**************************************************************/ +/* treebnf: a tree oriented bnf library */ +/* Copyright (C) 2024 SysDeer Technologies, LLC */ +/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */ +/**************************************************************/ + +#ifndef TBNF_REGEX_DEFS_H +#define TBNF_REGEX_DEFS_H + +enum tbnf_regex_scan_state { + TBNF_REGEX_STATE_INIT, + TBNF_REGEX_STATE_BRACE, + TBNF_REGEX_STATE_BRACKET, + TBNF_REGEX_STATE_CAP, +}; + +enum tbnf_regex_token_type { + TBNF_REGEX_TOK_NONE, + + /* in-brace tokens */ + TBNF_REGEX_TOK_BRACE_RBRACE, + TBNF_REGEX_TOK_BRACE_DIGIT, + TBNF_REGEX_TOK_BRACE_COMMA, + + /* in-bracket tokens */ + TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR, + TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS, + TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL, + TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS, + TBNF_REGEX_TOK_BRACKET_RBRACKET, + TBNF_REGEX_TOK_BRACKET_HYPHEN, + TBNF_REGEX_TOK_BRACKET_ERROR, + TBNF_REGEX_TOK_BRACKET_CHAR, + + /* brace state initializer token */ + TBNF_REGEX_TOK_LBRACE, + + /* bracket state initializer tokens */ + TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET, + TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX, + TBNF_REGEX_TOK_LBRACKET_RBRACKET, + TBNF_REGEX_TOK_LBRACKET, + + /* init state tokens */ + TBNF_REGEX_TOK_ESCAPED_CHAR, + + TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK, + TBNF_REGEX_TOK_CIRCUMFLEX, + + TBNF_REGEX_TOK_LPAREN, + TBNF_REGEX_TOK_RPAREN, + + TBNF_REGEX_TOK_ASTERISK, + TBNF_REGEX_TOK_PERIOD, + TBNF_REGEX_TOK_DOLLAR, + TBNF_REGEX_TOK_VLINE, + TBNF_REGEX_TOK_QMARK, + TBNF_REGEX_TOK_PLUS, + TBNF_REGEX_TOK_CHAR, + + TBNF_REGEX_TOK_CAP, +}; + +#endif diff --git a/src/regex/tbnf_regex_scanfns.h b/src/regex/tbnf_regex_scanfns.h new file mode 100644 index 0000000..c57a606 --- /dev/null +++ b/src/regex/tbnf_regex_scanfns.h @@ -0,0 +1,287 @@ +/**************************************************************/ +/* treebnf: a tree oriented bnf library */ +/* Copyright (C) 2024 SysDeer Technologies, LLC */ +/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */ +/**************************************************************/ + +#ifndef TBNF_REGEX_SCANFNS_H +#define TBNF_REGEX_SCANFNS_H + +#include <treebnf/treebnf.h> + +/* in-brace scanners */ +static inline int tbnf_regex_scan_brace_rbrace(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '}') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_brace_digit(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] >= '0') + if (sctx->tok_scan_mark[0] <= '9') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_brace_comma(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == ',') + return 1; + + return -1; +} + + +/* in-bracket scanners */ +static inline int tbnf_regex_scan_bracket_escaped_char(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '\\') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1]) + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_character_class(const struct tbnf_scan_ctx * sctx) +{ + const char * ch = 0; + + if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap) + return -1; + + if (sctx->tok_scan_mark[0] == '[') + if (sctx->tok_scan_mark[1] == ':') + ch = &sctx->tok_scan_mark[2]; + + if (!ch) + return -1; + + for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); ) + ch++; + + if ((*ch++ == ':') && (*ch++ == ']')) + return (ch - sctx->tok_scan_mark); + + return -1; +} + +static inline int tbnf_regex_scan_bracket_collation_symbol(const struct tbnf_scan_ctx * sctx) +{ + if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap) + return -1; + + if (sctx->tok_scan_mark[0] == '[') + if (sctx->tok_scan_mark[1] == '.') + if (sctx->tok_scan_mark[3] == '.') + if (sctx->tok_scan_mark[4] == ']') + if (sctx->tok_scan_mark[2]) + return 5; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_equivalence_class(const struct tbnf_scan_ctx * sctx) +{ + const char * ch = 0; + + if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap) + return -1; + + if (sctx->tok_scan_mark[0] == '[') + if (sctx->tok_scan_mark[1] == '=') + ch = &sctx->tok_scan_mark[2]; + + if (!ch) + return -1; + + for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); ) + ch++; + + if ((*ch++ == '=') && (*ch++ == ']')) + return (ch - sctx->tok_scan_mark); + + return -1; +} + +static inline int tbnf_regex_scan_bracket_rbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == ']') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_hyphen(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '-') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_error(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + return 1; + + return -1; +} + + + +/* bracket state entry scanners */ +static inline int tbnf_regex_scan_lbracket_circumflex_rbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + if (&sctx->tok_scan_mark[2] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == '^') + if (sctx->tok_scan_mark[2] == ']') + return 3; + + return -1; +} + +static inline int tbnf_regex_scan_lbracket_circumflex(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == '^') + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_lbracket_rbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == ']') + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_lbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + return 1; + + return -1; +} + + +/* common scanners */ +static inline int tbnf_regex_scan_escaped_char(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '\\') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1]) + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_circumflex_asterisk(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '^') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == '*') + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_circumflex(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '^') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_lbrace(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '{') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_lparen(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '(') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_rparen(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == ')') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_asterisk(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '*') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_period(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '.') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_dollar(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '$') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_vline(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '|') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_qmark(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '?') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_plus(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '+') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_char(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0]) + return 1; + + return -1; +} + +#endif |