From c257188cd912503371db1c2b7b2c59b4fd53df1c Mon Sep 17 00:00:00 2001 From: midipix Date: Wed, 15 May 2024 20:40:33 +0000 Subject: regex module: implemented token scanners, added definitions and scan table. --- project/common.mk | 1 + project/headers.mk | 7 +- project/tree.mk | 1 + src/internal/treebnf_regex_impl.h | 6 + src/regex/tbnf_regex.c | 185 ++++++++++++++++++++++++ src/regex/tbnf_regex_defs.h | 64 +++++++++ src/regex/tbnf_regex_scanfns.h | 287 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 550 insertions(+), 1 deletion(-) create mode 100644 src/internal/treebnf_regex_impl.h create mode 100644 src/regex/tbnf_regex.c create mode 100644 src/regex/tbnf_regex_defs.h create mode 100644 src/regex/tbnf_regex_scanfns.h diff --git a/project/common.mk b/project/common.mk index c9523bd..c26a018 100644 --- a/project/common.mk +++ b/project/common.mk @@ -4,6 +4,7 @@ API_SRCS = \ src/driver/tbnf_map_input.c \ src/driver/tbnf_unit_ctx.c \ src/output/tbnf_output_error.c \ + src/regex/tbnf_regex.c \ src/skin/tbnf_skin_default.c \ src/tokscan/tbnf_scan_token.c \ diff --git a/project/headers.mk b/project/headers.mk index c5ae8b6..9c2fcaa 100644 --- a/project/headers.mk +++ b/project/headers.mk @@ -7,7 +7,12 @@ INTERNAL_HEADERS = \ $(SOURCE_DIR)/src/internal/$(PACKAGE)_dprintf_impl.h \ $(SOURCE_DIR)/src/internal/$(PACKAGE)_driver_impl.h \ $(SOURCE_DIR)/src/internal/$(PACKAGE)_errinfo_impl.h \ + $(SOURCE_DIR)/src/internal/$(PACKAGE)_regex_impl.h \ $(SOURCE_DIR)/src/internal/$(PACKAGE)_tmpfile_impl.h \ $(SOURCE_DIR)/src/internal/$(PACKAGE)_visibility_impl.h \ -ALL_HEADERS = $(API_HEADERS) $(INTERNAL_HEADERS) +INTERNAL_UNIT_HEADERS = \ + $(SOURCE_DIR)/src/regex/tbnf_regex_defs.h \ + $(SOURCE_DIR)/src/regex/tbnf_regex_scanfns.h \ + +ALL_HEADERS = $(API_HEADERS) $(INTERNAL_HEADERS) $(INTERNAL_UNIT_HEADERS) diff --git a/project/tree.mk b/project/tree.mk index 0c51986..d01bd24 100644 --- a/project/tree.mk +++ b/project/tree.mk @@ -1,6 +1,7 @@ TREE_DIRS = src \ src/driver \ src/output \ + src/regex \ src/skin \ src/tokscan \ src/internal \ diff --git a/src/internal/treebnf_regex_impl.h b/src/internal/treebnf_regex_impl.h new file mode 100644 index 0000000..c5612ae --- /dev/null +++ b/src/internal/treebnf_regex_impl.h @@ -0,0 +1,6 @@ +#ifndef TREEBNF_REGEX_IMPL_H +#define TREEBNF_REGEX_IMPL_H + +extern struct tbnf_scan_tbl * tbnf_regex_scan_tbl[]; + +#endif diff --git a/src/regex/tbnf_regex.c b/src/regex/tbnf_regex.c new file mode 100644 index 0000000..2cdcb48 --- /dev/null +++ b/src/regex/tbnf_regex.c @@ -0,0 +1,185 @@ +/**************************************************************/ +/* treebnf: a tree oriented bnf library */ +/* Copyright (C) 2024 SysDeer Technologies, LLC */ +/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */ +/**************************************************************/ + +#include + +#include "treebnf_regex_impl.h" +#include "treebnf_visibility_impl.h" + +#include "tbnf_regex_defs.h" +#include "tbnf_regex_scanfns.h" + +#define TBNF_STATE_STACK_SIZE (512) + +/* init state scan table*/ +static struct tbnf_scan_tbl tbnf_regex_scan_tbl__init[TBNF_REGEX_TOK_CAP] = { + /* --> brace */ + [TBNF_REGEX_TOK_LBRACE] = { + .tok_scan_fn = tbnf_regex_scan_lbrace, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACE, + }, + + + /* --> bracket */ + [TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_lbracket_circumflex_rbracket, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + [TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX] = { + .tok_scan_fn = tbnf_regex_scan_lbracket_circumflex, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + [TBNF_REGEX_TOK_LBRACKET_RBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_lbracket_rbracket, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + [TBNF_REGEX_TOK_LBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_lbracket, + .tok_state_op = TBNF_STATE_PUSH, + .tok_state_next = TBNF_REGEX_STATE_BRACKET, + }, + + + /* (expression) */ + [TBNF_REGEX_TOK_ESCAPED_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_escaped_char, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK] = { + .tok_scan_fn = tbnf_regex_scan_circumflex_asterisk, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_CIRCUMFLEX] = { + .tok_scan_fn = tbnf_regex_scan_circumflex, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_LPAREN] = { + .tok_scan_fn = tbnf_regex_scan_lparen, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_RPAREN] = { + .tok_scan_fn = tbnf_regex_scan_rparen, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_ASTERISK] = { + .tok_scan_fn = tbnf_regex_scan_asterisk, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_PERIOD] = { + .tok_scan_fn = tbnf_regex_scan_period, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_DOLLAR] = { + .tok_scan_fn = tbnf_regex_scan_dollar, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_VLINE] = { + .tok_scan_fn = tbnf_regex_scan_vline, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_QMARK] = { + .tok_scan_fn = tbnf_regex_scan_qmark, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_PLUS] = { + .tok_scan_fn = tbnf_regex_scan_plus, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_char, + .tok_state_op = TBNF_STATE_KEEP, + }, +}; + + +/* brace state scan table*/ +static struct tbnf_scan_tbl tbnf_regex_scan_tbl__brace[TBNF_REGEX_TOK_CAP] = { + [TBNF_REGEX_TOK_BRACE_RBRACE] = { + .tok_scan_fn = tbnf_regex_scan_brace_rbrace, + .tok_state_op = TBNF_STATE_POP, + }, + + [TBNF_REGEX_TOK_BRACE_DIGIT] = { + .tok_scan_fn = tbnf_regex_scan_brace_digit, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACE_COMMA] = { + .tok_scan_fn = tbnf_regex_scan_brace_comma, + .tok_state_op = TBNF_STATE_KEEP, + }, +}; + + +/* bracket state scan table*/ +static struct tbnf_scan_tbl tbnf_regex_scan_tbl__bracket[TBNF_REGEX_TOK_CAP] = { + [TBNF_REGEX_TOK_BRACKET_RBRACKET] = { + .tok_scan_fn = tbnf_regex_scan_bracket_rbracket, + .tok_state_op = TBNF_STATE_POP, + }, + + [TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_bracket_escaped_char, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS] = { + .tok_scan_fn = tbnf_regex_scan_bracket_character_class, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL] = { + .tok_scan_fn = tbnf_regex_scan_bracket_collation_symbol, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS] = { + .tok_scan_fn = tbnf_regex_scan_bracket_equivalence_class, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_HYPHEN] = { + .tok_scan_fn = tbnf_regex_scan_bracket_hyphen, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_ERROR] = { + .tok_scan_fn = tbnf_regex_scan_bracket_error, + .tok_state_op = TBNF_STATE_KEEP, + }, + + [TBNF_REGEX_TOK_BRACKET_CHAR] = { + .tok_scan_fn = tbnf_regex_scan_char, + .tok_state_op = TBNF_STATE_KEEP, + }, +}; + + +/* extended regex expression token scan table */ +tbnf_hidden struct tbnf_scan_tbl * tbnf_regex_scan_tbl[] = { + [TBNF_REGEX_STATE_INIT] = tbnf_regex_scan_tbl__init, + [TBNF_REGEX_STATE_BRACE] = tbnf_regex_scan_tbl__brace, + [TBNF_REGEX_STATE_BRACKET] = tbnf_regex_scan_tbl__bracket, + [TBNF_REGEX_STATE_CAP] = 0, +}; diff --git a/src/regex/tbnf_regex_defs.h b/src/regex/tbnf_regex_defs.h new file mode 100644 index 0000000..9276ae0 --- /dev/null +++ b/src/regex/tbnf_regex_defs.h @@ -0,0 +1,64 @@ +/**************************************************************/ +/* treebnf: a tree oriented bnf library */ +/* Copyright (C) 2024 SysDeer Technologies, LLC */ +/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */ +/**************************************************************/ + +#ifndef TBNF_REGEX_DEFS_H +#define TBNF_REGEX_DEFS_H + +enum tbnf_regex_scan_state { + TBNF_REGEX_STATE_INIT, + TBNF_REGEX_STATE_BRACE, + TBNF_REGEX_STATE_BRACKET, + TBNF_REGEX_STATE_CAP, +}; + +enum tbnf_regex_token_type { + TBNF_REGEX_TOK_NONE, + + /* in-brace tokens */ + TBNF_REGEX_TOK_BRACE_RBRACE, + TBNF_REGEX_TOK_BRACE_DIGIT, + TBNF_REGEX_TOK_BRACE_COMMA, + + /* in-bracket tokens */ + TBNF_REGEX_TOK_BRACKET_ESCAPED_CHAR, + TBNF_REGEX_TOK_BRACKET_CHARACTER_CLASS, + TBNF_REGEX_TOK_BRACKET_COLLATION_SYMBOL, + TBNF_REGEX_TOK_BRACKET_EQUIVALENCE_CLASS, + TBNF_REGEX_TOK_BRACKET_RBRACKET, + TBNF_REGEX_TOK_BRACKET_HYPHEN, + TBNF_REGEX_TOK_BRACKET_ERROR, + TBNF_REGEX_TOK_BRACKET_CHAR, + + /* brace state initializer token */ + TBNF_REGEX_TOK_LBRACE, + + /* bracket state initializer tokens */ + TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX_RBRACKET, + TBNF_REGEX_TOK_LBRACKET_CIRCUMFLEX, + TBNF_REGEX_TOK_LBRACKET_RBRACKET, + TBNF_REGEX_TOK_LBRACKET, + + /* init state tokens */ + TBNF_REGEX_TOK_ESCAPED_CHAR, + + TBNF_REGEX_TOK_CIRCUMFLEX_ASTERISK, + TBNF_REGEX_TOK_CIRCUMFLEX, + + TBNF_REGEX_TOK_LPAREN, + TBNF_REGEX_TOK_RPAREN, + + TBNF_REGEX_TOK_ASTERISK, + TBNF_REGEX_TOK_PERIOD, + TBNF_REGEX_TOK_DOLLAR, + TBNF_REGEX_TOK_VLINE, + TBNF_REGEX_TOK_QMARK, + TBNF_REGEX_TOK_PLUS, + TBNF_REGEX_TOK_CHAR, + + TBNF_REGEX_TOK_CAP, +}; + +#endif diff --git a/src/regex/tbnf_regex_scanfns.h b/src/regex/tbnf_regex_scanfns.h new file mode 100644 index 0000000..c57a606 --- /dev/null +++ b/src/regex/tbnf_regex_scanfns.h @@ -0,0 +1,287 @@ +/**************************************************************/ +/* treebnf: a tree oriented bnf library */ +/* Copyright (C) 2024 SysDeer Technologies, LLC */ +/* Released under GPLv2 and GPLv3; see COPYING.TREEBNF. */ +/**************************************************************/ + +#ifndef TBNF_REGEX_SCANFNS_H +#define TBNF_REGEX_SCANFNS_H + +#include + +/* in-brace scanners */ +static inline int tbnf_regex_scan_brace_rbrace(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '}') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_brace_digit(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] >= '0') + if (sctx->tok_scan_mark[0] <= '9') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_brace_comma(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == ',') + return 1; + + return -1; +} + + +/* in-bracket scanners */ +static inline int tbnf_regex_scan_bracket_escaped_char(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '\\') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1]) + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_character_class(const struct tbnf_scan_ctx * sctx) +{ + const char * ch = 0; + + if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap) + return -1; + + if (sctx->tok_scan_mark[0] == '[') + if (sctx->tok_scan_mark[1] == ':') + ch = &sctx->tok_scan_mark[2]; + + if (!ch) + return -1; + + for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); ) + ch++; + + if ((*ch++ == ':') && (*ch++ == ']')) + return (ch - sctx->tok_scan_mark); + + return -1; +} + +static inline int tbnf_regex_scan_bracket_collation_symbol(const struct tbnf_scan_ctx * sctx) +{ + if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap) + return -1; + + if (sctx->tok_scan_mark[0] == '[') + if (sctx->tok_scan_mark[1] == '.') + if (sctx->tok_scan_mark[3] == '.') + if (sctx->tok_scan_mark[4] == ']') + if (sctx->tok_scan_mark[2]) + return 5; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_equivalence_class(const struct tbnf_scan_ctx * sctx) +{ + const char * ch = 0; + + if (&sctx->tok_scan_mark[4] >= sctx->tok_scan_cap) + return -1; + + if (sctx->tok_scan_mark[0] == '[') + if (sctx->tok_scan_mark[1] == '=') + ch = &sctx->tok_scan_mark[2]; + + if (!ch) + return -1; + + for (; (*ch >= 'a') && (*ch <= 'z') && (&ch[2] < sctx->tok_scan_cap); ) + ch++; + + if ((*ch++ == '=') && (*ch++ == ']')) + return (ch - sctx->tok_scan_mark); + + return -1; +} + +static inline int tbnf_regex_scan_bracket_rbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == ']') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_hyphen(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '-') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_bracket_error(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + return 1; + + return -1; +} + + + +/* bracket state entry scanners */ +static inline int tbnf_regex_scan_lbracket_circumflex_rbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + if (&sctx->tok_scan_mark[2] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == '^') + if (sctx->tok_scan_mark[2] == ']') + return 3; + + return -1; +} + +static inline int tbnf_regex_scan_lbracket_circumflex(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == '^') + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_lbracket_rbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == ']') + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_lbracket(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '[') + return 1; + + return -1; +} + + +/* common scanners */ +static inline int tbnf_regex_scan_escaped_char(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '\\') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1]) + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_circumflex_asterisk(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '^') + if (&sctx->tok_scan_mark[1] < sctx->tok_scan_cap) + if (sctx->tok_scan_mark[1] == '*') + return 2; + + return -1; +} + +static inline int tbnf_regex_scan_circumflex(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '^') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_lbrace(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '{') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_lparen(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '(') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_rparen(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == ')') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_asterisk(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '*') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_period(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '.') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_dollar(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '$') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_vline(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '|') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_qmark(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '?') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_plus(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0] == '+') + return 1; + + return -1; +} + +static inline int tbnf_regex_scan_char(const struct tbnf_scan_ctx * sctx) +{ + if (sctx->tok_scan_mark[0]) + return 1; + + return -1; +} + +#endif -- cgit v1.2.3