1341 lines
41 KiB
C
1341 lines
41 KiB
C
/*
|
|
This is a Unix port of the Plan 9 regular expression library, by Rob Pike.
|
|
Please send comments about the packaging to Russ Cox <rsc@swtch.com>.
|
|
|
|
Copyright © 2021 Plan 9 Foundation
|
|
Copyright © 2023 Tyge Løvset, for additions.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
#ifndef STC_CREGEX_PRV_C_INCLUDED
|
|
#define STC_CREGEX_PRV_C_INCLUDED
|
|
|
|
#include <setjmp.h>
|
|
#include "utf8_prv.h"
|
|
#include "cstr_prv.h"
|
|
#include "ucd_prv.c"
|
|
|
|
typedef uint32_t _Rune; /* Utf8 code point */
|
|
typedef int32_t _Token;
|
|
/* max character classes per program */
|
|
#define _NCLASS CREG_MAX_CLASSES
|
|
/* max subexpressions */
|
|
#define _NSUBEXP CREG_MAX_CAPTURES
|
|
/* max rune ranges per character class */
|
|
#define _NCCRUNE (_NSUBEXP * 2)
|
|
|
|
/*
|
|
* character class, each pair of rune's defines a range
|
|
*/
|
|
typedef struct
|
|
{
|
|
_Rune *end;
|
|
_Rune spans[_NCCRUNE];
|
|
} _Reclass;
|
|
|
|
/*
|
|
* Machine instructions
|
|
*/
|
|
typedef struct _Reinst
|
|
{
|
|
_Token type;
|
|
union {
|
|
_Reclass *classp; /* class pointer */
|
|
_Rune rune; /* character */
|
|
int subid; /* sub-expression id for TOK_RBRA and TOK_LBRA */
|
|
struct _Reinst *right; /* right child of TOK_OR */
|
|
} r;
|
|
union { /* regexp relies on these two being in the same union */
|
|
struct _Reinst *left; /* left child of TOK_OR */
|
|
struct _Reinst *next; /* next instruction for TOK_CAT & TOK_LBRA */
|
|
} l;
|
|
} _Reinst;
|
|
|
|
typedef struct {
|
|
bool icase;
|
|
bool dotall;
|
|
} _Reflags;
|
|
|
|
/*
|
|
* Reprogram definition
|
|
*/
|
|
typedef struct _Reprog
|
|
{
|
|
_Reinst *startinst; /* start pc */
|
|
_Reflags flags;
|
|
int nsubids;
|
|
isize allocsize;
|
|
_Reclass cclass[_NCLASS]; /* .data */
|
|
_Reinst firstinst[]; /* .text : originally 5 elements? */
|
|
} _Reprog;
|
|
|
|
/*
|
|
* Sub expression matches
|
|
*/
|
|
typedef csview _Resub;
|
|
|
|
/*
|
|
* substitution list
|
|
*/
|
|
typedef struct _Resublist
|
|
{
|
|
_Resub m[_NSUBEXP];
|
|
} _Resublist;
|
|
|
|
/*
|
|
* Actions and Tokens (_Reinst types)
|
|
*
|
|
* 0x800000-0x80FFFF: operators, value => precedence
|
|
* 0x810000-0x81FFFF: TOK_RUNE and char classes.
|
|
* 0x820000-0x82FFFF: tokens, i.e. operands for operators
|
|
*/
|
|
enum {
|
|
TOK_MASK = 0xFF00000,
|
|
TOK_OPERATOR = 0x8000000, /* Bitmask of all operators */
|
|
TOK_START = 0x8000001, /* Start, used for marker on stack */
|
|
TOK_RBRA , /* Right bracket, ) */
|
|
TOK_LBRA , /* Left bracket, ( */
|
|
TOK_OR , /* Alternation, | */
|
|
TOK_CAT , /* Concatentation, implicit operator */
|
|
TOK_STAR , /* Closure, * */
|
|
TOK_PLUS , /* a+ == aa* */
|
|
TOK_QUEST , /* a? == a|nothing, i.e. 0 or 1 a's */
|
|
TOK_RUNE = 0x8100000,
|
|
TOK_IRUNE ,
|
|
ASC_an , ASC_AN, /* alphanum */
|
|
ASC_al , ASC_AL, /* alpha */
|
|
ASC_as , ASC_AS, /* ascii */
|
|
ASC_bl , ASC_BL, /* blank */
|
|
ASC_ct , ASC_CT, /* ctrl */
|
|
ASC_d , ASC_D, /* digit */
|
|
ASC_s , ASC_S, /* space */
|
|
ASC_w , ASC_W, /* word */
|
|
ASC_gr , ASC_GR, /* graphic */
|
|
ASC_pr , ASC_PR, /* print */
|
|
ASC_pu , ASC_PU, /* punct */
|
|
ASC_lo , ASC_LO, /* lower */
|
|
ASC_up , ASC_UP, /* upper */
|
|
ASC_xd , ASC_XD, /* hex */
|
|
UTF_al , UTF_AL, /* utf8 alpha */
|
|
UTF_an , UTF_AN, /* utf8 alphanumeric */
|
|
UTF_bl , UTF_BL, /* utf8 blank */
|
|
UTF_lc , UTF_LC, /* utf8 letter cased */
|
|
UTF_ll , UTF_LL, /* utf8 letter lowercase */
|
|
UTF_lu , UTF_LU, /* utf8 letter uppercase */
|
|
UTF_sp , UTF_SP, /* utf8 space */
|
|
UTF_wr , UTF_WR, /* utf8 word */
|
|
UTF_GRP = 0x8150000,
|
|
UTF_cc = UTF_GRP+2*U8G_Cc, UTF_CC, /* utf8 control char */
|
|
UTF_lt = UTF_GRP+2*U8G_Lt, UTF_LT, /* utf8 letter titlecase */
|
|
UTF_nd = UTF_GRP+2*U8G_Nd, UTF_ND, /* utf8 number decimal */
|
|
UTF_nl = UTF_GRP+2*U8G_Nl, UTF_NL, /* utf8 number letter */
|
|
UTF_pc = UTF_GRP+2*U8G_Pc, UTF_PC, /* utf8 punct connector */
|
|
UTF_pd = UTF_GRP+2*U8G_Pd, UTF_PD, /* utf8 punct dash */
|
|
UTF_pf = UTF_GRP+2*U8G_Pf, UTF_PF, /* utf8 punct final */
|
|
UTF_pi = UTF_GRP+2*U8G_Pi, UTF_PI, /* utf8 punct initial */
|
|
UTF_sc = UTF_GRP+2*U8G_Sc, UTF_SC, /* utf8 symbol currency */
|
|
UTF_zl = UTF_GRP+2*U8G_Zl, UTF_ZL, /* utf8 separator line */
|
|
UTF_zp = UTF_GRP+2*U8G_Zp, UTF_ZP, /* utf8 separator paragraph */
|
|
UTF_zs = UTF_GRP+2*U8G_Zs, UTF_ZS, /* utf8 separator space */
|
|
UTF_arabic = UTF_GRP+2*U8G_Arabic, UTF_ARABIC,
|
|
UTF_bengali = UTF_GRP+2*U8G_Bengali, UTF_BENGALI,
|
|
UTF_cyrillic = UTF_GRP+2*U8G_Cyrillic, UTF_CYRILLIC,
|
|
UTF_devanagari = UTF_GRP+2*U8G_Devanagari, UTF_DEVANAGARI,
|
|
UTF_georgian = UTF_GRP+2*U8G_Georgian, UTF_GEORGIAN,
|
|
UTF_greek = UTF_GRP+2*U8G_Greek, UTF_GREEK,
|
|
UTF_han = UTF_GRP+2*U8G_Han, UTF_HAN,
|
|
UTF_hiragana = UTF_GRP+2*U8G_Hiragana, UTF_HIRAGANA,
|
|
UTF_katakana = UTF_GRP+2*U8G_Katakana, UTF_KATAKANA,
|
|
UTF_latin = UTF_GRP+2*U8G_Latin, UTF_LATIN,
|
|
UTF_thai = UTF_GRP+2*U8G_Thai, UTF_THAI,
|
|
TOK_ANY = 0x8200000, /* Any character except newline, . */
|
|
TOK_ANYNL , /* Any character including newline, . */
|
|
TOK_NOP , /* No operation, internal use only */
|
|
TOK_BOL , TOK_BOS, /* Beginning of line / string, ^ */
|
|
TOK_EOL , TOK_EOS, /* End of line / string, $ */
|
|
TOK_EOZ , /* End of line with optional NL */
|
|
TOK_CCLASS , /* Character class, [] */
|
|
TOK_NCCLASS , /* Negated character class, [] */
|
|
TOK_WBOUND , /* Non-word boundary, not consuming meta char */
|
|
TOK_NWBOUND , /* Word boundary, not consuming meta char */
|
|
TOK_CASED , /* (?-i) */
|
|
TOK_ICASE , /* (?i) */
|
|
TOK_END = 0x82FFFFF, /* Terminate: match found */
|
|
};
|
|
|
|
/*
|
|
* _regexec execution lists
|
|
*/
|
|
#define _LISTSIZE 10
|
|
#define _BIGLISTSIZE (10*_LISTSIZE)
|
|
|
|
typedef struct _Relist
|
|
{
|
|
_Reinst* inst; /* Reinstruction of the thread */
|
|
_Resublist se; /* matched subexpressions in this thread */
|
|
} _Relist;
|
|
|
|
typedef struct _Reljunk
|
|
{
|
|
_Relist* relist[2];
|
|
_Relist* reliste[2];
|
|
int starttype;
|
|
_Rune startchar;
|
|
const char* starts;
|
|
const char* eol;
|
|
} _Reljunk;
|
|
|
|
/*
|
|
* utf8 and _Rune code
|
|
*/
|
|
|
|
static inline int
|
|
chartorune(_Rune *rune, const char *s)
|
|
{
|
|
utf8_decode_t d = {.state=0};
|
|
int n = utf8_decode_codepoint(&d, s, NULL);
|
|
*rune = d.codep;
|
|
return n;
|
|
}
|
|
|
|
static const char*
|
|
utfrune(const char *s, _Rune c) // search
|
|
{
|
|
if (c < 0x80) /* ascii */
|
|
return strchr((char *)s, (int)c);
|
|
|
|
utf8_decode_t d = {.state=0};
|
|
while (*s != 0) {
|
|
int n = utf8_decode_codepoint(&d, s, NULL);
|
|
if (d.codep == c) return s;
|
|
s += n;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static const char*
|
|
utfruneicase(const char *s, _Rune c) {
|
|
if (c < 0x80) {
|
|
for (int low = tolower((int)c); *s != 0; ++s)
|
|
if (tolower(*s) == low)
|
|
return s;
|
|
} else {
|
|
utf8_decode_t d = {.state=0};
|
|
c = utf8_casefold(c);
|
|
while (*s != 0) {
|
|
int n = utf8_decode_codepoint(&d, s, NULL);
|
|
if (utf8_casefold(d.codep) == c)
|
|
return s;
|
|
s += n;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/************
|
|
* regaux.c *
|
|
************/
|
|
|
|
/*
|
|
* save a new match in mp
|
|
*/
|
|
static void
|
|
_renewmatch(_Resub *mp, int ms, _Resublist *sp, int nsubids)
|
|
{
|
|
if (mp==NULL || ms==0)
|
|
return;
|
|
if (mp[0].buf == NULL || sp->m[0].buf < mp[0].buf ||
|
|
(sp->m[0].buf == mp[0].buf && sp->m[0].size > mp[0].size)) {
|
|
for (int i=0; i<ms && i<=nsubids; i++)
|
|
mp[i] = sp->m[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Note optimization in _renewthread:
|
|
* *lp must be pending when _renewthread called; if *l has been looked
|
|
* at already, the optimization is a bug.
|
|
*/
|
|
static _Relist*
|
|
_renewthread(_Relist *lp, /* _relist to add to */
|
|
_Reinst *ip, /* instruction to add */
|
|
int ms,
|
|
_Resublist *sep) /* pointers to subexpressions */
|
|
{
|
|
_Relist *p;
|
|
|
|
for (p=lp; p->inst; p++) {
|
|
if (p->inst == ip) {
|
|
if (sep->m[0].buf < p->se.m[0].buf) {
|
|
if (ms > 1)
|
|
p->se = *sep;
|
|
else
|
|
p->se.m[0] = sep->m[0];
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
p->inst = ip;
|
|
if (ms > 1)
|
|
p->se = *sep;
|
|
else
|
|
p->se.m[0] = sep->m[0];
|
|
(++p)->inst = NULL;
|
|
return p;
|
|
}
|
|
|
|
/*
|
|
* same as renewthread, but called with
|
|
* initial empty start pointer.
|
|
*/
|
|
static _Relist*
|
|
_renewemptythread(_Relist *lp, /* _relist to add to */
|
|
_Reinst *ip, /* instruction to add */
|
|
int ms,
|
|
const char *sp) /* pointers to subexpressions */
|
|
{
|
|
_Relist *p;
|
|
|
|
for (p=lp; p->inst; p++) {
|
|
if (p->inst == ip) {
|
|
if (sp < p->se.m[0].buf) {
|
|
if (ms > 1)
|
|
memset(&p->se, 0, sizeof(p->se));
|
|
p->se.m[0].buf = sp;
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
p->inst = ip;
|
|
if (ms > 1)
|
|
memset(&p->se, 0, sizeof(p->se));
|
|
p->se.m[0].buf = sp;
|
|
(++p)->inst = NULL;
|
|
return p;
|
|
}
|
|
|
|
/*
|
|
* _Parser Information
|
|
*/
|
|
typedef struct _Node
|
|
{
|
|
_Reinst* first;
|
|
_Reinst* last;
|
|
} _Node;
|
|
|
|
#define _NSTACK 20
|
|
typedef struct _Parser
|
|
{
|
|
const char* exprp; /* pointer to next character in source expression */
|
|
_Node andstack[_NSTACK];
|
|
_Node* andp;
|
|
_Token atorstack[_NSTACK];
|
|
_Token* atorp;
|
|
short subidstack[_NSTACK]; /* parallel to atorstack */
|
|
short* subidp;
|
|
short cursubid; /* id of current subexpression */
|
|
int error;
|
|
_Reflags flags;
|
|
int dot_type;
|
|
int rune_type;
|
|
bool litmode;
|
|
bool lastwasand; /* Last token was _operand */
|
|
short nbra;
|
|
short nclass;
|
|
isize instcap;
|
|
_Rune yyrune; /* last lex'd rune */
|
|
_Reclass *yyclassp; /* last lex'd class */
|
|
_Reclass* classp;
|
|
_Reinst* freep;
|
|
jmp_buf regkaboom;
|
|
} _Parser;
|
|
|
|
/* predeclared crap */
|
|
static void _operator(_Parser *par, _Token type);
|
|
static void _pushand(_Parser *par, _Reinst *first, _Reinst *last);
|
|
static void _pushator(_Parser *par, _Token type);
|
|
static void _evaluntil(_Parser *par, _Token type);
|
|
static int _bldcclass(_Parser *par);
|
|
|
|
static void
|
|
_rcerror(_Parser *par, cregex_result err)
|
|
{
|
|
par->error = err;
|
|
longjmp(par->regkaboom, 1);
|
|
}
|
|
|
|
static _Reinst*
|
|
_newinst(_Parser *par, _Token t)
|
|
{
|
|
par->freep->type = t;
|
|
par->freep->l.left = 0;
|
|
par->freep->r.right = 0;
|
|
return par->freep++;
|
|
}
|
|
|
|
static void
|
|
_operand(_Parser *par, _Token t)
|
|
{
|
|
_Reinst *i;
|
|
|
|
if (par->lastwasand)
|
|
_operator(par, TOK_CAT); /* catenate is implicit */
|
|
i = _newinst(par, t);
|
|
switch (t) {
|
|
case TOK_CCLASS: case TOK_NCCLASS:
|
|
i->r.classp = par->yyclassp; break;
|
|
case TOK_RUNE:
|
|
i->r.rune = par->yyrune; break;
|
|
case TOK_IRUNE:
|
|
i->r.rune = utf8_casefold(par->yyrune);
|
|
}
|
|
_pushand(par, i, i);
|
|
par->lastwasand = true;
|
|
}
|
|
|
|
static void
|
|
_operator(_Parser *par, _Token t)
|
|
{
|
|
if (t==TOK_RBRA && --par->nbra<0)
|
|
_rcerror(par, CREG_UNMATCHEDRIGHTPARENTHESIS);
|
|
if (t==TOK_LBRA) {
|
|
if (++par->cursubid >= _NSUBEXP)
|
|
_rcerror(par, CREG_TOOMANYSUBEXPRESSIONS);
|
|
par->nbra++;
|
|
if (par->lastwasand)
|
|
_operator(par, TOK_CAT);
|
|
} else
|
|
_evaluntil(par, t);
|
|
if (t != TOK_RBRA)
|
|
_pushator(par, t);
|
|
par->lastwasand = 0;
|
|
if (t==TOK_STAR || t==TOK_QUEST || t==TOK_PLUS || t==TOK_RBRA)
|
|
par->lastwasand = true; /* these look like operands */
|
|
}
|
|
|
|
static void
|
|
_pushand(_Parser *par, _Reinst *f, _Reinst *l)
|
|
{
|
|
if (par->andp >= &par->andstack[_NSTACK])
|
|
_rcerror(par, CREG_OPERANDSTACKOVERFLOW);
|
|
par->andp->first = f;
|
|
par->andp->last = l;
|
|
par->andp++;
|
|
}
|
|
|
|
static void
|
|
_pushator(_Parser *par, _Token t)
|
|
{
|
|
if (par->atorp >= &par->atorstack[_NSTACK])
|
|
_rcerror(par, CREG_OPERATORSTACKOVERFLOW);
|
|
*par->atorp++ = t;
|
|
*par->subidp++ = par->cursubid;
|
|
}
|
|
|
|
static _Node*
|
|
_popand(_Parser *par, _Token op)
|
|
{
|
|
(void)op;
|
|
_Reinst *inst;
|
|
|
|
if (par->andp <= &par->andstack[0]) {
|
|
_rcerror(par, CREG_MISSINGOPERAND);
|
|
inst = _newinst(par, TOK_NOP);
|
|
_pushand(par, inst, inst);
|
|
}
|
|
return --par->andp;
|
|
}
|
|
|
|
static _Token
|
|
_popator(_Parser *par)
|
|
{
|
|
if (par->atorp <= &par->atorstack[0])
|
|
_rcerror(par, CREG_OPERATORSTACKUNDERFLOW);
|
|
--par->subidp;
|
|
return *--par->atorp;
|
|
}
|
|
|
|
|
|
static void
|
|
_evaluntil(_Parser *par, _Token pri)
|
|
{
|
|
_Node *op1, *op2;
|
|
_Reinst *inst1, *inst2;
|
|
|
|
while (pri==TOK_RBRA || par->atorp[-1]>=pri) {
|
|
switch (_popator(par)) {
|
|
default:
|
|
_rcerror(par, CREG_UNKNOWNOPERATOR);
|
|
break;
|
|
case TOK_LBRA: /* must have been TOK_RBRA */
|
|
op1 = _popand(par, '(');
|
|
inst2 = _newinst(par, TOK_RBRA);
|
|
inst2->r.subid = *par->subidp;
|
|
op1->last->l.next = inst2;
|
|
inst1 = _newinst(par, TOK_LBRA);
|
|
inst1->r.subid = *par->subidp;
|
|
inst1->l.next = op1->first;
|
|
_pushand(par, inst1, inst2);
|
|
return;
|
|
case TOK_OR:
|
|
op2 = _popand(par, '|');
|
|
op1 = _popand(par, '|');
|
|
inst2 = _newinst(par, TOK_NOP);
|
|
op2->last->l.next = inst2;
|
|
op1->last->l.next = inst2;
|
|
inst1 = _newinst(par, TOK_OR);
|
|
inst1->r.right = op1->first;
|
|
inst1->l.left = op2->first;
|
|
_pushand(par, inst1, inst2);
|
|
break;
|
|
case TOK_CAT:
|
|
op2 = _popand(par, 0);
|
|
op1 = _popand(par, 0);
|
|
op1->last->l.next = op2->first;
|
|
_pushand(par, op1->first, op2->last);
|
|
break;
|
|
case TOK_STAR:
|
|
op2 = _popand(par, '*');
|
|
inst1 = _newinst(par, TOK_OR);
|
|
op2->last->l.next = inst1;
|
|
inst1->r.right = op2->first;
|
|
_pushand(par, inst1, inst1);
|
|
break;
|
|
case TOK_PLUS:
|
|
op2 = _popand(par, '+');
|
|
inst1 = _newinst(par, TOK_OR);
|
|
op2->last->l.next = inst1;
|
|
inst1->r.right = op2->first;
|
|
_pushand(par, op2->first, inst1);
|
|
break;
|
|
case TOK_QUEST:
|
|
op2 = _popand(par, '?');
|
|
inst1 = _newinst(par, TOK_OR);
|
|
inst2 = _newinst(par, TOK_NOP);
|
|
inst1->l.left = inst2;
|
|
inst1->r.right = op2->first;
|
|
op2->last->l.next = inst2;
|
|
_pushand(par, inst1, inst2);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static _Reprog*
|
|
_optimize(_Parser *par, _Reprog *pp)
|
|
{
|
|
_Reinst *inst, *target;
|
|
_Reclass *cl;
|
|
|
|
/*
|
|
* get rid of NOOP chains
|
|
*/
|
|
for (inst = pp->firstinst; inst->type != TOK_END; inst++) {
|
|
target = inst->l.next;
|
|
while (target->type == TOK_NOP)
|
|
target = target->l.next;
|
|
inst->l.next = target;
|
|
}
|
|
|
|
/*
|
|
* The original allocation is for an area larger than
|
|
* necessary. Reallocate to the actual space used
|
|
* and then relocate the code.
|
|
*/
|
|
if ((par->freep - pp->firstinst)*2 > par->instcap)
|
|
return pp;
|
|
|
|
intptr_t ipp = (intptr_t)pp; // convert pointer to integer!
|
|
isize new_allocsize = c_sizeof(_Reprog) + (par->freep - pp->firstinst)*c_sizeof(_Reinst);
|
|
_Reprog *npp = (_Reprog *)c_realloc(pp, pp->allocsize, new_allocsize);
|
|
isize diff = (intptr_t)npp - ipp;
|
|
|
|
if ((npp == NULL) | (diff == 0))
|
|
return (_Reprog *)ipp;
|
|
npp->allocsize = new_allocsize;
|
|
par->freep = (_Reinst *)((char *)par->freep + diff);
|
|
|
|
for (inst = npp->firstinst; inst < par->freep; inst++) {
|
|
switch (inst->type) {
|
|
case TOK_OR:
|
|
case TOK_STAR:
|
|
case TOK_PLUS:
|
|
case TOK_QUEST:
|
|
inst->r.right = (_Reinst *)((char*)inst->r.right + diff);
|
|
break;
|
|
case TOK_CCLASS:
|
|
case TOK_NCCLASS:
|
|
inst->r.right = (_Reinst *)((char*)inst->r.right + diff);
|
|
cl = inst->r.classp;
|
|
cl->end = (_Rune *)((char*)cl->end + diff);
|
|
break;
|
|
}
|
|
if (inst->l.left)
|
|
inst->l.left = (_Reinst *)((char*)inst->l.left + diff);
|
|
}
|
|
npp->startinst = (_Reinst *)((char*)npp->startinst + diff);
|
|
return npp;
|
|
}
|
|
|
|
|
|
static _Reclass*
|
|
_newclass(_Parser *par)
|
|
{
|
|
if (par->nclass >= _NCLASS)
|
|
_rcerror(par, CREG_TOOMANYCHARACTERCLASSES);
|
|
return &(par->classp[par->nclass++]);
|
|
}
|
|
|
|
|
|
static int /* quoted */
|
|
_nextc(_Parser *par, _Rune *rp)
|
|
{
|
|
int ret;
|
|
for (;;) {
|
|
ret = par->litmode;
|
|
par->exprp += chartorune(rp, par->exprp);
|
|
|
|
if (*rp == '\\') {
|
|
if (par->litmode) {
|
|
if (*par->exprp != 'E')
|
|
break;
|
|
par->exprp += 1;
|
|
par->litmode = false;
|
|
continue;
|
|
}
|
|
par->exprp += chartorune(rp, par->exprp);
|
|
if (*rp == 'Q') {
|
|
par->litmode = true;
|
|
continue;
|
|
}
|
|
if (*rp == 'x' && *par->exprp == '{') {
|
|
*rp = (_Rune)strtol(par->exprp + 1, (char **)&par->exprp, 16);
|
|
if (*par->exprp != '}')
|
|
_rcerror(par, CREG_UNMATCHEDRIGHTPARENTHESIS);
|
|
par->exprp++;
|
|
}
|
|
ret = 1;
|
|
}
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
static void
|
|
_lexasciiclass(_Parser *par, _Rune *rp) /* assume *rp == '[' and *par->exprp == ':' */
|
|
{
|
|
static struct { const char* c; int n, r; } cls[] = {
|
|
{"alnum:]", 7, ASC_an}, {"alpha:]", 7, ASC_al}, {"ascii:]", 7, ASC_as},
|
|
{"blank:]", 7, ASC_bl}, {"cntrl:]", 7, ASC_ct}, {"digit:]", 7, ASC_d},
|
|
{"graph:]", 7, ASC_gr}, {"lower:]", 7, ASC_lo}, {"print:]", 7, ASC_pr},
|
|
{"punct:]", 7, ASC_pu}, {"space:]", 7, ASC_s}, {"upper:]", 7, ASC_up},
|
|
{"xdigit:]", 8, ASC_xd}, {"word:]", 6, ASC_w},
|
|
};
|
|
int inv = par->exprp[1] == '^', off = 1 + inv;
|
|
for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i)
|
|
if (strncmp(par->exprp + off, cls[i].c, (size_t)cls[i].n) == 0) {
|
|
*rp = (_Rune)cls[i].r;
|
|
par->exprp += off + cls[i].n;
|
|
break;
|
|
}
|
|
if (par->rune_type == TOK_IRUNE && (*rp == ASC_lo || *rp == ASC_up))
|
|
*rp = (_Rune)ASC_al;
|
|
if (inv && *rp != '[')
|
|
*rp += 1;
|
|
}
|
|
|
|
|
|
static void
|
|
_lexutfclass(_Parser *par, _Rune *rp)
|
|
{
|
|
static struct { const char* c; uint32_t n, r; } cls[] = {
|
|
{"{Alpha}", 7, UTF_al}, {"{L&}", 4, UTF_lc},
|
|
{"{Digit}", 7, UTF_nd}, {"{Nd}", 4, UTF_nd},
|
|
{"{Lower}", 7, UTF_ll}, {"{Ll}", 4, UTF_ll},
|
|
{"{Upper}", 7, UTF_lu}, {"{Lu}", 4, UTF_lu},
|
|
{"{Cntrl}", 7, UTF_cc}, {"{Cc}", 4, UTF_cc},
|
|
{"{Alnum}", 7, UTF_an}, {"{Blank}", 7, UTF_bl},
|
|
{"{Space}", 7, UTF_sp}, {"{Word}", 6, UTF_wr},
|
|
{"{XDigit}", 8, ASC_xd},
|
|
{"{Lt}", 4, UTF_lt}, {"{Nl}", 4, UTF_nl},
|
|
{"{Pc}", 4, UTF_pc}, {"{Pd}", 4, UTF_pd},
|
|
{"{Pf}", 4, UTF_pf}, {"{Pi}", 4, UTF_pi},
|
|
{"{Zl}", 4, UTF_zl}, {"{Zp}", 4, UTF_zp},
|
|
{"{Zs}", 4, UTF_zs}, {"{Sc}", 4, UTF_sc},
|
|
{"{Arabic}", 8, UTF_arabic},
|
|
{"{Bengali}", 9, UTF_bengali},
|
|
{"{Cyrillic}", 10, UTF_cyrillic},
|
|
{"{Devanagari}", 12, UTF_devanagari},
|
|
{"{Georgian}", 10, UTF_georgian},
|
|
{"{Greek}", 7, UTF_greek},
|
|
{"{Han}", 5, UTF_han},
|
|
{"{Hiragana}", 10, UTF_hiragana},
|
|
{"{Katakana}", 10, UTF_katakana},
|
|
{"{Latin}", 7, UTF_latin},
|
|
{"{Thai}", 6, UTF_thai},
|
|
};
|
|
unsigned inv = (*rp == 'P');
|
|
for (unsigned i = 0; i < (sizeof cls/sizeof *cls); ++i) {
|
|
if (strncmp(par->exprp, cls[i].c, (size_t)cls[i].n) == 0) {
|
|
if (par->rune_type == TOK_IRUNE && (cls[i].r == UTF_ll || cls[i].r == UTF_lu))
|
|
*rp = (_Rune)(UTF_lc + inv);
|
|
else
|
|
*rp = (_Rune)(cls[i].r + inv);
|
|
par->exprp += cls[i].n;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
#define CASE_RUNE_MAPPINGS(rune) \
|
|
case 't': rune = '\t'; break; \
|
|
case 'n': rune = '\n'; break; \
|
|
case 'r': rune = '\r'; break; \
|
|
case 'v': rune = '\v'; break; \
|
|
case 'f': rune = '\f'; break; \
|
|
case 'a': rune = '\a'; break; \
|
|
case 'd': rune = UTF_nd; break; \
|
|
case 'D': rune = UTF_ND; break; \
|
|
case 's': rune = UTF_sp; break; \
|
|
case 'S': rune = UTF_SP; break; \
|
|
case 'w': rune = UTF_wr; break; \
|
|
case 'W': rune = UTF_WR; break
|
|
|
|
|
|
static _Token
|
|
_lex(_Parser *par)
|
|
{
|
|
bool quoted = _nextc(par, &par->yyrune);
|
|
|
|
if (quoted) {
|
|
if (par->litmode)
|
|
return par->rune_type;
|
|
|
|
switch (par->yyrune) {
|
|
CASE_RUNE_MAPPINGS(par->yyrune);
|
|
case 'b': return TOK_WBOUND;
|
|
case 'B': return TOK_NWBOUND;
|
|
case 'A': return TOK_BOS;
|
|
case 'z': return TOK_EOS;
|
|
case 'Z': return TOK_EOZ;
|
|
case 'p': case 'P':
|
|
_lexutfclass(par, &par->yyrune);
|
|
break;
|
|
}
|
|
return par->rune_type;
|
|
}
|
|
|
|
switch (par->yyrune) {
|
|
case 0 : return TOK_END;
|
|
case '*': return TOK_STAR;
|
|
case '?': return TOK_QUEST;
|
|
case '+': return TOK_PLUS;
|
|
case '|': return TOK_OR;
|
|
case '^': return TOK_BOL;
|
|
case '$': return TOK_EOL;
|
|
case '.': return par->dot_type;
|
|
case '[': return _bldcclass(par);
|
|
case '(':
|
|
if (par->exprp[0] == '?') { /* override global flags */
|
|
for (int k = 1, enable = 1; ; ++k) switch (par->exprp[k]) {
|
|
case 0 : par->exprp += k; return TOK_END;
|
|
case ')': par->exprp += k + 1;
|
|
return TOK_CASED + (par->rune_type == TOK_IRUNE);
|
|
case '-': enable = 0; break;
|
|
case 's': par->dot_type = TOK_ANY + enable; break;
|
|
case 'i': par->rune_type = TOK_RUNE + enable; break;
|
|
default: _rcerror(par, CREG_UNKNOWNOPERATOR); return 0;
|
|
}
|
|
}
|
|
return TOK_LBRA;
|
|
case ')': return TOK_RBRA;
|
|
}
|
|
return par->rune_type;
|
|
}
|
|
|
|
|
|
static _Token
|
|
_bldcclass(_Parser *par)
|
|
{
|
|
_Token type;
|
|
_Rune r[_NCCRUNE];
|
|
_Rune *p, *ep, *np;
|
|
_Rune rune;
|
|
int quoted;
|
|
|
|
/* we have already seen the '[' */
|
|
type = TOK_CCLASS;
|
|
par->yyclassp = _newclass(par);
|
|
|
|
/* look ahead for negation */
|
|
/* SPECIAL CASE!!! negated classes don't match \n */
|
|
ep = r;
|
|
quoted = _nextc(par, &rune);
|
|
if (!quoted && rune == '^') {
|
|
type = TOK_NCCLASS;
|
|
quoted = _nextc(par, &rune);
|
|
ep[0] = ep[1] = '\n';
|
|
ep += 2;
|
|
}
|
|
|
|
/* parse class into a set of spans */
|
|
for (; ep < &r[_NCCRUNE]; quoted = _nextc(par, &rune)) {
|
|
if (rune == 0) {
|
|
_rcerror(par, CREG_MALFORMEDCHARACTERCLASS);
|
|
return 0;
|
|
}
|
|
if (!quoted) {
|
|
if (rune == ']')
|
|
break;
|
|
if (rune == '-') {
|
|
if (ep != r && *par->exprp != ']') {
|
|
quoted = _nextc(par, &rune);
|
|
if (rune == 0) {
|
|
_rcerror(par, CREG_MALFORMEDCHARACTERCLASS);
|
|
return 0;
|
|
}
|
|
ep[-1] = par->rune_type == TOK_IRUNE ? utf8_casefold(rune) : rune;
|
|
continue;
|
|
}
|
|
}
|
|
if (rune == '[' && *par->exprp == ':')
|
|
_lexasciiclass(par, &rune);
|
|
} else switch (rune) {
|
|
CASE_RUNE_MAPPINGS(rune);
|
|
case 'p': case 'P':
|
|
_lexutfclass(par, &rune);
|
|
break;
|
|
}
|
|
ep[0] = ep[1] = par->rune_type == TOK_IRUNE ? utf8_casefold(rune) : rune;
|
|
ep += 2;
|
|
}
|
|
|
|
/* sort on span start */
|
|
for (p = r; p < ep; p += 2)
|
|
for (np = p; np < ep; np += 2)
|
|
if (*np < *p) {
|
|
rune = np[0]; np[0] = p[0]; p[0] = rune;
|
|
rune = np[1]; np[1] = p[1]; p[1] = rune;
|
|
}
|
|
|
|
/* merge spans */
|
|
np = par->yyclassp->spans;
|
|
p = r;
|
|
if (r == ep)
|
|
par->yyclassp->end = np;
|
|
else {
|
|
np[0] = *p++;
|
|
np[1] = *p++;
|
|
for (; p < ep; p += 2)
|
|
if (p[0] <= np[1]) {
|
|
if (p[1] > np[1])
|
|
np[1] = p[1];
|
|
} else {
|
|
np += 2;
|
|
np[0] = p[0];
|
|
np[1] = p[1];
|
|
}
|
|
par->yyclassp->end = np+2;
|
|
}
|
|
|
|
return type;
|
|
}
|
|
|
|
|
|
static _Reprog*
|
|
_regcomp1(_Reprog *pp, _Parser *par, const char *s, int cflags)
|
|
{
|
|
_Token token;
|
|
|
|
/* get memory for the program. estimated max usage */
|
|
isize instcap = 5 + 6*c_strlen(s);
|
|
isize new_allocsize = c_sizeof(_Reprog) + instcap*c_sizeof(_Reinst);
|
|
pp = (_Reprog *)c_realloc(pp, pp ? pp->allocsize : 0, new_allocsize);
|
|
if (pp == NULL) {
|
|
par->error = CREG_OUTOFMEMORY;
|
|
return NULL;
|
|
}
|
|
pp->allocsize = new_allocsize;
|
|
pp->flags.icase = (cflags & CREG_ICASE) != 0;
|
|
pp->flags.dotall = (cflags & CREG_DOTALL) != 0;
|
|
par->instcap = instcap;
|
|
par->freep = pp->firstinst;
|
|
par->classp = pp->cclass;
|
|
par->error = 0;
|
|
|
|
if (setjmp(par->regkaboom))
|
|
goto out;
|
|
|
|
/* go compile the sucker */
|
|
par->flags = pp->flags;
|
|
par->rune_type = pp->flags.icase ? TOK_IRUNE : TOK_RUNE;
|
|
par->dot_type = pp->flags.dotall ? TOK_ANYNL : TOK_ANY;
|
|
par->litmode = false;
|
|
par->exprp = s;
|
|
par->nclass = 0;
|
|
par->nbra = 0;
|
|
par->atorp = par->atorstack;
|
|
par->andp = par->andstack;
|
|
par->subidp = par->subidstack;
|
|
par->lastwasand = false;
|
|
par->cursubid = 0;
|
|
|
|
/* Start with a low priority operator to prime parser */
|
|
_pushator(par, TOK_START-1);
|
|
while ((token = _lex(par)) != TOK_END) {
|
|
if ((token & TOK_MASK) == TOK_OPERATOR)
|
|
_operator(par, token);
|
|
else
|
|
_operand(par, token);
|
|
}
|
|
|
|
/* Close with a low priority operator */
|
|
_evaluntil(par, TOK_START);
|
|
|
|
/* Force TOK_END */
|
|
_operand(par, TOK_END);
|
|
_evaluntil(par, TOK_START);
|
|
|
|
if (par->nbra)
|
|
_rcerror(par, CREG_UNMATCHEDLEFTPARENTHESIS);
|
|
--par->andp; /* points to first and only _operand */
|
|
pp->startinst = par->andp->first;
|
|
|
|
pp = _optimize(par, pp);
|
|
pp->nsubids = par->cursubid;
|
|
out:
|
|
if (par->error) {
|
|
c_free(pp, pp->allocsize);
|
|
pp = NULL;
|
|
}
|
|
return pp;
|
|
}
|
|
|
|
#if defined __clang__
|
|
#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
|
|
#elif defined __GNUC__
|
|
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
|
|
#endif
|
|
|
|
static int
|
|
_runematch(_Rune s, _Rune r)
|
|
{
|
|
int inv = 0, n;
|
|
switch (s) {
|
|
case ASC_D: inv = 1; case ASC_d: return inv ^ (isdigit((int)r) != 0);
|
|
case ASC_S: inv = 1; case ASC_s: return inv ^ (isspace((int)r) != 0);
|
|
case ASC_W: inv = 1; case ASC_w: return inv ^ ((isalnum((int)r) != 0) | (r == '_'));
|
|
case ASC_AL: inv = 1; case ASC_al: return inv ^ (isalpha((int)r) != 0);
|
|
case ASC_AN: inv = 1; case ASC_an: return inv ^ (isalnum((int)r) != 0);
|
|
case ASC_AS: return (r >= 128); case ASC_as: return (r < 128);
|
|
case ASC_BL: inv = 1; case ASC_bl: return inv ^ ((r == ' ') | (r == '\t'));
|
|
case ASC_CT: inv = 1; case ASC_ct: return inv ^ (iscntrl((int)r) != 0);
|
|
case ASC_GR: inv = 1; case ASC_gr: return inv ^ (isgraph((int)r) != 0);
|
|
case ASC_PR: inv = 1; case ASC_pr: return inv ^ (isprint((int)r) != 0);
|
|
case ASC_PU: inv = 1; case ASC_pu: return inv ^ (ispunct((int)r) != 0);
|
|
case ASC_LO: inv = 1; case ASC_lo: return inv ^ (islower((int)r) != 0);
|
|
case ASC_UP: inv = 1; case ASC_up: return inv ^ (isupper((int)r) != 0);
|
|
case ASC_XD: inv = 1; case ASC_xd: return inv ^ (isxdigit((int)r) != 0);
|
|
case UTF_AN: inv = 1; case UTF_an: return inv ^ (int)utf8_isalnum(r);
|
|
case UTF_BL: inv = 1; case UTF_bl: return inv ^ (int)utf8_isblank(r);
|
|
case UTF_SP: inv = 1; case UTF_sp: return inv ^ (int)utf8_isspace(r);
|
|
case UTF_LL: inv = 1; case UTF_ll: return inv ^ (int)utf8_islower(r);
|
|
case UTF_LU: inv = 1; case UTF_lu: return inv ^ (int)utf8_isupper(r);
|
|
case UTF_LC: inv = 1; case UTF_lc: return inv ^ (int)utf8_iscased(r);
|
|
case UTF_AL: inv = 1; case UTF_al: return inv ^ (int)utf8_isalpha(r);
|
|
case UTF_WR: inv = 1; case UTF_wr: return inv ^ (int)utf8_isword(r);
|
|
case UTF_cc: case UTF_CC:
|
|
case UTF_lt: case UTF_LT:
|
|
case UTF_nd: case UTF_ND:
|
|
case UTF_nl: case UTF_NL:
|
|
case UTF_pc: case UTF_PC:
|
|
case UTF_pd: case UTF_PD:
|
|
case UTF_pf: case UTF_PF:
|
|
case UTF_pi: case UTF_PI:
|
|
case UTF_sc: case UTF_SC:
|
|
case UTF_zl: case UTF_ZL:
|
|
case UTF_zp: case UTF_ZP:
|
|
case UTF_zs: case UTF_ZS:
|
|
case UTF_arabic: case UTF_ARABIC:
|
|
case UTF_bengali: case UTF_BENGALI:
|
|
case UTF_cyrillic: case UTF_CYRILLIC:
|
|
case UTF_devanagari: case UTF_DEVANAGARI:
|
|
case UTF_georgian: case UTF_GEORGIAN:
|
|
case UTF_greek: case UTF_GREEK:
|
|
case UTF_han: case UTF_HAN:
|
|
case UTF_hiragana: case UTF_HIRAGANA:
|
|
case UTF_katakana: case UTF_KATAKANA:
|
|
case UTF_latin: case UTF_LATIN:
|
|
case UTF_thai: case UTF_THAI:
|
|
n = (int)s - UTF_GRP;
|
|
inv = n & 1;
|
|
return inv ^ (int)utf8_isgroup(n / 2, r);
|
|
}
|
|
return s == r;
|
|
}
|
|
|
|
/*
|
|
* return 0 if no match
|
|
* >0 if a match
|
|
* <0 if we ran out of _relist space
|
|
*/
|
|
static int
|
|
_regexec1(const _Reprog *progp, /* program to run */
|
|
const char *bol, /* string to run machine on */
|
|
_Resub *mp, /* subexpression elements */
|
|
int ms, /* number of elements at mp */
|
|
_Reljunk *j,
|
|
int mflags
|
|
)
|
|
{
|
|
int flag=0;
|
|
_Reinst *inst;
|
|
_Relist *tlp;
|
|
_Relist *tl, *nl; /* This list, next list */
|
|
_Relist *tle, *nle; /* Ends of this and next list */
|
|
const char *s, *p;
|
|
_Rune r, *rp, *ep;
|
|
int n, checkstart, match = 0;
|
|
int i;
|
|
|
|
bool icase = progp->flags.icase;
|
|
checkstart = j->starttype;
|
|
if (mp)
|
|
for (i=0; i<ms; i++) {
|
|
mp[i].buf = NULL;
|
|
mp[i].size = 0;
|
|
}
|
|
j->relist[0][0].inst = NULL;
|
|
j->relist[1][0].inst = NULL;
|
|
|
|
/* Execute machine once for each character, including terminal NUL */
|
|
s = j->starts;
|
|
do {
|
|
/* fast check for first char */
|
|
if (checkstart) {
|
|
switch (j->starttype) {
|
|
case TOK_IRUNE:
|
|
p = utfruneicase(s, j->startchar);
|
|
goto next1;
|
|
case TOK_RUNE:
|
|
p = utfrune(s, j->startchar);
|
|
next1:
|
|
if (p == NULL || s == j->eol)
|
|
return match;
|
|
s = p;
|
|
break;
|
|
case TOK_BOL:
|
|
if (s == bol)
|
|
break;
|
|
p = utfrune(s, '\n');
|
|
if (p == NULL || s == j->eol)
|
|
return match;
|
|
s = p+1;
|
|
break;
|
|
}
|
|
}
|
|
r = *(uint8_t*)s;
|
|
n = r < 0x80 ? 1 : chartorune(&r, s);
|
|
|
|
/* switch run lists */
|
|
tl = j->relist[flag];
|
|
tle = j->reliste[flag];
|
|
nl = j->relist[flag^=1];
|
|
nle = j->reliste[flag];
|
|
nl->inst = NULL;
|
|
|
|
/* Add first instruction to current list */
|
|
if (match == 0)
|
|
_renewemptythread(tl, progp->startinst, ms, s);
|
|
|
|
/* Execute machine until current list is empty */
|
|
for (tlp=tl; tlp->inst; tlp++) { /* assignment = */
|
|
for (inst = tlp->inst; ; inst = inst->l.next) {
|
|
int ok = false;
|
|
|
|
switch (inst->type) {
|
|
case TOK_IRUNE:
|
|
r = utf8_casefold(r); /* FALLTHRU */
|
|
case TOK_RUNE:
|
|
ok = _runematch(inst->r.rune, r);
|
|
break;
|
|
case TOK_CASED: case TOK_ICASE:
|
|
icase = inst->type == TOK_ICASE;
|
|
continue;
|
|
case TOK_LBRA:
|
|
tlp->se.m[inst->r.subid].buf = s;
|
|
continue;
|
|
case TOK_RBRA:
|
|
tlp->se.m[inst->r.subid].size = (s - tlp->se.m[inst->r.subid].buf);
|
|
continue;
|
|
case TOK_ANY:
|
|
ok = (r != '\n');
|
|
break;
|
|
case TOK_ANYNL:
|
|
ok = true;
|
|
break;
|
|
case TOK_BOL:
|
|
if (s == bol || s[-1] == '\n') continue;
|
|
break;
|
|
case TOK_BOS:
|
|
if (s == bol) continue;
|
|
break;
|
|
case TOK_EOL:
|
|
if (r == '\n') continue; /* FALLTHRU */
|
|
case TOK_EOS:
|
|
if (s == j->eol || r == 0) continue;
|
|
break;
|
|
case TOK_EOZ:
|
|
if (s == j->eol || r == 0 || (r == '\n' && s[1] == 0)) continue;
|
|
break;
|
|
case TOK_NWBOUND:
|
|
ok = true; /* FALLTHRU */
|
|
case TOK_WBOUND:
|
|
if (ok ^ (r == 0 || s == bol || s == j->eol ||
|
|
(utf8_isword(utf8_peek_at(s, -1)) ^
|
|
utf8_isword(utf8_peek(s)))))
|
|
continue;
|
|
break;
|
|
case TOK_NCCLASS:
|
|
ok = true; /* FALLTHRU */
|
|
case TOK_CCLASS:
|
|
ep = inst->r.classp->end;
|
|
if (icase) r = utf8_casefold(r);
|
|
for (rp = inst->r.classp->spans; rp < ep; rp += 2) {
|
|
if ((r >= rp[0] && r <= rp[1]) || (rp[0] == rp[1] && _runematch(rp[0], r)))
|
|
break;
|
|
}
|
|
ok ^= (rp < ep);
|
|
break;
|
|
case TOK_OR:
|
|
/* evaluate right choice later */
|
|
if (_renewthread(tlp, inst->r.right, ms, &tlp->se) == tle)
|
|
return -1;
|
|
/* efficiency: advance and re-evaluate */
|
|
continue;
|
|
case TOK_END: /* Match! */
|
|
match = !(mflags & CREG_FULLMATCH) ||
|
|
((s == j->eol || r == 0 || r == '\n') &&
|
|
(tlp->se.m[0].buf == bol || tlp->se.m[0].buf[-1] == '\n'));
|
|
tlp->se.m[0].size = (s - tlp->se.m[0].buf);
|
|
if (mp != NULL)
|
|
_renewmatch(mp, ms, &tlp->se, progp->nsubids);
|
|
break;
|
|
}
|
|
|
|
if (ok && _renewthread(nl, inst->l.next, ms, &tlp->se) == nle)
|
|
return -1;
|
|
break;
|
|
}
|
|
}
|
|
if (s == j->eol)
|
|
break;
|
|
checkstart = j->starttype && nl->inst==NULL;
|
|
s += n;
|
|
} while (r);
|
|
return match;
|
|
}
|
|
|
|
|
|
static int
|
|
_regexec2(const _Reprog *progp, /* program to run */
|
|
const char *bol, /* string to run machine on */
|
|
_Resub *mp, /* subexpression elements */
|
|
int ms, /* number of elements at mp */
|
|
_Reljunk *j,
|
|
int mflags
|
|
)
|
|
{
|
|
int rv;
|
|
_Relist *relists;
|
|
|
|
/* mark space */
|
|
isize sz = 2 * _BIGLISTSIZE*c_sizeof(_Relist);
|
|
relists = (_Relist *)c_malloc(sz);
|
|
if (relists == NULL)
|
|
return -1;
|
|
|
|
j->relist[0] = relists;
|
|
j->relist[1] = relists + _BIGLISTSIZE;
|
|
j->reliste[0] = relists + _BIGLISTSIZE - 2;
|
|
j->reliste[1] = relists + 2*_BIGLISTSIZE - 2;
|
|
|
|
rv = _regexec1(progp, bol, mp, ms, j, mflags);
|
|
c_free(relists, sz);
|
|
return rv;
|
|
}
|
|
|
|
static int
|
|
_regexec(const _Reprog *progp, /* program to run */
|
|
const char *bol, /* string to run machine on */
|
|
const char *bol_end,/* end of string (or NULL for null-termination) */
|
|
int ms, /* number of elements at mp */
|
|
_Resub mp[], /* subexpression elements */
|
|
int mflags)
|
|
{
|
|
_Reljunk j;
|
|
_Relist relist0[_LISTSIZE], relist1[_LISTSIZE];
|
|
int rv;
|
|
|
|
/*
|
|
* use user-specified starting/ending location if specified
|
|
*/
|
|
j.starts = bol;
|
|
j.eol = bol_end;
|
|
|
|
if ((mflags & CREG_NEXT) && mp[0].buf)
|
|
j.starts = mp[0].buf + mp[0].size;
|
|
if (j.eol && j.starts > j.eol)
|
|
return 0; // no match
|
|
|
|
j.starttype = 0;
|
|
j.startchar = 0;
|
|
int rune_type = progp->flags.icase ? TOK_IRUNE : TOK_RUNE;
|
|
if (progp->startinst->type == rune_type && progp->startinst->r.rune < 128) {
|
|
j.starttype = rune_type;
|
|
j.startchar = progp->startinst->r.rune;
|
|
}
|
|
if (progp->startinst->type == TOK_BOL)
|
|
j.starttype = TOK_BOL;
|
|
|
|
/* mark space */
|
|
j.relist[0] = relist0;
|
|
j.relist[1] = relist1;
|
|
j.reliste[0] = relist0 + _LISTSIZE - 2;
|
|
j.reliste[1] = relist1 + _LISTSIZE - 2;
|
|
|
|
rv = _regexec1(progp, bol, mp, ms, &j, mflags);
|
|
if (rv >= 0)
|
|
return rv;
|
|
rv = _regexec2(progp, bol, mp, ms, &j, mflags);
|
|
return rv;
|
|
}
|
|
|
|
|
|
static void
|
|
_build_substitution(const char* replace, int nmatch, const csview match[],
|
|
bool(*transform)(int, csview, cstr*), cstr* subst) {
|
|
cstr_buf mbuf = cstr_getbuf(subst);
|
|
isize len = 0, cap = mbuf.cap;
|
|
char* dst = mbuf.data;
|
|
cstr tr_str = {0};
|
|
|
|
while (*replace != '\0') {
|
|
if (*replace == '$') {
|
|
int arg = replace[1];
|
|
if (arg >= '0' && arg <= '9') {
|
|
arg -= '0';
|
|
if (replace[2] >= '0' && replace[2] <= '9' && replace[3] == ';')
|
|
{ arg = arg*10 + (replace[2] - '0'); replace += 2; }
|
|
replace += 2;
|
|
if (arg < nmatch) {
|
|
csview tr_sv = transform && transform(arg, match[arg], &tr_str)
|
|
? cstr_sv(&tr_str) : match[arg];
|
|
if (len + tr_sv.size > cap)
|
|
dst = cstr_reserve(subst, cap += cap/2 + tr_sv.size);
|
|
for (int i = 0; i < tr_sv.size; ++i)
|
|
dst[len++] = tr_sv.buf[i];
|
|
}
|
|
continue;
|
|
}
|
|
if (arg == '$') // allow e.g. "$$3" => "$3"
|
|
++replace;
|
|
}
|
|
if (len == cap)
|
|
dst = cstr_reserve(subst, cap += cap/2 + 4);
|
|
dst[len++] = *replace++;
|
|
}
|
|
cstr_drop(&tr_str);
|
|
_cstr_set_size(subst, len);
|
|
}
|
|
|
|
|
|
/* ---------------------------------------------------------------
|
|
* API functions
|
|
*/
|
|
|
|
int cregex_compile_pro(cregex *self, const char* pattern, int cflags) {
|
|
_Parser par;
|
|
self->prog = _regcomp1(self->prog, &par, pattern, cflags);
|
|
return self->error = par.error;
|
|
}
|
|
|
|
int cregex_captures(const cregex* self) {
|
|
return self->prog ? self->prog->nsubids : 0;
|
|
}
|
|
|
|
void cregex_drop(cregex* self) {
|
|
c_free(self->prog, self->prog->allocsize);
|
|
}
|
|
|
|
int cregex_match_opt(const cregex* re, const char* input, const char* input_end, struct cregex_match_opt opt) {
|
|
int res = _regexec(re->prog, input, input_end, cregex_captures(re) + 1, opt.match, opt.flags);
|
|
switch (res) {
|
|
case 1: return CREG_OK;
|
|
case 0: return CREG_NOMATCH;
|
|
default: return CREG_MATCHERROR;
|
|
}
|
|
}
|
|
|
|
int cregex_match_aio_opt(const char* pattern, const char* input, const char* input_end, struct cregex_match_opt opt) {
|
|
cregex re = cregex_make(pattern, opt.flags);
|
|
if (re.error != CREG_OK) return re.error;
|
|
int res = cregex_match_opt(&re, input, input_end, opt);
|
|
cregex_drop(&re);
|
|
return res;
|
|
}
|
|
|
|
cstr cregex_replace_opt(const cregex* re, const char* input, const char* input_end, const char* replace, struct cregex_replace_opt opt) {
|
|
cstr out = {0};
|
|
cstr subst = {0};
|
|
csview match[CREG_MAX_CAPTURES];
|
|
int nmatch = cregex_captures(re) + 1;
|
|
bool copy = !(opt.flags & CREG_STRIP);
|
|
struct cregex_match_opt mopt = {match};
|
|
opt.count += (opt.count != 0);
|
|
|
|
while (--opt.count && cregex_match_opt(re, input, input_end, mopt) == CREG_OK) {
|
|
_build_substitution(replace, nmatch, match, opt.xform, &subst);
|
|
const isize mpos = (match[0].buf - input);
|
|
if (copy & (mpos > 0))
|
|
cstr_append_n(&out, input, mpos);
|
|
cstr_append_s(&out, subst);
|
|
input = match[0].buf + match[0].size;
|
|
}
|
|
if (copy) {
|
|
isize len = input_end ? input_end - input : c_strlen(input);
|
|
cstr_append_sv(&out, c_sv(input, len));
|
|
}
|
|
cstr_drop(&subst);
|
|
return out;
|
|
}
|
|
|
|
cstr cregex_replace_aio_opt(const char* pattern, const char* input, const char* input_end, const char* replace, struct cregex_replace_opt opt) {
|
|
cregex re = {0};
|
|
if (cregex_compile_pro(&re, pattern, opt.flags) != CREG_OK)
|
|
assert(0);
|
|
cstr out = cregex_replace_opt(&re, input, input_end, replace, opt);
|
|
cregex_drop(&re);
|
|
return out;
|
|
}
|
|
|
|
#endif // STC_CREGEX_PRV_C_INCLUDED
|