On 1 March 2016 at 12:43, Greg Reagle <[email protected]> wrote:
> That's interesting. It's funny that sam does a better job since acme is a
> successor to sam. I wonder if/how they share code.
acme/regx.c certainly was derived from sam/regexp.c. See attached diff.
cls
--- sam/regexp.c 2016-03-01 16:52:35.085509675 +0000
+++ acme/regx.c 2016-03-01 16:52:34.861508409 +0000
@@ -1,47 +1,52 @@
-#include "sam.h"
+#include <u.h>
+#include <libc.h>
+#include <draw.h>
+#include <thread.h>
+#include <cursor.h>
+#include <mouse.h>
+#include <keyboard.h>
+#include <frame.h>
+#include <fcall.h>
+#include <plumb.h>
+#include "dat.h"
+#include "fns.h"
Rangeset sel;
-String lastregexp;
+Rune *lastregexp;
+
/*
* Machine Information
*/
typedef struct Inst Inst;
-
struct Inst
{
- long type; /* < 0x10000 ==> literal, otherwise action */
+ uint type; /* < 0x10000 ==> literal, otherwise action */
union {
- int rsid;
- int rsubid;
+ int sid;
+ int subid;
int class;
- struct Inst *rother;
- struct Inst *rright;
- } r;
+ Inst *other;
+ Inst *right;
+ };
union{
- struct Inst *lleft;
- struct Inst *lnext;
- } l;
+ Inst *left;
+ Inst *next;
+ };
};
-#define sid r.rsid
-#define subid r.rsubid
-#define rclass r.class
-#define other r.rother
-#define right r.rright
-#define left l.lleft
-#define next l.lnext
#define NPROG 1024
Inst program[NPROG];
Inst *progp;
Inst *startinst; /* First inst. of program; might not be program[0] */
Inst *bstartinst; /* same for backwards machine */
+Channel *rechan; /* chan(Inst*) */
typedef struct Ilist Ilist;
struct Ilist
{
Inst *inst; /* Instruction of the thread */
Rangeset se;
- Posn startp; /* first char of match */
+ uint startp; /* first char of match */
};
#define NLIST 127
@@ -120,35 +125,40 @@
void bldcclass(void);
void
-regerror(Err e)
+rxinit(void)
{
- Strzero(&lastregexp);
- error(e);
+ rechan = chancreate(sizeof(Inst*), 0);
+ lastregexp = runemalloc(1);
}
void
-regerror_c(Err e, int c)
+regerror(char *e)
{
- Strzero(&lastregexp);
- error_c(e, c);
+ lastregexp[0] = 0;
+ warning(nil, "regexp: %s\n", e);
+ sendp(rechan, nil);
+ threadexits(nil);
}
Inst *
newinst(int t)
{
if(progp >= &program[NPROG])
- regerror(Etoolong);
+ regerror("expression too long");
progp->type = t;
- progp->left = 0;
- progp->right = 0;
+ progp->left = nil;
+ progp->right = nil;
return progp++;
}
-Inst *
-realcompile(Rune *s)
+void
+realcompile(void *arg)
{
int token;
+ Rune *s;
+ threadsetname("regcomp");
+ s = arg;
startlex(s);
atorp = atorstack;
andp = andstack;
@@ -169,31 +179,44 @@
operand(END);
evaluntil(START);
if(nbra)
- regerror(Eleftpar);
+ regerror("unmatched `('");
--andp; /* points to first and only operand */
- return andp->first;
+ sendp(rechan, andp->first);
+ threadexits(nil);
}
-void
-compile(String *s)
+/* r is null terminated */
+int
+rxcompile(Rune *r)
{
- int i;
+ int i, nr;
Inst *oprogp;
- if(Strcmp(s, &lastregexp)==0)
- return;
+ nr = runestrlen(r)+1;
+ if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
+ return TRUE;
+ lastregexp[0] = 0;
for(i=0; i<nclass; i++)
free(class[i]);
nclass = 0;
progp = program;
backwards = FALSE;
- startinst = realcompile(s->s);
+ bstartinst = nil;
+ threadcreate(realcompile, r, STACK);
+ startinst = recvp(rechan);
+ if(startinst == nil)
+ return FALSE;
optimize(program);
oprogp = progp;
backwards = TRUE;
- bstartinst = realcompile(s->s);
+ threadcreate(realcompile, r, STACK);
+ bstartinst = recvp(rechan);
+ if(bstartinst == nil)
+ return FALSE;
optimize(oprogp);
- Strduplstr(&lastregexp, s);
+ lastregexp = runerealloc(lastregexp, nr);
+ runemove(lastregexp, r, nr);
+ return TRUE;
}
void
@@ -206,7 +229,7 @@
if(t == CCLASS){
if(negateclass)
i->type = NCCLASS; /* UGH */
- i->rclass = nclass-1; /* UGH */
+ i->class = nclass-1; /* UGH */
}
pushand(i, i);
lastwasand = TRUE;
@@ -216,12 +239,8 @@
operator(int t)
{
if(t==RBRA && --nbra<0)
- regerror(Erightpar);
+ regerror("unmatched `)'");
if(t==LBRA){
-/*
- * if(++cursubid >= NSUBEXP)
- * regerror(Esubexp);
- */
cursubid++; /* silently ignored */
nbra++;
if(lastwasand)
@@ -236,19 +255,10 @@
}
void
-cant(char *s)
-{
- char buf[100];
-
- sprint(buf, "regexp: can't happen: %s", s);
- panic(buf);
-}
-
-void
pushand(Inst *f, Inst *l)
{
if(andp >= &andstack[NSTACK])
- cant("operand stack overflow");
+ error("operand stack overflow");
andp->first = f;
andp->last = l;
andp++;
@@ -258,9 +268,9 @@
pushator(int t)
{
if(atorp >= &atorstack[NSTACK])
- cant("operator stack overflow");
+ error("operator stack overflow");
*atorp++=t;
- if(cursubid >= NSUBEXP)
+ if(cursubid >= NRange)
*subidp++= -1;
else
*subidp++=cursubid;
@@ -269,19 +279,22 @@
Node *
popand(int op)
{
+ char buf[64];
+
if(andp <= &andstack[0])
- if(op)
- regerror_c(Emissop, op);
- else
- regerror(Ebadregexp);
+ if(op){
+ sprint(buf, "missing operand for %c", op);
+ regerror(buf);
+ }else
+ regerror("malformed regexp");
return --andp;
}
int
-popator(void)
+popator()
{
if(atorp <= &atorstack[0])
- cant("operator stack underflow");
+ error("operator stack underflow");
--subidp;
return *--atorp;
}
@@ -305,7 +318,7 @@
pushand(inst1, inst2);
return; /* must have been RBRA */
default:
- panic("unknown regexp operator");
+ error("unknown regexp operator");
break;
case OR:
op2 = popand('|');
@@ -321,8 +334,11 @@
case CAT:
op2 = popand(0);
op1 = popand(0);
- if(backwards && op2->first->type!=END)
- t = op1, op1 = op2, op2 = t;
+ if(backwards && op2->first->type!=END){
+ t = op1;
+ op1 = op2;
+ op2 = t;
+ }
op1->last->next = op2->first;
pushand(op1->first, op2->last);
break;
@@ -367,31 +383,6 @@
}
}
-#ifdef DEBUG
-void
-dumpstack(void){
- Node *stk;
- int *ip;
-
- dprint("operators\n");
- for(ip = atorstack; ip<atorp; ip++)
- dprint("0%o\n", *ip);
- dprint("operands\n");
- for(stk = andstack; stk<andp; stk++)
- dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
-}
-void
-dump(void){
- Inst *l;
-
- l = program;
- do{
- dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
- l->left-program, l->right-program);
- }while(l++->type);
-}
-#endif
-
void
startlex(Rune *s)
{
@@ -402,8 +393,9 @@
int
lex(void){
- int c= *exprp++;
+ int c;
+ c = *exprp++;
switch(c){
case '\\':
if(*exprp)
@@ -449,10 +441,11 @@
return c;
}
-long
-nextrec(void){
+int
+nextrec(void)
+{
if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
- regerror(Ebadclass);
+ regerror("malformed `[]'");
if(exprp[0] == '\\'){
exprp++;
if(*exprp=='n'){
@@ -467,10 +460,10 @@
void
bldcclass(void)
{
- long c1, c2, n, na;
+ int c1, c2, n, na;
Rune *classp;
- classp = emalloc(DCLASS*RUNESIZE);
+ classp = runemalloc(DCLASS);
n = 0;
na = DCLASS;
/* we have already seen the '[' */
@@ -484,11 +477,11 @@
if(c1 == '-'){
Error:
free(classp);
- regerror(Ebadclass);
+ regerror("malformed `[]'");
}
if(n+4 >= na){ /* 3 runes plus NUL */
na += DCLASS;
- classp = erealloc(classp, na*RUNESIZE);
+ classp = runerealloc(classp, na);
}
if(*exprp == '-'){
exprp++; /* eat '-' */
@@ -504,7 +497,7 @@
classp[n] = 0;
if(nclass == Nclass){
Nclass += DCLASS;
- class = erealloc(class, Nclass*sizeof(Rune*));
+ class = realloc(class, Nclass*sizeof(Rune*));
}
class[nclass++] = classp;
}
@@ -538,66 +531,93 @@
for(p = l; p->inst; p++){
if(p->inst==inst){
- if((sep)->p[0].p1 < p->se.p[0].p1)
+ if((sep)->r[0].q0 < p->se.r[0].q0)
p->se= *sep; /* this would be bug */
return 0; /* It's already there */
}
}
p->inst = inst;
p->se= *sep;
- (p+1)->inst = 0;
+ (p+1)->inst = nil;
return 1;
}
int
-execute(File *f, Posn startp, Posn eof)
+rxnull(void)
+{
+ return startinst==nil || bstartinst==nil;
+}
+
+/* either t!=nil or r!=nil, and we match the string in the appropriate place */
+int
+rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
{
- int flag = 0;
+ int flag;
Inst *inst;
Ilist *tlp;
- Posn p = startp;
- int nnl = 0, ntl;
- int c;
- int wrapped = 0;
- int startchar = startinst->type<OPERATOR? startinst->type : 0;
-
- list[0][0].inst = list[1][0].inst = 0;
- sel.p[0].p1 = -1;
+ uint p;
+ int nnl, ntl;
+ int nc, c;
+ int wrapped;
+ int startchar;
+
+ flag = 0;
+ p = startp;
+ startchar = 0;
+ wrapped = 0;
+ nnl = 0;
+ if(startinst->type<OPERATOR)
+ startchar = startinst->type;
+ list[0][0].inst = list[1][0].inst = nil;
+ sel.r[0].q0 = -1;
+ if(t != nil)
+ nc = t->file->nc;
+ else
+ nc = runestrlen(r);
/* Execute machine once for each character */
for(;;p++){
doloop:
- c = filereadc(f, p);
- if(p>=eof || c<0){
+ if(p>=eof || p>=nc){
switch(wrapped++){
case 0: /* let loop run one more click */
case 2:
break;
case 1: /* expired; wrap to beginning */
- if(sel.p[0].p1>=0 || eof!=INFINITY)
+ if(sel.r[0].q0>=0 || eof!=Infinity)
goto Return;
- list[0][0].inst = list[1][0].inst = 0;
+ list[0][0].inst = list[1][0].inst = nil;
p = 0;
goto doloop;
default:
goto Return;
}
- }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
- break;
+ c = 0;
+ }else{
+ if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
+ break;
+ if(t != nil)
+ c = textreadc(t, p);
+ else
+ c = r[p];
+ }
/* fast check for first char */
if(startchar && nnl==0 && c!=startchar)
continue;
tl = list[flag];
nl = list[flag^=1];
- nl->inst = 0;
+ nl->inst = nil;
ntl = nnl;
nnl = 0;
- if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
+ if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
/* Add first instruction to this list */
- sempty.p[0].p1 = p;
+ sempty.r[0].q0 = p;
if(addinst(tl, startinst, &sempty))
- if(++ntl >= NLIST)
+ if(++ntl >= NLIST){
Overflow:
- error(Eoverflow);
+ warning(nil, "regexp list overflow\n");
+ sel.r[0].q0 = -1;
+ goto Return;
+ }
}
/* Execute machine until this list is empty */
for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
@@ -613,12 +633,12 @@
break;
case LBRA:
if(inst->subid>=0)
- tlp->se.p[inst->subid].p1 = p;
+ tlp->se.r[inst->subid].q0 = p;
inst = inst->next;
goto Switchstmt;
case RBRA:
if(inst->subid>=0)
- tlp->se.p[inst->subid].p2 = p;
+ tlp->se.r[inst->subid].q1 = p;
inst = inst->next;
goto Switchstmt;
case ANY:
@@ -626,7 +646,7 @@
goto Addinst;
break;
case BOL:
- if(p==0 || filereadc(f, p - 1)=='\n'){
+ if(p==0 || (t!=nil && textreadc(t, p-1)=='\n')
|| (r!=nil && r[p-1]=='\n')){
Step:
inst = inst->next;
goto Switchstmt;
@@ -637,11 +657,11 @@
goto Step;
break;
case CCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 0))
+ if(c>=0 && classmatch(inst->class, c, 0))
goto Addinst;
break;
case NCCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 1))
+ if(c>=0 && classmatch(inst->class, c, 1))
goto Addinst;
break;
case OR:
@@ -653,77 +673,89 @@
inst = inst->left;
goto Switchstmt;
case END: /* Match! */
- tlp->se.p[0].p2 = p;
+ tlp->se.r[0].q1 = p;
newmatch(&tlp->se);
break;
}
}
}
Return:
- return sel.p[0].p1>=0;
+ *rp = sel;
+ return sel.r[0].q0 >= 0;
}
void
newmatch(Rangeset *sp)
{
- int i;
-
- if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
- (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
- for(i = 0; i<NSUBEXP; i++)
- sel.p[i] = sp->p[i];
+ if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
+ (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
+ sel = *sp;
}
int
-bexecute(File *f, Posn startp)
+rxbexecute(Text *t, uint startp, Rangeset *rp)
{
- int flag = 0;
+ int flag;
Inst *inst;
Ilist *tlp;
- Posn p = startp;
- int nnl = 0, ntl;
+ int p;
+ int nnl, ntl;
int c;
- int wrapped = 0;
- int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
+ int wrapped;
+ int startchar;
- list[0][0].inst = list[1][0].inst = 0;
- sel.p[0].p1= -1;
+ flag = 0;
+ nnl = 0;
+ wrapped = 0;
+ p = startp;
+ startchar = 0;
+ if(bstartinst->type<OPERATOR)
+ startchar = bstartinst->type;
+ list[0][0].inst = list[1][0].inst = nil;
+ sel.r[0].q0= -1;
/* Execute machine once for each character, including terminal NUL */
for(;;--p){
doloop:
- if((c = filereadc(f, p - 1))==-1){
+ if(p <= 0){
switch(wrapped++){
case 0: /* let loop run one more click */
case 2:
break;
case 1: /* expired; wrap to end */
- if(sel.p[0].p1>=0)
- case 3:
+ if(sel.r[0].q0>=0)
goto Return;
- list[0][0].inst = list[1][0].inst = 0;
- p = f->nc;
+ list[0][0].inst = list[1][0].inst = nil;
+ p = t->file->nc;
goto doloop;
+ case 3:
default:
goto Return;
}
- }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
- break;
+ c = 0;
+ }else{
+ if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
+ break;
+ c = textreadc(t, p-1);
+ }
/* fast check for first char */
if(startchar && nnl==0 && c!=startchar)
continue;
tl = list[flag];
nl = list[flag^=1];
- nl->inst = 0;
+ nl->inst = nil;
ntl = nnl;
nnl = 0;
- if(sel.p[0].p1<0 && (!wrapped || p>startp)){
+ if(sel.r[0].q0<0 && (!wrapped || p>startp)){
/* Add first instruction to this list */
/* the minus is so the optimizations in addinst work */
- sempty.p[0].p1 = -p;
+ sempty.r[0].q0 = -p;
if(addinst(tl, bstartinst, &sempty))
- if(++ntl >= NLIST)
+ if(++ntl >= NLIST){
Overflow:
- error(Eoverflow);
+ warning(nil, "regexp list overflow\n");
+ sel.r[0].q0 = -1;
+ goto Return;
+ }
}
/* Execute machine until this list is empty */
for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
@@ -739,12 +771,12 @@
break;
case LBRA:
if(inst->subid>=0)
- tlp->se.p[inst->subid].p1 = p;
+ tlp->se.r[inst->subid].q0 = p;
inst = inst->next;
goto Switchstmt;
case RBRA:
if(inst->subid >= 0)
- tlp->se.p[inst->subid].p2 = p;
+ tlp->se.r[inst->subid].q1 = p;
inst = inst->next;
goto Switchstmt;
case ANY:
@@ -759,15 +791,15 @@
}
break;
case EOL:
- if(p==f->nc || filereadc(f, p)=='\n')
+ if(p<t->file->nc && textreadc(t, p)=='\n')
goto Step;
break;
case CCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 0))
+ if(c>0 && classmatch(inst->class, c, 0))
goto Addinst;
break;
case NCCLASS:
- if(c>=0 && classmatch(inst->rclass, c, 1))
+ if(c>0 && classmatch(inst->class, c, 1))
goto Addinst;
break;
case OR:
@@ -779,24 +811,26 @@
inst = inst->left;
goto Switchstmt;
case END: /* Match! */
- tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus
sign */
- tlp->se.p[0].p2 = p;
+ tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus
sign */
+ tlp->se.r[0].q1 = p;
bnewmatch(&tlp->se);
break;
}
}
}
Return:
- return sel.p[0].p1>=0;
+ *rp = sel;
+ return sel.r[0].q0 >= 0;
}
void
bnewmatch(Rangeset *sp)
{
int i;
- if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 ||
(sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
- for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2
*/
- sel.p[i].p1 = sp->p[i].p2;
- sel.p[i].p2 = sp->p[i].p1;
+
+ if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 ||
(sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
+ for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1
*/
+ sel.r[i].q0 = sp->r[i].q1;
+ sel.r[i].q1 = sp->r[i].q0;
}
}