> Dan: > Looks like your mailer wordwrapped the program pretty badly. Could you try > again as either a context or unified diff (-c or -u) and attached to mail? > I'm curious to look at it, as I've only partially considered how we'll do > regexes to date. >
Oh, sorry. I should stop sending mails from webmin when I am out of job. It is going to kill al my personal relations. :( I attach it again, hope it works right now. The patch is not very documented, so i better explain the opcodes here: * initbr ic: saves a -1 into the integer stack, and DEST to the destination stack. ($1 will be the ultimate failing destinaion) * clearbr i: deletes all items from the integer and destination stack untils is finds a -1 in the integer stack. Checks first if $1 is already -1. * savebr i, ic: saves a pending match. Saves de string position (in the integer satck) + label of the next regex node (in the destination stack). * backtrack i. Pops the integer stack into $1, and then returns to last destination in the destination stack. The other functionalities are implemented in terms of the previous opcodes plus usual ones (branch,..) The part i like less of the thing is the messing with integer and destination stacks. I would rather use separate stacks, but prefered to make the minimal changes possible. If you have a first design of how the whole thing should work, I would modifiy the compiler to follow your design. I attach also an example of the output of the compiler (babyre.pasm), for those who do not want to install the module with its dependences, but want to see how the assembler output looks like. Angel
package BabyRegex; use YAPE::Regex 'BabyRegex'; use strict; use vars '$VERSION'; $VERSION = '0.01'; my %modes = ( on => '', off => '' ); sub buildtree { my $self = shift; my $cnt = 0; my ($groupscnt, @groups); my @tree; while (my $node = $self->next) { $node->id($cnt++); $tree[-1]->next($node) if @tree; if ($node->type =~ /capture|group/) { push @groups, $node; $node->{ALTS} = []; $node->{COUNT} = $groupscnt++; } if ($node->type eq "alt") { push (@{$groups[-1]->{ALTS}}, $node); my $groupnode = $groups[-1]; $node->{GROUP} = $groupnode; push @{$groupnode->{ALTS}}, $node, } if ($node->type eq "close"){ my $groupnode = pop @groups; $groupnode->{CLOSED} = $node; $node->{GROUP} = $groupnode; for my $alt (@{$groupnode->{ALTS}}) { #Alt nodes get its ID replaced by the Closing node ID, so #that the when its antecessors calls ->next->id it gets the good one. #This is probably on of the worse to do that. $alt->{ID} = $node->{ID}; } } push (@tree, $node); } return @tree; } sub cry { if (@_[1]) { my $label = shift; my $opcode = shift; my $spc = " " x (4 - length($label) ) ; print $label. ":" . $spc . $opcode . "\n"; } else { my $opcode = shift; print " $opcode\n"; } } sub pasm { my ($self, $string) = @_; my @tree = $self->buildtree; cry "INIT", "initbrstack FAIL"; cry "set I1, 0"; cry "set S1, \"$string\""; for my $node (@tree) { $node->pasm($self); #print $node->type; } cry "OK", "print \"match\""; cry "clearbrstack I1"; cry "end"; print "\n"; cry "FAIL", "print \"fail\""; cry "clearbrstack I1"; cry "end"; print "\n"; cry "BT", "backtrack I1"; print "\n"; } ## ## shared methods ## sub BabyRegex::Element::id { my $self = shift; my $id = shift; if ($self->{ID}) { return $self->{ID} } else { $self->{ID} = "L" . $id; } } sub BabyRegex::Element::next { my $self = shift; my $next = shift; if ($next) { $self->{NEXT} = $next; return $next; } else { return $self->{NEXT} } } sub BabyRegex::Element::cry_atomic { my $self = shift; my $opcode = shift; my $id = $self->id; if ($self->quant eq "*") { my $nextid = $self->next()->id(); cry $id, "savebr I1, $nextid"; cry $opcode; cry "branc $id"; } elsif ($self->quant eq "+" ) { my $nextid = $self->next()->id(); cry $id, $opcode; cry "savebr I1, $nextid"; cry "branch $id"; } elsif ($self->quant eq "?" ) { my $nextid = $self->next()->id(); cry $id, "savebr I1, $nextid"; cry $opcode; } else { cry $id, $opcode; } } ## ## each element pasm ## sub BabyRegex::anchor::pasm { my $self = shift; my $type = $self->{TEXT}; print $type; } sub BabyRegex::macro::pasm { die "unimplemented\n"; } sub BabyRegex::oct::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::hex::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::utf8hex::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::ctrl::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::named::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::Cchar::explanation { die "unimplemented - too lazy\n"; } sub BabyRegex::any::pasm { my $self = shift; my $l; my $id = $self->id; if ($modes{on} =~ /s/) { $self->cry_atomic ("matchanychar S1, I1, BT"); } else { #$self->cry_atomic ("matchanycharbutnl S1, I1, BT"); #we don't have the opcode anyway $self->cry_atomic ("matchanychar S1, I1, BT"); } } sub BabyRegex::text::pasm { my $self = shift; my $text = $self->text; $text =~ s/\n/\\n/g; $text =~ s/\r/\\r/g; $text =~ s/\t/\\t/g; $text =~ s/\f/\\f/g; $text =~ s/'/\\'/g; my $id = $self->id(); $self->cry_atomic ("matchexactly \"$text\", S1, I1, BT"); } sub BabyRegex::alt::pasm { my $self = shift; my $id = $self->id(); my $endofgroup_id = $self->{GROUP}->{CLOSED}->id; cry("branch $endofgroup_id"); } sub BabyRegex::slash::pasm { die "unimplemented\n"; } sub BabyRegex::class::pasm { die "unimplemented\n"; } sub BabyRegex::group::pasm{ my $self = shift; my $id = $self->id; my $cnt = $self->{COUNT}; my $fs = substr($self->fullstring,1,30); print "\n"; cry $id, "#start of n.c. group $cnt ($fs...)"; if ($self->quant eq "*" or $self->quant eq "?") { cry "savebr I1, ". $self->{CLOSED}->next->id(); } foreach my $alt (@{$self->{ALTS}}) { cry "savebr I1, " . $alt->next->id(); } } sub BabyRegex::capture::pasm { # We are not capturing anything yet! my $self = shift; my $id = $self->id; my $cnt = $self->{COUNT}; my $fs = substr($self->fullstring,1,30); print "\n"; if ($self->quant eq "*" or $self->quant eq "?") { cry "savebr I1, ". $self->{CLOSED}->next->id(); } cry $id, "#start of group $cnt ($fs...)"; foreach my $alt (@{$self->{ALTS}}) { cry "savebr I1, ". $alt->next->id(); } } sub BabyRegex::close::pasm { my $self = shift; my $id = $self->id; my $cnt = $self->{GROUP}->{COUNT}; cry $id, "#end of group $cnt"; if ($self->{GROUP}->quant eq "*" or $self->{GROUP}->quant eq "+") { cry "savebr I1, " . $self->next->id(); cry "branch " . $self->{GROUP}->id; } print "\n"; } sub BabyRegex::comment::pasm { } sub BabyRegex::whitespace::pasm{ } sub BabyRegex::lookahead::explanation { die "unimplemented\n"; } sub BabyRegex::lookbehind::explanation { die "unimplemented\n"; } sub BabyRegex::code::pasm { die "unimplemented\n"; } sub BabyRegex::later::pasm { die "unimplemented\n"; } sub BabyRegex::conditional::pasm { die "unimplemented\n"; } sub BabyRegex::cut::pasm { die "unimplemented\n"; } sub BabyRegex::flags::pasm{ die "unimplemented\n"; } sub BabyRegex::backref::pasm { die "unimplemented \n"; } 1; __END__ =head1 NAME BabyRegex - compiles a regular expression down to Parrot bytecode =head1 SYNOPSIS use BabyRegex; BabyRegex->new($REx)->pasm; =head1 SEE ALSO The C<YAPE::Regex> documentation. =head1 AUTHOR Angel Faus [EMAIL PROTECTED] Based in YAPE::Regex::Explain by Jeff Pinyan ([EMAIL PROTECTED]) =cut
use BabyRegex; unless (@ARGV[0] & @ARGV[1]) { print 'usage: perl babyre.pl "pattern" "string"' . "\n"; print 'ex: perl babyre.pl "reg(exp?|ular +expression)?" "regex" > regex.pasm' . "\n"; exit; } $pattern = @ARGV[0]; $string = @ARGV[1]; $c = BabyRegex->new($pattern); $c->pasm($string);
INIT:initbrstack FAIL set I1, 0 set S1, "regex" L0: #start of n.c. group 0 (?-imsx:reg(exp?|ular +expressi...) L1: matchexactly "reg", S1, I1, BT savebr I1, L10 L2: #start of group 1 (exp?|ular +expression)?...) savebr I1, L6 savebr I1, L6 L3: matchexactly "ex", S1, I1, BT L4: savebr I1, L9 matchexactly "p", S1, I1, BT branch L9 L6: matchexactly "ular", S1, I1, BT L7: matchexactly " ", S1, I1, BT savebr I1, L8 branch L7 L8: matchexactly "expression", S1, I1, BT L9: #end of group 1 L10: #end of group 0 OK: print "match" clearbrstack I1 end FAIL:print "fail" clearbrstack I1 end BT: backtrack I1
1814a1815,1882 > ######################################## > > AUTO_OP matchexactly(sc, s, i, ic){ > > STRING* temp; > > > if (string_length($2) <= $3) { > RETREL($4); > } > > temp = string_substr(interpreter, $2, $3 , string_length($1), NULL); > > if (string_compare(interpreter, $1, temp) != 0 ) { > RETREL($4); > } > else { > $3 = $3 + string_length($1); > } > } > > AUTO_OP matchanychar(s, i, ic) { > if (string_length($1) > $2){ > $2++; > } > else { > RETREL($3); > } > } > > MANUAL_OP backtrack(i){ > opcode_t *dest; > > pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), >STACK_ENTRY_INT); > pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, >STACK_ENTRY_DESTINATION); > > RETABS(dest); > } > > > AUTO_OP savebr(i, ic){ > > push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + >cur_opcode[2], STACK_ENTRY_DESTINATION, NULL); > > push_generic_entry(interpreter, &interpreter->user_stack_top, &($1), >STACK_ENTRY_INT, NULL); > > } > > AUTO_OP initbrstack(ic) { > INTVAL i; > i = -1; > > push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + >cur_opcode[1], STACK_ENTRY_DESTINATION, NULL); > push_generic_entry(interpreter, &interpreter->user_stack_top, &i, STACK_ENTRY_INT, >NULL); > > } > > AUTO_OP clearbrstack(i){ > opcode_t *dest; > > while ($1 && $1 >= 0) { > pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, >STACK_ENTRY_DESTINATION); > pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), >STACK_ENTRY_INT); > } > > } > > 1826a1895 >