Re: Experimental regex support

Angel Faus Fri, 02 Nov 2001 01:52:06 -0800


> Dan:
> Looks like your mailer wordwrapped the program pretty badly. Could you try
> again as either a context or unified diff (-c or -u) and attached to mail?
> I'm curious to look at it, as I've only partially considered how we'll do
> regexes to date.
>


Oh, sorry. I should stop sending mails from webmin when I am out of job. It 
is going to kill al my personal relations. :(

I attach it again, hope it works right now.

The patch is not very documented, so i better explain the opcodes here:

* initbr ic: saves a -1 into the integer stack, and DEST to the destination 
stack.  ($1 will be the ultimate failing destinaion)

* clearbr i: deletes all items from the integer and destination stack untils 
is finds a -1 in the integer stack. Checks first if $1 is already -1.

* savebr i, ic: saves a pending match. Saves de string position (in the 
integer satck) + label of the next regex node (in the destination stack).

* backtrack i. Pops the integer stack into $1, and then returns to last 
destination in the destination stack.

The other functionalities are implemented in terms of the previous opcodes 
plus usual ones (branch,..)

The part i like less of the thing is the messing with integer and destination 
stacks. I would rather use separate stacks, but prefered to make the minimal 
changes possible.

If you have a first design of how the whole thing should work, I would 
modifiy the compiler to follow your design.

I attach also an example of the output of the compiler (babyre.pasm), for 
those who do not want to install the module with its dependences, but want to 
see how the assembler output looks like.

        Angel



package BabyRegex;

use YAPE::Regex 'BabyRegex';
use strict;
use vars '$VERSION';

$VERSION = '0.01';

my %modes = ( on => '', off => '' );

sub buildtree {
  my $self = shift;
  
  my $cnt = 0;
  my ($groupscnt, @groups);
  my @tree;
  
  while (my $node = $self->next) {            
    
    $node->id($cnt++);
    $tree[-1]->next($node) if @tree;  
    
    if ($node->type =~ /capture|group/) {
        push @groups, $node;
        $node->{ALTS} = [];
        $node->{COUNT} = $groupscnt++;
        }       
        
    if ($node->type eq "alt")    {
        push (@{$groups[-1]->{ALTS}}, $node);
        my $groupnode = $groups[-1];
        $node->{GROUP} = $groupnode;
        
        push @{$groupnode->{ALTS}}, $node,  
        }
    
    if ($node->type eq "close"){
        my $groupnode = pop @groups;
        $groupnode->{CLOSED} = $node;
        $node->{GROUP} = $groupnode;
        for my $alt (@{$groupnode->{ALTS}}) {
            #Alt nodes get its ID replaced by the Closing node ID, so 
            #that the when its antecessors calls ->next->id it gets the good one.
            #This is probably on of the worse to do that.
            $alt->{ID} = $node->{ID};
            }
        }
    push (@tree, $node);      
    }
    
  return @tree;  
   
}

sub cry {
  if (@_[1]) {
        my $label = shift;
        my $opcode = shift;
  
        my $spc = " " x (4 - length($label) ) ;
        print $label. ":" . $spc . $opcode . "\n";
  }
  else {
        my $opcode = shift;
        print "     $opcode\n";
  }

}


sub pasm {
  my ($self, $string) = @_;  
  my @tree = $self->buildtree;   
  
  cry "INIT", "initbrstack FAIL";
  cry "set I1, 0";
  cry "set S1, \"$string\"";

  for my $node (@tree) {

    $node->pasm($self);                 

    #print $node->type; 
  }

  
  cry "OK", "print \"match\"";
  cry "clearbrstack I1";
  cry "end";  
  print "\n";
  
  cry "FAIL", "print \"fail\"";
  cry "clearbrstack I1";
  cry "end";
  print "\n";
  
  cry "BT", "backtrack I1";
  print "\n";
}



##
## shared methods
##

sub BabyRegex::Element::id {
   my $self = shift;
   my $id = shift;
   
   if ($self->{ID}) { return $self->{ID} }
   else {
       $self->{ID} = "L" . $id;
   }

}


sub BabyRegex::Element::next {
   my $self = shift;
   my $next = shift;
   
   if ($next) {
        $self->{NEXT} = $next;
        return $next;
   }
   else {
        return $self->{NEXT}
   }
   
}



sub BabyRegex::Element::cry_atomic {
  my $self = shift;
  my $opcode = shift;
  my $id = $self->id;
  
  if ($self->quant eq "*") {            
        my $nextid = $self->next()->id();
        cry $id, "savebr I1, $nextid";
        cry $opcode;
        cry "branc $id";

  } elsif ($self->quant eq "+" ) {
        my $nextid = $self->next()->id();
        cry $id, $opcode;
        cry "savebr I1, $nextid";
        cry "branch $id";       

  } elsif ($self->quant eq "?" ) {
        my $nextid = $self->next()->id();
        cry $id, "savebr I1, $nextid";
        cry $opcode;  
  }
  else {
        cry $id, $opcode;
  }     
}

##
## each element pasm 
##



sub BabyRegex::anchor::pasm {
  my $self = shift;
  my $type = $self->{TEXT};
  print $type;
}


sub BabyRegex::macro::pasm { die "unimplemented\n"; }


sub BabyRegex::oct::explanation {  
  die "unimplemented - too lazy\n";
}

sub BabyRegex::hex::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::utf8hex::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::ctrl::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::named::explanation {
  die "unimplemented - too lazy\n";
}

sub BabyRegex::Cchar::explanation {
  die "unimplemented - too lazy\n";
}


sub BabyRegex::any::pasm {
  my $self = shift;  
  my $l;
  my $id = $self->id;
    
  if ($modes{on} =~ /s/) {
        $self->cry_atomic ("matchanychar S1, I1, BT");
  } else {
        #$self->cry_atomic ("matchanycharbutnl S1, I1, BT");
        #we don't have the opcode anyway
        $self->cry_atomic ("matchanychar S1, I1, BT");
  }

}


sub BabyRegex::text::pasm {
  my $self = shift;
  my $text = $self->text;
  
  $text =~ s/\n/\\n/g;
  $text =~ s/\r/\\r/g;
  $text =~ s/\t/\\t/g;
  $text =~ s/\f/\\f/g;
  $text =~ s/'/\\'/g;
  
  my $id = $self->id();

  $self->cry_atomic ("matchexactly \"$text\", S1, I1, BT");
  
}


sub BabyRegex::alt::pasm {
  my $self = shift;
  my $id = $self->id();
  my $endofgroup_id = $self->{GROUP}->{CLOSED}->id;
    
  cry("branch $endofgroup_id");

}


sub BabyRegex::slash::pasm {  die "unimplemented\n"; }

sub BabyRegex::class::pasm {  die "unimplemented\n"; }


sub BabyRegex::group::pasm{
  my $self = shift;
  
  my $id = $self->id;
  my $cnt = $self->{COUNT};  
  my $fs = substr($self->fullstring,1,30);
  
  print "\n";
  cry $id, "#start of n.c. group $cnt        ($fs...)";

  if ($self->quant eq "*" or $self->quant eq "?") {
        cry "savebr I1, ". $self->{CLOSED}->next->id();         
  }
 
  foreach my $alt (@{$self->{ALTS}}) {
        cry "savebr I1, " . $alt->next->id();
  }
  
}


sub BabyRegex::capture::pasm {
    
  # We are not capturing anything yet! 
  
  my $self = shift;
  my $id = $self->id;
  my $cnt = $self->{COUNT};
  my $fs = substr($self->fullstring,1,30);
  
  print "\n";
  
  if ($self->quant eq "*" or $self->quant eq "?") {
        cry "savebr I1, ". $self->{CLOSED}->next->id();         
  }
        
  cry $id, "#start of group $cnt             ($fs...)";
      
  foreach my $alt (@{$self->{ALTS}}) {
        cry "savebr I1, ". $alt->next->id();
  }
}


sub BabyRegex::close::pasm {
  my $self = shift;  
  my $id = $self->id;
  my $cnt = $self->{GROUP}->{COUNT};

  cry $id, "#end of group $cnt";
  
  if ($self->{GROUP}->quant eq "*" or $self->{GROUP}->quant eq "+") {
        cry "savebr I1, " .  $self->next->id();
        cry "branch " . $self->{GROUP}->id;
        }

  print "\n";
  
}


  
sub BabyRegex::comment::pasm { }

sub BabyRegex::whitespace::pasm{ }


sub BabyRegex::lookahead::explanation { die "unimplemented\n"; }

sub BabyRegex::lookbehind::explanation { die "unimplemented\n"; }

sub BabyRegex::code::pasm {  die "unimplemented\n"; }

sub BabyRegex::later::pasm { die "unimplemented\n"; }

sub BabyRegex::conditional::pasm { die "unimplemented\n"; }

sub BabyRegex::cut::pasm { die "unimplemented\n"; }

sub BabyRegex::flags::pasm{ die "unimplemented\n"; }

sub BabyRegex::backref::pasm { die "unimplemented \n"; }





1;

__END__

=head1 NAME

BabyRegex - compiles a regular expression down to Parrot bytecode

=head1 SYNOPSIS

  use BabyRegex;
  BabyRegex->new($REx)->pasm;

=head1 SEE ALSO

The C<YAPE::Regex> documentation.

=head1 AUTHOR

  Angel Faus
  [EMAIL PROTECTED]
  
  Based in YAPE::Regex::Explain by Jeff Pinyan ([EMAIL PROTECTED])

=cut


use BabyRegex;

unless (@ARGV[0] & @ARGV[1]) {
        print 'usage: perl babyre.pl "pattern" "string"' . "\n";
        print 'ex:    perl babyre.pl "reg(exp?|ular +expression)?" "regex" > 
regex.pasm' . "\n";  
        exit;
}       
        
$pattern = @ARGV[0];
$string = @ARGV[1];

$c = BabyRegex->new($pattern);
$c->pasm($string);

INIT:initbrstack FAIL
     set I1, 0
     set S1, "regex"

L0:  #start of n.c. group 0        (?-imsx:reg(exp?|ular +expressi...)
L1:  matchexactly "reg", S1, I1, BT

     savebr I1, L10
L2:  #start of group 1             (exp?|ular +expression)?...)
     savebr I1, L6
     savebr I1, L6
L3:  matchexactly "ex", S1, I1, BT
L4:  savebr I1, L9
     matchexactly "p", S1, I1, BT
     branch L9
L6:  matchexactly "ular", S1, I1, BT
L7:  matchexactly " ", S1, I1, BT
     savebr I1, L8
     branch L7
L8:  matchexactly "expression", S1, I1, BT
L9:  #end of group 1

L10: #end of group 0

OK:  print "match"
     clearbrstack I1
     end

FAIL:print "fail"
     clearbrstack I1
     end

BT:  backtrack I1

1814a1815,1882
> ########################################
> 
> AUTO_OP matchexactly(sc, s, i, ic){
>   
>   STRING* temp;
>  
>   
>   if (string_length($2) <= $3) {
>     RETREL($4);
>     }   
> 
>   temp = string_substr(interpreter, $2, $3 , string_length($1), NULL);
> 
>   if (string_compare(interpreter, $1, temp) != 0 ) {
>     RETREL($4);
>   }
>   else {
>     $3 = $3 + string_length($1);
>   }
> }  
> 
> AUTO_OP matchanychar(s, i, ic) {
>    if (string_length($1) > $2){   
>       $2++;
>       }
>    else {
>       RETREL($3);
>    }
> }
>        
> MANUAL_OP backtrack(i){
>   opcode_t *dest;
> 
>   pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), 
>STACK_ENTRY_INT);
>   pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, 
>STACK_ENTRY_DESTINATION);
> 
>   RETABS(dest);
> }
> 
> 
> AUTO_OP savebr(i, ic){
>  
>   push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + 
>cur_opcode[2],  STACK_ENTRY_DESTINATION, NULL);
> 
>   push_generic_entry(interpreter, &interpreter->user_stack_top, &($1),  
>STACK_ENTRY_INT, NULL);
> 
> }
> 
> AUTO_OP initbrstack(ic) {
>   INTVAL i;
>   i = -1;
>   
>   push_generic_entry(interpreter, &interpreter->control_stack_top, cur_opcode + 
>cur_opcode[1], STACK_ENTRY_DESTINATION, NULL);
>   push_generic_entry(interpreter, &interpreter->user_stack_top, &i, STACK_ENTRY_INT, 
>NULL); 
> 
> }
> 
> AUTO_OP clearbrstack(i){
>   opcode_t *dest;
>   
>   while ($1 && $1 >= 0) {
>       pop_generic_entry(interpreter, &interpreter->control_stack_top, &dest, 
>STACK_ENTRY_DESTINATION);
>       pop_generic_entry(interpreter, &interpreter->user_stack_top, &($1), 
>STACK_ENTRY_INT); 
>       }
>       
> }
> 
> 
1826a1895
>

Re: Experimental regex support

Reply via email to