I've written the code below to parse a number of text page files generated by
Tesseract OCR software to look for a guess the most likely values for VIN,
Reg number and stock number for a vehicle.
On my development site it works fine and gives the values it should.
However, on my live server it only returns '1' as the value.
The only thing I can think of is that my live server is much older than my
development server. My perl versions are:
Live v5.10.0
Devel v5.18.4
Am I right, and what can I do to get the code to work on my live server (other
than upgrage perl)
#!/usr/bin/perl -w
# searches a series of OCR generated text files - one per page
# looks for sets of regex's for field contents and stores in arrays
use warnings;
use strict;
use Data::Dumper;
my %searches=('stock'=>[qr/\b([NU][LD] *\d{5})\b/],
'regno'=>[qr/\b([A-Za-z]{2}\d{2}[A-Za-z]{3})\b/],
'vin'=>[qr/\b(WF[0O]XX[A-Z]{6}\d{5}\b)/i,qr/\b([A-Z]
{6}\d{5}\b)/i]);
my %found;
my %values;
foreach my $file (<*.txt>) {
print "file.....$file\n";
if (!open FH,$file ) {
print "file open failed: $!\n";
next;
}
my $content = do { local $/; <FH> };
close(FH);
# print "*****$content*********\n";
foreach my $field (keys %searches) { # foreach field
#####THe following line is the one with the problem
if (my @matches = $content =~ @{$searches{$field}}) {
foreach (@matches) {
$_=~s/ //g;
print STDERR "match found - '$field': '$_'\n";
if ($found{$field}{$_}) {
$found{$field}{$_}++;
} else {
$found{$field}{$_}=1;
}
}
}
}
} # foreach page
foreach my $field (keys %found) { # foreach field
my $value='';
my $count=0;
foreach my $key (keys %{$found{$field}}) { # foreach field -> value
$value=$key if ($found{$field}{$key} > $count);
}
print STDERR "field='$field' value='$value'\n";
$values{$field}=$value;
}
print STDERR Dumper(%found);
print STDERR Dumper(%values);
Development server output
[gary@gary tmp]$ parse_deal_pack
file.....DOC160715-16072015164033.pdf-01.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-02.txt
file.....DOC160715-16072015164033.pdf-03.txt
match found - 'regno': 'yy15yyy'
file.....DOC160715-16072015164033.pdf-04.txt
file.....DOC160715-16072015164033.pdf-05.txt
file.....DOC160715-16072015164033.pdf-06.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-07.txt
file.....DOC160715-16072015164033.pdf-08.txt
file.....DOC160715-16072015164033.pdf-09.txt
file.....DOC160715-16072015164033.pdf-10.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-11.txt
match found - 'stock': 'NL01047'
file.....DOC160715-16072015164033.pdf-12.txt
match found - 'stock': 'NL31047'
file.....DOC160715-16072015164033.pdf-13.txt
match found - 'regno': 'yy15yyy'
file.....DOC160715-16072015164033.pdf-14.txt
file.....DOC160715-16072015164033.pdf-15.txt
match found - 'regno': 'yy15yyy'
file.....DOC160715-16072015164033.pdf-16.txt
file.....DOC160715-16072015164033.pdf-17.txt
file.....DOC160715-16072015164033.pdf-18.txt
file.....DOC160715-16072015164033.pdf-19.txt
match found - 'regno': 'yy15yyy'
field='stock' value='NL31047'
field='regno' value='yy15yyy'
$VAR1 = 'stock';
$VAR2 = {
'NL01047' => 1,
'NL31047' => 4
};
$VAR3 = 'regno';
$VAR4 = {
'yy15yyy' => 4
};
$VAR1 = 'regno';
$VAR2 = 'yy15yyy';
$VAR3 = 'stock';
$VAR4 = 'NL31047';
[gary@gary tmp]$
Live server
[root@ollie faxgateway_10734]# parse_deal_pack
file.....DOC160715-16072015164033.pdf-01.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-02.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-03.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-04.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-05.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-06.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-07.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-08.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-09.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-10.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-11.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-12.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-13.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-14.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-15.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-16.txt
match found - 'vin': '1'
file.....DOC160715-16072015164033.pdf-17.txt
match found - 'vin': '1'
file.....DOC160715-16072015164033.pdf-18.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
file.....DOC160715-16072015164033.pdf-19.txt
match found - 'regno': '1'
match found - 'vin': '1'
match found - 'stock': '1'
field='regno' value='1'
field='stock' value='1'
field='vin' value='1'
$VAR1 = 'regno';
$VAR2 = {
'1' => 17
};
$VAR3 = 'stock';
$VAR4 = {
'1' => 17
};
$VAR5 = 'vin';
$VAR6 = {
'1' => 19
};
$VAR1 = 'regno';
$VAR2 = '1';
$VAR3 = 'vin';
$VAR4 = '1';
$VAR5 = 'stock';
$VAR6 = '1';
--
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]
http://learn.perl.org/