Mitchell A. Petersen wrote:
> 
> P.S.
>       I couldn't figure out the commands
>               my @vals =  map {s/[,$ =]//g} @col_asset[0,-1];

Take the first and last elements of the array and remove ',$ =' from them
(element by element) and store in new array.

>               print join(",", @vals), "\n";
>       If you could direct me to a manual, that would be fine as well.

Most everything you need is in perlfunc man page or one of the several
RE (regular expression) man pages.

> Program ----------
> 
> my $total = 0;
> while ($total <= 3000) {

You need to remove the above loop - it's causing you to possibly
re-run the entire process multiple times.  If you want to limit
total to 3000, do it inside your other loop (eg:
last if $total > 3000).

use strict;
use warnings;
use HTML::TreeBuilder;

my $txtfile = 'D:/res/edgar/10k/2178_0000002178-06-000013.txt';
my $csvfile = 'D:/res/edgar/match/gcu_unchecked3_junk.csv';

# open the CSV file for writing

open OUT, ">$csvfile" or die "create csv: $!($^E)";
select ((select (OUT), $| = 1)[0]);     # unbuffer CSV write

# open the text file for reading

open IN, $txtfile or die "open $txtfile: $!($^E)";
my $doc = join '', <IN>;
close IN;

my $root = HTML::TreeBuilder->new;
$root->parse($doc);
$root->eof();

# get tables in HTML

my @tables = $root->find_by_tag_name('TABLE');

# foreach table in input file

my $total = 0;
foreach my $table (@tables) {

        my $txt = $table->as_text_trimmed;

        # skip items not of interest

        next if ($txt !~ /total asset/is || $txt !~ /(\d|,){4,12}/is);

        my @col_asset = ();
        foreach my $row ($table->find_by_tag_name('tr')) {

                # skip rows not of interest

                next if $row->as_text_trimmed !~ /^total asset/i;

                # foreach column in row

                foreach my $column ($row->find_by_tag_name('td')) {

                        my $col_text = $column->as_text_trimmed;

                        # if asset figure, save it

                        if ($col_text =~ /[\d,\.]{4,12}/) {
                                push @col_asset, $col_text;
                        }
                }
                last;           # skip rows after total assset if any
        }
        $total++;

        # print the totals

        my @vals = map { s/[,$ =]//g } @col_asset[0,-1];
        print join (',', @vals), "\n";

# uncomment one of these as appropriate:
#       last;                           # only do 1st table ??????
#       last if $total > 3000;          # or 3000 lines ??????
}
close OUT;

__END__


_______________________________________________
ActivePerl mailing list
[email protected]
To unsubscribe: http://listserv.ActiveState.com/mailman/mysubs

Reply via email to