# a15_2.pl                                                          12aug07waw

# From: sushil.negi@siemens.com
# RE: Splitting String in array
# Date Rec'd: 8/11/2007 2:56:44 P.M. EDT
#

# Thanks Ee and Bill,
#
# Both your solution work perfectly for the data set that I provided.
# However I think I missed in explaining an important point.
#
# The sample was only one file out of thousands of file that need
# to be parsed. See attached as another example which shows what
# I mean to say. This is where I am getting stuck as the
# "field width is not constant". (It changes in different files
# based on the max length of the data in a particular field).
#
# Any thoughts on making a generic parser script to create an array
# for this?
#
# Thanks Sushil

# CAUTION: double-quote interpolation in here-docs mungs the backslashes
# in these example files

my $File1 = <<FILE;
		ABC_011 0311           0311      Pat Asc      FirstName LastName MiddleName ABCAHS:  no City present, City Required...
		ABC_010 V9D2 0310      0311      Requires     FirstName MiddleName          SANAH 3.0x- 55+ User Fields UI- : no feature.
		ABC_000 0903 9.0a1     0903      Pat Asc      FirstName LastName            blah blah blah Series Timeframe max
		ABC_579 0903 0303_AB99 0311      Cat Dog                LastName MiddleName Serivce Summarisation not option sorting by Date.
		ABC_707 0903           0311                                                 SRFDSA0142\\///SS:NullPointerException:mark incomplete
		ABC_707 0903                                                                SRFDSA0142\\///SS:NullPointerException:mark incomplete
FILE

my $File2 = <<FILE;  # (See here field length is changed based on the reduced size of the data that is going in the field)
		ABC_011 0311       0311 Pat A FirstName LastName   ABCAHS:  no City present, City Required...
		ABC_010 V9D2 0310  0311 Req   FirstName MiddleName SANAH 3.0x- 55+ User Fields UI- : no feature.
		ABC_000 0903 9.0a1 0903 Pat   FirstName LastName   blah blah blah Series Timeframe max
		ABC_579 0903       0311 Cat   LastName MiddleName  Serivce Summarisation not option sorting by Date.
		ABC_707 0903       0311                            SRFDSA0142\\///SS:NullPointerException:mark incomplete
		ABC_707 0903                                       SRFDSA0142\\///SS:NullPointerException:mark incomplete
FILE

my $File3 = <<FILE;  # (See here field length is changed based on the reduced size of the data that is going in the field)
		ABC_000 0903 9.0a1     0903 Pat     FirstName LastName            blah blah blah Series Timeframe max
		ABC_579 0903 0303_AB99 0311 Cat Dog FirstName LastName MiddleName Serivce Summarisation not option sorting by Date.
		ABC_707 0903           0311                                       SRFDSA0142\\///SS:NullPointerException:mark incomplete
FILE


=comment

hi sushil --

ok, yes, you have a problem, and a tricky one!

based on what you have said in your posts and on what i can glean,
guess, infer and imagine from the examples given, i have come up with
a set of assumptions that form the basis for the approach i will
suggest.

ASSUMPTIONS:
    - characters of files are ascii latin-1.
    - there is no explicit information in the data contained in a
      file that gives the position or width of any data field in
      the file; e.g., the data in field n do NOT give the starting
      column number, etc., of field m.
    - every line begins with 2 tab characters.
    - the data of every field is in the form of non-whitespace strings,
      possibly with leading, embedded, or trailing whitespace.
    - the beginning of every field is preceded by a space (0x20) character.
    - the presence of a space character is NOT sufficient to infer
      the beginning of a field: it may be a part of leading, embedded
      or trailing whitespace in the text of the field.
    - a space or spaces will be the only field separator characters
      after the pair of tabs at the start of each line/record.
    - there are no defaulted fields in any record.
      i.e., any empty field or fields at the end of a record (line)
      will be filled with spaces (or something) to full field width,
      not truncated, and field delimiters will be preserved.

given these assumptions, the only approach i can see is to do some
kind of analysis of a large number of the lines of a file (maybe
all of them) and try to infer the field boundaries of that file.

if you look at all the lines of a file and find that certain columns
never contain anything other than a tab, and other columns never
contain anything other than a space, then you can GUESS that a data
field begins in the first column after a string of one or more of
these constant-position whitespace characters.  after you guess the
start of each field, it's easy to calculate the widths of the fields.

this is obviously an imperfect heuristic.

for example, in the case of file 1, if you look at all the lines of
the file and find that the character in column 1 (with columns
starting at 0) is always a tab, and that the characters in columns 9,
14, 24 and 34 are always spaces, then one can GUESS that fields
begin in columns 2, 10, 15, 25 and 35, with corresponding widths.
however, continuing this process with file 1 leads us to see fields
beginning at columns 48, 58 and 78, and if i am GUESSING correctly,
there is NO valid field beginning at column 58 in file 1; this is
actually in the middle of the `name' field.  the other files also
show examples of spurious fields.

the trick, then, is to figure out some way of analyzing these files
to find this info in a reasonable time.

here is my approach:
    - bitwise-and together all the lines of a file;
    - analyze the single line resulting from this process to find
      column positions that only ever held a space (two tabs are
      assumed at the start of every line);
    - assume that the first column after any such group of one
      or more constant space columns is the start of a field;
    - cross your fingers and hope you're right.

here's some code...

=cut

use warnings;
use strict;


my ($all_0s, $all_1s);  # accumulators for constant bit columns

# accumulating bits beyond the length of the shortest line of file is
# not needed because an unchanging character cannot exist if any line
# of the file does not have that character position.
#
# CAUTION: this assumes no truncated lines when trailing fields are empty.
# e.g., if line 6 in example file `File1' were `ABC_707 0903', the
# approach given here will fail, although perhaps detectably.

# note: file close/error checking stuff included just for completeness.
open my $data, '<', \$File1 or die "opening: $!";

# first line of file initializes bit column accumulators.
$all_1s = <$data>;
$all_0s = ~ $all_1s;

# accumulate unchanging 0s and 1s in bit columns in remaining lines...
while (defined (my $line = <$data>)) {
    $all_1s &=   $line; # accumulate columns of bits always 1
    $all_0s &= ~ $line; # accumulate columns of bits always 0
    }
# $! and die "reading: $!";  # invalid if reading `in memory' file

close $data or die "closing: $!";

# NOTE: the length of each accumulator string will be the length of
# the shortest line in the file.  this is due to the way the
# bitwise-and operator works.

# du("\$all_0s", $all_0s);  # FOR DEBUG
# du("\$all_1s", $all_1s);  # FOR DEBUG

# this should never happen
(length($all_0s) == length($all_1s)) or die "strings not same length";

# at this point, any bit in $all_0s that is 1 was 0 in that bit column
# position in every line of the file.
# likewise, any bit in $all_1s that is 1 was 1 in that bit column
# position in every line of the file.

# form mask for bits that are the same at a given bit column position
# for all lines in file: 1 for such bits, 0 for others.
my $all_same = $all_0s ^ $all_1s;  # bit == 1 if all the same in every line
# du("\$all_same after xor", $all_same);  # FOR DEBUG

# given the bit-mask, generate an unchanging-characters mask.
# CAUTION: this FAILS for a '\0' character in the file.
$all_same =~ tr/\xff/\0/c;  # anything other than ff translated to 0
# du("\$all_same after tr", $all_same);  # FOR DEBUG

$all_same &= $all_1s;  # mask out all characters not always the same
# du("\$all_same after mask", $all_same);  # FOR DEBUG

# at this point, any non-null character in the string was the same at
# its position for all lines in the file (except, as noted, '\0').

# infer field starting columns from supposed field separator groups.
# inference process begins just after the initial pair of tab characters
# (i.e., at column/string offset 2) because the tabs are assumed to be
# always present and thus irrelevant

my $separator  = qr{ [ ] }xms;  # a single space is minimal separator

my $field_start = qr{ (?! $separator) . }xms;  # anything not a separator
my $field_body  = qr{                 . }xms;  # anything at all
my $next_or_end = qr{ $separator $field_start | \z }xms;  # next field or eos

my @offsets_widths;  # will be list of anon. hash refs.: offset/width pairs

pos $all_same = 2;  # skip tabs, start matching at column/string offset 2

while ($all_same =~ m{ ($field_start $field_body*?) (?= $next_or_end) }gxms) {
    # printf "pos %ld, \$1 %ld wide  ", pos $all_same, length $1;  # FOR DEBUG
    # printf "\@ %ld, %ld wide  ", $-[1], $+[1] - $-[1];           # FOR DEBUG
    my $offset = $-[1];          # position of start of $1
    my $width  = $+[1] - $-[1];  # width of $1
    push @offsets_widths, { offset => $offset,  width => $width };
    # alternatively, just...
    # push @offsets_widths, { offset => $-[1],  width => $+[1] - $-[1] };
    }

# fix up the final offset/width pair extracted.  this will be the final
# `everything to end of line/record' unpack specifier in the template.
$offsets_widths[-1]->{width} = '*';  # catchall specifier

# print "\n";  # FOR DEBUG
# printf "offset %2s, field width %2s \n", $_->{offset}, $_->{width}
#     for @offsets_widths;  # FOR DEBUG

# use the extracted offset/width info to make an unpack() template.
my $template =
    join '  ',
    map  { "\@$_->{offset} A$_->{width}" }
    @offsets_widths
    ;

print "unpack() template: {$template}";


# subroutines ##############################################################

sub du {

    my ($msg, $string) = @_;

    $string = reverse $string;  # so it can be chop()ed

    print "\n dump $msg \n";

    BYTE:
    for my $i (0 .. length($string) - 1) {
        printf "%02x", ord chop $string;
        print $i % 32 == 31 ? "\n"
            : $i % 16 == 15 ? "  "
            : $i %  4 ==  3 ? "."
            :  " "
            ;
        }

    print "\n";

    }


__DATA__
		ABC_011 0311           0311      Pat Asc      FirstName LastName MiddleName ABCAHS:  no City present, City Required...
		ABC_010 V9D2 0310      0311      Requires     FirstName MiddleName          SANAH 3.0x- 55+ User Fields UI- : no feature.
		ABC_000 0903 9.0a1     0903      Pat Asc      FirstName LastName            blah blah blah Series Timeframe max
		ABC_579 0903 0303_AB99 0311      Cat Dog                LastName MiddleName Serivce Summarisation not option sorting by Date.
		ABC_707 0903           0311                                                 SRFDSA0142\\///SS:NullPointerException:mark incomplete
		ABC_707 0903                                                                SRFDSA0142\\///SS:NullPointerException:mark incomplete
