> From: Nolte, Jennifer [mailto:jennifer.no...@yale.edu] > Sent: Monday, January 25, 2010 09:48 AM > To: perl4lib@perl.org > Subject: Splitting a large file of MARC records into smaller files > > Hello- > > I am working with files of MARC records that are over a million records > each. I'd like to split them down into smaller chunks, preferably using > a command line. MARCedit works, but is slow and made for the desktop. > I've looked around and haven't found anything truly useful- Endeavor's > MARCsplit comes close but doesn't separate files into even numbers, > only by matching criteria, so there could be lots of record duplication > between files. > > Any idea where to begin? I am a (super) novice Perl person.
I use the following handy script I created many-many years ago. Consider it to be in the public domain. #!perl # # Usage: # perl MARC21-split.pl [-d#] [-n#] [-pPrefix] [-sSuffix] *.marc # # perl MARC21-split.pl -d3 -n10000 -pbib -s.marc *.marc # # Creates files with three digits sequence number that have 10,000 # records per file: bib001.marc, bib002.marc, etc. # # Options: # -d number of digits for sequence number # -n number of records per file # -p prefix text before sequence number # -s suffix text after sequence number # package main; # The current package name require 5.003; # The current package requires Perl v5.003 or later. BEGIN { unshift(@INC,'.') } use Carp; # Perl package, see documentation my $PACKAGE = 'main'; ###################### VARIABLES ##################### my $crlf = "\n"; # ASCII newline. my $recd = "\x1D"; # MARC21 record delimiter. my $fldd = "\x1E"; # MARC21 field delimiter. my $subd = "\x1F"; # MARC21 field separator. ###################### INLINE CODE ##################### # Change Perls default record delimiter. $/ = $recd; # Set defaults for command line options. my $recs = 1; my $digits = 2; my $prefix = ''; my $suffix = '.mrc'; # Initialize total record count to zero. my $total = 0; print STDERR join("\r\nARG=",'',@ARGV),"\r\n"; # Process command line. foreach $FileMARC (@ARGV) { my $FileOUT = undef; # Process command line options. if ($FileMARC =~ m/^[\-][Dd]/) { $FileMARC =~ s/^[\-][Dd]//; if (($digits = $FileMARC) !~ m/\d+/ || $recs == 0) { $digits = 1; } next; } elsif ($FileMARC =~ m/^[\-][Nn]/) { $FileMARC =~ s/^[\-][Nn]//; if (($recs = $FileMARC) !~ m/\d+/ || $recs == 0) { $recs = 1; } next; } elsif ($FileMARC =~ m/^[\-][Pp]/) { $FileMARC =~ s/^[\-][Pp]//; $prefix = $FileMARC; next; } elsif ($FileMARC =~ m/^[\-][Ss]/) { $FileMARC =~ s/^[\-][Ss]//; $suffix = $FileMARC; next; } # Open file from command line. open(MARC,'<'.$FileMARC) || croak("$PACKAGE:: Cannot open input file '$FileMARC': $!"); # Count each record in the file. my $count = 0; while (<MARC>) { # Open new output file when necessary. if (($total % $recs) == 0) { my $pattern = sprintf('%%s%%0%uu%%s',int($digits)); $FileOUT = sprintf($pattern,$prefix,($total/$recs)+1,$suffix); # Open output file. open(OUT,'>'.$FileOUT) || croak("$PACKAGE:: Cannot open output file '$FileOUT': $!"); } print OUT $_; ++$total; # Close output file when full. if (($total % $recs) == 0) { # Close file from command line. close(OUT); } ++$count; } # Close file from command line. close(MARC); # Output total records in file and file name. print STDERR join("\t",$count,$FileMARC),$crlf; } # Output total record count and file count. print STDERR join("\t",$total,"Total Records"),$crlf; print STDERR join("\t",int($total/$recs)+1,"Total Files"),$crlf;