Adding some new options to bulkmarcimport : -k idtagsubfield in order to store the id of the file record into another field -match tagsubfield,index -a to import authorities -l logfilename to store logs
Bug Fixing : C4/Charset.pm Charset was incorrect for UNIMARC Authorities --- C4/AuthoritiesMarc.pm | 104 ++++++++++++ C4/Charset.pm | 34 +++-- misc/migration_tools/bulkmarcimport.pl | 288 +++++++++++++++++++++++++++----- 3 files changed, 375 insertions(+), 51 deletions(-) diff --git a/C4/AuthoritiesMarc.pm b/C4/AuthoritiesMarc.pm index e039b99..bc32a6c 100644 --- a/C4/AuthoritiesMarc.pm +++ b/C4/AuthoritiesMarc.pm @@ -57,6 +57,9 @@ BEGIN { &merge &FindDuplicateAuthority + + &GuessAuthTypeCode + &GuessAuthId ); } @@ -409,6 +412,107 @@ sub GetAuthTypeCode { return $authtypecode; } +=head2 GuessAuthTypeCode + +=over 4 + +my $authtypecode = GuessAuthTypeCode($record); + +=back + +Get the record and tries to guess the adequate authtypecode from its content. + +=cut + +sub GuessAuthTypeCode { + my ($record) = @_; + return unless defined $record; +my $heading_fields = { + "MARC21"=>{ + '100'=>{authtypecode=>'PERSO_NAME'}, + '110'=>{authtypecode=>'CORPO_NAME'}, + '111'=>{authtypecode=>'MEETI_NAME'}, + '130'=>{authtypecode=>'UNIF_TITLE'}, + '148'=>{authtypecode=>'CHRON_TERM'}, + '150'=>{authtypecode=>'TOPIC_TERM'}, + '151'=>{authtypecode=>'GEOGR_NAME'}, + '155'=>{authtypecode=>'GENRE/FORM'}, + '180'=>{authtypecode=>'GEN_SUBDIV'}, + '181'=>{authtypecode=>'GEO_SUBDIV'}, + '182'=>{authtypecode=>'CHRON_SUBD'}, + '185'=>{authtypecode=>'FORM_SUBD'}, + }, +#200 Personal name 700, 701, 702 4-- with embedded 700, 701, 702 600 +# 604 with embedded 700, 701, 702 +#210 Corporate or meeting name 710, 711, 712 4-- with embedded 710, 711, 712 601 604 with embedded 710, 711, 712 +#215 Territorial or geographic name 710, 711, 712 4-- with embedded 710, 711, 712 601, 607 604 with embedded 710, 711, 712 +#216 Trademark 716 [Reserved for future use] +#220 Family name 720, 721, 722 4-- with embedded 720, 721, 722 602 604 with embedded 720, 721, 722 +#230 Title 500 4-- with embedded 500 605 +#240 Name and title (embedded 200, 210, 215, or 220 and 230) 4-- with embedded 7-- and 500 7-- 604 with embedded 7-- and 500 500 +#245 Name and collective title (embedded 200, 210, 215, or 220 and 235) 4-- with embedded 7-- and 501 604 with embedded 7-- and 501 7-- 501 +#250 Topical subject 606 +#260 Place access 620 +#280 Form, genre or physical characteristics 608 +# +# +# Could also be represented with : +#leader position 9 +#a = personal name entry +#b = corporate name entry +#c = territorial or geographical name +#d = trademark +#e = family name +#f = uniform title +#g = collective uniform title +#h = name/title +#i = name/collective uniform title +#j = topical subject +#k = place access +#l = form, genre or physical characteristics + "UNIMARC"=>{ + '200'=>{authtypecode=>'NP'}, + '210'=>{authtypecode=>'CO'}, + '215'=>{authtypecode=>'SNG'}, + '216'=>{authtypecode=>'TM'}, + '220'=>{authtypecode=>'FAM'}, + '230'=>{authtypecode=>'TU'}, + '235'=>{authtypecode=>'CO_UNI_TI'}, + '240'=>{authtypecode=>'SAUTTIT'}, + '245'=>{authtypecode=>'NAME_COL'}, + '250'=>{authtypecode=>'SNC'}, + '260'=>{authtypecode=>'PA'}, + '280'=>{authtypecode=>'GENRE/FORM'}, + } +}; + foreach my $field (keys %{$heading_fields->{uc(C4::Context->preference('marcflavour'))} }) { + return $heading_fields->{uc(C4::Context->preference('marcflavour'))}->{$field}->{'authtypecode'} if (defined $record->field($field)); + } + return; +} + +=head2 GuessAuthId + +=over 4 + +my $authtid = GuessAuthId($record); + +=back + +Get the record and tries to guess the adequate authtypecode from its content. + +=cut + +sub GuessAuthId { + my ($record) = @_; + return unless ($record && $record->field('001')); +# my $authtypecode=GuessAuthTypeCode($record); +# my ($tag,$subfield)=GetAuthMARCFromKohaField("auth_header.authid",$authtypecode); +# if ($tag > 010) {return $record->subfield($tag,$subfield)} +# else {return $record->field($tag)->data} + return $record->field('001')->data; +} + =head2 GetTagsLabels =over 4 diff --git a/C4/Charset.pm b/C4/Charset.pm index bbeef24..0265991 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -22,6 +22,7 @@ use warnings; use MARC::Charset qw/marc8_to_utf8/; use Text::Iconv; +use C4::Debug; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); @@ -192,7 +193,7 @@ sub MarcToUTF8Record { } else { if ($marc_flavour eq 'MARC21') { return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); - } elsif ($marc_flavour eq 'UNIMARC') { + } elsif ($marc_flavour =~/UNIMARC/) { return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour); } else { return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); @@ -253,18 +254,27 @@ sub SetMarcUnicodeFlag { my $leader = $marc_record->leader(); substr($leader, 9, 1) = 'a'; $marc_record->leader($leader); - } elsif ($marc_flavour eq "UNIMARC") { - if (my $field = $marc_record->field('100')) { - my $sfa = $field->subfield('a'); - - my $subflength = 36; - # fix the length of the field - $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength); - $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength); - - substr($sfa, 26, 4) = '50 '; - $field->update('a' => $sfa); + } elsif ($marc_flavour =~/UNIMARC/) { + my $string; + my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,8):(36,22)); + $string=$marc_record->subfield( 100, "a" ); + if (length($string)==$subflength) { + $string = substr $string, 0,$subflength if (length($string)>$subflength); + } + else { + $string = POSIX::strftime( "%Y%m%d", localtime ); + $string =~ s/\-//g; + $string = sprintf( "%-*s", $subflength, $string ); + } + substr( $string, $encodingposition, 8, "frey50 " ); + if ( $marc_record->subfield( 100, "a" ) ) { + $marc_record->field('100')->update(a=>$string); + } + else { + $marc_record->insert_grouped_field( + MARC::Field->new( 100, '', '', "a" => $string ) ); } + $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 ); } else { warn "Unrecognized marcflavour: $marc_flavour"; } diff --git a/misc/migration_tools/bulkmarcimport.pl b/misc/migration_tools/bulkmarcimport.pl index b8f5500..835b4dd 100755 --- a/misc/migration_tools/bulkmarcimport.pl +++ b/misc/migration_tools/bulkmarcimport.pl @@ -2,7 +2,7 @@ # Import an iso2709 file into Koha 3 use strict; -#use warnings; +use warnings; #use diagnostics; BEGIN { # find Koha's Perl modules @@ -20,6 +20,7 @@ use MARC::Charset; use C4::Context; use C4::Biblio; +use C4::Koha; use C4::Charset; use C4::Items; use Unicode::Normalize; @@ -28,9 +29,8 @@ use Getopt::Long; use IO::File; binmode(STDOUT, ":utf8"); - my ( $input_marc_file, $number, $offset) = ('',0,0); -my ($version, $delete, $test_parameter, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off,$format); +my ($version, $delete, $test_parameter, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off,$format,$biblios,$authorities,$keepids,$match, $isbn_check, $logfile); my ($sourcetag,$sourcesubfield,$idmapfl); $|=1; @@ -48,10 +48,17 @@ GetOptions( 'v:s' => \$verbose, 'fk' => \$fk_off, 'm:s' => \$format, + 'l:s' => \$logfile, + 'k|keepids:s' => \$keepids, + 'b|biblios' => \$biblios, + 'a|authorities' => \$authorities, + 'match=s@' => \$match, + 'i|isbn' => \$isbn_check, 'x:s' => \$sourcetag, 'y:s' => \$sourcesubfield, 'idmap:s' => \$idmapfl, ); +$biblios=!$authorities||$biblios; if ($version || ($input_marc_file eq '')) { print <<EOF @@ -65,6 +72,7 @@ Parameters: n the number of records to import. If missing, all the file is imported o file offset before importing, ie number of records to skip. commit the number of records to wait before performing a 'commit' operation + l file logs actions done for each record and their status into file t test mode: parses the file, saying what he would do, but doing nothing. s skip automatic conversion of MARC-8 to UTF-8. This option is provided for debugging. @@ -73,10 +81,27 @@ Parameters: d delete EVERYTHING related to biblio in koha-DB before import. Tables: biblio, biblioitems, titems m format, MARCXML or ISO2709 (defaults to ISO2709) + keepids field store ids in field (usefull for authorities, where 001 contains the authid for Koha, that can contain a very valuable info for authorities coming from LOC or BNF. useless for biblios probably) + b|biblios type of import : bibliographic records + a|authorities type of import : authority records + match matchindex,fieldtomatch matchpoint to use to deduplicate + fieldtomatch can be either 001 to 999 + or field and list of subfields as such 100abcde + i|isbn if set, a search will be done on isbn, and, if the same isbn is found, the biblio is not added. It's another + method to deduplicate. + match & i can be both set. x source bib tag for reporting the source bib number y source subfield for reporting the source bib number idmap file for the koha bib and source id - + keepids store ids in 009 (usefull for authorities, where 001 contains the authid for Koha, that can contain a very valuable info for authorities coming from LOC or BNF. useless for biblios probably) + b|biblios type of import : bibliographic records + a|authorities type of import : authority records + match matchindex,fieldtomatch matchpoint to use to deduplicate + fieldtomatch can be either 001 to 999 + or field and list of subfields as such 100abcde + i|isbn if set, a search will be done on isbn, and, if the same isbn is found, the biblio is not added. It's another + method to deduplicate. + match & i can be both set. IMPORTANT: don't use this script before you've entered and checked your MARC parameters tables twice (or more!). Otherwise, the import won't work correctly and you will get invalid data. @@ -112,10 +137,16 @@ if ($fk_off) { if ($delete) { - print "deleting biblios\n"; - $dbh->do("truncate biblio"); - $dbh->do("truncate biblioitems"); - $dbh->do("truncate items"); + if ($biblios){ + print "deleting biblios\n"; + $dbh->do("truncate biblio"); + $dbh->do("truncate biblioitems"); + $dbh->do("truncate items"); + } + else { + print "deleting authorities\n"; + $dbh->do("truncate auth_header"); + } $dbh->do("truncate zebraqueue"); } @@ -142,6 +173,10 @@ if ($format =~ /XML/i) { # extract the records, not using regexes to look # for <record>.*</record>. $MARC::File::XML::_load_args{BinaryEncoding} = 'utf-8'; + my $recordformat= ($marcFlavour eq "MARC21"?"USMARC":uc($marcFlavour)); +#UNIMARC Authorities have a different way to manage encoding than UNIMARC biblios. + $recordformat=$recordformat."AUTH" if ($authorities and $marcFlavour ne "MARC21"); + $MARC::File::XML::_load_args{RecordFormat} = $recordformat; $batch = MARC::Batch->new( 'XML', $fh ); } else { $batch = MARC::Batch->new( 'USMARC', $fh ); @@ -158,9 +193,28 @@ if ( $offset ) { $batch->next() while ($offset--); } +my ($tagid,$subfieldid); +if ($authorities){ + $tagid='001'; +} +else { + ( $tagid, $subfieldid ) = + GetMarcFromKohaField( "biblio.biblionumber", '' ); + $tagid||="001"; +} + +# the SQL query to search on isbn +my $sth_isbn = $dbh->prepare("SELECT biblionumber,biblioitemnumber FROM biblioitems WHERE isbn=?"); + $dbh->{AutoCommit} = 0; +my $loghandle; +if ($logfile){ + $loghandle= IO::File->new($logfile,"w") ; + print $loghandle "id;operation;status\n"; +} RECORD: while ( ) { my $record; + # get records eval { $record = $batch->next() }; if ( $@ ) { print "Bad MARC record: skipped\n"; @@ -172,52 +226,161 @@ RECORD: while ( ) { # C4::Charset::MarcToUTF8Record) because it doesn't use MARC::Batch. next; } + # skip if we get an empty record (that is MARC valid, but will result in AddBiblio failure last unless ( $record ); $i++; print "."; print "\r$i" unless $i % 100; + # transcode the record to UTF8 if needed & applicable. if ($record->encoding() eq 'MARC-8' and not $skip_marc8_conversion) { # FIXME update condition my ($guessed_charset, $charset_errors); - ($record, $guessed_charset, $charset_errors) = MarcToUTF8Record($record, $marcFlavour); + ($record, $guessed_charset, $charset_errors) = MarcToUTF8Record($record, $marcFlavour.(($authorities and $marcFlavour ne "MARC21")?'AUTH':'')); if ($guessed_charset eq 'failed') { warn "ERROR: failed to perform character conversion for record $i\n"; next RECORD; } } - - unless ($test_parameter) { - my ( $biblionumber, $biblioitemnumber, $itemnumbers_ref, $errors_ref ); - eval { ( $biblionumber, $biblioitemnumber ) = AddBiblio($record, '', { defer_marc_save => 1 }) }; - if ( $@ ) { - warn "ERROR: Adding biblio $biblionumber failed: $...@\n"; - next RECORD; - } - if (defined $idmapfl) { - if ($sourcetag lt '010'){ - if ($record->field($sourcetag)){ - my $source = $record->field($sourcetag)->data(); - printf(IDMAP "%s|%s\n",$source,$biblionumber); + my $isbn; + # remove trailing - in isbn (only for biblios, of course) + if ($biblios) { + if ($marcFlavour eq 'UNIMARC') { + if (my $f010 = $record->field('010')) { + $isbn = $f010->subfield('a'); + $isbn =~ s/-//g; + $f010->update('a' => $isbn); } - } else { - my $source=$record->subfield($sourcetag,$sourcesubfield); - printf(IDMAP "%s|%s\n",$source,$biblionumber); - } + } else { + if (my $f020 = $record->field('020')) { + $isbn = $f020->subfield('a'); + $isbn =~ s/-//g; + $f020->update('a' => $isbn); + } + } + } + my $id; + # search for duplicates (based on Local-number) + if ($match){ + require C4::Search; + my $query=build_query($match,$record); + my $server=($authorities?'authorityserver':'biblioserver'); + my ($error, $results,$totalhits)=C4::Search::SimpleSearch( $query, 0, 3, [$server] ); + die "unable to search the database for duplicates : $error" if (defined $error); + warn "$query $server : $totalhits"; + if ($results && scalar(@$results)==1){ + my $marcrecord = MARC::File::USMARC::decode($results->[0]); + $id=GetRecordId($marcrecord,$tagid,$subfieldid); + } + elsif ($results && scalar(@$results)>1){ + warn "more than one match for $query"; + } + else { + warn "nomatch for $query"; } - - eval { ( $itemnumbers_ref, $errors_ref ) = AddItemBatchFromMarc( $record, $biblionumber, $biblioitemnumber, '' ); }; - if ( $@ ) { - warn "ERROR: Adding items to bib $biblionumber failed: $...@\n"; - # if we failed because of an exception, assume that - # the MARC columns in biblioitems were not set. - ModBiblioMarc( $record, $biblionumber, '' ); - next RECORD; - } - if ($#{ $errors_ref } > -1) { - report_item_errors($biblionumber, $errors_ref); + } + my $originalid; + if ($keepids){ + $originalid=GetRecordId($record,$tagid,$subfieldid); + if ($originalid){ + my $storeidfield; + if (length($keepids)==3){ + $storeidfield=MARC::Field->new($keepids,$originalid); + } + else { + $storeidfield=MARC::Field->new(substr($keepids,0,3),"","",substr($keepids,3,1),$originalid); + } + $record->insert_fields_ordered($storeidfield); + $record->delete_field($record->field($tagid)); + } + } + unless ($test_parameter) { + if ($authorities){ + use C4::AuthoritiesMarc; + my $authtypecode=GuessAuthTypeCode($record); + my $authid= ($id?$id:GuessAuthId($record)); + if ($authid && GetAuthority($authid)){ + ## Authority has an id and is in database : Replace + eval { ( $authid ) = ModAuthority($authid,$record, $authtypecode) }; + if ($@){ + warn "Problem with authority $authid Cannot Modify"; + printlog({id=>$originalid||$id||$authid, op=>"edit",status=>"ERROR"}) if ($logfile); + } + else{ + printlog({id=>$originalid||$id||$authid, op=>"edit",status=>"ok"}) if ($logfile); + } + } + elsif (defined $authid) { + ## An authid is defined but no authority in database : add + eval { ( $authid ) = AddAuthority($record,$authid, $authtypecode) }; + if ($@){ + warn "Problem with authority $authid Cannot Add"; + printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ERROR"}) if ($logfile); + } + else{ + printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ok"}) if ($logfile); + } + } + else { + ## True insert in database + eval { ( $authid ) = AddAuthority($record,"", $authtypecode) }; + if ($@){ + warn "Problem with authority $authid Cannot Add"; + printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ERROR"}) if ($logfile); + } + else{ + printlog({id=>$originalid||$id||$authid, op=>"insert",status=>"ok"}) if ($logfile); + } + } + } + else { + my ( $biblionumber, $biblioitemnumber, $itemnumbers_ref, $errors_ref ); + $biblionumber = $id; + # check for duplicate, based on ISBN (skip it if we already have found a duplicate with match parameter + if (!$biblionumber && $isbn_check && $isbn) { + # warn "search ISBN : $isbn"; + $sth_isbn->execute($isbn); + ($biblionumber,$biblioitemnumber) = $sth_isbn->fetchrow; + } + if (defined $idmapfl) { + if ($sourcetag < "010"){ + if ($record->field($sourcetag)){ + my $source = $record->field($sourcetag)->data(); + printf(IDMAP "%s|%s\n",$source,$biblionumber); + } + } else { + my $source=$record->subfield($sourcetag,$sourcesubfield); + printf(IDMAP "%s|%s\n",$source,$biblionumber); + } + } + # create biblio, unless we already have it ( either match or isbn ) + unless ($biblionumber) { + eval { ( $biblionumber, $biblioitemnumber ) = AddBiblio($record, '', { defer_marc_save => 1 }) }; + } + if ( $@ ) { + warn "ERROR: Adding biblio $biblionumber failed: $...@\n"; + printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ERROR"}) if ($logfile); + next RECORD; + } + else{ + printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ok"}) if ($logfile); + } + eval { ( $itemnumbers_ref, $errors_ref ) = AddItemBatchFromMarc( $record, $biblionumber, $biblioitemnumber, '' ); }; + if ( $@ ) { + warn "ERROR: Adding items to bib $biblionumber failed: $...@\n"; + printlog({id=>$id||$originalid||$biblionumber, op=>"insertitem",status=>"ERROR"}) if ($logfile); + # if we failed because of an exception, assume that + # the MARC columns in biblioitems were not set. + ModBiblioMarc( $record, $biblionumber, '' ); + next RECORD; + } + else{ + printlog({id=>$id||$originalid||$biblionumber, op=>"insert",status=>"ok"}) if ($logfile); + } + if ($#{ $errors_ref } > -1) { + report_item_errors($biblionumber, $errors_ref); + } } - $dbh->commit() if (0 == $i % $commitnum); } last if $i == $number; @@ -225,6 +388,7 @@ RECORD: while ( ) { $dbh->commit(); + if ($fk_off) { $dbh->do("SET FOREIGN_KEY_CHECKS = 1"); } @@ -234,9 +398,51 @@ $dbh->do("UPDATE systempreferences SET value=$CataloguingLog WHERE variable='Cat my $timeneeded = gettimeofday - $starttime; print "\n$i MARC records done in $timeneeded seconds\n"; - +if ($logfile){ + print $loghandle "file : $input_marc_file\n"; + print $loghandle "$i MARC records done in $timeneeded seconds\n"; + $loghandle->close; +} exit 0; +sub GetRecordId{ + my $marcrecord=shift; + my $tag=shift; + my $subfield=shift; + my $id; + if ($tag lt "010"){ + return $marcrecord->field($tag)->data() if $marcrecord->field($tag); + } + elsif ($subfield){ + if ($marcrecord->field($tag)){ + return $marcrecord->subfield($tag,$subfield); + } + } + return $id; +} +sub build_query { + my $match = shift; + my $record=shift; + my @searchstrings; + foreach my $matchingpoint (@$match){ + my $string = build_simplequery($matchingpoint,$record); + push @searchstrings,$string if (length($string)>0); + } + return join(" and ",@searchstrings); +} +sub build_simplequery { + my $element=shift; + my $record=shift; + my ($index,$recorddata)=split /,/,$element; + my ($tag,$subfields) =($1,$2) if ($recorddata=~/(\d{3})(.*)/); + my @searchstrings; + foreach my $field ($record->field($tag)){ + if (length($field->as_string("$subfields"))>0){ + push @searchstrings,"$index,wrdl=\"".$field->as_string("$subfields")."\""; + } + } + return join(" and ",@searchstrings); +} sub report_item_errors { my $biblionumber = shift; my $errors_ref = shift; @@ -249,3 +455,7 @@ sub report_item_errors { print $msg, "\n"; } } +sub printlog{ + my $logelements=shift; + print $loghandle join (";",@$logelements{qw<id op status>}),"\n"; +} -- 1.6.0.4 _______________________________________________ Koha-patches mailing list Koha-patches@lists.koha.org http://lists.koha.org/mailman/listinfo/koha-patches