> Here is an updated version of my `lsr-snippets.pl` script, [...] And another update handling `%LSR` comments that must be filtered out. Completely undocumented, of course...
Werner
# lsr-snippets.pl # # Written 2019-2020 by Werner Lemberg <w...@gnu.org> ############################################################################# ## ## ## To allow easy diff comparison with the files from the existing LSR ## ## snippets tarball, this script currently refrains from some formatting ## ## improvements. However, it already contains lines marked with XXX that ## ## should be deleted or added (as described) after the diffing process has ## ## been performed. ## ## ## ############################################################################# # Usage: # # perl lsr-snippets.pl < lsr.mysqldump > lsr.txt # This script does two things. # # * Extract all approved documentation snippets from the daily SQL database # dump of the LilyPond Snippet Repository (LSR) available at # # http://lsr.di.unimi.it/download/ # # The files are created in the current directory; its file names are # derived from the snippet titles. Additionally, various files named # `winds.snippet-list` or `connecting-notes.snippet-list` are created that # list the snippets grouped by tags assigned in the database. # # Note that 'approved documentation snippets' is a subset of the available # snippets in the LSR. # # * Write a text dump of all snippets (omitting binary fields) in the SQL # database dump to stdout. use 5.14.0; # We use `s///r`. use strict; use warnings; # Access mysqldump files without the need of mysql tools. use MySQL::Dump::Parser::XS 0.04; # Use `pandoc` for converting HTML documentation to texinfo format. use Pandoc; # Convert potential Latin1 to UTF-8 encoding as a safety measure against # invalid user input. use Encoding::FixLatin qw(fix_latin); pandoc or die "'pandoc' executable not found"; # We open the LSR database dump in binary mode since it contains PNG images. binmode(STDIN); my $parser = MySQL::Dump::Parser::XS->new; # Parse input and store all table entries in the `%tables` hash as arrays. my %tables; while (my $line = <STDIN>) { my @entries = $parser->parse($line); my $table_name = $parser->current_target_table(); push @{$tables{$table_name}} => @entries if $table_name; } # Function: Convert tag and file names, similar to the code in the original # Java implementation used to extract LSR snippets. We # additionally remove (simple) HTML tags. # # Arguments: $s String to be manipulated. # sub convert_name { my ($s) = @_; # Remove HTML start and end tags. $s =~ s| < /? [a-zA-Z0-9]+ .*? > ||gx; # Translate some characters not generally allowed in file names. $s =~ tr[* /:<>?|_;\\] [+\-\-\-\-\-\-\-\-\-\-]; # Remove some problematic characters entirely. $s =~ s/[()"']+//g; # Convert to lowercase. $s = lc($s); } # Access entries of `tag` table and build a hash `%tags` to map tag names # onto the corresponding ID numbers. my $tag_table = $tables{"tag"}; my %tags; for my $entry (@{$tag_table}) { $tags{$entry->{"id"}} = convert_name($entry->{"name"}); } # Access entries of `snippet` table. my $snippet_table = $tables{"snippet"}; my @column_names = $parser->columns("snippet"); my %snippet_lists; # Function: Replace numeric tags with its names. Tag fields in the snippet # table are called `id_tag0_tag`, `id_tag1_tag`, etc. # # Arguments: $idx Index. # $entry Reference to snippet table hash. # sub use_tag_name { my ($idx, $entry) = @_; if (defined($entry->{"id_tag${idx}_tag"})) { my $tag = $entry->{"id_tag${idx}_tag"}; $entry->{"id_tag${idx}_tag"} = $tags{$tag}; } } # Function: Store snippet file name in the `%snippet_lists` hash (as a # sub-hash entry so that we can easily check later on whether it # is approved). # # Arguments: $idx Index. # $filename File name. # $entry Reference to snippet table hash. # use constant APPROVED => 1; sub add_to_snippet_list { my ($idx, $filename, $entry) = @_; if (defined($entry->{"id_tag${idx}_tag"})) { $snippet_lists{$entry->{"id_tag${idx}_tag"}}->{$filename} = $entry->{"approved"} == APPROVED; } } # The next loop over all snippet entries does the following actions. # # * Add a new field `filename` to the snippet table, containing file names # derived from snippet titles. # * Replace tag IDs with tag names. # * Fill the `%snippet_lists` hash. for my $entry (@{$snippet_table}) { if (!defined($entry->{"title"})) { my $id = $entry->{"id"}; my $filename = "snippet-$id.ly"; warn "snippet $id has no title; using '$filename' as file name\n"; $entry->{"filename"} = $filename; } else { $entry->{"filename"} = convert_name($entry->{"title"}) . ".ly"; } # There are seven tag fields per entry in the snippet table. for my $idx (0 .. 6) { use_tag_name($idx, $entry); add_to_snippet_list($idx, $entry->{"filename"}, $entry); } } # Write snippet lists. for my $list (keys %snippet_lists) { # Skip unassigned tags (i.e., lists without entries). next if !keys %{$snippet_lists{$list}}; # Don't create snippet list for the 'docs' tag, since it would contain all # snippets. next if $list eq "docs"; my @snippet_list_contents; # Ignore file extension while sorting file names. for my $snippet (sort { my ($aa, $bb) = map { s/\.ly$//r } ($a, $b); $aa cmp $bb; } keys %{$snippet_lists{$list}}) { # Only take approved snippets from the 'docs' category. if (defined($snippet_lists{"docs"}->{$snippet}) && $snippet_lists{"docs"}->{$snippet} == APPROVED) { push @snippet_list_contents => "$snippet\n"; } } # Don't write empty snippet lists. next if not @snippet_list_contents; my $filename = "$list.snippet-list"; open(my $fh, ">", $filename) || die "Can't open $filename: $!"; print $fh @snippet_list_contents; close($fh) || warn "Can't close $filename: $!"; } # Function: Clean up data entries. # # Arguments: $data Data to be manipulated. # sub normalize_text { my ($data) = @_; # Fix encoding; we always need UTF-8. $data = fix_latin($data, bytes_only => 1); # Make line endings uniform. $data =~ s/(\015\012?|\012)/\n/g; # Remove trailing (horizontal) whitespace from every line. $data =~ s/\h+$//gm; # Remove leading and trailing empty lines. $data =~ s/^\n+//; $data =~ s/\n+$//; return $data; } # Function: Convert HTML data into the texinfo format. # # We pre- and post-process the HTML data sent to and received # from the `pandoc` converter, respectively. # # Pre: # * Convert `<samp>...</samp>` to `#samp#...#/samp#` since # `pandoc` would swallow these tags unprocessed otherwise # (tested with version 2.6; note that `pandoc`'s `raw_html` # extension has no effect since the texinfo writer ignores # it). # * Ditto for `<var>...</var>`. # * Escape backslashes and double quotation marks with a # backslash since everything has to be emitted as LilyPond # strings (we do this before calling pandoc to avoid too long # texinfo source code lines). # * Convert a full stop followed by two spaces to `#:#` since # `pandoc` would swallow the second space otherwise. [This # wouldn't be a problem in the final PDF or HTML output, # however, it improves the snippet source code: Editors like # Emacs use a double space after a full stop to indicate the # end of a sentence in contrast to a single space after an # abbreviation full stop.] # # Post: # * Remove unusable texinfo node references to 'Top'. # * Convert `#samp#...#/samp#` to `@samp{...}`. # * Convert `#var#...#/var#` to `@var{...}`. # * Convert `#:#` back to a full stop followed by two spaces. # * Replace ``` ``...'' ``` with `@qq{...}`. # # Note that we don't check whether there is a paragraph boundary # between opening and closing HTML tag (ditto for # ``` ``...'' ```). # # Arguments: $s String to be manipulated. # sub html_to_texinfo { my ($s) = @_; $s =~ s|<samp\h*>(.*?)</samp\h*>|#samp#$1#/samp#|sg; $s =~ s|<var\h*>(.*?)</var\h*>|#var#$1#/var#|sg; $s =~ s/\\/\\\\/g; $s =~ s/"/\\"/g; $s =~ s/\. /#:#/g; $s = pandoc->convert("html" => "texinfo", $s, "--columns=71"); $s =~ s/\@node Top\n\@top Top\n\n//; $s =~ s|#samp#(.*?)#/samp#|\@samp{$1}|sg; $s =~ s|#var#(.*?)#/var#|\@var{$1}|sg; $s =~ s/#:#/. /g; $s =~ s/``(.*?)''/\@qq{$1}/sg; return $s; } # Start and end of the documentation section of a snippet. use constant DOC_PREAMBLE => <<~'EOT'; %% DO NOT EDIT this file manually; it is automatically %% generated from LSR http://lsr.di.unimi.it %% Make any changes in LSR itself, or in Documentation/snippets/new/ , %% and then run scripts/auxiliar/makelsr.py %% %% This file is in the public domain. \version "2.18.0" \header { EOT # XXX add empty line after brace use constant DOC_POSTAMBLE => <<~'EOT'; } % begin verbatim EOT # Emit all approved snippets from the 'docs' category as files. for my $entry (@{$snippet_table}) { my @lsrtags = (); my $is_docs = 0; # Collect tags in array `@lsrtags` (except tag 'docs', which all of our # snippets have set). for my $idx (0 .. 6) { if (defined($entry->{"id_tag${idx}_tag"})) { if ($entry->{"id_tag${idx}_tag"} eq "docs") { $is_docs = 1; } else { push @lsrtags => $entry->{"id_tag${idx}_tag"}; } } } # Skip non-documentation snippets. next unless $is_docs; # Skip unapproved snippets. next unless $entry->{"approved"} == APPROVED; my $filename = $entry->{"filename"}; open(my $fh, ">", $filename) || die "Can't open $filename: $!"; print $fh DOC_PREAMBLE; print $fh ' lsrtags = "' . join(", ", sort @lsrtags) . '"' . "\n\n"; my $texidoc = html_to_texinfo(normalize_text($entry->{"text"})); print $fh ' texidoc = "' . "\n" . $texidoc . "\n" . "\n" # XXX remove . '"' . "\n"; # XXX add . "\n"; # The title gets stored as a LilyPond string, too. my $doctitle = html_to_texinfo(normalize_text($entry->{"title"})); # No newlines in title. $doctitle =~ s/\n+$//; $doctitle =~ s/\n/ /g; print $fh ' doctitle = "' . $doctitle . '"' . "\n"; print $fh DOC_POSTAMBLE; # Finally, emit the LilyPond snippet code itself. my $snippet = normalize_text($entry->{"snippet"}); # We have to remove '%LSR' comments, which are not intended to be # exported from the LSR. $snippet .= "\n"; $snippet =~ s/^ \h* %+ \h* LSR .* \n//gmx; # '%LSR' on a line of its own. $snippet =~ s/\h* %+ \h* LSR .* $//gmx; # Trailing '%LSR'. print $fh $snippet; } # We want to see the constructed file name in the LSR text dump also, so add # it to the array of column names. push @column_names => "filename"; # Emit a text dump of all snippets (sorted by snippet ID) to stdout. for my $entry (sort { $a->{"id"} <=> $b->{"id"} } @{$snippet_table}) { for my $name (@column_names) { # Ignore binary data. next if $name eq "image"; next if $name eq "largeimage"; # Ignore unset fields. next if !defined($entry->{$name}); my $tag = "$name: "; print $tag; my $data = normalize_text($entry->{$name}); # Insert a prefix to indicate continuation lines for nicer reading. my $prefix = " " x (length($tag) - 2) . "| "; my $n = 0; $data =~ s/^/$n++ ? "$prefix" : $&/gme; # Skip the first match. print "$data\n"; } print "\n"; } print "END OF DUMP\n" # eof