Re: lsr-snippets.pl

Werner LEMBERG Wed, 09 Sep 2020 10:36:24 -0700

> Here is an updated version of my `lsr-snippets.pl` script, [...]

And another update handling `%LSR` comments that must be filtered out.
Completely undocumented, of course...



   Werner

# lsr-snippets.pl
#
# Written 2019-2020 by Werner Lemberg <w...@gnu.org>


#############################################################################
##                                                                         ##
## To allow easy diff comparison with the files from the existing LSR      ##
## snippets tarball, this script currently refrains from some formatting   ##
## improvements.  However, it already contains lines marked with XXX that  ##
## should be deleted or added (as described) after the diffing process has ##
## been performed.                                                         ##
##                                                                         ##
#############################################################################


# Usage:
#
#   perl lsr-snippets.pl < lsr.mysqldump > lsr.txt


# This script does two things.
#
# * Extract all approved documentation snippets from the daily SQL database
#   dump of the LilyPond Snippet Repository (LSR) available at
#
#     http://lsr.di.unimi.it/download/
#
#   The files are created in the current directory; its file names are
#   derived from the snippet titles.  Additionally, various files named
#   `winds.snippet-list` or `connecting-notes.snippet-list` are created that
#   list the snippets grouped by tags assigned in the database.
#
#   Note that 'approved documentation snippets' is a subset of the available
#   snippets in the LSR.
#
# * Write a text dump of all snippets (omitting binary fields) in the SQL
#   database dump to stdout.


use 5.14.0; # We use `s///r`.
use strict;
use warnings;

# Access mysqldump files without the need of mysql tools.
use MySQL::Dump::Parser::XS 0.04;
# Use `pandoc` for converting HTML documentation to texinfo format.
use Pandoc;
# Convert potential Latin1 to UTF-8 encoding as a safety measure against
# invalid user input.
use Encoding::FixLatin qw(fix_latin);


pandoc or die "'pandoc' executable not found";


# We open the LSR database dump in binary mode since it contains PNG images.
binmode(STDIN);


my $parser = MySQL::Dump::Parser::XS->new;


# Parse input and store all table entries in the `%tables` hash as arrays.
my %tables;

while (my $line = <STDIN>) {
  my @entries = $parser->parse($line);
  my $table_name = $parser->current_target_table();

  push @{$tables{$table_name}} => @entries if $table_name;
}


# Function:  Convert tag and file names, similar to the code in the original
#            Java implementation used to extract LSR snippets.  We
#            additionally remove (simple) HTML tags.
#
# Arguments: $s    String to be manipulated.
#
sub convert_name {
  my ($s) = @_;

  # Remove HTML start and end tags.
  $s =~ s| < /? [a-zA-Z0-9]+ .*? > ||gx;

  # Translate some characters not generally allowed in file names.
  $s =~ tr[* /:<>?|_;\\]
          [+\-\-\-\-\-\-\-\-\-\-];

  # Remove some problematic characters entirely.
  $s =~ s/[()"']+//g;

  # Convert to lowercase.
  $s = lc($s);
}


# Access entries of `tag` table and build a hash `%tags` to map tag names
# onto the corresponding ID numbers.
my $tag_table = $tables{"tag"};
my %tags;

for my $entry (@{$tag_table}) {
  $tags{$entry->{"id"}} = convert_name($entry->{"name"});
}


# Access entries of `snippet` table.
my $snippet_table = $tables{"snippet"};
my @column_names = $parser->columns("snippet");

my %snippet_lists;


# Function:  Replace numeric tags with its names.  Tag fields in the snippet
#            table are called `id_tag0_tag`, `id_tag1_tag`, etc.
#
# Arguments: $idx      Index.
#            $entry    Reference to snippet table hash.
#
sub use_tag_name {
  my ($idx, $entry) = @_;

  if (defined($entry->{"id_tag${idx}_tag"})) {
    my $tag = $entry->{"id_tag${idx}_tag"};
    $entry->{"id_tag${idx}_tag"} = $tags{$tag};
  }
}


# Function:  Store snippet file name in the `%snippet_lists` hash (as a
#            sub-hash entry so that we can easily check later on whether it
#            is approved).
#
# Arguments: $idx         Index.
#            $filename    File name.
#            $entry       Reference to snippet table hash.
#
use constant APPROVED => 1;

sub add_to_snippet_list {
  my ($idx, $filename, $entry) = @_;

  if (defined($entry->{"id_tag${idx}_tag"})) {
    $snippet_lists{$entry->{"id_tag${idx}_tag"}}->{$filename}
      = $entry->{"approved"} == APPROVED;
  }
}


# The next loop over all snippet entries does the following actions.
#
# * Add a new field `filename` to the snippet table, containing file names
#   derived from snippet titles.
# * Replace tag IDs with tag names.
# * Fill the `%snippet_lists` hash.
for my $entry (@{$snippet_table}) {
  if (!defined($entry->{"title"})) {
    my $id = $entry->{"id"};
    my $filename = "snippet-$id.ly";

    warn "snippet $id has no title; using '$filename' as file name\n";

    $entry->{"filename"} = $filename;
  }
  else {
    $entry->{"filename"} = convert_name($entry->{"title"}) . ".ly";
  }

  # There are seven tag fields per entry in the snippet table.
  for my $idx (0 .. 6) {
    use_tag_name($idx, $entry);
    add_to_snippet_list($idx, $entry->{"filename"}, $entry);
  }
}


# Write snippet lists.
for my $list (keys %snippet_lists) {
  # Skip unassigned tags (i.e., lists without entries).
  next if !keys %{$snippet_lists{$list}};

  # Don't create snippet list for the 'docs' tag, since it would contain all
  # snippets.
  next if $list eq "docs";

  my @snippet_list_contents;

  # Ignore file extension while sorting file names.
  for my $snippet (sort { my ($aa, $bb) = map { s/\.ly$//r } ($a, $b);
                          $aa cmp $bb;
                        } keys %{$snippet_lists{$list}}) {
    # Only take approved snippets from the 'docs' category.
    if (defined($snippet_lists{"docs"}->{$snippet})
        && $snippet_lists{"docs"}->{$snippet} == APPROVED) {
      push @snippet_list_contents => "$snippet\n";
    }
  }

  # Don't write empty snippet lists.
  next if not @snippet_list_contents;

  my $filename = "$list.snippet-list";
  open(my $fh, ">", $filename) || die "Can't open $filename: $!";
  print $fh @snippet_list_contents;
  close($fh) || warn "Can't close $filename: $!";
}


# Function:  Clean up data entries.
#
# Arguments: $data    Data to be manipulated.
#
sub normalize_text {
  my ($data) = @_;

  # Fix encoding; we always need UTF-8.
  $data = fix_latin($data, bytes_only => 1);

  # Make line endings uniform.
  $data =~ s/(\015\012?|\012)/\n/g;

  # Remove trailing (horizontal) whitespace from every line.
  $data =~ s/\h+$//gm;

  # Remove leading and trailing empty lines.
  $data =~ s/^\n+//;
  $data =~ s/\n+$//;

  return $data;
}


# Function:  Convert HTML data into the texinfo format.
#
#            We pre- and post-process the HTML data sent to and received
#            from the `pandoc` converter, respectively.
#
#            Pre:
#              * Convert `<samp>...</samp>` to `#samp#...#/samp#` since
#                `pandoc` would swallow these tags unprocessed otherwise
#                (tested with version 2.6; note that `pandoc`'s `raw_html`
#                extension has no effect since the texinfo writer ignores
#                it).
#              * Ditto for `<var>...</var>`.
#              * Escape backslashes and double quotation marks with a
#                backslash since everything has to be emitted as LilyPond
#                strings (we do this before calling pandoc to avoid too long
#                texinfo source code lines).
#              * Convert a full stop followed by two spaces to `#:#` since
#                `pandoc` would swallow the second space otherwise.  [This
#                wouldn't be a problem in the final PDF or HTML output,
#                however, it improves the snippet source code: Editors like
#                Emacs use a double space after a full stop to indicate the
#                end of a sentence in contrast to a single space after an
#                abbreviation full stop.]
#
#            Post:
#              * Remove unusable texinfo node references to 'Top'.
#              * Convert `#samp#...#/samp#` to `@samp{...}`.
#              * Convert `#var#...#/var#` to `@var{...}`.
#              * Convert `#:#` back to a full stop followed by two spaces.
#              * Replace ``` ``...'' ``` with `@qq{...}`.
#
#            Note that we don't check whether there is a paragraph boundary
#            between opening and closing HTML tag (ditto for
#            ``` ``...'' ```).
#
# Arguments: $s    String to be manipulated.
#
sub html_to_texinfo {
  my ($s) = @_;

  $s =~ s|<samp\h*>(.*?)</samp\h*>|#samp#$1#/samp#|sg;
  $s =~ s|<var\h*>(.*?)</var\h*>|#var#$1#/var#|sg;
  $s =~ s/\\/\\\\/g;
  $s =~ s/"/\\"/g;
  $s =~ s/\.  /#:#/g;

  $s = pandoc->convert("html" => "texinfo", $s, "--columns=71");

  $s =~ s/\@node Top\n\@top Top\n\n//;
  $s =~ s|#samp#(.*?)#/samp#|\@samp{$1}|sg;
  $s =~ s|#var#(.*?)#/var#|\@var{$1}|sg;
  $s =~ s/#:#/.  /g;
  $s =~ s/``(.*?)''/\@qq{$1}/sg;

  return $s;
}


# Start and end of the documentation section of a snippet.
use constant DOC_PREAMBLE => <<~'EOT';
  %% DO NOT EDIT this file manually; it is automatically
  %% generated from LSR http://lsr.di.unimi.it
  %% Make any changes in LSR itself, or in Documentation/snippets/new/ ,
  %% and then run scripts/auxiliar/makelsr.py
  %%
  %% This file is in the public domain.
  \version "2.18.0"

  \header {
  EOT

# XXX add empty line after brace
use constant DOC_POSTAMBLE => <<~'EOT';
  } % begin verbatim

  EOT


# Emit all approved snippets from the 'docs' category as files.
for my $entry (@{$snippet_table}) {
  my @lsrtags = ();
  my $is_docs = 0;

  # Collect tags in array `@lsrtags` (except tag 'docs', which all of our
  # snippets have set).
  for my $idx (0 .. 6) {
    if (defined($entry->{"id_tag${idx}_tag"})) {
      if ($entry->{"id_tag${idx}_tag"} eq "docs") {
        $is_docs = 1;
      }
      else {
        push @lsrtags => $entry->{"id_tag${idx}_tag"};
      }
    }
  }

  # Skip non-documentation snippets.
  next unless $is_docs;
  # Skip unapproved snippets.
  next unless $entry->{"approved"} == APPROVED;

  my $filename = $entry->{"filename"};
  open(my $fh, ">", $filename) || die "Can't open $filename: $!";

  print $fh DOC_PREAMBLE;

  print $fh '  lsrtags = "' . join(", ", sort @lsrtags) . '"' . "\n\n";

  my $texidoc = html_to_texinfo(normalize_text($entry->{"text"}));
  print $fh '  texidoc = "' . "\n"
            . $texidoc . "\n"
            . "\n" # XXX remove
            . '"' . "\n";
  # XXX add . "\n";

  # The title gets stored as a LilyPond string, too.
  my $doctitle = html_to_texinfo(normalize_text($entry->{"title"}));

  # No newlines in title.
  $doctitle =~ s/\n+$//;
  $doctitle =~ s/\n/ /g;

  print $fh '  doctitle = "' . $doctitle . '"' . "\n";

  print $fh DOC_POSTAMBLE;

  # Finally, emit the LilyPond snippet code itself.
  my $snippet = normalize_text($entry->{"snippet"});

  # We have to remove '%LSR' comments, which are not intended to be
  # exported from the LSR.
  $snippet .= "\n";
  $snippet =~ s/^ \h* %+ \h* LSR .* \n//gmx; # '%LSR' on a line of its own.
  $snippet =~ s/\h* %+ \h* LSR .* $//gmx; # Trailing '%LSR'.

  print $fh $snippet;
}


# We want to see the constructed file name in the LSR text dump also, so add
# it to the array of column names.
push @column_names => "filename";


# Emit a text dump of all snippets (sorted by snippet ID) to stdout.
for my $entry (sort { $a->{"id"} <=> $b->{"id"} } @{$snippet_table}) {
  for my $name (@column_names) {
    # Ignore binary data.
    next if $name eq "image";
    next if $name eq "largeimage";

    # Ignore unset fields.
    next if !defined($entry->{$name});

    my $tag = "$name: ";
    print $tag;

    my $data = normalize_text($entry->{$name});

    # Insert a prefix to indicate continuation lines for nicer reading.
    my $prefix = " " x (length($tag) - 2) . "| ";
    my $n = 0;
    $data =~ s/^/$n++ ? "$prefix" : $&/gme; # Skip the first match.

    print "$data\n";
  }

  print "\n";
}

print "END OF DUMP\n"

# eof

Re: lsr-snippets.pl

Reply via email to