Bug#425537: linuxdoc-tools: Improving TOC creation in fmt_txt.pl

Agustin Martin Tue, 22 May 2007 04:43:29 -0700

Package: linuxdoc-tools
Version: 0.9.21-0.8
Severity: wishlist

Hi,


I have noticed that TOC creation in fmt_txt.pl has some
minor annoyances,

* Does not work well with chapters (creates numbering after sections)
* Long lines are not wrapped
* Tabs and extra whitespace is not stripped, making the result visually
  strange.

I have been playing with the txt postASP stuff related to TOC creation and
finally mostly rewrite it to my taste after similar ideas with a number of
improvements. In my preliminary tests, things are working with chapters,
long lines are wrapped and a lot of noise (pending tabs,..) is removed
giving a much better look.

This is still very experimental, but I am attaching a patch showing the
current changes. I will test them extensively in the meantime.

Cheers,

-- 
Agustin

Index: lib/dist/fmt_txt.pl
===================================================================
RCS file: /home/agmartin/CVSROOT/debian/linuxdoc-tools/lib/dist/fmt_txt.pl,v
retrieving revision 1.3
retrieving revision 1.1.1.1.2.18
diff -u -r1.3 -r1.1.1.1.2.18
--- lib/dist/fmt_txt.pl	7 May 2007 10:17:00 -0000	1.3
+++ lib/dist/fmt_txt.pl	14 May 2007 11:55:38 -0000	1.1.1.1.2.18
@@ -14,6 +14,7 @@
 
 use File::Copy;
 use Text::EntityMap;
+use Text::Wrap;
 use LinuxDocTools::CharEnts;
 use LinuxDocTools::Lang;
 use LinuxDocTools::Vars;
@@ -88,12 +89,11 @@
 # ---------------------------------------------------------------
 $txt->{preASP} = sub
 # ---------------------------------------------------------------
-#  Run the file through the genertoc utility before sgmlsasp. Not needed
-#  when producing a manpage. A lot of code from FJM, untested by me.
+# Pre-process file before sgmlsasp and create a TOC unless producing
+# a manpage. Code based in the genertoc utility and in code from FJM.
 # ---------------------------------------------------------------
 {
-  my ($infile, $outfile) = @_;
-  my (@toc, @lines);
+  my ($INFILE, $OUTFILE) = @_;
   my $char_maps = load_char_maps ('.2tr', [ Text::EntityMap::sdata_dirs() ]);
   
   if ( $global->{charset} eq "latin1" ){
@@ -101,10 +101,10 @@
   }
   
   if ($txt->{manpage}){    
-    while (<$infile>){
+    while (<$INFILE>){
       if ( s/^-// ){
 	chomp;  
-	print $outfile "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n";
+	print $OUTFILE "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n";
       } elsif (/^A/) {
 	/^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
 	    || die "bad attribute data: $_\n";
@@ -113,9 +113,9 @@
 	  # CDATA attributes get translated also
 	  $value = &parse_data ($value, $char_maps, $txt_escape);
 	}
-	print $outfile "A$name $type $value\n";
+	print $OUTFILE "A$name $type $value\n";
       } else {  
-	print $outfile $_;
+	print $OUTFILE $_;
       }
     }
     return;
@@ -124,190 +124,162 @@
   # note the conversion of `sdata_dirs' list to an anonymous array to
   # make a single argument
   
-  #
-  #  Build TOC. The file is read into @lines in the meantime, we need to
-  #  traverse it twice.
-  #
-  push (@toc, "(HLINE\n");
-  push (@toc, ")HLINE\n");
-  push (@toc, "(P\n");
-  push (@toc, "-" . Xlat ("Table of Contents") . "\n");
-  push (@toc, ")P\n");
-  push (@toc, "(VERB\n");
-  my (@prevheader, @header);
-  my $appendix = 0;
-  my $nonprint = 0;
-  while (<$infile>)
-    {
-      push (@lines, $_);
-
-      if (/^\(SECT(.*)/) 
-        {
-	  @prevheader = @header;
-	  @header = @header[0..$1];
-	  if ($appendix == 1) 
-            {
-	      $header[$1] = "A";
-	      $appendix = 0;
-            } else 
-            {
-	      $header[$1]++;
-	    }
-        }
-      if (/^\(APPEND(.*)/) 
-        {
-	  $appendix = 1;
-        }
-      if (/^\(HEADING/) 
-        {
-	  $_ = <$infile>;
-	  s/\\n/ /g;
-	  push (@lines, $_);
-	  chop;
-	  s/^-//;
-	  $_ = join(".",@header) . " " . $_;
-	  s/\(\\[0-9][0-9][0-9]\)/\\\1/g;
-
-	  if (!$#header) 
-	    {
-	      # put a newline before top-level sections unless previous was also
-	      # a top level section
-	      $_ = "\\n" . $_ unless (!$#prevheader);
-	      # put a . and a space after top level sections
-	      s/ /. /;
-#####	      $_ = "-" . $_ . "\\n";
-	      $_ = "-" . $_;
-	    } 
-	  else 
-	    {
-	      # subsections get indentation matching hierarchy
-	      $_ = "-" . "   " x $#header . $_;
-	    }
-
-#	remove tags from a toc
-	  s/\)TT//g;
-	  s/\(TT//g;
-	  s/\)IT//g;
-	  s/\(IT//g;
-	  s/\)EM//g;
-	  s/\(EM//g;
-	  s/\)BF//g;
-	  s/\(BF//g;
-	  s/AID * CDATA.*$//g;
-	  s/\)LABEL//g;
-	  s/\(LABEL//g;
-
-	  push(@toc, parse_data ($_, $char_maps, $txt_escape));
-
-	  $_ = <$infile>;
-	  while (!/^\)HEADING/) {
-	    s/\\n/ /g; ####
-	    push(@lines, $_);
-	    chop;
-	    s/^-//;
-
-#	remove tags from a toc
-	    s/\)TT//g;
-	    s/\(TT//g;
-	    s/\)IT//g;
-	    s/\(IT//g;
-	    s/\)EM//g;
-	    s/\(EM//g;
-	    s/\)BF//g;
-	    s/\(BF//g;
-	    s/AID * CDATA.*$//g;
-	    s/\)LABEL//g;
-	    s/\(LABEL//g;
-
-#	remove NIDX, NCDX from a toc entry
-	    if (/^\(NIDX$/ || /^\(NCDX$/) { $nonprint = 1; }
-	    if (/^\)NIDX$/ || /^\)NCDX$/) { $nonprint = 1; }
-
-#	  $_ = "-" . $_ . "\\n";
-	    push(@toc, parse_data ($_, $char_maps, $txt_escape))
-	      if (! $nonprint);
-	    $_ = <$infile>;
-	  }
-	  s/\\n/ /g; ###
-	  push(@lines, $_);
-	  push(@toc, "\\n\n");
+  # ---------------------------------
+  # Pre-process file and extract TOC info
+  # ---------------------------------
+  
+  my $inheading    = 0;
+  my $headertext   = '';
+  my $sectionlevel = '';
+  my $appendix     = 0;
+  my $txtout       = "";
+  my @tocarray     = ();
+  my $thetoc       = '';
+  my @header       = ();
+  my @prevheader   = ();
+  my $chapterskip  = 0;
+  
+  while (<$INFILE>) {
+    if ($inheading){
+      next if ( /^(\(|\))(BF|EM|IT|LABEL|TT)/ );
+      next if ( /^\)TOC/ );
+      
+      if ( s/^-// ) {              # Header text
+	chomp;
+	s/([^\\])\\n/$1 /g;        # No unescaped \n in text
+	s/^\\n/ /g;                # No newlines in text BOL
+	s/([^\\])\\011/$1 /g;      # No tabulars in text
+	s/^[\s\n\t]*//;            # 
+	s/[\s\n\t]*$//;            #
+	$headertext .= $_;
+	$headertext .= " ";
+      } elsif (/^\)HEADING/){      # End of header: Write full header text
+	$headertext =~ s/[ \n]*$//;
+	if ( $headertext ) {
+	  $headertext = &parse_data ($headertext, $char_maps, $txt_escape);
+	  $txtout .= "-" . $headertext . "\n";
+	  push @tocarray, [$sectionlevel, $headertext];
+	}
+	$inheading    = 0;
+	$sectionlevel = '';
+	$txtout .= $_;
+      } else {                     # labels and friends: copy to output
+	$txtout .= $_; 
       }
-    }
-  push (@toc, ")VERB\n");
-  push (@toc, "(HLINE\n");
-  push (@toc, ")HLINE\n");
-
-  my $inheading = 0;
-  my $tipo = '';
-  for (@lines)
-    {
-      if ($inheading)
-        {
-	  next if (/^\)TT/ || /^\(TT/ || /^\)IT/ || /^\(IT/ ||
-                   /^\)EM/ || /^\(EM/ || /^\)BF/ || /^\(BF/);
-	  if (/^-/) 
-            {
-	      $tipo .=  $' ;
-	      chop ($tipo);
-	      $tipo .= " " unless $tipo =~ / $/;
-	    }
-	  else 
-	    {
-	      $tipo =~ s/ $//;
-	      if ($tipo)
-		{
-		  print $outfile "-"
-		      . parse_data ($tipo, $char_maps, $txt_escape)
-		      . "\n";
-		}
-	      print $outfile $_;
-	      $tipo = '';
-	    }
-	  if (/^\)HEADING/)
-	    {
-	      $inheading = 0;
-            }
-	  next;
+      
+    } else { # --- Not in heading 
+      
+      if (/^\(APPEND(.*)/) {       # appendix mode
+	$appendix = 1;
+	$txtout .= $_;
+      } elsif (/^\(HEADING/) {     #  Go into heading processing mode.
+	$headertext   = '';
+	$inheading    = 1;
+	$txtout .= $_;
+      } elsif (/^\(CHAPT/) {
+	$sectionlevel = 0;
+	$chapterskip  = 1;         # Start sectioning with chapter
+	if ( $appendix ) {
+	  $sectionlevel = "A$sectionlevel";
+	  $appendix     = 0;
 	}
-      if (/^\(HEADING/) 
-        {
-	  #
-	  #  Go into heading processing mode.
-	  #
-	  $tipo = '';
-	  $inheading = 1;
+	$txtout .= $_;
+      } elsif (/^\(SECT(.*)/) {
+	$sectionlevel = $1 ? $1 : 0;
+	$sectionlevel += $chapterskip;
+	if ( $appendix ) {
+	  $sectionlevel = "A$sectionlevel";
+	  $appendix     = 0;
 	}
-      if (/^\(TOC/)
-        {
-	  print $outfile @toc;
-	  next;
+	$txtout .= $_;
+      } elsif (/^\(TOC/) {         # Placeholder for TOC
+	$txtout .= "##TOC##";
+      } elsif ( s/^-// ) {
+	chomp;
+	$txtout .=  "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n";
+      } elsif (/^A/) {
+	/^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
+	    || die "bad attribute data: $_\n";
+	my ($name,$type,$value) = ($1,$2,$4);
+	if ($type eq "CDATA") {
+	  # CDATA attributes get translated also
+	  $value = &parse_data ($value, $char_maps, $txt_escape);
 	}
-      if (/^-/)
-        {
-	  my ($str) = $';
-	  chop ($str);
-	  print $outfile "-" . parse_data ($str, $char_maps, $txt_escape) . "\n";
-	  next;
-        }
-      elsif (/^A/)
-        {
-	  /^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
-	      || die "bad attribute data: $_\n";
-	  my ($name,$type,$value) = ($1,$2,$4);
-	  if ($type eq "CDATA")
-	    {
-	      # CDATA attributes get translated also
-	      $value = parse_data ($value, $char_maps, $txt_escape);
-	    }
-	  print $outfile "A$name $type $value\n";
-	  next;
-        }
-
-      #
-      #  Default action if not skipped over with next: copy in to out.
-      #
-      print $outfile $_;
+	$txtout .= "A$name $type $value\n";
+      } else {
+	$txtout .= $_;
+      }
+    } 
+  } # end of  while (<$INFILE>) loop
+  
+  # ----------------------------
+  # Post-process the TOC, if any
+  # ----------------------------
+  
+  if ( @tocarray ) {
+    my $toclinelength = 72;          # Length of a normal line
+    @header = @prevheader = ();
+    $thetoc = join ("\n",("(HLINE",
+			  ")HLINE",
+			  "(P",
+			  "-" . Xlat ("Table of Contents"),
+			  ")P",
+			  "(VERB\n"));
+    
+    foreach my $entry ( @tocarray ) {
+      my $level  = $$entry[0];       # Section level
+      my $text   = $$entry[1];       # section entry
+      my $number = '';               # Numbering of the item
+      my $nwhite = '';               # Will be length($number) times " "
+      
+      $text =~ s/(\(|\))(BF|EM|IT|LABEL|TT)//g;
+      $text =~ s/AID * CDATA.*$//g;
+      $text =~ s/\s+/ /g;
+      
+      @prevheader = @header;
+      @header     = @header[0..$level];
+      
+      if ( $level =~ s/^A// ){
+	$header[$level] = "A";
+      } else {
+	$header[$level]++;
+      }
+      
+      my $number = join ('.',@header);
+      
+      if ( ! $#header ) {
+	# put a . after top level sections
+	$number .= '.';
+	# put a newline before top-level sections unless previous is one
+	$number = "\\n" . $number unless (!$#prevheader);
+	$number = "-" . $number;
+      } else {
+	# subsections get indentation matching hierarchy
+	$number = "-" . "   " x $#header . $number;
+      }
+      unless ( $text =~ /^(\(|\))(NCDX|NIDX)$/ ){
+	$nwhite = $number;
+	$nwhite =~ s/^[-\\n]*//;
+	$nwhite = "-" . " " x length($nwhite);
+	$Text::Wrap::columns = $toclinelength - length($nwhite);
+	foreach ( split("\n",wrap('','',$text)) ){
+	  $thetoc .= "$number $_\\n\n";
+	  $number = $nwhite;     # Whitespaces if number is already printed
+	}
+      }
     }
+    $thetoc .= join ("\n",(")VERB",
+			   "(HLINE",
+			   ")HLINE\n"));
+  } # Parsed @tocarray
+  
+  if ( $thetoc ){
+    $txtout =~ s/^\#\#TOC\#\#/$thetoc/m;
+  } else {
+    $txtout =~ s/^\#\#TOC\#\#//m;
+  }
+  print $OUTFILE $txtout;
+  return 0;
 };
 
 # ---------------------------------------------------------------
@@ -400,5 +372,6 @@
 __END__
 
 #Local Variables:
-#perl-indent-level: 2
+# mode: perl
+# perl-indent-level: 2
 #End:

Bug#425537: linuxdoc-tools: Improving TOC creation in fmt_txt.pl

Reply via email to