On a second thought, here's a simpler fix that may help with parsing some others as well.
From c758e4d02993deb2205da744cdf0e9f4e6ad4a46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Skytt=C3=A4?= <ville.sky...@iki.fi> Date: Fri, 13 Oct 2017 22:08:02 +0300 Subject: [PATCH] private/refresh-manual-refs: Update debconf reference mapping, strip whitespace with $index_re
The usual $index_re now applies, tweaked to strip surrounding whitespace. --- private/refresh-manual-refs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/private/refresh-manual-refs b/private/refresh-manual-refs index 906d31084..3960e3e66 100755 --- a/private/refresh-manual-refs +++ b/private/refresh-manual-refs @@ -62,7 +62,7 @@ my $title_re = qr/<title\s?>(.+?)<\/title\s?>/i; my $link_re = qr/<link href="(.+?)" rel="[\w]+" title="([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)">/; my $index_re - = qr/<a href="(.+?)">([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)<\/a>/; + = qr/<a href="(.+?)">\s*([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)\s*<\/a>/; my $fields = [['url'], ['section'], ['title']]; my $dbk_index_re = qr/([\d.]+?)\.\s+<a\s*href="(.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i; @@ -132,8 +132,7 @@ my %manuals = ( join(q{/}, 'https://www.debian.org', 'doc/packaging-manuals/debconf_specification.html'), - qr/<a href="(#.+?)">([\w\s[:punct:]]+?)<\/a>/, - [['section', 'url'], ['title']] + $index_re, $fields ], 'fhs' => [ '/usr/share/doc/debian-policy/fhs/fhs-2.3.html', -- 2.14.1