Hi,

Thank you for looking into this!

On Wed, 15 Oct 2025 at 21:32, Peter Eisentraut <[email protected]> wrote:
>
> On 07.10.25 15:12, Nazir Bilal Yavuz wrote:
> > 1 - A new sgml_syntax_check.pl script was added to handle tab, nbsp,
> > and xmllint validation checks.
> > 1.1 - It is registered as the sgml_syntax_check test in the Meson build.
> > 1.2 - These checks are run when executing 'make check' or 'meson test
> > sgml_syntax_check' commands.
> > 1.3 - During the creation of postgres-full.xml, the script performs
> > tab and nbsp checks. The xmllint check is skipped there, since
> > validation is already handled by the --valid option. So, we do not run
> > the same check twice.
>
> I think including the xmllint support in the new sgml_syntax_check is
> overkill, since the normal build already runs xmllint, or you could
> alternatively just write it into the build description file (makefile or
> meson.build).  The build commands should be visible in the build
> description file, not layered into some other script.
>
> I suggest the following approach:
>
> - Change sgml_syntax_check.pl into a smaller script that just checks for
> tabs and nbsp.  (Maybe a different name then.)
>
> - Add a call of that script to the build of postgres-full.xml.
>
> - Change the "check" target to just depend on postgres-full.xml, without
> its own commands.
>
> And then replicate that logic in meson.

All of these are addressed in v7. I think sgml_syntax_check is still a
suitable name but I am open to suggestions.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft
From bab59770eee6ee348d20c8cb7eb1b7af5366f5df Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <[email protected]>
Date: Tue, 21 Oct 2025 13:49:16 +0300
Subject: [PATCH v7] Improve docs syntax checking

Move the checks out of the Makefile into a perl script that can be
called from both the Makefile and meson.build. The set of files checked
is simplified, so it is just all the sgml and xsl files found in
docs/src/sgml directory tree.

Along the way make some adjustments to .cirrus.tasks.yml to support this
better in CI.

Also ensure that the checks are run while generating the
postgres-full.xml.

Author: Nazir Bilal Yavuz <[email protected]>
Co-Author: Andrew Dunstan <[email protected]>
Discussion: https://postgr.es/m/CAN55FZ1qzoDcaKqsR3DwE%3DX6FL%2Bwpm%2B%3DKLvH6ahrRXNhjU53DQ%40mail.gmail.com
---
 doc/src/sgml/Makefile             |  16 +----
 doc/src/sgml/meson.build          |  34 ++++++++-
 doc/src/sgml/sgml_syntax_check.pl | 115 ++++++++++++++++++++++++++++++
 .cirrus.tasks.yml                 |   3 +
 4 files changed, 153 insertions(+), 15 deletions(-)
 create mode 100755 doc/src/sgml/sgml_syntax_check.pl

diff --git a/doc/src/sgml/Makefile b/doc/src/sgml/Makefile
index b53b2694a6b..50de8fab931 100644
--- a/doc/src/sgml/Makefile
+++ b/doc/src/sgml/Makefile
@@ -69,6 +69,7 @@ ALL_IMAGES := $(wildcard $(srcdir)/images/*.svg)
 # files into one big file).  This helps tools that don't understand
 # vpath builds (such as dbtoepub).
 postgres-full.xml: postgres.sgml $(ALL_SGML)
+	$(PERL) $(srcdir)/sgml_syntax_check.pl --srcdir $(srcdir)
 	$(XMLLINT) $(XMLINCLUDE) --output $@ --noent --valid $<
 
 
@@ -200,8 +201,7 @@ MAKEINFO = makeinfo
 ##
 
 # Quick syntax check without style processing
-check: postgres.sgml $(ALL_SGML) check-tabs check-nbsp
-	$(XMLLINT) $(XMLINCLUDE) --noout --valid $<
+check: postgres-full.xml
 
 
 ##
@@ -261,18 +261,6 @@ clean-man:
 
 endif # sqlmansectnum != 7
 
-# tabs are harmless, but it is best to avoid them in SGML files
-check-tabs:
-	@( ! grep '	' $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl) ) || \
-	(echo "Tabs appear in SGML/XML files" 1>&2;  exit 1)
-
-# Non-breaking spaces are harmless, but it is best to avoid them in SGML files.
-# Use perl command because non-GNU grep or sed could not have hex escape sequence.
-check-nbsp:
-	@ ( $(PERL) -ne '/\xC2\xA0/ and print("$$ARGV:$$_"),$$n++; END {exit($$n>0)}' \
-	  $(wildcard $(srcdir)/*.sgml $(srcdir)/func/*.sgml $(srcdir)/ref/*.sgml $(srcdir)/*.xsl $(srcdir)/images/*.xsl) ) || \
-	(echo "Non-breaking spaces appear in SGML/XML files" 1>&2;  exit 1)
-
 ##
 ## Clean
 ##
diff --git a/doc/src/sgml/meson.build b/doc/src/sgml/meson.build
index 6ae192eac68..7895ae5e776 100644
--- a/doc/src/sgml/meson.build
+++ b/doc/src/sgml/meson.build
@@ -96,6 +96,26 @@ xmllint = xmltools_wrapper + [
   '--tool', xmllint_bin, '--',
 ]
 
+sgml_syntax_check = files(
+  'sgml_syntax_check.pl'
+)
+
+# Create a stamp file for sgml syntax check and make postgres_full_xml
+# depend on it.  So, this check will run before the generation of
+# postgres_full_xml.
+sgml_check_stamp = custom_target('sgml-check-stamp',
+  output: 'sgml-check-stamp',
+  depfile: 'sgml-check-stamp.d',
+  command: [
+    perl, sgml_syntax_check,
+    '--srcdir', meson.current_source_dir(),
+    '--builddir', meson.current_build_dir(),
+    '--dep_file', '@DEPFILE@',
+    '--stamp_file', '@OUTPUT@',
+  ],
+  build_by_default: false,
+)
+
 # Run validation only once, common to all subsequent targets.  While
 # we're at it, also resolve all entities (that is, copy all included
 # files into one big file).  This helps tools that don't understand
@@ -106,7 +126,7 @@ postgres_full_xml = custom_target('postgres-full.xml',
   depfile: 'postgres-full.xml.d',
   command: [xmllint, '--nonet', '--noent', '--valid',
             '--path', '@OUTDIR@', '-o', '@OUTPUT@', '@INPUT@'],
-  depends: doc_generated,
+  depends: [doc_generated, sgml_check_stamp],
   build_by_default: false,
 )
 docs += postgres_full_xml
@@ -306,3 +326,15 @@ endif
 if alldocs.length() != 0
   alias_target('alldocs', alldocs)
 endif
+
+# Test creation of 'postgres_full_xml' by depending on it.
+test(
+  'doc' / 'sgml_syntax_check',
+  # We actually don't need to set 'executable', as we run the test by depending
+  # on 'postgres_full_xml', but meson requires setting an executable.
+  perl,
+  suite: 'doc',
+  depends: postgres_full_xml
+)
+
+testprep_targets += doc_generated
diff --git a/doc/src/sgml/sgml_syntax_check.pl b/doc/src/sgml/sgml_syntax_check.pl
new file mode 100755
index 00000000000..55c212d9e5d
--- /dev/null
+++ b/doc/src/sgml/sgml_syntax_check.pl
@@ -0,0 +1,115 @@
+#!/usr/bin/perl
+#
+# Perl script that checks SGML/XSL syntax and formatting issues.
+#
+# doc/src/sgml/sgml_syntax_check.pl
+#
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+use Getopt::Long;
+
+use File::Find;
+
+my $srcdir;
+my $builddir;
+my $dep_file;
+my $stamp_file;
+
+GetOptions(
+	'srcdir:s' => \$srcdir,
+	'builddir:s' => \$builddir,
+	'dep_file:s' => \$dep_file,
+	'stamp_file:s' => \$stamp_file) or die "$0: wrong arguments";
+
+die "$0: --srcdir must be specified\n" unless defined $srcdir;
+
+# Find files to process - all the sgml and xsl files (including in subdirectories)
+my @files_to_process;
+my @dirs_to_search = ($srcdir);
+push @dirs_to_search, $builddir if defined $builddir;
+find(
+	sub {
+		return unless -f $_;
+		return if $_ !~ /\.(sgml|xsl)$/;
+		push @files_to_process, $File::Find::name;
+	},
+	@dirs_to_search,);
+
+# tabs and non-breaking spaces are harmless, but it is best to avoid them in SGML files
+sub check_tabs_and_nbsp
+{
+	my $errors = 0;
+	for my $f (@files_to_process)
+	{
+		open my $fh, "<:encoding(UTF-8)", $f or die "Can't open $f: $!";
+		my $line_no = 0;
+		while (<$fh>)
+		{
+			$line_no++;
+			if (/\t/)
+			{
+				print STDERR "Tab found in $f:$line_no\n";
+				$errors++;
+			}
+			if (/\xC2\xA0/)
+			{
+				print STDERR "$f:$line_no: contains non-breaking space\n";
+				$errors++;
+			}
+		}
+		close($fh);
+	}
+
+	if ($errors)
+	{
+		remove_stamp_file();
+		die "Tabs and/or non-breaking spaces appear in SGML/XML files\n";
+	}
+}
+
+sub write_dep_file()
+{
+	open my $fh, '>', $dep_file or die "can't open $dep_file: $!";
+
+	foreach my $file (@files_to_process)
+	{
+		print $fh "$dep_file : $file\n";
+	}
+
+	close $fh;
+}
+
+sub create_stamp_file
+{
+	# Avoid touching existing stamp file to prevent unnecessary rebuilds
+	if (!(-f $stamp_file))
+	{
+		open my $fh, '>', $stamp_file
+		  or die "can't open $stamp_file: $!";
+		close $fh;
+	}
+}
+
+sub remove_stamp_file
+{
+	if ($stamp_file)
+	{
+		unlink $stamp_file;
+	}
+}
+
+# Create a dependency file for meson build
+if ($dep_file)
+{
+	write_dep_file();
+}
+
+check_tabs_and_nbsp();
+
+# All checks are passed, we can create a stamp file meson build now
+if ($stamp_file)
+{
+	create_stamp_file();
+}
diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml
index eca9d62fc22..1c937247a9a 100644
--- a/.cirrus.tasks.yml
+++ b/.cirrus.tasks.yml
@@ -627,6 +627,8 @@ task:
     TEST_JOBS: 8
     IMAGE: ghcr.io/cirruslabs/macos-runner:sonoma
 
+    XML_CATALOG_FILES: /opt/local/share/xml/docbook/4.5/catalog.xml
+
     CIRRUS_WORKING_DIR: ${HOME}/pgsql/
     CCACHE_DIR: ${HOME}/ccache
     MACPORTS_CACHE: ${HOME}/macports-cache
@@ -641,6 +643,7 @@ task:
 
     MACOS_PACKAGE_LIST: >-
       ccache
+      docbook-xml-4.5
       icu
       kerberos5
       lz4
-- 
2.51.0

Reply via email to