ArielGlenn has uploaded a new change for review. https://gerrit.wikimedia.org/r/72005
Change subject: mwbzutils: clean up makefile and source in prep for debian packaging ...................................................................... mwbzutils: clean up makefile and source in prep for debian packaging * cleanup install, add deinstall targets * make distclean actually do that * generate man pages with help2man * add or redo all usage messages to conform with help2man * add version and copyright info to all programs Change-Id: Id7ddd9edb5b2e22f896166a23cf49d28a010007b --- M xmldumps-backup/mwbzutils/Makefile M xmldumps-backup/mwbzutils/checkforbz2footer.c M xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c M xmldumps-backup/mwbzutils/dumplastbz2block.c M xmldumps-backup/mwbzutils/findpageidinbz2xml.c M xmldumps-backup/mwbzutils/recompressxml.c M xmldumps-backup/mwbzutils/writeuptopageid.c 7 files changed, 411 insertions(+), 155 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps refs/changes/05/72005/1 diff --git a/xmldumps-backup/mwbzutils/Makefile b/xmldumps-backup/mwbzutils/Makefile index 5fcd560..f3a3c44 100644 --- a/xmldumps-backup/mwbzutils/Makefile +++ b/xmldumps-backup/mwbzutils/Makefile @@ -16,22 +16,38 @@ # 2010-2010: see the file COPYING for details. # ------------------------------------------------------------------ -CC=gcc -LDFLAGS= -BIGFILES=-D_FILE_OFFSET_BITS=64 -CFLAGS=-Wall -Winline -O2 -g $(BIGFILES) -PREFIX=/usr/local +VERSION = "0.0.3" +CC = gcc +LDFLAGS = +BIGFILES = -D_FILE_OFFSET_BITS=64 +CFLAGS = -Wall -Winline -O2 -g $(BIGFILES) -DVERSION=\"$(VERSION)\" -SHELL=/bin/sh +build: checkforbz2footer dumpbz2filefromoffset \ + dumplastbz2block findpageidinbz2xml \ + recompressxml writeuptopageid \ -OBJSBZ= bzlibfuncs.o +NAME_CHECKFORBZ2FOOTER = "Check if bzip2 file ends with bz2 magic footer" +NAME_DUMPBZ2FILEFROMOFFSET = "Write MediaWiki XML pages from bzip2 file starting from offset" +NAME_DUMPLASTBZ2BLOCK = "Find last bz2 block in bzip2 file and dump contents" +NAME_FINDPAGEIDINBZ2XML = "Display offset of bz2 block for given page id in bzip2 MediaWiki XML file" +NAME_RECOMPRESSXML = "Bz2 compress MediaWiki XML input in batches of pages" +NAME_WRITEUPTOPAGEID = "Write range of page content from MediaWiki XML input" -all: checkforbz2footer \ - dumpbz2filefromoffset \ - dumplastbz2block \ - findpageidinbz2xml \ - recompressxml \ - writeuptopageid +BINDIR = $(DESTDIR)$(PREFIX)/usr/local/bin/ +MANDIR = $(DESTDIR)$(PREFIX)/usr/local/share/man/man1/ + +GZIP = /bin/gzip +HELP2MAN = /usr/bin/help2man +SHELL = /bin/sh + +DISTNAME = mwbzutils-$(VERSION) + +OBJSBZ = bzlibfuncs.o + +build: checkforbz2footer dumpbz2filefromoffset \ + dumplastbz2block findpageidinbz2xml \ + recompressxml writeuptopageid \ + manpages dumplastbz2block: $(OBJSBZ) mwbzlib.o dumplastbz2block.o $(CC) $(CFLAGS) $(LDFLAGS) -o dumplastbz2block dumplastbz2block.o mwbzlib.o $(OBJSBZ) -lbz2 @@ -51,25 +67,61 @@ writeuptopage: $(OBJSBZ) writeuptopageid.o $(CC) $(CFLAGS) $(LDFLAGS) -o writeuptopageid writeuptopageid.o -lbz2 +manpages: dumplastbz2block.1.gz findpageidinbz2xml.1.gz \ + checkforbz2footer.1.gz dumpbz2filefromoffset.1.gz \ + recompressxml.1.gz writeuptopageid.1.gz + +dumplastbz2block.1.gz : dumplastbz2block + $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPLASTBZ2BLOCK) \ + --no-discard-stderr ./dumplastbz2block | $(GZIP) > dumplastbz2block.1.gz +findpageidinbz2xml.1.gz : findpageidinbz2xml + $(HELP2MAN) --section 1 --no-info --name $(NAME_FINDPAGEIDINBZ2XML) \ + --no-discard-stderr ./findpageidinbz2xml | $(GZIP) > findpageidinbz2xml.1.gz +checkforbz2footer.1.gz : checkforbz2footer + $(HELP2MAN) --section 1 --no-info --name $(NAME_CHECKFORBZ2FOOTER) \ + --no-discard-stderr ./checkforbz2footer | $(GZIP) > checkforbz2footer.1.gz +dumpbz2filefromoffset.1.gz : dumpbz2filefromoffset + $(HELP2MAN) --section 1 --no-info --name $(NAME_DUMPBZ2FILEFROMOFFSET) \ + --no-discard-stderr ./dumpbz2filefromoffset | $(GZIP) > dumpbz2filefromoffset.1.gz +recompressxml.1.gz : recompressxml + $(HELP2MAN) --section 1 --no-info --name $(NAME_RECOMPRESSXML) \ + --no-discard-stderr ./recompressxml | $(GZIP) > recompressxml.1.gz +writeuptopageid.1.gz : writeuptopageid + $(HELP2MAN) --section 1 --no-info --name $(NAME_WRITEUPTOPAGEID) \ + --no-discard-stderr ./writeuptopageid | $(GZIP) > writeuptopageid.1.gz + + install: dumplastbz2block findpageidinbz2xml checkforbz2footer dumpbz2filefromoffset recompressxml writeuptopageid - if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi - cp -f dumplastbz2block $(PREFIX)/bin/dumplastbz2block - cp -f findpageidinbz2xml $(PREFIX)/bin/findpageidinbz2xml - cp -f checkforbz2footer $(PREFIX)/bin/checkforbz2footer - cp -f dumpbz2filefromoffset $(PREFIX)/bin/dumpbz2filefromoffset - cp -f recompressxml $(PREFIX)/bin/recompressxml - cp -f writeuptopageid $(PREFIX)/bin/writeuptopageid - chmod a+x $(PREFIX)/bin/dumplastbz2block - chmod a+x $(PREFIX)/bin/findpageidinbz2xml - chmod a+x $(PREFIX)/bin/checkforbz2footer - chmod a+x $(PREFIX)/bin/dumpbz2filefromoffset - chmod a+x $(PREFIX)/bin/recompressxml - chmod a+x $(PREFIX)/bin/writeuptopageid + install --directory $(BINDIR) + install --mode=755 dumplastbz2block $(BINDIR) + install --mode=755 findpageidinbz2xml $(BINDIR) + install --mode=755 checkforbz2footer $(BINDIR) + install --mode=755 dumpbz2filefromoffset $(BINDIR) + install --mode=755 recompressxml $(BINDIR) + install --mode=755 writeuptopageid $(BINDIR) + install --directory $(MANDIR) + install --mode=755 dumplastbz2block.1.gz $(BINDIR) + install --mode=755 findpageidinbz2xml.1.gz $(BINDIR) + install --mode=755 checkforbz2footer.1.gz $(BINDIR) + install --mode=755 dumpbz2filefromoffset.1.gz $(BINDIR) + install --mode=755 recompressxml.1.gz $(BINDIR) + install --mode=755 writeuptopageid.1.gz $(BINDIR) + +deinstall: + rm -f $(BINDIR)dumplastbz2block + rm -f $(BINDIR)findpageidinbz2xml + rm -f $(BINDIR)checkforbz2footer + rm -f $(BINDIR)dumpbz2filefromoffset + rm -f $(BINDIR)recompressxml + rm -f $(BINDIR)writeuptopageid clean: rm -f *.o *.a dumplastbz2block findpageidinbz2xml \ checkforbz2footer dumpbz2filefromoffset \ recompressxml writeuptopageid + rm -f dumplastbz2block.1.gz findpageidinbz2xml.1.gz \ + checkforbz2footer.1.gz dumpbz2filefromoffset.1.gz \ + recompressxml.1.gz writeuptopageid.1.gz bzlibfuncs.o: bzlibfuncs.c bzlib.h bzlib_private.h $(CC) $(CFLAGS) -c bzlibfuncs.c @@ -90,9 +142,10 @@ writeuptopageid.o: writeuptopageid.c $(CC) $(CFLAGS) -c writeuptopageid.c -distclean: clean +distclean: + rm -f $(DISTNAME) + rm -f *.tar.gz -DISTNAME=mwbzutils-0.0.3 dist: rm -f $(DISTNAME) ln -s -f . $(DISTNAME) diff --git a/xmldumps-backup/mwbzutils/checkforbz2footer.c b/xmldumps-backup/mwbzutils/checkforbz2footer.c index 7ff9f7e..b6ad199 100644 --- a/xmldumps-backup/mwbzutils/checkforbz2footer.c +++ b/xmldumps-backup/mwbzutils/checkforbz2footer.c @@ -8,20 +8,52 @@ #include <errno.h> #include "mwbzutils.h" -/* - Check to see whether a file ends with a bz2 footer or not - (i.e. if it is truncated or corrupted). - This is a crude but fast test for integrity; we don't - check the CRC at the end of fthe stream, nor do we check the - bit padding in the last byte of the file. +void usage(char *message) { + char * help = +"Usage: checkforbz2footer [--version|--help]\n" +" or: checkforbz2footer <infile>\n\n" +"Check whether the specified bzip2 compressed file ends with a bz2 footer\n" +"or not ((i.e. if it is truncated or corrupted).\n" +"This is a crude but fast test for integrity; we don't check the CRC at\n" +"the end of the stream, nor do we check the bit padding in the last byte\n" +"of the file.\n\n" +"Exits with 0 if the file has the bz2 footer, 1 if the file does not have\n" +"the footer and -1 on error.\n\n" +"Options:\n\n" +"Flags:\n\n" +" -h, --help Show this help message\n" +" -v, --version Display the version of this program and exit\n\n" +"Arguments:\n\n" +" <infile> Name of the file to check\n\n" +"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n" +"See also:\n\n" +" dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1)\n" +" recompressxml(1), writeuptopageid(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} - Arguments: the name of the file to check, presumably - a bzipped file. - Outputs: none. - Exits with 0 if the file contains the footer at the end, - -1 if the file does not contain the footer or there is an error. -*/ - +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see <http://www.gnu.org/licenses/>\n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"checkforbz2footer %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); +} int main(int argc, char **argv) { @@ -30,9 +62,13 @@ bz_info_t bfile; if (argc != 2) { - fprintf(stderr,"usage: %s infile\n", argv[0]); + usage("Missing option or argument."); exit(-1); } + + if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL); + if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) show_version(VERSION); + fin = open (argv[1], O_RDONLY); if (fin < 0) { fprintf(stderr,"failed to open file %s for read\n", argv[1]); diff --git a/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c b/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c index 5066bb9..03b2b9b 100644 --- a/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c +++ b/xmldumps-backup/mwbzutils/dumpbz2filefromoffset.c @@ -10,8 +10,56 @@ #include <regex.h> #include "mwbzutils.h" +void usage(char *message) { + char * help = +"Usage: dumpbz2filefromoffset [--version|--help]\n" +" or: dumpbz2filefromoffset <infile> <offset>\n\n" +"Find the first bz2 block in a file after the specified offset, uncompress\n" +"and write contents from that point on to stdout, starting with the first\n" +"<page> tag encountered.\n\n" +"The starting <mediawiki> tag and the <siteinfo> header from the file will\n" +"be written out first.\n\n" +"Note that some bytes from the very last block may be lost if the blocks are\n" +"not byte-aligned. This is due to the bzip2 crc at the eof being wrong.\n\n" +"Exits with BZ_OK on success, various BZ_ errors otherwise.\n\n" +"Options:\n\n" +"Flags:\n\n" +" -h, --help Show this help message\n" +" -v, --version Display the version of this program and exit\n\n" +"Arguments:\n\n" +" <infile> Name of the file to check\n" +" <offset> byte in the file from which to start processing\n\n" +"Report bugs in dumpbz2filefromoffset to <https://bugzilla.wikimedia.org/>.\n\n" +"See also checkforbz2footer(1), dumplastbz2block(1), findpageidinbz2xml(1),\n" + "recompressxml(1), writeuptopageid(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} + +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see <http://www.gnu.org/licenses/>\n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"dumpbz2filefromoffset %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); +} + /* - dump the <meadiawiki> header (up through + dump the <mediawiki> header (up through </siteinfo> close tag) found at the beginning of xml dump files. returns: @@ -206,37 +254,18 @@ return(0); } -/* - find the first bz2 block after the specified offset, - uncompress from that point on, write out the - contents starting with the first <page> tag, - prefacing first with the <mediawiki> header from - the beginning of the file, up through </siteinfo>. - - note that we may lose some bytes from the very last - block if the blocks are bit shifted, because the - bzip crc at end of file will be wrong. (needs testing to - find a workaround, simply not feeding in the crc doesn't - suffice) - - for purposes of the XML dumps this is fine, since we use - this tool to generate prefetch data starting from - a given pageid, rather than needing to uncompress - gigabytes of data to get to the point in the file - we want. - - returns: - BZ_OK on success, various BZ_ errors otherwise. -*/ int main(int argc, char **argv) { int fin, res; off_t position; - if (argc != 3) { - fprintf(stderr,"usage: %s infile position\n", argv[0]); + if (argc < 2 || argc > 3) { + usage("Missing or bad options/arguments"); exit(-1); } + if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL); + if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) show_version(VERSION); + fin = open (argv[1], O_RDONLY); if (fin < 0) { fprintf(stderr,"failed to open file %s for read\n", argv[1]); diff --git a/xmldumps-backup/mwbzutils/dumplastbz2block.c b/xmldumps-backup/mwbzutils/dumplastbz2block.c index 34d5601..ab441ad 100644 --- a/xmldumps-backup/mwbzutils/dumplastbz2block.c +++ b/xmldumps-backup/mwbzutils/dumplastbz2block.c @@ -9,22 +9,52 @@ #include <inttypes.h> #include "mwbzutils.h" +void usage(char *message) { + char * help = +"Usage: dumplastbz2block [--version|--help]\n" +" or: dumplastbz2block <infile>\n\n" +"Find the last bz2 block marker in a file and dump whatever can be\n" +"decompressed after that point. The header of the file must be intact\n" +"in order for any output to be produced.\n" +"This will produce output for truncated files as well, as long as there\n" +"is 'enough' data after the block marker.\n" +"Exits with 0 if some decompressed data was written, 1 if no data could\n" +"be uncompressed and -1 on error.\n\n" +"Options:\n\n" +"Flags:\n\n" +" -h, --help Show this help message\n" +" -v, --version Display the version of this program and exit\n\n" +"Arguments:\n\n" +" <infile> Name of the file to process\n\n" +"Report bugs in dumplastbz2block to <https://bugzilla.wikimedia.org/>.\n\n" +"See also checkforbz2footer(1), dumpbz2filefromoffset(1), findpageidinbz2xml(1),\n" +"recompressxml(1), writeuptopageid(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} -/* - Find the last bz2 block marker in a file - and dump whatever can be decompressed after - that point. The header of the file must - be intact in order for any output to be produced. - This will produce output for truncated files as well, - as long as there is "enough" data after the block - marker. +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see <http://www.gnu.org/licenses/>\n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"dumplastbz2block %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); +} - Arguments: the name of the file to check, presumably - a bzipped file. - Outputs: the decompressed data at the end of the file. - Exits with 0 if decompression of some data can be done, - 1 if decompression fails, and -1 on error. -*/ int main(int argc, char **argv) { @@ -38,9 +68,12 @@ int length = 5000; /* output buffer size */ if (argc != 2) { - fprintf(stderr,"usage: %s infile\n", argv[0]); + usage("Missing option or argument."); exit(-1); } + + if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL); + if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) show_version(VERSION); fin = open (argv[1], O_RDONLY); if (fin < 0) { @@ -96,4 +129,3 @@ close(fin); exit(0); } - diff --git a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c index f00da48..f403a8b 100644 --- a/xmldumps-backup/mwbzutils/findpageidinbz2xml.c +++ b/xmldumps-backup/mwbzutils/findpageidinbz2xml.c @@ -13,6 +13,63 @@ #include <zlib.h> #include "mwbzutils.h" +void usage(char *message) { + char * help = +"Usage: findpageidinbz2xml --filename file --pageid id [--stubfile] [--useapi] [--verbose]\n" +" [--help] [--version]\n\n" +"Show the offset of the bz2 block in the specified MediaWiki XML dump file\n" +"containing the given page id. This assumes that the bz2 header of the file\n" +"is intact and that page ids are steadily increasing throughout the file.\n\n" +"If the page id is found, a line in the following format will be written to stdout:\n" +" position:xxxxx pageid:nnn\n\n" +"where 'xxxxx' is the offset of the block from the beginning of the file, and\n" +"'nnn' is the id of the first page encountered in that block.\n\n" +"Note:\n" +"This program may use the MediaWiki api to find page ids from revision ids\n" +"if 'useapi' is specified.\n" +"It may use a stub file to find page ids from rev ids if 'stubfile' is specified.\n" +"It will only do one of the above if it has been reading from the file for some\n" +"large number of iterations without findind a page tag (some pages have > 500K\n" +"revisions and a heck of a lot of text).\n" +"If both 'useapi' and 'stubfile' are specified, the api will be used as it is faster.\n\n" +"Exits with 0 in success, -1 on error.\n\n" +"Options:\n\n" +" -f, --filename name of file to search\n" +" -p, --pageid page_id of page for which to search\n" +" -s, --stubfile name of MediaWiki XML stub file to fall back on (see 'Note' above)\n" +" -a, --useapi fall back to the api if stuck (see 'Note' above)\n" +" -V, --verbose show search process; specify multiple times for more output\n" +" -h, --help Show this help message\n" +" -V, --version Display the version of this program and exit\n\n" +"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n" +"See also dumpbz2filefromoffset(1), dumplastbz2block(1), findpageidinbz2xml(1),\n" + "recompressxml(1), writeuptopageid(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} + +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see <http://www.gnu.org/licenses/>\n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"findpageidinbz2xml %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); +} + /* find the first bz2 block marker in the file, from its current position, @@ -484,36 +541,6 @@ } } - -void usage(char *whoami, char *message) { - if (message) { - fprintf(stderr,message); - } - fprintf(stderr,"usage: %s --filename file --pageid id [--stubfile] [--useapi] [--verbose]\n", whoami); - exit(1); -} - -/* - given a bzipped and possibly truncated file, and a page id, - hunt for the page id in the file; this assume that the - bz2 header is intact and that page ids are steadily increasing - throughout the file. - - writes the offset of the relevant block (from beginning of file) - and the first pageid found in that block, to stdout - - it may use the api to find page ids from rev ids if use_api is specified - it may use a stub file to find page ids from rev ids if stubfile is specified - it will only do these if it has been reading from awhile without - findind a page tag (some pages have > 500K revisions and a heck of - a lot of text) - if both use_api and stubfile are specified, we will use_api, it's faster - - format of output: - position:xxxxx pageid:nnn - - returns: 0 on success, -1 on error -*/ int main(int argc, char **argv) { int fin, res, page_id=0; off_t position, interval, file_size; @@ -529,20 +556,22 @@ struct option optvalues[] = { {"filename", 1, 0, 'f'}, + {"help", 0, 0, 'h'}, {"pageid", 1, 0, 'p'}, {"useapi", 0, 0, 'a'}, {"verbose", 0, 0, 'v'}, + {"version", 0, 0, 'V'}, {"stubfile", 1, 0, 's'}, {NULL, 0, NULL, 0} }; while (1) { - optc=getopt_long_only(argc,argv,"filename:pageid:useapi:stubfile:verbose", optvalues, &optindex); + optc=getopt_long_only(argc,argv,"filename:help:pageid:useapi:stubfile:verbose:version", optvalues, &optindex); if (optc=='f') { filename=optarg; } else if (optc=='p') { - if (!(isdigit(optarg[0]))) usage(argv[0],NULL); + if (!(isdigit(optarg[0]))) usage(NULL); page_id=atoi(optarg); } else if (optc=='a') @@ -551,18 +580,22 @@ use_stub=1; stubfile = optarg; } + else if (optc=='h') + usage(NULL); else if (optc=='v') verbose++; + else if (optc=='V') + show_version(VERSION); else if (optc==-1) break; - else usage(argv[0],"Unknown option or other error\n"); + else usage("Unknown option or other error\n"); } if (! filename || ! page_id) { - usage(argv[0],NULL); + usage(NULL); } if (page_id <1) { - usage(argv[0], "Please specify a page_id >= 1.\n"); + usage("Please specify a page_id >= 1.\n"); } fin = open (filename, O_RDONLY); diff --git a/xmldumps-backup/mwbzutils/recompressxml.c b/xmldumps-backup/mwbzutils/recompressxml.c index be6cc92..417cdb6 100644 --- a/xmldumps-backup/mwbzutils/recompressxml.c +++ b/xmldumps-backup/mwbzutils/recompressxml.c @@ -31,6 +31,54 @@ bz_stream strm_indx; +void usage(char *message) { + char * help = +"Usage: recompressxml --pagesperstream n [--buildindex filename] [--verbose]\n" +" or: recompressxml [--version|--help]\n\n" +"Reads a stream of XML pages from stdin and writes to stdout the bz2 compressed\n" +"data, one bz2 stream (header, blocks, footer) per specified number of pages.\n\n" +"Options:\n\n" +" -p, --pagesperstream: Compress this number of pages in each complete\n" +" bz2stream before opening a new stream. The siteinfo\n" +" header is written to a separate stream at the beginning\n" +" of all output, and the closing mediawiki tag is written\n" +" into a separate stream at the end.\n" +" -b, --buildindex: Generate a file containing an index of pages ids and titles\n" +" per stream. Each line contains: offset-to-stream:pageid:pagetitle\n" +" If filename ends in '.bz2' the file will be written in bz2 format.\n" +" -v, --verbose: Write lots of debugging output to stderr. This option can be used\n" +" multiple times to increase verbosity.\n"; +" -h, --help Show this help message\n" +" -V, --version Display the version of this program and exit\n\n" +"Report bugs in checkforbz2footer to <https://bugzilla.wikimedia.org/>.\n\n" +"See also checkforbz2footer(1), dumpbz2filefromoffset(1), dumplastbz2block(1),\n" +"findpageidinbz2xml(1), writeuptopageid(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} + +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see <http://www.gnu.org/licenses/>\n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"recompressxml %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); +} + void setupIndexBz2Stream() { int bz_verbosity = 0; int bz_workFactor = 0; @@ -257,27 +305,6 @@ return; } -void usage(char *whoami, char *message) { - if (message) { - fprintf(stderr,"%s",message); - } - fprintf(stderr,"Usage: %s --pagesperstream n [--buildindex indexfilename] [--verbose]\n\n", whoami); - fprintf(stderr,"Reads a stream of XML pages from stdin,\n"); - fprintf(stderr,"and writes to stdout the bz2 compressed\n"); - fprintf(stderr,"data, one bz2 stream per count pages.\n\n"); - fprintf(stderr,"Options:\n"); - fprintf(stderr,"pagesperstream: compress this many pages in each complete bz2stream before\n"); - fprintf(stderr," opening a new stream. The siteinfo header is written to a\n"); - fprintf(stderr," separate stream at the beginning of all output, and the closing\n"); - fprintf(stderr," mediawiki tag is written into a separate stream at the end.\n"); - fprintf(stderr,"buildindex: generate a file containing an index of pages ids and titles\n"); - fprintf(stderr," per stream. Each line contains: offset-to-stream:pageid:pagetitle\n"); - fprintf(stderr," If filename ends in '.bz2' the file will be written in bz2 format.\n"); - fprintf(stderr,"verbose: produce lots of debugging output to stderr. This option can be used\n"); - fprintf(stderr," multiple times to increase verbosity.\n"); - exit(-1); -} - int main(int argc, char **argv) { int optindex=0; int optc; @@ -285,8 +312,10 @@ struct option optvalues[] = { {"buildindex", 1, 0, 'b'}, + {"help", 0, 0, 'h'}, {"pagesperstream", 1, 0, 'p'}, {"verbose", 0, 0, 'v'}, + {"version", 0, 0, 'V'}, {NULL, 0, NULL, 0} }; @@ -301,18 +330,22 @@ if (optc=='b') { indexFilename = optarg; } + else if (optc=='h') + usage(NULL); else if (optc=='p') { - if (!(isdigit(optarg[0]))) usage(argv[0],NULL); + if (!(isdigit(optarg[0]))) usage(NULL); count=atoi(optarg); } else if (optc=='v') verbose++; + else if (optc=='V') + show_version(VERSION); else if (optc==-1) break; - else usage(argv[0],"unknown option or other error\n"); + else usage("unknown option or other error\n"); } if (count <= 0) { - usage(argv[0],"bad or no argument given for count.\n"); + usage("bad or no argument given for count.\n"); } if (indexFilename) { @@ -321,7 +354,7 @@ } indexfd = fopen(indexFilename, "w"); if (! indexfd) { - usage(argv[0],"failed to open index file for write.\n"); + usage("failed to open index file for write.\n"); } if (!strcmp(indexFilename+(strlen(indexFilename)-4),".bz2")) { if (verbose) { diff --git a/xmldumps-backup/mwbzutils/writeuptopageid.c b/xmldumps-backup/mwbzutils/writeuptopageid.c index ea608df..4df5c99 100644 --- a/xmldumps-backup/mwbzutils/writeuptopageid.c +++ b/xmldumps-backup/mwbzutils/writeuptopageid.c @@ -10,13 +10,52 @@ namespaces will one project want? */ #define MAXHEADERLEN 524289 -void usage(char *me) { - fprintf(stderr,"Usage: %s startPageID [endPageID]\n",me); - fprintf(stderr,"Copies the contents of an XML file starting with and including startPageID\n"); - fprintf(stderr,"and up to but not including endPageID. This program is used in processing XML\n"); - fprintf(stderr,"dump files that were only partially written, as well as in writing partial\n"); - fprintf(stderr,"stub files for reruns of those dump files.\n"); - fprintf(stderr,"If endPageID is ommitted, all pages starting from startPageID will be copied.\n"); +void usage(char *message) { + char * help = +"Usage: writeuptopageid [--version|--help]\n" +" or: writeuptopageid <startpageid> <endpageid>\n\n" +"Reads a MediaWiki XML file from stdin anfd writes a range of pages from the file\n" +"to stdout, starting with and including the startpageid, up to but not including\n" +"the endpageid.\n" +"This program can be used in processing XML dump files that were only partially\n" +"written, as well as in writing partial stub files for reruns of those dump files.\n" +"If endPageID is ommitted, all pages starting from startPageID will be copied.\n\n" +"Options:\n\n" +"Flags:\n\n" +" -h, --help Show this help message\n" +" -v, --version Display the version of this program and exit\n\n" +"Arguments:\n\n" +" <startpageid> id of the first page to write\n" +" <endpageid> id of the page at which to stop writing; if omitted, all pages through eof\n" +" will be written\n\n" +"Report bugs in writeuptopageid to <https://bugzilla.wikimedia.org/>.\n\n" +"See also checkforbz2footer(1), dumpbz2filefromoffset(1), dumplastbz2block(1),\n" + "findpageidinbz2xml(1), recompressxml(1)\n\n"; + if (message) { + fprintf(stderr,"%s\n\n",message); + } + fprintf(stderr,"%s",help); + exit(-1); +} + + +void show_version(char *version_string) { + char * copyright = +"Copyright (C) 2011, 2012, 2013 Ariel T. Glenn. All rights reserved.\n\n" +"This program is free software: you can redistribute it and/or modify it\n" +"under the terms of the GNU General Public License as published by the\n" +"Free Software Foundation, either version 2 of the License, or (at your\n" +"option) any later version.\n\n" +"This program is distributed in the hope that it will be useful, but\n" +"WITHOUT ANY WARRANTY; without even the implied warranty of \n" +"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General\n" +"Public License for more details.\n\n" +"You should have received a copy of the GNU General Public License along\n" +"with this program. If not, see <http://www.gnu.org/licenses/>\n\n" + "Written by Ariel T. Glenn.\n"; + fprintf(stderr,"writeuptopageid %s\n", version_string); + fprintf(stderr,"%s",copyright); + exit(-1); } /* note that even if we have only read a partial line @@ -131,9 +170,12 @@ char mem[MAXHEADERLEN]; if (argc < 2 || argc > 3) { - usage(argv[0]); + usage(NULL); exit(-1); } + + if (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-h")) usage(NULL); + if (!strcmp(argv[1], "--version") || !strcmp(argv[1], "-v")) show_version(VERSION); errno = 0; startPageID = strtol(argv[1], &nonNumeric, 10); @@ -141,8 +183,7 @@ *nonNumeric != 0 || nonNumeric == (char *) &startPageID || errno != 0) { - fprintf (stderr,"The value you entered for startPageID must be a positive integer.\n"); - usage(argv[0]); + usage("The value you entered for startPageID must be a positive integer."); exit(-1); } if (argc == 3) { @@ -151,8 +192,7 @@ *nonNumeric != 0 || nonNumeric == (char *) &endPageID || errno != 0) { - fprintf (stderr,"The value you entered for endPageID must be a positive integer.\n"); - usage(argv[0]); + usage("The value you entered for endPageID must be a positive integer.\n"); exit(-1); } } -- To view, visit https://gerrit.wikimedia.org/r/72005 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id7ddd9edb5b2e22f896166a23cf49d28a010007b Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits