ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/350255 )

Change subject: Add option to skip specified namespaces
......................................................................

Add option to skip specified namespaces

[WIP] Untested

This is useful for e.g. LiquidThreads; it can be useful
for other namespaces as well.

Bug: T68661
Change-Id: I2331ab9c243f0be31dc443557683320f3ddfb329
---
M xmlfileutils/mwxml2sql.c
M xmlfileutils/mwxml2sql.h
M xmlfileutils/mwxmlelts.c
3 files changed, 66 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps/import-tools 
refs/changes/55/350255/1

diff --git a/xmlfileutils/mwxml2sql.c b/xmlfileutils/mwxml2sql.c
index 3c162ea..ffdaa1b 100644
--- a/xmlfileutils/mwxml2sql.c
+++ b/xmlfileutils/mwxml2sql.c
@@ -203,6 +203,19 @@
 "        multiple times to increase verbosity.\n"
 "  -V, --version\n"
 "        Write version information to stderr; and exit.\n\n"
+"\n"
+"Examples:\n\n"
+"    mwxml2sql -s imports/elwiki-20170401-stub-meta-history.xml.gz\n"
+"            -t imports/elwiki-20170401-pages-meta-history.xml.bz2\n"
+"            -f outfiles/elwiki-20170401-history.sql.gz -m 1.29\n"
+"This will generate pages, revs, text table sql files for import, using the\n"
+"full history files you have presumably already downloaded into the 
'imports'\n"
+"directory, leaving the output files in the 'outfiles' directory.\n"
+"    mwxml2sql -s imports/enwikinews-20170401-stub-meta-current.xml.gz\n"
+"              -t imports/enwikinews-20170401-pages-meta-current.xml.bz2\n"
+"              -f outfiles/enwikinews-20170401-current.sql.gz -m 1.29 -S 
90,91,92\n"
+"This command does the same but it skips items in namespaces 90, 91 and 92,\n"
+"which are used by LiquidThreads on some wikis.\n"
 "Report bugs in mwxml2sql to <https://phabricator.wikimedia.org/>.\n\n"
 "See also sql2txt(1), sqlfilter(1).\n\n";
   if (message) {
@@ -246,6 +259,13 @@
   char *mw_version = NULL;
   mw_version_t *mwv = NULL;
 
+  char *nstoskip = NULL;
+  int *ns_list = NULL;
+  int empty_list[1];
+  char *ns_no_commas = NULL;
+  int howmany = 0;
+  int i = 0;
+
   int pages_done = 0;
   int eof = 0;
   
@@ -263,6 +283,7 @@
     {"mysqlfile", required_argument, NULL, 'f'},
     {"mediawiki", required_argument, NULL, 'm'},
     {"nodrop", no_argument, NULL, 'n'},
+    {"nstoskip", required_argument, NULL, 'S'},
     {"pageid", required_argument, NULL, 'i'},
     {"stubs", required_argument, NULL, 's'},
     {"tableprefix", required_argument, NULL, 'p'},
@@ -273,7 +294,7 @@
   };
 
   while (1) {
-    optc=getopt_long(argc,argv,"cf:hi:m:np:s:t:vV", optvalues, &optindex);
+    optc=getopt_long(argc,argv,"cf:hi:m:np:s:S:t:vV", optvalues, &optindex);
     if (optc==-1) break;
 
     switch(optc) {
@@ -300,6 +321,9 @@
       break;
     case 's':
       stubs_file = optarg;
+      break;
+    case 'S':
+      nstoskip = optarg;
       break;
     case 't':
       text_file = optarg;
@@ -330,6 +354,34 @@
 
   if (!mw_version) {
     usage(argv[0], "missing required 'mediawiki' option");
+  }
+
+  if (nstoskip) {
+    /* expect comma sep string, split it, turn the chars into ints */
+    ns_no_commas = strtok (nstoskip, ",");
+    howmany = 1;
+    while (ns_no_commas != NULL) {
+      ns_no_commas = strtok (NULL, ",");
+      howmany++;
+    }
+    howmany++; /* plus -1 to mark the end */
+    ns_list = (int *) malloc(sizeof(int *) * howmany);    
+    if (!ns_list) {
+      fprintf(stderr,"Failed to get memory for list of namespaces to skip\n");
+      exit(1);
+    }
+    i = 0;
+    ns_no_commas = strtok (nstoskip, ",");
+    while (ns_no_commas != NULL) {
+      ns_list[i++] = atoi(ns_no_commas);
+      ns_no_commas = strtok (NULL, ",");
+    }
+    ns_list[i] = -1;
+  }
+  else {
+    empty_list[0] = -1;
+    /* no namespaces to skip, we do them all */
+    ns_list = empty_list;
   }
   mwv = check_mw_version(mw_version);
   if (!mwv) {
@@ -419,7 +471,7 @@
   }
 
   while (! eof) {
-    result = do_page(stubs, text, text_compress, mysql_page, mysql_revs, 
mysql_text, s_info, verbose, tables, nodrop, start_page_id);
+    result = do_page(stubs, text, text_compress, mysql_page, mysql_revs, 
mysql_text, s_info, verbose, tables, nodrop, start_page_id, ns_list);
     if (!result) break;
     pages_done++;
     if (verbose && !(pages_done%1000)) fprintf(stderr,"%d pages processed\n", 
pages_done);
diff --git a/xmlfileutils/mwxml2sql.h b/xmlfileutils/mwxml2sql.h
index 189d506..dd4086e 100644
--- a/xmlfileutils/mwxml2sql.h
+++ b/xmlfileutils/mwxml2sql.h
@@ -244,7 +244,7 @@
 int do_contributor(input_file_t *f, contributor_t *c, int verbose);
 int do_text(input_file_t *f,  output_file_t *sqlt, revision_t *r, int verbose, 
tablenames_t *t, int insrt_ignore, int get_sha1, int get_text_len, int 
text_commpress);
 int do_revision(input_file_t *stubs, input_file_t *text, int text_compress, 
output_file_t *sqlp, output_file_t *sqlr, output_file_t *sqlt, page_t *p, int 
verbose, tablenames_t *t, int insert_ignore);
-int do_page(input_file_t *stubs, input_file_t *text, int text_compress, 
output_file_t *sqlp, output_file_t *sqlr, output_file_t *sqlt, siteinfo_t 
*s_info, int verbose, tablenames_t *t, int insert_ignore, char *start_page_id);
+int do_page(input_file_t *stubs, input_file_t *text, int text_compress, 
output_file_t *sqlp, output_file_t *sqlr, output_file_t *sqlt, siteinfo_t 
*s_info, int verbose, tablenames_t *t, int insert_ignore, char *start_page_id, 
int *ns_list);
 int do_namespace(input_file_t *f, namespace_t *n, int verbose);
 int do_namespaces(input_file_t *f, siteinfo_t *s, int verbose);
 int do_siteinfo(input_file_t *f, siteinfo_t **s, int verbose);
diff --git a/xmlfileutils/mwxmlelts.c b/xmlfileutils/mwxmlelts.c
index 083bba6..aee48fa 100644
--- a/xmlfileutils/mwxmlelts.c
+++ b/xmlfileutils/mwxmlelts.c
@@ -994,12 +994,13 @@
        is successfully read
 */
 
-int do_page(input_file_t *stubs, input_file_t *text, int text_compress, 
output_file_t *sqlp, output_file_t *sqlr, output_file_t *sqlt, siteinfo_t *s, 
int verbose, tablenames_t *t, int insert_ignore, char*start_page_id) {
+int do_page(input_file_t *stubs, input_file_t *text, int text_compress, 
output_file_t *sqlp, output_file_t *sqlr, output_file_t *sqlt, siteinfo_t *s, 
int verbose, tablenames_t *t, int insert_ignore, char*start_page_id, int 
*ns_to_skip) {
   page_t p;
   char out_buf[1024]; /* seriously how long can username plus title plus the 
rest of the cruft be? */
   int want_text = 0;
   char escaped_title[FIELD_LEN*2];
   int skip = 0;
+  int i = 0;
 
   p.title[0] = '\0';
   p.ns[0] = '\0';
@@ -1051,6 +1052,15 @@
       else if (strlen(start_page_id) < strlen(p.id)) skip=0;
       else if (strcmp(start_page_id, p.id) > 0) skip = 1;
     }
+    /* skip specified namespaces */
+    i = 0;
+    /* end of this array is marked by -1 */
+    while (ns_to_skip[i] >= 0) {
+      if (atoi(p.ns) == ns_to_skip[i]) {
+       skip = 1;
+       break;
+      }
+    }
     if (skip) {
       if (verbose > 1) fprintf(stderr,"skipping page %s by user request\n", 
p.id);
       while (1) {

-- 
To view, visit https://gerrit.wikimedia.org/r/350255
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2331ab9c243f0be31dc443557683320f3ddfb329
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps/import-tools
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>
Gerrit-Reviewer: Wpmirrordev <wpmirror...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to