ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/347625 )

Change subject: updated for support up through MW 1.29
......................................................................


updated for support up through MW 1.29

thanks to patches from Wpmirrordev for part of this update

add dbname, page_linked_update, page_lang fields, account
for different field orders in xml, do initialization of
revision fields at beginning of retrieval

Change-Id: I90d9e5f113c61940cade3847e856541a26791c2b
---
M xmlfileutils/CHANGELOG
M xmlfileutils/Makefile
M xmlfileutils/mwxml2sql.c
M xmlfileutils/mwxml2sql.h
M xmlfileutils/mwxmlelts.c
M xmlfileutils/sqlutils.c
6 files changed, 133 insertions(+), 35 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmlfileutils/CHANGELOG b/xmlfileutils/CHANGELOG
index 1ac5c8a..23bb437 100644
--- a/xmlfileutils/CHANGELOG
+++ b/xmlfileutils/CHANGELOG
@@ -1,3 +1,11 @@
+10 April 2017 Ariel T. Glenn <[email protected]>
+
+   Release 0.0.3
+
+    Updated for compatibility with MediaWiki up through
+    version 1.29; bump version. Thanks to Kent L. Miller
+    for earlier patches that updated through MW 1.24.
+
 29 May 2013 Kent L. Miller <[email protected]>
 
    Release 0.0.2-1
diff --git a/xmlfileutils/Makefile b/xmlfileutils/Makefile
index 78dc755..7f0f770 100644
--- a/xmlfileutils/Makefile
+++ b/xmlfileutils/Makefile
@@ -1,9 +1,9 @@
 # ------------------------------------------------------------------
 # This project is released under the GPL, copyright (C) Ariel T. Glenn
-# 2013: see the file COPYING for details.
+# 2013 - 2017: see the file COPYING for details.
 # ------------------------------------------------------------------
 
-MWXML2SQLVERS  = "0.0.2"
+MWXML2SQLVERS  = "0.0.3"
 PROGRAM        = mediawiki-mwxml2sql
 DISTNAME       = $(PROGRAM)-$(MWXML2SQLVERS)
 NAME_MWXML2SQL = "convert MediaWiki XML dump file to MySQL INSERT commands"
diff --git a/xmlfileutils/mwxml2sql.c b/xmlfileutils/mwxml2sql.c
index bb5f608..3c162ea 100644
--- a/xmlfileutils/mwxml2sql.c
+++ b/xmlfileutils/mwxml2sql.c
@@ -120,9 +120,9 @@
     comma = strchr(start, ',');
     if (comma) *comma = '\0';
     else last++;
-    /* we know MW 1.5 through MW 1.21 even though there is no MW 1.21 yet */
+    /* we know MW 1.5 through MW 1.29 even though there is no MW 1.29 yet */
     sscanf(start, "%u.%u%20s", &mwv->major, &mwv->minor, mwv->qualifier);
-    if (mwv->major != 1 || mwv->minor < 5 || mwv->minor > 21) {
+    if (mwv->major != 1 || mwv->minor < 5 || mwv->minor > 29) {
       free_mw_version(mwv);
       return(NULL);
     }
diff --git a/xmlfileutils/mwxml2sql.h b/xmlfileutils/mwxml2sql.h
index 1223d6b..189d506 100644
--- a/xmlfileutils/mwxml2sql.h
+++ b/xmlfileutils/mwxml2sql.h
@@ -59,6 +59,7 @@
 
 typedef struct {
   char sitename[FIELD_LEN];
+  char dbname[FIELD_LEN];
   char base[FIELD_LEN];
   char generator[FIELD_LEN];
   char s_case[FIELD_LEN];
@@ -96,9 +97,11 @@
   char redirect[2];
   char restrictions[FIELD_LEN];
   char touched[FIELD_LEN]; /* from rev_timestamp */
+  char links_updated[FIELD_LEN];   /* if not present, set to NULL */
   char latest[MAX_ID_LEN];  /* from rev_id */
   char len[FIELD_LEN];     /* from text_len */
   char model[FIELD_LEN];   /* if not present, set to NULL */
+  char lang[FIELD_LEN];   /* if not present, set to NULL */
   revision_t ** revs;
 } page_t;
 
@@ -149,6 +152,7 @@
 /* tags we recognize */
 #define BASE "base"
 #define CASE "case"
+#define DBNAME "dbname"
 #define COMMENT "comment"
 #define CONTRIBUTOR "contributor"
 #define FORMAT "format"
diff --git a/xmlfileutils/mwxmlelts.c b/xmlfileutils/mwxmlelts.c
index df8ac36..de0c475 100644
--- a/xmlfileutils/mwxmlelts.c
+++ b/xmlfileutils/mwxmlelts.c
@@ -580,6 +580,12 @@
   c.id[0] = '\0';
 
   r.text = NULL;
+  r.comment[0] = '\0';
+  r.sha1[0] = '\0';
+  r.model[0] = '\0';
+  r.format[0] = '\0';
+  r.text_id[0] = '\0';
+  r.text_len[0] = '\0';
 
   get_elt_with_attrs(stubs, ID, r.id, sizeof(r.id), NULL, 0);
   if (get_line(stubs) == NULL) {
@@ -624,7 +630,6 @@
     r.minor[1]='\0';
   }
 
-  r.comment[0] = '\0';
   if (get_elt_with_attrs(stubs, COMMENT, r.comment, sizeof(r.comment), NULL, 
0) != -1) {
     if (get_line(stubs) == NULL) {
       whine("abrupt end of revision data in rev id %s", r.id);
@@ -632,8 +637,6 @@
     }
   }
   un_xml_escape(r.comment, NULL, 1);
-  r.text_id[0] = '\0';
-  r.text_len[0] = '\0';
 
   /* schema 0.7 has sha1 then text, earlier schema don't have it at all so 
look for it here optionally */
   if (get_elt_with_attrs(stubs, SHA1, r.sha1, sizeof(r.sha1), NULL, 0) != -1) {
@@ -643,9 +646,22 @@
     }
   }
 
+  /* schema 0.10 has model, format before text, sha1 */
+  if (get_elt_with_attrs(stubs, MODEL, r.model, sizeof(r.model), NULL, 0) != 
-1) {
+    if (get_line(stubs) == NULL) {
+      whine("abrupt end of revision data in rev id %s", r.id);
+      return(0);
+    }
+  }
+  if (get_elt_with_attrs(stubs, FORMAT, r.format, sizeof(r.format), NULL, 0) 
!= -1) {
+    if (get_line(stubs) == NULL) {
+      whine("abrupt end of revision data in rev id %s", r.id);
+      return(0);
+    }
+  }
+
   /*       <text id="382338088" bytes="57" />  */
   get_elt_with_attrs(stubs, TEXT, NULL, 0, attrs, MAX_ATTRS_STR_LEN);
-
   if (verbose > 1) fprintf(stderr,"text tag found, %s\n", attrs);
   attrs_ptr = attrs;
   while (1) {
@@ -680,14 +696,11 @@
     if (!todo) break;
     else attrs_ptr = todo;
   }
-
+  /* prep buffer after text end */
   if (get_line(stubs) == NULL) {
     whine("abrupt end of revision data in rev id %s", r.id);
     return(0);
   }
-
-  r.model[0] = '\0';
-  r.format[0] = '\0';
 
   /* schema 0.8 and later have sha1 here after text */
   if (! r.sha1[0]) {
@@ -698,17 +711,22 @@
       }
     }
   }
-  /* schema 0.8 and later have model and format */
-  if (get_elt_with_attrs(stubs, MODEL, r.model, sizeof(r.model), NULL, 0) != 
-1) {
-    if (get_line(stubs) == NULL) {
-      whine("abrupt end of revision data in rev id %s", r.id);
-      return(0);
+
+  /* schema 0.8 and 0.9 have model and format after text/sha1 */
+  if (! r.model[0]) {
+    if (get_elt_with_attrs(stubs, MODEL, r.model, sizeof(r.model), NULL, 0) != 
-1) {
+      if (get_line(stubs) == NULL) {
+       whine("abrupt end of revision data in rev id %s", r.id);
+       return(0);
+      }
     }
   }
-  if (get_elt_with_attrs(stubs, FORMAT, r.format, sizeof(r.format), NULL, 0) 
!= -1) {
-    if (get_line(stubs) == NULL) {
-      whine("abrupt end of revision data in rev id %s", r.id);
-      return(0);
+  if (! r.format[0]) {
+    if (get_elt_with_attrs(stubs, FORMAT, r.format, sizeof(r.format), NULL, 0) 
!= -1) {
+      if (get_line(stubs) == NULL) {
+       whine("abrupt end of revision data in rev id %s", r.id);
+       return(0);
+      }
     }
   }
 
@@ -716,7 +734,6 @@
     whine("no rev end tag for rev id %s", r.id);
     return(0);
   }
-
   /* 
      If schema is earlier than 0.5 or for some other reason we don't
      hve the bytes attr in the text tag, AND we aren't reading the
@@ -941,8 +958,10 @@
   p.redirect[1] = '\0';
   p.restrictions[0] = '\0';
   p.touched[0] = '\0';
+  p.links_updated[0] = '\0';
   p.latest[0] = '\0';
   p.model[0] = '\0';
+  p.lang[0] = '\0';
 
   if (get_start_tag(stubs, PAGE) == -1) return(0);
 
@@ -1058,17 +1077,33 @@
     if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf);
 
     snprintf(out_buf, sizeof(out_buf), "INSERT %s INTO %s \
-(page_id, page_namespace, page_title, page_restrictions, \
-page_counter, page_is_redirect, page_is_new, \
-page_random, page_touched, page_latest, page_len", insert_ignore?"IGNORE":"", 
t->page);
+(page_id, page_namespace, page_title, page_restrictions", 
insert_ignore?"IGNORE":"", t->page);
+    put_line_all(sqlp, out_buf);
+    if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf);
+
+    snprintf(out_buf, sizeof(out_buf), ", page_counter");
+    write_if_mwv(sqlp, 0,0,1,25,out_buf, verbose);
+
+    snprintf(out_buf, sizeof(out_buf), ", page_is_redirect, page_is_new, 
page_random, page_touched");
+    put_line_all(sqlp, out_buf);
+    if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf);
+
+    snprintf(out_buf, sizeof(out_buf), ", page_links_updated");
+    write_if_mwv(sqlp, 1,23,0,0,out_buf, verbose);
+
+    snprintf(out_buf, sizeof(out_buf), ", page_latest, page_len");
     put_line_all(sqlp, out_buf);
     if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf);
 
     snprintf(out_buf, sizeof(out_buf), ", page_content_model");
     write_if_mwv(sqlp, 1,20,0,0,out_buf, verbose);
 
+    snprintf(out_buf, sizeof(out_buf), ", page_lang");
+    write_if_mwv(sqlp, 1,23,0,0,out_buf, verbose);
+
     strcpy(out_buf, ") VALUES\n");
     put_line_all(sqlp, out_buf);
+    if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf);
 
   }
   else {
@@ -1079,17 +1114,38 @@
   /* fixme having a fixed size buffer kinda sucks here */
   /* text: page_title page_restrictions page_touched */
   snprintf(out_buf, sizeof(out_buf),                           \
-       "(%s, %s, '%s', '%s', %s, %s, %s, %.14f, '%s', %s, %s", \
-          p.id, p.ns, escaped_title, p.restrictions,           \
-          "0", p.redirect, "0", drand48(), p.touched, p.latest, p.len );
+          "(%s, %s, '%s', '%s'",
+          p.id, p.ns, escaped_title, p.restrictions);
   put_line_all(sqlp, out_buf);
   if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf);
 
+  /* p->counter */
+  snprintf(out_buf, sizeof(out_buf), ", 0");
+  write_if_mwv(sqlp, 0, 0, 1, 25, out_buf, verbose);
+
+  snprintf(out_buf, sizeof(out_buf), ", %s, %s, %.14f, '%s'",  \
+          p.redirect, "0", drand48(), p.touched);
+  put_line_all(sqlp, out_buf);
+  if (verbose > 2) fprintf(stderr,"%s", out_buf);
+
+  strcpy(out_buf, ", ");
+  write_if_mwv(sqlp, 1, 24, 0, 0, out_buf, verbose);
+  copy_sql_field(out_buf, p.links_updated[0]?p.links_updated:NULL, 1, 1);
+  write_if_mwv(sqlp, 1, 24, 0, 0, out_buf, verbose);
+
+  snprintf(out_buf, sizeof(out_buf), ", %s, %s", p.latest, p.len);
+  put_line_all(sqlp, out_buf);
+  if (verbose > 2) fprintf(stderr,"%s", out_buf);
+
   strcpy(out_buf, ", ");
   write_if_mwv(sqlp, 1, 20, 0, 0, out_buf, verbose);
-
   copy_sql_field(out_buf, p.model[0]?p.model:NULL, 1, 1);
   write_if_mwv(sqlp, 1, 20, 0, 0, out_buf, verbose);
+
+  strcpy(out_buf, ", ");
+  write_if_mwv(sqlp, 1, 23, 0, 0, out_buf, verbose);
+  copy_sql_field(out_buf, p.lang[0]?p.lang:NULL, 1, 1);
+  write_if_mwv(sqlp, 1, 23, 0, 0, out_buf, verbose);
 
   if (page_rows_written == MAX_PAGE_BATCH) {
     strcpy(out_buf,");\nCOMMIT;\n");
@@ -1334,6 +1390,10 @@
     exit(1);
   }
   s->sitename[0] = s->base[0] = s->generator[0] = s->s_case[0] = '\0';
+  memset(s->sitename, 0, sizeof(s->sitename));
+  memset(s->dbname, 0, sizeof(s->dbname));
+  memset(s->base, 0, sizeof(s->base));
+  memset(s->generator, 0, sizeof(s->generator));
   s->namespaces = NULL;
 
   if (s_info) *s_info = s;
@@ -1352,8 +1412,19 @@
     whine("abrupt end to siteinfo");
     return(0);
   }
-  result = get_elt_with_attrs(f, BASE, s->base, sizeof(s->base), NULL, 0);
 
+  result = get_elt_with_attrs(f, DBNAME, s->dbname, sizeof(s->dbname), NULL, 
0);
+
+  if (result != -1) {
+    /* this xml file has the dbname tag in it (not all do);
+       get the next line. */
+    if (get_line(f) == NULL) {
+      whine("abrupt end to siteinfo");
+      return(0);
+    }
+  }
+
+  result = get_elt_with_attrs(f, BASE, s->base, sizeof(s->base), NULL, 0);
   if (get_line(f) == NULL) {
     whine("abrupt end to siteinfo");
     return(0);
diff --git a/xmlfileutils/sqlutils.c b/xmlfileutils/sqlutils.c
index 284501d..6245094 100644
--- a/xmlfileutils/sqlutils.c
+++ b/xmlfileutils/sqlutils.c
@@ -511,13 +511,18 @@
     else
       snprintf(out_buf, sizeof(out_buf), "`page_restrictions` tinyblob NOT 
NULL,\n");
     put_line(f, out_buf);
-    if (MWV_LESS(mwv,1,10))
+    if (MWV_LESS(mwv,1,10)) {
       snprintf(out_buf, sizeof(out_buf), "`page_counter` bigint(20) unsigned 
NOT NULL DEFAULT '0',\n");
-    else if (MWV_LESS(mwv,1,15))
+      put_line(f, out_buf);
+    }
+    else if (MWV_LESS(mwv,1,15)) {
       snprintf(out_buf, sizeof(out_buf), "`page_counter` bigint unsigned NOT 
NULL DEFAULT '0',\n");
-    else
+      put_line(f, out_buf);
+    }
+    else if (MWV_LESS(mwv,1,25)) {
       snprintf(out_buf, sizeof(out_buf), "`page_counter` bigint unsigned NOT 
NULL DEFAULT 0,\n");
-    put_line(f, out_buf);
+      put_line(f, out_buf);
+    }
     if (MWV_LESS(mwv,1,10))
       snprintf(out_buf, sizeof(out_buf), "`page_is_redirect` tinyint(1) 
unsigned NOT NULL DEFAULT '0',\n");
     else if (MWV_LESS(mwv,1,15))
@@ -541,6 +546,10 @@
     else
       snprintf(out_buf, sizeof(out_buf), "`page_touched` binary(14) NOT NULL 
DEFAULT '',\n");
     put_line(f, out_buf);
+    if (MWV_GREATER(mwv,1,22)) {
+      snprintf(out_buf, sizeof(out_buf), "`page_links_updated` varbinary(14) 
NULL DEFAULT NULL,\n");
+      put_line(f, out_buf);
+    }
     if (MWV_LESS(mwv,1,10))
       snprintf(out_buf, sizeof(out_buf), "`page_latest` int(8) unsigned NOT 
NULL,\n");
     else
@@ -553,6 +562,10 @@
     put_line(f, out_buf);
     if (MWV_GREATER(mwv, 1, 20)) {
       snprintf(out_buf, sizeof(out_buf), "`page_content_model` varbinary(32) 
DEFAULT NULL,\n");
+      put_line(f, out_buf);
+    }
+    if (MWV_GREATER(mwv, 1, 23)) {
+      snprintf(out_buf, sizeof(out_buf), "`page_lang` varbinary(35) DEFAULT 
NULL,\n");
       put_line(f, out_buf);
     }
     snprintf(out_buf, sizeof(out_buf), "PRIMARY KEY (`page_id`),\n");
@@ -598,8 +611,10 @@
     }
     if (MWV_LESS(mwv, 1, 9))
       snprintf(out_buf, sizeof(out_buf), "`rev_comment` tinyblob NOT NULL 
default '',\n");
-    else
+    else if (MWV_LESS(mwv, 1, 25))
       snprintf(out_buf, sizeof(out_buf), "`rev_comment` tinyblob NOT NULL,\n");
+    else
+      snprintf(out_buf, sizeof(out_buf), "`rev_comment` varbinary(767) NOT 
NULL,\n");
     put_line(f, out_buf);
     if (MWV_LESS(mwv, 1, 10))
       snprintf(out_buf, sizeof(out_buf), "`rev_user` int(5) unsigned NOT NULL 
DEFAULT '0',\n");

-- 
To view, visit https://gerrit.wikimedia.org/r/347625
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I90d9e5f113c61940cade3847e856541a26791c2b
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to