ArielGlenn has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/347625 )
Change subject: updated for support up through MW 1.29 ...................................................................... updated for support up through MW 1.29 thanks to patches from Wpmirrordev for part of this update add dbname, page_linked_update, page_lang fields, account for different field orders in xml, do initialization of revision fields at beginning of retrieval Change-Id: I90d9e5f113c61940cade3847e856541a26791c2b --- M xmlfileutils/CHANGELOG M xmlfileutils/Makefile M xmlfileutils/mwxml2sql.c M xmlfileutils/mwxml2sql.h M xmlfileutils/mwxmlelts.c M xmlfileutils/sqlutils.c 6 files changed, 133 insertions(+), 35 deletions(-) Approvals: ArielGlenn: Looks good to me, approved jenkins-bot: Verified diff --git a/xmlfileutils/CHANGELOG b/xmlfileutils/CHANGELOG index 1ac5c8a..23bb437 100644 --- a/xmlfileutils/CHANGELOG +++ b/xmlfileutils/CHANGELOG @@ -1,3 +1,11 @@ +10 April 2017 Ariel T. Glenn <[email protected]> + + Release 0.0.3 + + Updated for compatibility with MediaWiki up through + version 1.29; bump version. Thanks to Kent L. Miller + for earlier patches that updated through MW 1.24. + 29 May 2013 Kent L. Miller <[email protected]> Release 0.0.2-1 diff --git a/xmlfileutils/Makefile b/xmlfileutils/Makefile index 78dc755..7f0f770 100644 --- a/xmlfileutils/Makefile +++ b/xmlfileutils/Makefile @@ -1,9 +1,9 @@ # ------------------------------------------------------------------ # This project is released under the GPL, copyright (C) Ariel T. Glenn -# 2013: see the file COPYING for details. +# 2013 - 2017: see the file COPYING for details. # ------------------------------------------------------------------ -MWXML2SQLVERS = "0.0.2" +MWXML2SQLVERS = "0.0.3" PROGRAM = mediawiki-mwxml2sql DISTNAME = $(PROGRAM)-$(MWXML2SQLVERS) NAME_MWXML2SQL = "convert MediaWiki XML dump file to MySQL INSERT commands" diff --git a/xmlfileutils/mwxml2sql.c b/xmlfileutils/mwxml2sql.c index bb5f608..3c162ea 100644 --- a/xmlfileutils/mwxml2sql.c +++ b/xmlfileutils/mwxml2sql.c @@ -120,9 +120,9 @@ comma = strchr(start, ','); if (comma) *comma = '\0'; else last++; - /* we know MW 1.5 through MW 1.21 even though there is no MW 1.21 yet */ + /* we know MW 1.5 through MW 1.29 even though there is no MW 1.29 yet */ sscanf(start, "%u.%u%20s", &mwv->major, &mwv->minor, mwv->qualifier); - if (mwv->major != 1 || mwv->minor < 5 || mwv->minor > 21) { + if (mwv->major != 1 || mwv->minor < 5 || mwv->minor > 29) { free_mw_version(mwv); return(NULL); } diff --git a/xmlfileutils/mwxml2sql.h b/xmlfileutils/mwxml2sql.h index 1223d6b..189d506 100644 --- a/xmlfileutils/mwxml2sql.h +++ b/xmlfileutils/mwxml2sql.h @@ -59,6 +59,7 @@ typedef struct { char sitename[FIELD_LEN]; + char dbname[FIELD_LEN]; char base[FIELD_LEN]; char generator[FIELD_LEN]; char s_case[FIELD_LEN]; @@ -96,9 +97,11 @@ char redirect[2]; char restrictions[FIELD_LEN]; char touched[FIELD_LEN]; /* from rev_timestamp */ + char links_updated[FIELD_LEN]; /* if not present, set to NULL */ char latest[MAX_ID_LEN]; /* from rev_id */ char len[FIELD_LEN]; /* from text_len */ char model[FIELD_LEN]; /* if not present, set to NULL */ + char lang[FIELD_LEN]; /* if not present, set to NULL */ revision_t ** revs; } page_t; @@ -149,6 +152,7 @@ /* tags we recognize */ #define BASE "base" #define CASE "case" +#define DBNAME "dbname" #define COMMENT "comment" #define CONTRIBUTOR "contributor" #define FORMAT "format" diff --git a/xmlfileutils/mwxmlelts.c b/xmlfileutils/mwxmlelts.c index df8ac36..de0c475 100644 --- a/xmlfileutils/mwxmlelts.c +++ b/xmlfileutils/mwxmlelts.c @@ -580,6 +580,12 @@ c.id[0] = '\0'; r.text = NULL; + r.comment[0] = '\0'; + r.sha1[0] = '\0'; + r.model[0] = '\0'; + r.format[0] = '\0'; + r.text_id[0] = '\0'; + r.text_len[0] = '\0'; get_elt_with_attrs(stubs, ID, r.id, sizeof(r.id), NULL, 0); if (get_line(stubs) == NULL) { @@ -624,7 +630,6 @@ r.minor[1]='\0'; } - r.comment[0] = '\0'; if (get_elt_with_attrs(stubs, COMMENT, r.comment, sizeof(r.comment), NULL, 0) != -1) { if (get_line(stubs) == NULL) { whine("abrupt end of revision data in rev id %s", r.id); @@ -632,8 +637,6 @@ } } un_xml_escape(r.comment, NULL, 1); - r.text_id[0] = '\0'; - r.text_len[0] = '\0'; /* schema 0.7 has sha1 then text, earlier schema don't have it at all so look for it here optionally */ if (get_elt_with_attrs(stubs, SHA1, r.sha1, sizeof(r.sha1), NULL, 0) != -1) { @@ -643,9 +646,22 @@ } } + /* schema 0.10 has model, format before text, sha1 */ + if (get_elt_with_attrs(stubs, MODEL, r.model, sizeof(r.model), NULL, 0) != -1) { + if (get_line(stubs) == NULL) { + whine("abrupt end of revision data in rev id %s", r.id); + return(0); + } + } + if (get_elt_with_attrs(stubs, FORMAT, r.format, sizeof(r.format), NULL, 0) != -1) { + if (get_line(stubs) == NULL) { + whine("abrupt end of revision data in rev id %s", r.id); + return(0); + } + } + /* <text id="382338088" bytes="57" /> */ get_elt_with_attrs(stubs, TEXT, NULL, 0, attrs, MAX_ATTRS_STR_LEN); - if (verbose > 1) fprintf(stderr,"text tag found, %s\n", attrs); attrs_ptr = attrs; while (1) { @@ -680,14 +696,11 @@ if (!todo) break; else attrs_ptr = todo; } - + /* prep buffer after text end */ if (get_line(stubs) == NULL) { whine("abrupt end of revision data in rev id %s", r.id); return(0); } - - r.model[0] = '\0'; - r.format[0] = '\0'; /* schema 0.8 and later have sha1 here after text */ if (! r.sha1[0]) { @@ -698,17 +711,22 @@ } } } - /* schema 0.8 and later have model and format */ - if (get_elt_with_attrs(stubs, MODEL, r.model, sizeof(r.model), NULL, 0) != -1) { - if (get_line(stubs) == NULL) { - whine("abrupt end of revision data in rev id %s", r.id); - return(0); + + /* schema 0.8 and 0.9 have model and format after text/sha1 */ + if (! r.model[0]) { + if (get_elt_with_attrs(stubs, MODEL, r.model, sizeof(r.model), NULL, 0) != -1) { + if (get_line(stubs) == NULL) { + whine("abrupt end of revision data in rev id %s", r.id); + return(0); + } } } - if (get_elt_with_attrs(stubs, FORMAT, r.format, sizeof(r.format), NULL, 0) != -1) { - if (get_line(stubs) == NULL) { - whine("abrupt end of revision data in rev id %s", r.id); - return(0); + if (! r.format[0]) { + if (get_elt_with_attrs(stubs, FORMAT, r.format, sizeof(r.format), NULL, 0) != -1) { + if (get_line(stubs) == NULL) { + whine("abrupt end of revision data in rev id %s", r.id); + return(0); + } } } @@ -716,7 +734,6 @@ whine("no rev end tag for rev id %s", r.id); return(0); } - /* If schema is earlier than 0.5 or for some other reason we don't hve the bytes attr in the text tag, AND we aren't reading the @@ -941,8 +958,10 @@ p.redirect[1] = '\0'; p.restrictions[0] = '\0'; p.touched[0] = '\0'; + p.links_updated[0] = '\0'; p.latest[0] = '\0'; p.model[0] = '\0'; + p.lang[0] = '\0'; if (get_start_tag(stubs, PAGE) == -1) return(0); @@ -1058,17 +1077,33 @@ if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf); snprintf(out_buf, sizeof(out_buf), "INSERT %s INTO %s \ -(page_id, page_namespace, page_title, page_restrictions, \ -page_counter, page_is_redirect, page_is_new, \ -page_random, page_touched, page_latest, page_len", insert_ignore?"IGNORE":"", t->page); +(page_id, page_namespace, page_title, page_restrictions", insert_ignore?"IGNORE":"", t->page); + put_line_all(sqlp, out_buf); + if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf); + + snprintf(out_buf, sizeof(out_buf), ", page_counter"); + write_if_mwv(sqlp, 0,0,1,25,out_buf, verbose); + + snprintf(out_buf, sizeof(out_buf), ", page_is_redirect, page_is_new, page_random, page_touched"); + put_line_all(sqlp, out_buf); + if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf); + + snprintf(out_buf, sizeof(out_buf), ", page_links_updated"); + write_if_mwv(sqlp, 1,23,0,0,out_buf, verbose); + + snprintf(out_buf, sizeof(out_buf), ", page_latest, page_len"); put_line_all(sqlp, out_buf); if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf); snprintf(out_buf, sizeof(out_buf), ", page_content_model"); write_if_mwv(sqlp, 1,20,0,0,out_buf, verbose); + snprintf(out_buf, sizeof(out_buf), ", page_lang"); + write_if_mwv(sqlp, 1,23,0,0,out_buf, verbose); + strcpy(out_buf, ") VALUES\n"); put_line_all(sqlp, out_buf); + if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf); } else { @@ -1079,17 +1114,38 @@ /* fixme having a fixed size buffer kinda sucks here */ /* text: page_title page_restrictions page_touched */ snprintf(out_buf, sizeof(out_buf), \ - "(%s, %s, '%s', '%s', %s, %s, %s, %.14f, '%s', %s, %s", \ - p.id, p.ns, escaped_title, p.restrictions, \ - "0", p.redirect, "0", drand48(), p.touched, p.latest, p.len ); + "(%s, %s, '%s', '%s'", + p.id, p.ns, escaped_title, p.restrictions); put_line_all(sqlp, out_buf); if (verbose > 2) fprintf(stderr,"(%s) %s",t->page, out_buf); + /* p->counter */ + snprintf(out_buf, sizeof(out_buf), ", 0"); + write_if_mwv(sqlp, 0, 0, 1, 25, out_buf, verbose); + + snprintf(out_buf, sizeof(out_buf), ", %s, %s, %.14f, '%s'", \ + p.redirect, "0", drand48(), p.touched); + put_line_all(sqlp, out_buf); + if (verbose > 2) fprintf(stderr,"%s", out_buf); + + strcpy(out_buf, ", "); + write_if_mwv(sqlp, 1, 24, 0, 0, out_buf, verbose); + copy_sql_field(out_buf, p.links_updated[0]?p.links_updated:NULL, 1, 1); + write_if_mwv(sqlp, 1, 24, 0, 0, out_buf, verbose); + + snprintf(out_buf, sizeof(out_buf), ", %s, %s", p.latest, p.len); + put_line_all(sqlp, out_buf); + if (verbose > 2) fprintf(stderr,"%s", out_buf); + strcpy(out_buf, ", "); write_if_mwv(sqlp, 1, 20, 0, 0, out_buf, verbose); - copy_sql_field(out_buf, p.model[0]?p.model:NULL, 1, 1); write_if_mwv(sqlp, 1, 20, 0, 0, out_buf, verbose); + + strcpy(out_buf, ", "); + write_if_mwv(sqlp, 1, 23, 0, 0, out_buf, verbose); + copy_sql_field(out_buf, p.lang[0]?p.lang:NULL, 1, 1); + write_if_mwv(sqlp, 1, 23, 0, 0, out_buf, verbose); if (page_rows_written == MAX_PAGE_BATCH) { strcpy(out_buf,");\nCOMMIT;\n"); @@ -1334,6 +1390,10 @@ exit(1); } s->sitename[0] = s->base[0] = s->generator[0] = s->s_case[0] = '\0'; + memset(s->sitename, 0, sizeof(s->sitename)); + memset(s->dbname, 0, sizeof(s->dbname)); + memset(s->base, 0, sizeof(s->base)); + memset(s->generator, 0, sizeof(s->generator)); s->namespaces = NULL; if (s_info) *s_info = s; @@ -1352,8 +1412,19 @@ whine("abrupt end to siteinfo"); return(0); } - result = get_elt_with_attrs(f, BASE, s->base, sizeof(s->base), NULL, 0); + result = get_elt_with_attrs(f, DBNAME, s->dbname, sizeof(s->dbname), NULL, 0); + + if (result != -1) { + /* this xml file has the dbname tag in it (not all do); + get the next line. */ + if (get_line(f) == NULL) { + whine("abrupt end to siteinfo"); + return(0); + } + } + + result = get_elt_with_attrs(f, BASE, s->base, sizeof(s->base), NULL, 0); if (get_line(f) == NULL) { whine("abrupt end to siteinfo"); return(0); diff --git a/xmlfileutils/sqlutils.c b/xmlfileutils/sqlutils.c index 284501d..6245094 100644 --- a/xmlfileutils/sqlutils.c +++ b/xmlfileutils/sqlutils.c @@ -511,13 +511,18 @@ else snprintf(out_buf, sizeof(out_buf), "`page_restrictions` tinyblob NOT NULL,\n"); put_line(f, out_buf); - if (MWV_LESS(mwv,1,10)) + if (MWV_LESS(mwv,1,10)) { snprintf(out_buf, sizeof(out_buf), "`page_counter` bigint(20) unsigned NOT NULL DEFAULT '0',\n"); - else if (MWV_LESS(mwv,1,15)) + put_line(f, out_buf); + } + else if (MWV_LESS(mwv,1,15)) { snprintf(out_buf, sizeof(out_buf), "`page_counter` bigint unsigned NOT NULL DEFAULT '0',\n"); - else + put_line(f, out_buf); + } + else if (MWV_LESS(mwv,1,25)) { snprintf(out_buf, sizeof(out_buf), "`page_counter` bigint unsigned NOT NULL DEFAULT 0,\n"); - put_line(f, out_buf); + put_line(f, out_buf); + } if (MWV_LESS(mwv,1,10)) snprintf(out_buf, sizeof(out_buf), "`page_is_redirect` tinyint(1) unsigned NOT NULL DEFAULT '0',\n"); else if (MWV_LESS(mwv,1,15)) @@ -541,6 +546,10 @@ else snprintf(out_buf, sizeof(out_buf), "`page_touched` binary(14) NOT NULL DEFAULT '',\n"); put_line(f, out_buf); + if (MWV_GREATER(mwv,1,22)) { + snprintf(out_buf, sizeof(out_buf), "`page_links_updated` varbinary(14) NULL DEFAULT NULL,\n"); + put_line(f, out_buf); + } if (MWV_LESS(mwv,1,10)) snprintf(out_buf, sizeof(out_buf), "`page_latest` int(8) unsigned NOT NULL,\n"); else @@ -553,6 +562,10 @@ put_line(f, out_buf); if (MWV_GREATER(mwv, 1, 20)) { snprintf(out_buf, sizeof(out_buf), "`page_content_model` varbinary(32) DEFAULT NULL,\n"); + put_line(f, out_buf); + } + if (MWV_GREATER(mwv, 1, 23)) { + snprintf(out_buf, sizeof(out_buf), "`page_lang` varbinary(35) DEFAULT NULL,\n"); put_line(f, out_buf); } snprintf(out_buf, sizeof(out_buf), "PRIMARY KEY (`page_id`),\n"); @@ -598,8 +611,10 @@ } if (MWV_LESS(mwv, 1, 9)) snprintf(out_buf, sizeof(out_buf), "`rev_comment` tinyblob NOT NULL default '',\n"); - else + else if (MWV_LESS(mwv, 1, 25)) snprintf(out_buf, sizeof(out_buf), "`rev_comment` tinyblob NOT NULL,\n"); + else + snprintf(out_buf, sizeof(out_buf), "`rev_comment` varbinary(767) NOT NULL,\n"); put_line(f, out_buf); if (MWV_LESS(mwv, 1, 10)) snprintf(out_buf, sizeof(out_buf), "`rev_user` int(5) unsigned NOT NULL DEFAULT '0',\n"); -- To view, visit https://gerrit.wikimedia.org/r/347625 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I90d9e5f113c61940cade3847e856541a26791c2b Gerrit-PatchSet: 3 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <[email protected]> Gerrit-Reviewer: ArielGlenn <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
