ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/347626 )
Change subject: add a cheap sample script for importing to a local instance
......................................................................
add a cheap sample script for importing to a local instance
Change-Id: Icd070d76dbdb8585f94523ee2c2c2b38d15f1a40
---
A xmlfileutils/scripts/extract_tablecreate.py
A xmlfileutils/scripts/import_tables.sh
2 files changed, 235 insertions(+), 0 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/xmlfileutils/scripts/extract_tablecreate.py
b/xmlfileutils/scripts/extract_tablecreate.py
new file mode 100644
index 0000000..f8fa4ab
--- /dev/null
+++ b/xmlfileutils/scripts/extract_tablecreate.py
@@ -0,0 +1,114 @@
+"""
+grab CREATE TABLE statement from e.g. a mysql dump
+and write it to a separate file
+"""
+import getopt
+import sys
+import gzip
+
+
+def usage(message=None):
+ """
+ show usage information for this script with an optional
+ message preceding it
+ """
+ if message is not None:
+ sys.stderr.write(message + "\n")
+ usage_message = """extract_tablecreate.py --sqlfile path
+ [--help]
+
+Tis script will read the sql contained in the specified sql file until
+it finds a CREATE TABLE statement. It will write that statement to
+an output file of a similar name but with 'create' tacked on at
+the end.
+
+Gzipped files will be zcatted silently as input;
+the output file will be uncompressed regardless.
+
+Options:
+
+--sqlfile (-s): path to possibly gzipped sql file with the
+ CREATE TABLE statement and perhaps a bunch of
+ INSERTS and such afterwards
+--help (-h): show this help message
+"""
+ sys.stderr.write(usage_message)
+ sys.exit(1)
+
+
+def get_output_file(sqlfile):
+ """
+ generate suitable output filename
+ """
+ newfile = sqlfile
+ if newfile.endswith(".gz"):
+ newfile = newfile[:-3]
+ return newfile + ".create"
+
+
+def get_fhandle(path, mode="r"):
+ """
+ get an appropriate filehandle for
+ plaintext or gzipped file
+ """
+ if path.endswith(".gz"):
+ return gzip.open(path, mode)
+ else:
+ return open(path, mode)
+
+
+def write_create_table(sqlfile):
+ """
+ read the first part of the sql file,
+ fine the create table statement,
+ write it out to a file of a similar name but with
+ no compression file extension (as the file will
+ be written out uncompressed), and the string
+ 'create' tacked on at the end.
+ """
+ out_fhandle = get_fhandle(get_output_file(sqlfile), "w+")
+ in_fhandle = get_fhandle(sqlfile, "r")
+ writing = False
+ for line in in_fhandle:
+ if line.startswith("CREATE"):
+ writing = True
+ out_fhandle.write(line)
+ elif line.startswith(")") and writing:
+ writing = False
+ out_fhandle.write(line)
+ out_fhandle.close()
+ return
+ elif writing:
+ out_fhandle.write(line)
+
+
+def do_main():
+ 'main entry point, does all the work'
+ sqlfile = None
+
+ try:
+ (options, remainder) = getopt.gnu_getopt(
+ sys.argv[1:], "s:h", ["sqlfile=", "help"])
+ except getopt.GetoptError as err:
+ usage("Unknown option specified: " + str(err))
+
+ for (opt, val) in options:
+ if opt in ["-s", "--sqlfile"]:
+ sqlfile = val
+ elif opt in ["-h", "--help"]:
+ usage("Help for this script")
+ else:
+ usage("Unknown option specified: <%s>" % opt)
+
+ if len(remainder) > 0:
+ usage("Unknown option(s) specified: <%s>" % remainder[0])
+ if sqlfile is None:
+ print "Mandatory 'sqlfile' argument not specified"
+ sys.exit(1)
+
+ write_create_table(sqlfile)
+
+
+if __name__ == '__main__':
+ do_main()
+
diff --git a/xmlfileutils/scripts/import_tables.sh
b/xmlfileutils/scripts/import_tables.sh
new file mode 100644
index 0000000..2ab78a7
--- /dev/null
+++ b/xmlfileutils/scripts/import_tables.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# change these according to your wiki and export date and location of the sql
files
+WIKI="elwikivoyage" # name of the wiki as it
appears in downloaded files
+DBNAME="elwikivoyage" # name of the wiki's db in
your local mysql database
+DATE="20170401" # date as it appears in
downloaded files
+CMDDIR="." # where the sql2txt and
mwxml2sql files live
+IMPORTDIR="imported" # directory relative to cwd,
where downloaded files are located
+OUTDIR="outputs" # directory relative to cwd,
where output files will be generated
+VERSION="1.29" # version of the generator in
the stubs, page content files downloaded
+BASEDOWNLOADURL="https://dumps.wikimedia.org" # url to base of dumps tree
for downloading
+
+MOSTTABLES="categorylinks category change_tag externallinks geo_tags
imagelinks iwlinks \
+ langlinks pagelinks page_props page_restrictions protected_titles \
+ redirect templatelinks"
+SPECIALTABLES="page revision text"
+TABLES="${MOSTTABLES} ${SPECIALTABLES}"
+
+echo "checking if downloads are needed"
+downloadsneeded=0
+for table in $MOSTTABLES; do
+ filename="${WIKI}-${DATE}-${table}.sql.gz"
+ if [ ! -e ${IMPORTDIR}/${filename} ]; then
+ downloadsneeded=1
+ break
+ fi
+done
+
+if [ $downloadsneeded -eq 0 ]; then
+ echo "downloads not needed"
+else
+ echo "downloads proceeding"
+ for table in $MOSTTABLES; do
+ filename="${WIKI}-${DATE}-${table}.sql.gz"
+ if [ ! -e ${IMPORTDIR}/${filename} ]; then
+ wget -O ${IMPORTDIR}/${filename}
${BASEDOWNLOADURL}/${WIKI}/${DATE}/$filename
+ fi
+ done
+ echo "downloads complete"
+fi
+
+echo "checking if page, revision, text file generation needed"
+generateneeded=0
+for table in ${SPECIALTABLES}; do
+ if [ ! -e "${OUTDIR}/${WIKI}-${DATE}-${table}.sql.gz" ]; then
+ generateneeded=1
+ fi
+done
+if [ $generateneeded -eq 0 ]; then
+ echo "generation not needed"
+else
+ echo "generating sql files for page, revision, text"
+ ${CMDDIR}/mwxml2sql -s
${IMPORTDIR}/${WIKI}-${DATE}-stub-meta-history.xml.gz -t
${IMPORTDIR}/${WIKI}-${DATE}-pages-meta-history.xml.bz2 -f
${OUTDIR}/${WIKI}-${DATE}-history.sql.gz -m "$VERSION"
+ echo "sql file generation done"
+ echo "converting sql files to tab-delimited for import"
+ for table in ${SPECIALTABLES}; do
+ mv ${OUTDIR}/${WIKI}-${DATE}-history.sql-${table}.sql-${VERSION}.gz
${OUTDIR}/${WIKI}-${DATE}-${table}.sql.gz
+ done
+fi
+
+for table in $TABLES; do
+ file="${WIKI}-${DATE}-${table}.sql.gz"
+ newfile=`echo $file | sed -e 's/sql.gz/tabs.gz/'`
+ if [ -e "${OUTDIR}/$file" ]; then
+ # if it was a converted file, use that
+ infile="${OUTDIR}/$file"
+ else
+ # otherwise use the file we downloaded, ready for import
+ infile="${IMPORTDIR}/$file"
+ fi
+ # convert to tab separated
+ zcat $infile | ${CMDDIR}/sql2txt | gzip > ${OUTDIR}/$newfile
+done
+echo "tab conversion done"
+
+echo "extracting table create statements"
+for table in $MOSTTABLES; do
+ python ${CMDDIR}/extract_tablecreate.py -s
"${IMPORTDIR}/${WIKI}-${DATE}-${table}.sql.gz"
+done
+echo "table create statement extraction done"
+
+echo "Dropping tables"
+for table in $MOSTTABLES; do
+ file="${WIKI}-${DATE}-${table}.sql.create"
+ if [ -e ${IMPORTDIR}/${file} ]; then
+ echo "DROP TABLE IF EXISTS $table ; " | mysql -u root -pnotverysecure
$DBNAME
+ fi
+done
+echo "Dropping tables done"
+
+echo "Truncating tables"
+for table in $SPECIALTABLES; do
+ echo "TRUNCATE TABLE $table ; " | mysql -u root -pnotverysecure $DBNAME
+done
+echo "Truncating tables done"
+
+echo "Creating tables"
+for table in $MOSTTABLES; do
+ file="${WIKI}-${DATE}-${table}.sql.create"
+ if [ -e ${IMPORTDIR}/${file} ]; then
+ cat ${IMPORTDIR}/${file} | mysql -u root -pnotverysecure $DBNAME
+ fi
+done
+echo "Table creation done"
+
+echo "beginning sql import"
+date > import-timing.txt
+CWD=`pwd`
+for table in $TABLES; do
+ echo "TABLE: $table"
+ zcat "${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs.gz" >
"${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs"
+ ( \
+ echo "SET autocommit=0; SET unique_checks=0; SET foreign_key_checks=0;" ;
+ echo "LOAD DATA INFILE
\"${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs\" INTO TABLE ${table} FIELDS
OPTIONALLY ENCLOSED BY \"'\";" ;
+ echo "SET autocommit=1; SET unique_checks=1; SET foreign_key_checks=1;"
;
+ ) | mysql -u root -pnotverysecure $DBNAME
+ rm "${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs"
+done
+date >> import-timing.txt
+echo "import done"
+echo "ALL STEPS COMPLETE"
--
To view, visit https://gerrit.wikimedia.org/r/347626
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icd070d76dbdb8585f94523ee2c2c2b38d15f1a40
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits