ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/347626 )

Change subject: add a sample script for importing to a local instance
......................................................................

add a sample script for importing to a local instance

[WIP] can't be sure it works completely until I have a working
dump from it; this testing is in progess

Change-Id: Icd070d76dbdb8585f94523ee2c2c2b38d15f1a40
---
A xmlfileutils/scripts/extract_tablecreate.py
A xmlfileutils/scripts/import_tables.sh
2 files changed, 235 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps 
refs/changes/26/347626/1

diff --git a/xmlfileutils/scripts/extract_tablecreate.py 
b/xmlfileutils/scripts/extract_tablecreate.py
new file mode 100644
index 0000000..f8fa4ab
--- /dev/null
+++ b/xmlfileutils/scripts/extract_tablecreate.py
@@ -0,0 +1,114 @@
+"""
+grab CREATE TABLE statement from e.g. a mysql dump
+and write it to a separate file
+"""
+import getopt
+import sys
+import gzip
+
+
+def usage(message=None):
+    """
+    show usage information for this script with an optional
+    message preceding it
+    """
+    if message is not None:
+        sys.stderr.write(message + "\n")
+    usage_message = """extract_tablecreate.py --sqlfile path
+               [--help]
+
+Tis script will read the sql contained in the specified sql file until
+it finds a CREATE TABLE statement.  It will write that statement to
+an output file of a similar name but with 'create' tacked on at
+the end.
+
+Gzipped files will be zcatted silently as input;
+the output file will be uncompressed regardless.
+
+Options:
+
+--sqlfile (-s):  path to possibly gzipped sql file with the
+                 CREATE TABLE statement and perhaps a bunch of
+                 INSERTS and such afterwards
+--help    (-h):  show this help message
+"""
+    sys.stderr.write(usage_message)
+    sys.exit(1)
+
+
+def get_output_file(sqlfile):
+    """
+    generate suitable output filename
+    """
+    newfile = sqlfile
+    if newfile.endswith(".gz"):
+        newfile = newfile[:-3]
+    return newfile + ".create"
+
+
+def get_fhandle(path, mode="r"):
+    """
+    get an appropriate filehandle for
+    plaintext or gzipped file
+    """
+    if path.endswith(".gz"):
+        return gzip.open(path, mode)
+    else:
+        return open(path, mode)
+
+
+def write_create_table(sqlfile):
+    """
+    read the first part of the sql file,
+    fine the create table statement,
+    write it out to a file of a similar name but with
+    no compression file extension (as the file will
+    be written out uncompressed), and the string
+    'create' tacked on at the end.
+    """
+    out_fhandle = get_fhandle(get_output_file(sqlfile), "w+")
+    in_fhandle = get_fhandle(sqlfile, "r")
+    writing = False
+    for line in in_fhandle:
+        if line.startswith("CREATE"):
+            writing = True
+            out_fhandle.write(line)
+        elif line.startswith(")") and writing:
+            writing = False
+            out_fhandle.write(line)
+            out_fhandle.close()
+            return
+        elif writing:
+            out_fhandle.write(line)
+
+
+def do_main():
+    'main entry point, does all the work'
+    sqlfile = None
+
+    try:
+        (options, remainder) = getopt.gnu_getopt(
+            sys.argv[1:], "s:h", ["sqlfile=", "help"])
+    except getopt.GetoptError as err:
+        usage("Unknown option specified: " + str(err))
+
+    for (opt, val) in options:
+        if opt in ["-s", "--sqlfile"]:
+            sqlfile = val
+        elif opt in ["-h", "--help"]:
+            usage("Help for this script")
+        else:
+            usage("Unknown option specified: <%s>" % opt)
+
+    if len(remainder) > 0:
+        usage("Unknown option(s) specified: <%s>" % remainder[0])
+    if sqlfile is None:
+        print "Mandatory 'sqlfile' argument not specified"
+        sys.exit(1)
+
+    write_create_table(sqlfile)
+
+
+if __name__ == '__main__':
+    do_main()
+
diff --git a/xmlfileutils/scripts/import_tables.sh 
b/xmlfileutils/scripts/import_tables.sh
new file mode 100644
index 0000000..c0066d0
--- /dev/null
+++ b/xmlfileutils/scripts/import_tables.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# change these according to your wiki and export date and location of the sql 
files
+WIKI="elwikivoyage"                              # name of the wiki as it 
appears in downloaded files
+DBNAME="elwikivoyage"                            # name of the wiki's db in 
your local mysql database
+DATE="20170401"                                  # date as it appears in 
downloaded files
+CMDDIR="."                                       # where the sql2txt and 
mwxml2sql files live
+IMPORTDIR="imported"                             # directory relative to cwd, 
where downloaded files are located
+OUTDIR="outputs"                                 # directory relative to cwd, 
where output files will be generated
+VERSION="1.29"                                   # version of the generator in 
the stubs, page content files downloaded
+BASEDOWNLOADURL="https://dumps.wikimedia.org";    # url to base of dumps tree 
for downloading
+
+MOSTTABLES="categorylinks category change_tag externallinks geo_tags 
imagelinks iwlinks \
+      langlinks pagelinks page_props page_restrictions protected_titles \
+      redirect templatelinks"
+SPECIALTABLES="page revision text"
+TABLES="${MOSTTABLES} ${SPECIALTABLES}"
+
+echo "checking if downloads are needed"
+downloadsneeded=0
+for table in $MOSTTABLES; do
+    filename="${WIKI}-${DATE}-${table}.sql.gz"
+    if [ ! -e ${IMPORTDIR}/${filename} ]; then
+       downloadsneeded=1
+       break
+    fi
+done
+
+if [ $downloadsneeded -eq 0 ]; then
+    echo "downloads not needed"
+else
+    echo "downloads proceeding"
+    for table in $MOSTTABLES; do
+        filename="${WIKI}-${DATE}-${table}.sql.gz"
+        if [ ! -e ${IMPORTDIR}/${filename} ]; then
+           wget -O ${IMPORTDIR}/${filename} 
${BASEDOWNLOADURL}/${WIKI}/${DATE}/$filename
+        fi
+    done
+    echo "downloads complete"
+fi
+
+echo "checking if page, revision, text file generation needed"
+generateneeded=0
+for table in ${SPECIALTABLES}; do
+    if [ ! -e "${OUTDIR}/${WIKI}-${DATE}-${table}.sql.gz" ]; then
+       generateneeded=1
+    fi
+done
+if [ $generateneeded -eq 0 ]; then
+    echo "generation not needed"    
+else
+    echo "generating sql files for page, revision, text"
+    ${CMDDIR}/mwxml2sql -s 
${IMPORTDIR}/${WIKI}-${DATE}-stub-meta-history.xml.gz -t 
${IMPORTDIR}/${WIKI}-${DATE}-pages-meta-history.xml.bz2 -f 
${OUTDIR}/${WIKI}-${DATE}-history.sql.gz -m "$VERSION"
+    echo "sql file generation done"
+    echo "converting sql files to tab-delimited for import"
+    for table in ${SPECIALTABLES}; do
+        mv ${OUTDIR}/${WIKI}-${DATE}-history.sql-${table}.sql-${VERSION}.gz 
${OUTDIR}/${WIKI}-${DATE}-${table}.sql.gz
+    done
+fi
+
+for table in $TABLES; do
+    file="${WIKI}-${DATE}-${table}.sql.gz"
+    newfile=`echo $file | sed -e 's/sql.gz/tabs.gz/'`
+    if [ -e "${OUTDIR}/$file" ]; then
+        # if it was a converted file, use that
+       infile="${OUTDIR}/$file"
+    else
+       # otherwise use the file we downloaded, ready for import
+       infile="${IMPORTDIR}/$file"
+    fi
+    # convert to tab separated
+    zcat $infile | ${CMDDIR}/sql2txt | gzip > ${OUTDIR}/$newfile
+done
+echo "tab conversion done"
+
+echo "extracting table create statements"
+for table in $MOSTTABLES; do
+    python ${CMDDIR}/extract_tablecreate.py -s 
"${IMPORTDIR}/${WIKI}-${DATE}-${table}.sql.gz"
+done
+echo "table create statement extraction done"
+
+echo "Dropping tables"
+for table in $MOSTTABLES; do
+    file="${WIKI}-${DATE}-${table}.sql.create"
+    if [ -e ${IMPORTDIR}/${file} ]; then
+        echo "DROP TABLE IF EXISTS $table ; " | mysql -u root -pnotverysecure 
$DBNAME
+    fi
+done
+echo "Dropping tables done"
+
+echo "Truncating tables"
+for table in $SPECIALTABLES; do
+    echo "TRUNCATE TABLE $table ; " | mysql -u root -pnotverysecure $DBNAME
+done
+echo "Truncating tables done"
+
+echo "Creating tables"
+for table in $MOSTTABLES; do
+    file="${WIKI}-${DATE}-${table}.sql.create"
+    if [ -e ${IMPORTDIR}/${file} ]; then
+        cat ${IMPORTDIR}/${file} | mysql -u root -pnotverysecure $DBNAME
+    fi
+done
+echo "Table creation done"
+
+echo "beginning sql import"
+date > import-timing.txt
+CWD=`pwd`
+for table in $TABLES; do
+    echo "TABLE: $table"
+    zcat "${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs.gz" > 
"${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs"
+    ( \
+      echo "SET autocommit=0; SET unique_checks=0; SET foreign_key_checks=0;" ;
+      echo "LOAD DATA INFILE 
\"${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs\" INTO TABLE ${table} FIELDS 
OPTIONALLY ENCLOSED BY '';" ;
+      echo "SET autocommit=1; SET unique_checks=1; SET foreign_key_checks=1;" 
; 
+    ) | mysql -u root -pnotverysecure $DBNAME
+    rm "${CWD}/${OUTDIR}/${WIKI}-${DATE}-${table}.tabs"
+done
+date >> import-timing.txt
+echo "import done"
+echo "ALL STEPS COMPLETE"

-- 
To view, visit https://gerrit.wikimedia.org/r/347626
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icd070d76dbdb8585f94523ee2c2c2b38d15f1a40
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to