https://www.mediawiki.org/wiki/Special:Code/MediaWiki/101971
Revision: 101971 Author: ariel Date: 2011-11-04 09:46:30 +0000 (Fri, 04 Nov 2011) Log Message: ----------- possibily useful, likely buggy script for grabbing specific columns from desired rows of a mysql table dump Added Paths: ----------- branches/ariel/xmldumps-backup/mysql2txt.py Added: branches/ariel/xmldumps-backup/mysql2txt.py =================================================================== --- branches/ariel/xmldumps-backup/mysql2txt.py (rev 0) +++ branches/ariel/xmldumps-backup/mysql2txt.py 2011-11-04 09:46:30 UTC (rev 101971) @@ -0,0 +1,405 @@ +# this script reads from stdin a sql file created by mysqldump, grabs the requested columns from +# the requested table from each tuple, and writes them out one tuple per line +# with a comma between columns, keeping the original escaping of values as done by mysql. + +import getopt +import os +import re +import sys + +class ConverterError(Exception): + pass + +class MysqlFile: + def __init__(self, f, tableRequested, columnsRequested, valuesRequestedCols, valuesRequestedVals, fieldSeparator): + self.file = f + self.tableRequested = tableRequested + self.columnsRequested = columnsRequested + self.valuesRequestedCols = valuesRequestedCols + self.valuesRequestedVals = valuesRequestedVals + self.fieldSeparator = fieldSeparator + + self.buffer = "" + self.bufferInd = 0 + self.eof = False + self.rowsDone = False + self.GET = 1 + self.CHECK = 2 + self.SKIP = 0 + + def findCreateStatement(self): + tableFound = False + toFind = "CREATE TABLE `%s` (\n" % self.tableRequested + line = self.getLine(len(toFind)) + if (not line.endswith("\n")): + self.skipLineRemainder() + while line != "": + if line == toFind: + tableFound = True + break + line = self.getLine(len(toFind)) + if (not line.endswith("\n")): + self.skipLineRemainder() + if not tableFound: + raise ConverterError("create statement for requested table not found in file") + + def getLine(self, maxbytes = 0): + """returns line including the \n, up to maxbytes""" + line = "" + length = 0 + if self.eof: + return False + while self.buffer[self.bufferInd] != '\n': + line = line + self.buffer[self.bufferInd] + if not self.incrementBufferPtr(): + return False + length = length + 1 + if maxbytes and length == maxbytes: + return line + + if not self.skipChar('\n'): + return False + return line + "\n" + + def skipLineRemainder(self): + # skip up to the newline... + while self.buffer[self.bufferInd] != '\n': + if not self.incrementBufferPtr(): + return False + # and now the newline. + return self.incrementBufferPtr() + + def findInsertStatement(self): + """leave the file contents at the line immediately following + an INSERT statement""" + if m.eof: + return False + insertFound = False + toFind = "INSERT INTO `%s` VALUES " % self.tableRequested + line = self.getLine(len(toFind)) + while line and not self.eof: + if line.startswith(toFind): + insertFound = True + break + if (not line.endswith("\n")): + self.skipLineRemainder() + line = self.getLine(len(toFind)) + return insertFound + + def setupColumnRetrieval(self): + self.columnsInTable = [] + columnNameExpr = re.compile('\s+`([^`]+)`') + line = self.getLine() + while (line and not self.eof and line[0] != ')' ): + columnNameMatch = columnNameExpr.match(line) + if (columnNameMatch): + self.columnsInTable.append(columnNameMatch.group(1)) + line = self.getLine() + + for c in self.columnsRequested: + if not c in self.columnsInTable: + raise ConverterError("requested column %s not found in table" % c) + +# print "columns in table: ", self.columnsInTable +# print "columnsRequested: ", self.columnsRequested + + self.columnsToGet = [] + for c in self.columnsInTable: + v = self.SKIP + if c in self.columnsRequested: + v = v | self.GET + if c in self.valuesRequestedCols: + v = v | self.CHECK + self.columnsToGet.append( v ) + +# print "columns to get: ", self.columnsToGet + + self.columnOrder = [] + # we want here a list which tells us to + # write the ith column we read from tuple first, + # the jth one second, the kth one third etc. + columnsToGetTrue = [] + for i in range(0,len(self.columnsToGet)): + if self.columnsToGet[i] & self.GET: + columnsToGetTrue.append(self.columnsInTable[i]) + for c in self.columnsRequested: + self.columnOrder.append(columnsToGetTrue.index(c)) + +# print "column order: ", self.columnOrder + + def whine(self, message = None): + if (message): + raise ConverterError("whine whine whine: " + message ) + else: + raise ConverterError("whine whine whine. failed to parse a row.") + + def getColumnsFromRow(self): + """returns a list of column values extracted from a row. + f is an open input file positioned at the beginning of a + tuple representing a row in mysql output format, + colsToGet is a list of True/False correspnding to which + elements in the tuple we want to retrieve and return""" + +# print "buffer is ", self.buffer[self.bufferInd:self.bufferInd+80], "..." + if not self.skipStartOfRow(): + self.whine("couldn't find start of row") + cols = [] + ind = 0 + skip = False + for c in self.columnsToGet: + if skip: + self.skipColValue() + elif c & self.GET: + cols.append(self.getColValue()) + if c & self.CHECK: + colName = self.columnsInTable[ind] + j = self.valuesRequestedCols.index(colName) + if self.getColValue() != self.valuesRequestedVals[j]: + skip = True + cols = None + elif c & self.CHECK: + colName = self.columnsInTable[ind] + j = self.valuesRequestedCols.index(colName) + if self.getColValue() != self.valuesRequestedVals[j]: + skip = True + cols = None + else: + self.skipColValue() + ind = ind + 1 + + self.skipEndOfRow() + return(cols) + + def skipStartOfRow(self): + # expect ( + if not self.skipChar('('): + return False + return True + + def skipEndOfRow(self): + # expect... what do we expect? ); or ), + # the first means end of row with no more rows after, the second means end of + # specific row only + if not self.skipChar(')'): + self.whine() + if not self.skipChar(','): + if self.skipChar(';'): + self.rowsDone = True + else: + self.whine() + self.skipChar('\n') + + def getColValue(self): + #expect: a string of digits + # or: ' some stuff, ' + value="" + if (self.buffer[self.bufferInd].isdigit()): + while self.buffer[self.bufferInd].isdigit(): + value=value + self.buffer[self.bufferInd] + if not self.incrementBufferPtr(): + return False + # there will be a comma before the next + # column if we aren't at the end of the row. + self.skipChar(',') + return value + elif (self.skipChar("'")): + value = "'" + done = False + escaped = False + while not done: + if self.buffer[self.bufferInd] != "'" and self.buffer[self.bufferInd] != '\\': + value=value + self.buffer[self.bufferInd] + if not self.incrementBufferPtr(): + return False + escaped = False + elif self.buffer[self.bufferInd] == "'": + value=value + self.buffer[self.bufferInd] + if not self.incrementBufferPtr(): + return False + if not escaped: + done = True + else: + escaped = False + else: # escape char \ found + value=value + self.buffer[self.bufferInd] + if not self.incrementBufferPtr(): + return False + if escaped: + escaped = False + else: + escaped = True + if done: + # there will be a comma before the next + # column if we aren't at the end of the row. + self.skipChar(',') + return value + else: + self.whine() + + def skipColValue(self): + #expect: a string of digits with possibly a . in there + # or: ' some stuff, ' + if (self.buffer[self.bufferInd].isdigit()): + # might have a float so... crudely... + while self.buffer[self.bufferInd].isdigit() or self.buffer[self.bufferInd] == '.' or self.buffer[self.bufferInd] == 'e' or self.buffer[self.bufferInd] == '-': + if not self.incrementBufferPtr(): + return False + # there will be a comma before the next + # column if we aren't at the end of the row. + self.skipChar(',') + elif (self.skipChar("'")): + done = False + escaped = False + while not done: + if self.buffer[self.bufferInd] != "'" and self.buffer[self.bufferInd] != '\\': + if not self.incrementBufferPtr(): + return False + escaped = False + elif self.buffer[self.bufferInd] == "'": + if not self.incrementBufferPtr(): + return False + if not escaped: + done = True + else: + escaped = False + else: # escape char \ found + if not self.incrementBufferPtr(): + return False + if escaped: + escaped = False + else: + escaped = True + if done: + # there will be a comma before the next + # column if we aren't at the end of the row. + self.skipChar(',') + else: +# print "buffer is ", self.buffer[self.bufferInd:self.bufferInd+80], "..." + self.whine("failed to parse a value, found start character " + self.buffer[self.bufferInd]) + + def skipChar(self, c): + if self.buffer[self.bufferInd] == c: + if not self.incrementBufferPtr(): + return False + return True + else: + return False + + def incrementBufferPtr(self): + self.bufferInd = self.bufferInd + 1 + if self.bufferInd == len(self.buffer): + return self.fillBuffer() # this will move the index accordingly + return True + + def fillBuffer(self): + if self.bufferInd == len(self.buffer) and not self.rowsDone: + # we are out of data in the buffer, and there's more + # rows to be gotten + + # fixme this should be a constant someplace configurable + self.buffer = self.file.read(8192) + if (self.buffer == ""): + self.rowsDone = True + self.eof = True + return False + else: + self.bufferInd = 0 + return True + + def formatColumn(self, column): + """for now we do nothing. maybe we want this in the future.""" + return column + + def writeColumns(self, columns, outFile): + """takes a list of column values without names. + must find the names these correspond to, figure out the right + order (or alternatively maybe we have a map that tells us the order) + and write the values out in the new order.""" + if columns: + ind = 0 + for i in self.columnOrder: + outFile.write(self.formatColumn(columns[i])) + if ind < len(self.columnOrder)-1: + outFile.write(self.fieldSeparator) + ind = ind + 1 + outFile.write('\n') + +def usage(message = None): + if message: + print message + print "Usage: python mysql2txt.py --table=tablename --columns=col1,col2... " + print " [--values=col1=val1,col2=val2...] [--separator=<string>]" + print "" + print "This script reads a table dump in mysql format from stdin and writes" + print "specified columns from desired rows to stdout, one line per row." + print "" + print "--table: the name of the table from which we want to extract values" + print "--columns: the names of the columns from the table, separated by commas," + print " the values of which we want to retrieve, in the order we want" + print " them to be written on each line of the output" + print "--values: pairs of column names and values we want the column to have, for" + print " each row to be printed; in each pair the column name and" + print " the value are separated by an equals sign, and these pairs are" + print " separated from each other by commas" + print "--separator: the string which will be written after each value in a row" + print " to separate it from the next value, by default a space" + print "" + print "Example: zcat elwikidb-20111102-page.sql.gz | python mysql2txt.py --table=page \\" + print " --columns=page_title,page_id --values=page_namespace=15 --separator=' | '" + sys.exit(1) + +if __name__ == "__main__": + tableRequested = None + columnsRequested = None + valuesRequestedCols = [] + valuesRequestedVals = [] + fieldSeparator = ' ' + + try: + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", ['table=', 'columns=', 'values=', 'separator=' ]) + except: + usage("Unknown option specified") + + for (opt, val) in options: + if opt == "--table": + tableRequested = val + elif opt == "--columns": + if ',' in val: + columnsRequested = val.split(',') + else: + columnsRequested = [ val ] + elif opt == "--values": + if ',' in val: + vlist = val.split(',') + else: + vlist = [ val ] + valuesRequestedCols = [ v.split('=')[0] for v in vlist ] + valuesRequestedVals = [ v.split('=')[1] for v in vlist ] + elif opt == "--separator": + fieldSeparator = val + + if (len(remainder) > 0): + usage("Unknown option specified") + + if (not tableRequested or not columnsRequested): + usage("Missing required option") + + m = MysqlFile(sys.stdin, tableRequested, columnsRequested, valuesRequestedCols, valuesRequestedVals, fieldSeparator) + m.fillBuffer() + + m.findCreateStatement() + m.setupColumnRetrieval() + + if not m.findInsertStatement(): + raise ConverterError("insert statement for requested table not found in file") + while (not m.eof): + cols = m.getColumnsFromRow() + # write them out in the correct order... + m.writeColumns(cols, sys.stdout) + if m.rowsDone and not m.eof: + # could have multiple inserts for the same table + m.rowsDone = False + m.findInsertStatement() + + exit(0); + Property changes on: branches/ariel/xmldumps-backup/mysql2txt.py ___________________________________________________________________ Added: svn:eol-style + native _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs