On Thursday, 28 May 2020 02:40:49 UTC+8, Peter Otten wrote: > BBT wrote: > > > On Thursday, 28 May 2020 01:36:26 UTC+8, Peter Otten wrote: > >> BBT wrote: > >> > >> > I am trying to parse a word (.docx) for tables, then copy these tables > >> > over to excel using xlsxwriter. This is my code: > >> > > >> > from docx.api import Document > >> > import xlsxwriter > >> > > >> > document = Document('/Users/xxx/Documents/xxx/Clauses Sample - Copy v1 > >> > - for merge.docx') tables = document.tables > >> > > >> > wb = xlsxwriter.Workbook('C:/Users/xxx/Documents/xxx/test clause > >> > retrieval.xlsx') Sheet1 = wb.add_worksheet("Compliance") > >> > index_row = 0 > >> > > >> > print(len(tables)) > >> > > >> > for table in document.tables: > >> > data = [] > >> > keys = None > >> > for i, row in enumerate(table.rows): > >> > text = (cell.text for cell in row.cells) > >> > > >> > if i == 0: > >> > keys = tuple(text) > >> > continue > >> > row_data = dict(zip(keys, text)) > >> > data.append(row_data) > >> > #print (data) > >> > #big_data.append(data) > >> > Sheet1.write(index_row,0, str(row_data)) > >> > index_row = index_row + 1 > >> > > >> > print(row_data) > >> > > >> > wb.close() > >> > > >> > > >> > This is my desired output: https://i.stack.imgur.com/9qnbw.png > >> > > >> > However, here is my actual output: https://i.stack.imgur.com/vpXej.png > >> > > >> > I am aware that my current output produces a list of string instead. > >> > > >> > Is there anyway that I can get my desired output using xlsxwriter? > >> > >> I had to simulate docx.api. With that caveat the following seems to work: > >> > >> import xlsxwriter > >> > >> # begin simulation of > >> # from docx.api import Document > >> > >> class Cell: > >> def __init__(self, text): > >> self.text = text > >> > >> class Row: > >> def __init__(self, cells): > >> self.cells = [Cell(c) for c in cells] > >> > >> class Table: > >> def __init__(self, data): > >> self.rows = [ > >> Row(row) for row in data > >> ] > >> > >> class Document: > >> def __init__(self): > >> self.tables = [ > >> Table([ > >> ["Hello", "Test"], > >> ["est", "ing"], > >> ["gg", "ff"] > >> ]), > >> Table([ > >> ["Foo", "Bar", "Baz"], > >> ["ham", "spam", "jam"] > >> ]) > >> ] > >> > >> document = Document() > >> > >> # end simulation > >> > >> wb = xlsxwriter.Workbook("tmp.xlsx") > >> sheet = wb.add_worksheet("Compliance") > >> > >> offset = 0 > >> for table in document.tables: > >> for y, row in enumerate(table.rows): > >> for x, cell in enumerate(row.cells): > >> sheet.write(y + offset, x, cell.text) > >> offset += len(table.rows) + 1 # one empty row between tables > >> > >> wb.close() > > > > > > Hi Peter, thank you for your efforts :) > > > > However, what if there are many tables in the word document, it would be > > tedious to have to code the texts in the tables one by one. Can I instead, > > call on the word document and let Python do the parsing for tables and its > > contents? > > I don't understand. You have docx.api available, so you can replace > > the "simulation" part of my example with just these two lines: > > from docx.api import Document > document = Document('/Users/xxx/Documents/xxx/Clauses Sample - Copy v1 - for > merge.docx') > > I should work -- it's just that I cannot be sure it will work because I could > not test it.
I tried your code by replacing the Document portion: import xlsxwriter # begin simulation of # from docx.api import Document class Cell: def __init__(self, text): self.text = text class Row: def __init__(self, cells): self.cells = [Cell(c) for c in cells] class Table: def __init__(self, data): self.rows = [ Row(row) for row in data ] class Document: def __init__(self): self.tables = [ Table([ ["Hello", "Test"], ["est", "ing"], ["gg", "ff"] ]), Table([ ["Foo", "Bar", "Baz"], ["ham", "spam", "jam"] ]) ] #document = Document() # end simulation document = Document('/Users/Ai Shan/Documents/CPFB Work/Clauses Sample - Copy v1 - for merge.docx') wb = xlsxwriter.Workbook('C:/Users/Ai Shan/Documents/CPFB Work/test clause retrieval.xlsx') Sheet1 = wb.add_worksheet("Compliance") offset = 0 for table in document.tables: for y, row in enumerate(table.rows): for x, cell in enumerate(row.cells): sheet.write(y + offset, x, cell.text) offset += len(table.rows) + 1 # one empty row between tables wb.close() But I received an error: TypeError: __init__() takes 1 positional argument but 2 were given -- https://mail.python.org/mailman/listinfo/python-list