Author: jukka Date: Wed Mar 26 12:51:15 2008 New Revision: 641521 URL: http://svn.apache.org/viewvc?rev=641521&view=rev Log: TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support - Improved formatting of internalProcessRecord
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641521&r1=641520&r2=641521&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Mar 26 12:51:15 2008 @@ -40,6 +40,7 @@ import org.apache.poi.hssf.record.ExtendedFormatRecord; import org.apache.poi.hssf.record.FormatRecord; import org.apache.poi.hssf.record.FormulaRecord; +import org.apache.poi.hssf.record.UnicodeString; //import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI release import org.apache.poi.hssf.record.LabelRecord; import org.apache.poi.hssf.record.LabelSSTRecord; @@ -213,89 +214,69 @@ private void internalProcessRecord(Record record) throws SAXException { switch (record.getSid()) { + case BOFRecord.sid: // start of workbook, worksheet etc. records + BOFRecord bof = (BOFRecord) record; + if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { + currentSheetIndex = -1; + } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { + currentSheetIndex++; + currentSheet.clear(); + insideWorksheet = true; + } + break; - /* BOFRecord: indicates start of workbook, worksheet etc. records */ - case BOFRecord.sid: - switch (((BOFRecord) record).getType()) { - case BOFRecord.TYPE_WORKBOOK: - currentSheetIndex = -1; - break; - case BOFRecord.TYPE_WORKSHEET: - currentSheetIndex++; - currentSheet.clear(); - insideWorksheet = true; - break; - } - break; - - /* EOFRecord: indicates end of workbook, worksheet etc. records */ - case EOFRecord.sid: - // ignore empty sheets - if (insideWorksheet && !currentSheet.isEmpty()) { - processSheet(); - } - insideWorksheet = false; - break; + case EOFRecord.sid: // end of workbook, worksheet etc. records + if (insideWorksheet && !currentSheet.isEmpty()) { + processSheet(); + } + insideWorksheet = false; + break; - /* SSTRecord: holds all the strings for LabelSSTRecords */ - case SSTRecord.sid: - sstRecord = (SSTRecord)record; - break; - - /* BoundSheetRecord: Worksheet index record */ - case BoundSheetRecord.sid: - BoundSheetRecord boundSheetRecord = (BoundSheetRecord)record; - String sheetName = boundSheetRecord.getSheetname(); - sheetNames.add(sheetName); - break; - - // FIXME - requires POI release - ///* HyperlinkRecord: holds a URL associated with a cell */ - //case HyperlinkRecord.sid: - // HyperlinkRecord hyperlinkRecord = (HyperlinkRecord)record; - // if (insideWorksheet) { - // int row = hyperlinkRecord.getFirstRow(); - // short column = hyperlinkRecord.getFirstColumn(); - // Point point = new Point(column, row); - // Cell cell = currentSheet.get(point); - // if (cell != null) { - // cell = new LinkedCell(cell, hyperlinkRecord.getAddress()); - // currentSheet.put(point, cell); - // } - // } - // break; - - /* FormulaRecord: Cell value from a formula */ - case FormulaRecord.sid: - FormulaRecord formula = (FormulaRecord) record; - addCell(record, new NumberCell(formula.getValue())); - break; - - /* LabelRecord: strings stored directly in the cell */ - case LabelRecord.sid: - LabelRecord label = (LabelRecord) record; - addCell(record, getTextCell(label.getValue())); - break; - - /* LabelSSTRecord: Ref. a string in the shared string table */ - case LabelSSTRecord.sid: - LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record; - int sstIndex = labelSSTRecord.getSSTIndex(); - String sstLabel = sstRecord.getString(sstIndex).getString(); - addCell(record, getTextCell(sstLabel)); - break; - - /* NumberRecord: Contains a numeric cell value */ - case NumberRecord.sid: - NumberRecord number = (NumberRecord) record; - addCell(record, new NumberCell(number.getValue())); - break; - - /* RKRecord: Excel internal number record */ - case RKRecord.sid: - RKRecord rk = (RKRecord) record; - addCell(record, new NumberCell(rk.getRKNumber())); - break; + case BoundSheetRecord.sid: // Worksheet index record + BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; + sheetNames.add(boundSheetRecord.getSheetname()); + break; + + case SSTRecord.sid: // holds all the strings for LabelSSTRecords + sstRecord = (SSTRecord) record; + break; + + case FormulaRecord.sid: // Cell value from a formula + FormulaRecord formula = (FormulaRecord) record; + addCell(record, new NumberCell(formula.getValue())); + break; + + case LabelRecord.sid: // strings stored directly in the cell + LabelRecord label = (LabelRecord) record; + addTextCell(record, label.getValue()); + break; + + case LabelSSTRecord.sid: // Ref. a string in the shared string table + LabelSSTRecord sst = (LabelSSTRecord) record; + UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); + addTextCell(record, unicode.getString()); + break; + + case NumberRecord.sid: // Contains a numeric cell value + NumberRecord number = (NumberRecord) record; + addCell(record, new NumberCell(number.getValue())); + break; + + case RKRecord.sid: // Excel internal number record + RKRecord rk = (RKRecord) record; + addCell(record, new NumberCell(rk.getRKNumber())); + break; + + // FIXME - requires POI release + // case HyperlinkRecord.sid: // holds a URL associated with a cell + // HyperlinkRecord link = (HyperlinkRecord) record; + // Point point = + // new Point(link.getFirstColumn(), link.getFirstRow()); + // Cell cell = currentSheet.get(point); + // if (cell != null) { + // addCell(record, new LinkedCell(cell, link.getAddress())); + // } + // break; } } @@ -320,20 +301,19 @@ } /** - * Returns a text cell with the given text comment. The given text + * Adds a text cell with the given text comment. The given text * is trimmed, and ignored if <code>null</code> or empty. * + * @param record record that holds the text value * @param text text content, may be <code>null</code> - * @return text cell, or <code>null</code> */ - private Cell getTextCell(String text) { + private void addTextCell(Record record, String text) { if (text != null) { text = text.trim(); if (text.length() > 0) { - return new TextCell(text); + addCell(record, new TextCell(text)); } } - return null; } /**