Below please find the latest version of the xlsx2csv example program.
This version deals with styles and formats reasonably well.  However,
I have never seen special items like an inline string, so I'm sure
problems are still lurking.

Note that the POI BuiltinFormats class is not in POI 3.5 beta 5,
so I copied that class directly to my package.  Either you can do the
same, or check out the whole POI project, or use a nightly build.
You may have to fix the imports.  But then this should work perfectly.

I inserted TWO classes below, the converter and the supporting class
ReadonlySharedStringTable.

HTH

chris...

--


package something.or.other;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.Package;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
* A rudimentary XLSX -> CSV processor modeled on the
* POI sample program XLS2CSVmra by Nick Burch from the
* package org.apache.poi.hssf.eventusermodel.examples.
* Unlike the HSSF version, this one completely ignores * missing rows. * * Data sheets are read using a SAX parser to keep the * memory footprint relatively small, so this should be
* able to read enormous workbooks.  The styles table and
* the shared-string table must be kept in memory.  The
* standard POI styles table class is used, but a custom
* (read-only) class is used for the shared string table
* because the standard POI SharedStringsTable grows very
* quickly with the number of unique strings.
* * @author Chris Lott
*/
public class XLSX2CSV {

        /**
* The type of the data value is indicated by an attribute on * the cell element; the value is in a "v" element within the cell.
         */
        enum xssfDataType {
                BOOL,
                ERROR,
                FORMULA,
                INLINESTR,
                SSTINDEX,
                NUMBER,
        }

        /**
         * Derived from 
http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api
* * Also see Standard ECMA-376, 1st edition, part 4, pages 1928ff, at
         * http://www.ecma-international.org/publications/standards/Ecma-376.htm
* * A web-friendly version is http://openiso.org/Ecma/376/Part4
         */
        class MyXSSFSheetHandler extends DefaultHandler {

                /** Table with styles */
                private StylesTable stylesTable;

                /** Table with unique strings */
                private ReadonlySharedStringsTable sharedStringsTable;

                /** Destination for data */
                private final PrintStream output;

                /** Number of columns to read starting with leftmost */
                private final int minColumnCount;

                // Set when V start element is seen
                private boolean vIsOpen;

                // Set when cell start element is seen;
                // used when cell close element is seen.
                private xssfDataType nextDataType;

                // Used to format numeric cell values.
                private short formatIndex;
                private String formatString;
                private final DataFormatter formatter;

                private int thisColumn = -1;
                // The last column printed to the output stream
                private int lastColumnNumber = -1;

                // Gathers characters as they are seen.
                private StringBuffer value;

                /**
                 * Accepts objects needed while parsing.
* * @param styles Table of styles
                 * @param strings Table of shared strings
                 * @param cols Minimum number of columns to show
                 * @param target Sink for output
                 */
                public MyXSSFSheetHandler(
                                StylesTable styles,
                                ReadonlySharedStringsTable strings,
                                int cols,
                                PrintStream target) {
                        this.stylesTable = styles;
                        this.sharedStringsTable = strings;
                        this.minColumnCount = cols;
                        this.output = target;
                        this.value = new StringBuffer();
                        this.nextDataType = xssfDataType.NUMBER;
                        this.formatter = new DataFormatter();
                }

                /*
                 * (non-Javadoc)
                 * @see 
org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, 
java.lang.String, java.lang.String, org.xml.sax.Attributes)
                 */
                public void startElement(String uri, String localName, String 
name,
                                Attributes attributes) throws SAXException {

                        if ("inlineStr".equals(name) || "v".equals(name)) {
vIsOpen = true; // Clear contents cache
                                value.setLength(0);
                        }
                        // c => cell
                        else if ("c".equals(name)) {
                                // Get the cell reference
                                String r = attributes.getValue("r");
                                int firstDigit = -1;
                                for (int c = 0; c < r.length(); ++c) {
                                        if (Character.isDigit(r.charAt(c))) {
                                                firstDigit = c;
                                                break;
                                        }
                                }
                                thisColumn = nameToColumn(r.substring(0, 
firstDigit));

                                // Set up defaults.
                                this.nextDataType = xssfDataType.NUMBER;
                                this.formatIndex = -1;
                                this.formatString = null;
                                String cellType = attributes.getValue("t");
                                String cellStyleStr = attributes.getValue("s");
                                if ("b".equals(cellType))
                                        nextDataType = xssfDataType.BOOL;
                                else if ("e".equals(cellType))
                                        nextDataType = xssfDataType.ERROR;
                                else if ("inlineStr".equals(cellType))
                                        nextDataType = xssfDataType.INLINESTR;
                                else if ("s".equals(cellType))
                                        nextDataType = xssfDataType.SSTINDEX;
                                else if ("str".equals(cellType))
                                        nextDataType = xssfDataType.FORMULA;
                                else if (cellStyleStr != null) {
                                        /*
                                         * It's a number, but possibly has a 
style and/or special format.
* Nick Burch said to use org.apache.poi.ss.usermodel.BuiltinFormats, * and I see javadoc for that at apache.org, but it's not in the
                                         * POI 3.5 Beta 5 jars.  Scheduled to 
appear in 3.5 beta 6.
                                         */
                                        int styleIndex = 
Integer.parseInt(cellStyleStr);
                                        XSSFCellStyle style = 
stylesTable.getStyleAt(styleIndex);
                                        this.formatIndex = 
style.getDataFormat();
                                        this.formatString = 
style.getDataFormatString();
                                        if (this.formatString == null)
                                                this.formatString = 
BuiltinFormats.getBuiltinFormat(this.formatIndex);
                                }
                        }

                }

                /*
                 * (non-Javadoc)
                 * @see 
org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, 
java.lang.String, java.lang.String)
                 */
                public void endElement(String uri, String localName, String 
name)
                throws SAXException {

                        String thisStr = null;

                        // v => contents of a cell
                        if ("v".equals(name)) {
                                // Process the value contents as required.
                                // Do now, as characters() may be called more 
than once
                                switch(nextDataType) {

                                case BOOL:
                                        char first = value.charAt(0);
                                        thisStr = first == '0' ? "FALSE" : 
"TRUE";
                                        break;

                                case ERROR:
                                        thisStr = "\"ERROR:" + value.toString() + 
'"';
                                        break;  

case FORMULA: // A formula could result in a string value,
                                        // so always add double-quote 
characters.
                                        thisStr = '"' + value.toString() + '"';
                                        break;

                                case INLINESTR:
                                        // TODO: have seen an example of this, 
so it's untested.
                                        XSSFRichTextString rtsi = new 
XSSFRichTextString(value.toString());
thisStr = '"' + rtsi.toString() + '"'; break;

                                case SSTINDEX:
                                        String sstIndex = value.toString();
                                        try {
                                                int idx = 
Integer.parseInt(sstIndex);
                                                XSSFRichTextString rtss = new 
XSSFRichTextString(sharedStringsTable.getEntryAt(idx));
thisStr = '"' + rtss.toString() + '"'; }
                                        catch (NumberFormatException ex) {
                                                output.println("Failed to parse SST index 
'" + sstIndex + "': " + ex.toString());
                                        }
                                        break;

                                case NUMBER:
                                        String n = value.toString();
                                        if (this.formatString != null)
                                                thisStr = 
formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, 
this.formatString);
                                        else
                                                thisStr = n;
                                        break;

                                default:
                                        thisStr = "(TODO: Unexpected type: " + 
nextDataType + ")";
                                break;
                                }

                                // Output after we've seen the string contents
                                // Emit commas for any fields that were missing 
on this row
                                if(lastColumnNumber == -1) { lastColumnNumber = 
0; }
                                for (int i = lastColumnNumber; i < thisColumn; 
++i)
                                        output.print(',');

                                // Might be the empty string.
                                output.print(thisStr);

// Update column if(thisColumn > -1)
                                        lastColumnNumber = thisColumn;

                        }
                        else if("row".equals(name)) {

                                // Print out any missing commas if needed
                                if(minColumns > 0) {
                                        // Columns are 0 based
                                        if(lastColumnNumber == -1) { 
lastColumnNumber = 0; }
                                        for(int i=lastColumnNumber; 
i<(this.minColumnCount); i++) {
                                                output.print(',');
                                        }
                                }

                                // We're onto a new row
                                output.println();
                                lastColumnNumber = -1;
                        }

                }

                /**
                 * Captures characters only if a suitable element is open.
                 * Originally was just "v"; extended for inlineStr also.
                 */
                public void characters(char[] ch, int start, int length)
                throws SAXException {
                        if (vIsOpen)
                                value.append(ch, start, length);
                }

                /**
                 * Converts an Excel column name like "C" to a zero-based index.
                 * @param name
                 * @return Index corresponding to the specified name
                 */
                private int nameToColumn(String name) {
                        int column = -1;
                        for (int i = 0; i < name.length(); ++i) {
                                int c = name.charAt(i);
                                column = (column + 1) * 26 + c - 'A';
                        }
                        return column;
                }

        }

        ///////////////////////////////////////

        private Package xlsxPackage;
        private int minColumns;
        private PrintStream output;

        /**
         * Creates a new XLSX -> CSV converter
* Javadoc says I should use OPCPackage instead of Package, but OPCPackage * was not available in the POI 3.5-beta5 build I used. * * @param pkg The XLSX package to process
         * @param output The PrintStream to output the CSV to
         * @param minColumns The minimum number of columns to output, or -1 for 
no minimum
         */
        public XLSX2CSV(Package pkg, PrintStream output, int minColumns) {
                this.xlsxPackage = pkg;
                this.output = output;
                this.minColumns = minColumns;
        }

        /**
         * Parses and shows the content of one sheet
         * using the specified styles and shared-strings tables.
         * @param styles
         * @param strings
         * @param sheetInputStream
         */
        public void processSheet(
                        StylesTable styles,
ReadonlySharedStringsTable strings, InputStream sheetInputStream) throws IOException, ParserConfigurationException, SAXException {

                InputSource sheetSource = new InputSource(sheetInputStream);
                SAXParserFactory saxFactory = SAXParserFactory.newInstance();
                SAXParser saxParser = saxFactory.newSAXParser();
                XMLReader sheetParser = saxParser.getXMLReader();
                ContentHandler handler = new MyXSSFSheetHandler(styles, 
strings, this.minColumns, this.output);
                sheetParser.setContentHandler(handler);
                sheetParser.parse(sheetSource);
        }

        /**
         * Initiates the processing of the XLS workbook file to CSV.
         * @throws IOException
         * @throws OpenXML4JException
         * @throws ParserConfigurationException
* @throws SAXException */ public void process() throws IOException, OpenXML4JException, ParserConfigurationException, SAXException {

                ReadonlySharedStringsTable strings = new 
ReadonlySharedStringsTable(this.xlsxPackage);
                XSSFReader xssfReader = new XSSFReader(this.xlsxPackage);
StylesTable styles = xssfReader.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator)xssfReader.getSheetsData();
                int index = 0;
                while (iter.hasNext()) {
                        InputStream stream = iter.next();
                        String sheetName = iter.getSheetName();
                        this.output.println();
this.output.println(sheetName + " [index=" + index + "]:"); processSheet(styles, strings, stream);
                        stream.close();
                        ++index;
                }               
        }

        public static void main(String[] args) throws Exception {
                if(args.length < 1) {
                        System.err.println("Use:");
                        System.err.println("  XLSX2CSV <xlsx file> [min 
columns]");
                        return;
                }

                File xlsxFile = new File(args[0]);
                if (! xlsxFile.exists()) {
                        System.err.println("Not found or not a file: " + 
xlsxFile.getPath());
                        return;
                }

                int minColumns = -1;
                if(args.length >= 2)
                        minColumns = Integer.parseInt(args[1]);

                // If no log4j configuration is provided, these messages appear:
                //   log4j:WARN No appenders could be found for logger 
(org.openxml4j.opc).
                //   log4j:WARN Please initialize the log4j system properly.
                // If only the BasicConfigurator.configure() is done, these 
messages appear:
                //   0 [main] DEBUG org.openxml4j.opc  - Parsing relationship: 
/xl/_rels/workbook.xml.rels
                //  46 [main] DEBUG org.openxml4j.opc  - Parsing relationship: 
/_rels/.rels
                // Added the call to setLevel() to turn these off, now I see 
nothing.

                BasicConfigurator.configure();
                Logger.getRootLogger().setLevel(Level.INFO);

                // The package open is instantaneous, as it should be.
                Package p = Package.open(xlsxFile.getPath(), 
PackageAccess.READ);
                XLSX2CSV xlsx2csv = new XLSX2CSV(p, System.out, minColumns);
                xlsx2csv.process();
                // Want to call close() here, but the package is open for read,
                // so it's not necessary, and it complains if I do call it!
                p.revert();
        }

}

--

package something.or.other;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.poi.openxml4j.opc.Package;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
* Many methods copied from org.apache.poi.POIXMLDocumentPart because
* I was not sure whether this class should extend that class.
*
*/
public class ReadonlySharedStringsTable extends DefaultHandler {

        /**
         * An integer representing the total count of strings in the workbook. 
This count does not
         * include any numbers, it counts only the total of text strings in the 
workbook.
         */
        private int count;

        /**
         * An integer representing the total count of unique strings in the 
Shared String Table.
         * A string is unique even if it is a copy of another string, but has 
different formatting applied
         * at the character level.
         */
        private int uniqueCount;

        /**
         * The shared strings table.
         */
        private String [] strings;

        /**
* * @param pkg
         * @throws IOException
         * @throws SAXException
         * @throws ParserConfigurationException
         */
        public ReadonlySharedStringsTable(Package pkg)
        throws IOException, SAXException, ParserConfigurationException {
ArrayList<PackagePart> parts = pkg.getPartsByContentType(XSSFRelation.SHARED_STRINGS.getContentType());
                PackagePart sstPart = parts.get(0);
                readFrom(sstPart.getInputStream());
        }

        /**
         * Like POIXMLDocumentPart constructor
* * @param part
         * @param rel_ignored
         * @throws IOException
         */
public ReadonlySharedStringsTable(PackagePart part, PackageRelationship rel_ignored) throws IOException, SAXException, ParserConfigurationException {
                readFrom(part.getInputStream());
        }

        /**
         * Read this shared strings table from an XML file.
* * @param is The input stream containing the XML document.
         * @throws IOException if an error occurs while reading.
* @throws SAXException * @throws ParserConfigurationException */
        public void readFrom(InputStream is) throws IOException, SAXException, 
ParserConfigurationException {
                InputSource sheetSource = new InputSource(is);
                SAXParserFactory saxFactory = SAXParserFactory.newInstance();
                SAXParser saxParser = saxFactory.newSAXParser();
                XMLReader sheetParser = saxParser.getXMLReader();
                sheetParser.setContentHandler(this);
                sheetParser.parse(sheetSource);
        }

        /**
         * Return an integer representing the total count of strings in the 
workbook. This count does not
         * include any numbers, it counts only the total of text strings in the 
workbook.
         *
         * @return the total count of strings in the workbook
         */
        public int getCount(){
                return this.count;
        }

        /**
         * Returns an integer representing the total count of unique strings in 
the Shared String Table.
         * A string is unique even if it is a copy of another string, but has 
different formatting applied
         * at the character level.
         *
         * @return the total count of unique strings in the workbook
         */
        public int getUniqueCount(){
                return this.uniqueCount;
        }

        /**
         * Return a string item by index
         *
         * @param idx index of item to return.
         * @return the item at the specified position in this Shared String 
table.
         */
        public String getEntryAt(int idx) {
                return strings[idx];
        }

        //// ContentHandler methods ////

        private StringBuffer characters;
        private boolean tIsOpen;
        private int index;

        /*
         * (non-Javadoc)
         * @see 
org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, 
java.lang.String, java.lang.String, org.xml.sax.Attributes)
         */
        public void startElement(String uri, String localName, String name,
                        Attributes attributes) throws SAXException {
                if ("sst".equals(name)) {
                        String count = attributes.getValue("count");
                        String uniqueCount = attributes.getValue("uniqueCount");
                        this.count = Integer.parseInt(count);
                        this.uniqueCount = Integer.parseInt(uniqueCount);
                        this.strings = new String[this.uniqueCount];
                        index = 0;
                        characters = new StringBuffer();
                }
                else if ("t".equals(name)) {
                        characters.setLength(0);
                        tIsOpen = true;
                }
        }

        /*
         * (non-Javadoc)
         * @see org.xml.sax.helpers.DefaultHandler#endElement(java.lang.String, 
java.lang.String, java.lang.String)
         */
        public void endElement(String uri, String localName, String name)
        throws SAXException {
                if ("t".equals(name)) {
                        strings[index] = characters.toString();
                        ++index;
                }
        }

        /**
         * Captures characters only if a t(ext?) element is open.
         */
        public void characters(char[] ch, int start, int length)
        throws SAXException {
                if (tIsOpen)
                        characters.append(ch, start, length);
        }

        ////////////// For testing //////////////

        /**
         * Dumps the shared string table from an XLSX package.
* * @param args
         * @throws Exception
         */
        public static void main(String [] args) throws Exception {

                BasicConfigurator.configure();
                Logger.getRootLogger().setLevel(Level.INFO);

                if (args.length != 1) {
                        System.err.println("Usage: ReadonlySharedStringsTable 
<file.xlsx>");
                        return;
                }

                File inputFile = new File(args[0]);
                if (! inputFile.exists()) {
                        System.err.println("Failed to find file: " + 
inputFile.getPath());
                        return;
                }

                Package pkg = Package.open(inputFile.getPath(), 
PackageAccess.READ);
                ReadonlySharedStringsTable sst = new 
ReadonlySharedStringsTable(pkg);
                int unique = sst.getUniqueCount();
                for (int s = 0; s < unique; ++s)
                        System.out.println("Index " + s + ": " + 
sst.getEntryAt(s));                
                // Close without saving any changes.
                pkg.revert();

        }
}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to