[ https://issues.apache.org/jira/browse/TIKA-679?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13059639#comment-13059639 ]
Troy Witthoeft edited comment on TIKA-679 at 7/4/11 11:49 PM: -------------------------------------------------------------- I've narrowed the encoding down to CP437. CP437 correctly identifies many of the engineering symbols, such as [±] "plus minus," [º] degree," but fails on "diameter" PRT files actually store the diameter symbol as three characters, with the second one always being [φ] "lowercase phi" While not identical, the Nordic [Ø] "O with slash" is often accepted as the diameter symbol. You may find a more elegant solution looking at [http://en.wikipedia.org/wiki/Code_page_437] I've simply been substituting. String str = new String(text, 0, text.length, "Cp437"); str = str.replace("\u03C6","\u00D8"); was (Author: runamok81): I've narrowed the encoding down to CP437. CP437 correctly identifies many of the engineering symbols, such as [±] "plus minus," [º] degree," but fails on "diameter" PRT files actually store the diameter symbol as three characters, with the second one always being [φ] "lowercase phi" While not identical, the Nordic [Ø] "O with slash" is often accepted as the diameter symbol. You may find a more elegant solution looking at [http://en.wikipedia.org/wiki/Code_page_437] I've simply been substituting. [code] String str = new String(text, 0, text.length, "Cp437"); str = str.replace("\u03C6","\u00D8"); [/code] > Proposal for PRT Parser > ----------------------- > > Key: TIKA-679 > URL: https://issues.apache.org/jira/browse/TIKA-679 > Project: Tika > Issue Type: Improvement > Components: mime, parser > Reporter: Troy Witthoeft > Priority: Minor > Labels: CAD, Mime, Parser, Prt, Tika > Attachments: TikaTest.prt > > Original Estimate: 672h > Remaining Estimate: 672h > > It would be nice if Tika had support for prt CAD files. > A preliminary prt text extractor has been created. > Any assistance further developing this code is appreciated. > {code:title=PRTParser.java|borderStyle=solid} > package org.apache.tika.parser.prt; > import java.io.BufferedInputStream; > import java.io.BufferedReader; > import java.io.IOException; > import java.io.InputStream; > import java.io.InputStreamReader; > import java.io.Reader; > import java.io.UnsupportedEncodingException; > import java.nio.charset.Charset; > import java.util.Collections; > import java.util.Set; > import org.apache.poi.util.IOUtils; > import org.apache.tika.exception.TikaException; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.mime.MediaType; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.apache.tika.sax.XHTMLContentHandler; > import org.xml.sax.ContentHandler; > import org.xml.sax.SAXException; > /** > * Description: PRT (CAD Drawing) parser. This is a very basic parser. > * Searches for specific byte prefix, and outputs text from note entities > * Does not support special DRAFT-PAK characters. > */ > public class PRTParser implements Parser { > private static final Set<MediaType> SUPPORTED_TYPES = > Collections.singleton(MediaType.application("prt")); > public static final String PRT_MIME_TYPE = "application/prt"; > > public Set<MediaType> getSupportedTypes(ParseContext context) { > return SUPPORTED_TYPES; > } > > public void parse( > InputStream stream, ContentHandler handler, > Metadata metadata, ParseContext context) > throws IOException, SAXException, TikaException { > XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, > metadata); > int[] prefix = new int[] {227, 63}; > //Looking for a prefix set of bytes {E3, 3F} > int pos = 0; > > int read; > while( (read = stream.read()) > -1) { > // stream.read() moves to the next byte, and returns an integer value > of the byte. a value of -1 signals the EOF > if(read == prefix[pos]) { > // is the last byte read the same as the > first byte in the prefix? > pos++; > > if(pos == prefix.length) { > > stream.skip(11); > // skip the 13 bytes > of the prefix which can vary. > int length = stream.read(); > // Set the next byte equal to > the length of text in the user input field, see PRT schema > stream.skip(1); > > byte[] text = new byte[length]; > // a new byte array called text is > created. It should contain an array of integer values of the user inputted > text. > IOUtils.readFully(stream, text); > > String str = new String(text, 0, > text.length, "UTF-8"); // turn it into a string, but does not remove null > termination, assumes it's found to be utf-8 > xhtml.startElement("p"); > xhtml.characters(str); > xhtml.endElement("p"); > pos--; > } > } > else { > //Did not find the prefix. Reset the position > counter. > pos = 0; > } > } > } > > /** > * @deprecated This method will be removed in Apache Tika 1.0. > */ > public void parse( > InputStream stream, ContentHandler handler, Metadata > metadata) > throws IOException, SAXException, TikaException { > parse(stream, handler, metadata, new ParseContext()); > } > }{code} > -- This message is automatically generated by JIRA. For more information on JIRA, see: http://www.atlassian.com/software/jira