Kaleb Akalework created TIKA-2080: ------------------------------------- Summary: PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly Key: TIKA-2080 URL: https://issues.apache.org/jira/browse/TIKA-2080 Project: Tika Issue Type: Bug Components: parser Affects Versions: 1.13 Environment: Windows 8.1, jdk1.8.0_102 Reporter: Kaleb Akalework
I'm trying to use tika to parse PDF files that contain Japanese and Chinese characters, but for some reason it does parse it correctly. Every character that is extracted is changed to the first letter in the line. For example if the document contains 早上好, this, the extracted text will correctly know that it has 3 characters but all 3 characters will be 早早早, the last two characters are replaced by the gfirst character. This same string is correctly parsed, in a word document. The follwoing is what I am using as java sample code (Don't forget to change the fdilename) package kaleb; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringWriter; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.ReaderInputStream; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.parser.pdf.PDFParser; import org.xml.sax.SAXException; public class TestTika { /** character limit */ private static int parserCharLimit = 10 * 1024 * 1024; public static int getParserCharLimit() { return parserCharLimit; } public static void setParserCharLimit(int l) { parserCharLimit = l; } private static StringBuilder sb = null; private static ContentHandlerDecorator handler = new ContentHandlerDecorator() { private void ensureLimit() throws SAXException { if (sb.length() > parserCharLimit) { throw new MaxContentExceededException( "Your document contained more than " +parserCharLimit+" characters: "+sb.length()); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (length == 5) length *=2; sb.append(ch, start, length ); ensureLimit(); } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if (sb.length() > 0) sb.append(ch, start, length); ensureLimit(); } }; public static class MaxContentExceededException extends SAXException { public MaxContentExceededException() { super(); } public MaxContentExceededException(Exception e) { super(e); } public MaxContentExceededException(String message, Exception e) { super(message, e); } public MaxContentExceededException(String message) {super(message);} } public static void myTika() throws Exception{ TikaConfig tikaConfig = null; try{ InputStream stream = new FileInputStream(new File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb\\tika-config.xml"))); try { tikaConfig = new TikaConfig(stream); } catch (IOException | SAXException | TikaException e) { tikaConfig = TikaConfig.getDefaultConfig(); } finally { try { stream.close(); } catch (IOException e) { } } }catch(Exception e){} /** default Tika detector */ Detector tikaDetector = tikaConfig.getDetector(); /** default Tika parser */ CompositeParser tikaParser = new CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser()); TemporaryResources tmp = new TemporaryResources(); InputStream stream = new FileInputStream(new File("C:\\Users\\kaleba\\Desktop\\Chin.docx")); TikaInputStream tis = TikaInputStream.get(stream, tmp); String type =""; // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler instead?? Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(org.apache.tika.parser.Parser.class, tikaParser); try { // TODO: limit by content type to reduce dependencies? // https://tika.apache.org/1.10/parser_guide.html type = tikaDetector.detect(tis, metadata).toString(); metadata.set(Metadata.CONTENT_TYPE, type); } catch(Exception e){} sb = new StringBuilder(); tikaParser.parse(tis, handler, metadata, context); String s = sb.toString(); int i= 1; } public static void main(String[] args) { // TODO Auto-generated method stub /*try{ File initialFile = new File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf"); InputStream targetStream = new FileInputStream(initialFile); String s = parse(targetStream,null, null); int i=1; } catch (Exception e){}*/ /* TestTika tk = new TestTika(); tk.setFilePath("C:\\Users\\kaleba\\Desktop\\Rus3.pdf"); try{ System.out.println(tk.ToText()); } catch(Exception e){}*/ try{ myTika(); } catch (Exception e){ System.out.print(e.getMessage()); } } } -- This message was sent by Atlassian JIRA (v6.3.4#6332)