[ https://issues.apache.org/jira/browse/TIKA-2080?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15494305#comment-15494305 ]
Kaleb Akalework commented on TIKA-2080: --------------------------------------- Opened ticket at PDFBOX under Tim Allisons advice > PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters > correctly > ------------------------------------------------------------------------------------- > > Key: TIKA-2080 > URL: https://issues.apache.org/jira/browse/TIKA-2080 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 1.13 > Environment: Windows 8.1, jdk1.8.0_102 > Reporter: Kaleb Akalework > Attachments: nihao2.pdf > > > I'm trying to use tika to parse PDF files that contain Japanese and Chinese > characters, but for some reason it does parse it correctly. Every character > that is extracted is changed to the first letter in the line. For example if > the document contains 早上好, this, the extracted text will correctly know that > it has 3 characters but all 3 characters will be 早早早, the last two characters > are replaced by the gfirst character. This same string is correctly parsed, > in a word document. The follwoing is what I am using as java sample code > (Don't forget to change the fdilename) > package kaleb; > import java.io.BufferedReader; > import java.io.File; > import java.io.FileInputStream; > import java.io.IOException; > import java.io.InputStream; > import java.io.InputStreamReader; > import java.io.StringWriter; > import java.nio.charset.Charset; > import java.nio.charset.CharsetEncoder; > import org.apache.commons.io.IOUtils; > import org.apache.commons.io.input.ReaderInputStream; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.detect.Detector; > import org.apache.tika.exception.TikaException; > import org.apache.tika.io.TemporaryResources; > import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.parser.CompositeParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.pdf.PDFParser; > import org.apache.tika.sax.BodyContentHandler; > import org.apache.tika.sax.ContentHandlerDecorator; > import org.apache.tika.parser.pdf.PDFParser; > import org.xml.sax.SAXException; > public class TestTika { > > > > /** character limit */ > private static int parserCharLimit = 10 * 1024 * 1024; > public static int getParserCharLimit() { > return parserCharLimit; > } > public static void setParserCharLimit(int l) { > parserCharLimit = l; > } > private static StringBuilder sb = null; > > private static ContentHandlerDecorator handler = new > ContentHandlerDecorator() { > private void ensureLimit() throws SAXException { > if (sb.length() > parserCharLimit) { > throw new MaxContentExceededException( > "Your document contained more than " > +parserCharLimit+" characters: "+sb.length()); > } > } > @Override > public void characters(char[] ch, int start, int length) throws > SAXException { > if (length == 5) > length *=2; > sb.append(ch, start, length ); > ensureLimit(); > } > @Override > public void ignorableWhitespace(char[] ch, int start, > int length) throws SAXException { > if (sb.length() > 0) > sb.append(ch, start, length); > ensureLimit(); > } > }; > > public static class MaxContentExceededException extends SAXException { > public MaxContentExceededException() { super(); } > public MaxContentExceededException(Exception e) { super(e); } > public MaxContentExceededException(String message, Exception e) { > super(message, e); } > public MaxContentExceededException(String message) {super(message);} > } > > public static void myTika() throws Exception{ > TikaConfig tikaConfig = null; > > try{ > > InputStream stream = new FileInputStream(new > File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb\\tika-config.xml"))); > try { > tikaConfig = new TikaConfig(stream); > } catch (IOException | SAXException | TikaException e) { > tikaConfig = TikaConfig.getDefaultConfig(); > } finally { > try { stream.close(); } catch (IOException e) { } > } > }catch(Exception e){} > > /** default Tika detector */ > Detector tikaDetector = tikaConfig.getDetector(); > /** default Tika parser */ > CompositeParser tikaParser = new > CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser()); > TemporaryResources tmp = new TemporaryResources(); > InputStream stream = new FileInputStream(new > File("C:\\Users\\kaleba\\Desktop\\Chin.docx")); > > TikaInputStream tis = TikaInputStream.get(stream, tmp); > > String type =""; > // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler > instead?? > Metadata metadata = new Metadata(); > ParseContext context = new ParseContext(); > context.set(org.apache.tika.parser.Parser.class, tikaParser); > try { > // TODO: limit by content type to reduce dependencies? > // https://tika.apache.org/1.10/parser_guide.html > > type = tikaDetector.detect(tis, metadata).toString(); > metadata.set(Metadata.CONTENT_TYPE, type); > } > catch(Exception e){} > sb = new StringBuilder(); > tikaParser.parse(tis, handler, metadata, context); > > > > String s = sb.toString(); > > int i= 1; > } > public static void main(String[] args) { > // TODO Auto-generated method stub > > /*try{ > File initialFile = new > File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf"); > InputStream targetStream = new FileInputStream(initialFile); > String s = parse(targetStream,null, null); > int i=1; > } > catch (Exception e){}*/ > /* TestTika tk = new TestTika(); > tk.setFilePath("C:\\Users\\kaleba\\Desktop\\Rus3.pdf"); > try{ > System.out.println(tk.ToText()); > } > catch(Exception e){}*/ > try{ > myTika(); > } > catch (Exception e){ > System.out.print(e.getMessage()); > } > } > } -- This message was sent by Atlassian JIRA (v6.3.4#6332)