[jira] [Created] (TIKA-2080) PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly

Kaleb Akalework (JIRA) Thu, 15 Sep 2016 09:31:52 -0700

Kaleb Akalework created TIKA-2080:
-------------------------------------

             Summary: PDFParser tika-parsers-1.13.jar not parsing Japanese and 
Chinese Characters correctly
                 Key: TIKA-2080
                 URL: https://issues.apache.org/jira/browse/TIKA-2080
             Project: Tika
          Issue Type: Bug
          Components: parser
    Affects Versions: 1.13
         Environment: Windows 8.1, jdk1.8.0_102
            Reporter: Kaleb Akalework



I'm trying to use tika to parse PDF files that contain Japanese and Chinese 
characters, but for some reason it does parse it correctly. Every character 
that is extracted is changed to the first letter in the line. For example if 
the document contains 早上好, this, the extracted text will correctly know that it 
has 3 characters but all 3 characters will be 早早早, the last two characters are 
replaced by the gfirst character. This same string is correctly parsed, in a 
word document. The follwoing is what I am using as java sample code (Don't 
forget to change the fdilename)

package kaleb;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ReaderInputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.parser.pdf.PDFParser;


import org.xml.sax.SAXException;


public class TestTika {
    
   
   
    /** character limit */
    private static int parserCharLimit = 10 * 1024 * 1024;

    public static int getParserCharLimit() {
        return parserCharLimit;
    }

    public static void setParserCharLimit(int l) {
        parserCharLimit = l;
    }
    private static StringBuilder sb = null;
    
    private static ContentHandlerDecorator handler = new 
ContentHandlerDecorator() {
        private void ensureLimit() throws SAXException {
            if (sb.length() > parserCharLimit) {
                throw new MaxContentExceededException(
                        "Your document contained more than "
                        +parserCharLimit+" characters: "+sb.length());
            }
        }
        @Override
        public void characters(char[] ch, int start, int length) throws 
SAXException {
            if (length == 5) 
                length *=2;
            sb.append(ch, start, length );
            ensureLimit();
        }
        @Override
        public void ignorableWhitespace(char[] ch, int start,
                                        int length) throws SAXException {
            if (sb.length() > 0)
                sb.append(ch, start, length);
            ensureLimit();
        }
    };
    
    public static class MaxContentExceededException extends SAXException {
        public MaxContentExceededException() { super(); }
        public MaxContentExceededException(Exception e) { super(e); }
        public MaxContentExceededException(String message, Exception e) { 
super(message, e); }
        public MaxContentExceededException(String message) {super(message);}
    }
    
    public static void myTika() throws Exception{
        TikaConfig tikaConfig = null;
     
            try{
            
            InputStream stream = new FileInputStream(new 
File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb\\tika-config.xml")));
            try {
                tikaConfig = new TikaConfig(stream);
            } catch (IOException | SAXException | TikaException e) {
                tikaConfig = TikaConfig.getDefaultConfig();
            } finally {
                try { stream.close(); } catch (IOException e) { }
            }
            }catch(Exception e){}
        
            /** default Tika detector */
            Detector tikaDetector = tikaConfig.getDetector();

            /** default Tika parser */
           CompositeParser tikaParser = new 
CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser());
        TemporaryResources tmp = new TemporaryResources();
        InputStream stream = new FileInputStream(new 
File("C:\\Users\\kaleba\\Desktop\\Chin.docx"));
        
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
                
        String type ="";
        // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler 
instead??
        Metadata metadata = new Metadata();
        ParseContext context = new ParseContext();
        context.set(org.apache.tika.parser.Parser.class, tikaParser);
        try {
            // TODO: limit by content type to reduce dependencies?
            // https://tika.apache.org/1.10/parser_guide.html
            
                type = tikaDetector.detect(tis, metadata).toString();
                metadata.set(Metadata.CONTENT_TYPE, type);
            }
        catch(Exception e){}
            sb = new StringBuilder();
            tikaParser.parse(tis, handler, metadata, context);
            
           
            
           String s = sb.toString();
           
           int i= 1;
    }
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        
        /*try{
        File initialFile = new 
File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf");
        InputStream targetStream = new FileInputStream(initialFile);
        String s = parse(targetStream,null, null);
        int i=1;
        }
        catch (Exception e){}*/
       /* TestTika tk = new TestTika();
        tk.setFilePath("C:\\Users\\kaleba\\Desktop\\Rus3.pdf");
        try{
        System.out.println(tk.ToText());
        }
        catch(Exception e){}*/
       try{
           myTika();
       }
       catch (Exception e){
           System.out.print(e.getMessage());
       }
    }

}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

[jira] [Created] (TIKA-2080) PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly

Reply via email to