[jira] [Comment Edited] (TIKA-2080) PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly

Kaleb Akalework (JIRA) Thu, 15 Sep 2016 10:46:40 -0700

    [ 
https://issues.apache.org/jira/browse/TIKA-2080?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15494056#comment-15494056
 ]


Kaleb Akalework edited comment on TIKA-2080 at 9/15/16 5:45 PM:
----------------------------------------------------------------

Thanks. I still see the problem with the new PDFBox2.0.3 too. I have attached 
the code I'm using. I put a break point and saw that the Text variable 
contained the first parsed character repeatedly. I can also provide the PDF 
file I'm using if needed.

import java.io.File;

import java.io.IOException;


import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.text.PDFTextStripper;


public class PDFBoxTesting {

    
     private static PDFParser parser;
    private static PDFTextStripper pdfStripper;
    private static PDDocument pdDoc ;
    private static COSDocument cosDoc ;
    
    private static String Text ;
    private static String filePath;
    private static File file;

    public static String ToText() throws IOException
    {
        pdfStripper = null;
        pdDoc = null;
        cosDoc = null;
        filePath = "C:\\Users\\kaleba\\Desktop\\nihao2.pdf";
        file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file,"r")); // update for 
PDFBox V 2.0
        
        parser.parse();
        
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdDoc.getNumberOfPages();
        
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(10);
        
        // reading text from page 1 to 10
        // if you want to get text from full pdf file use this code
        // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
      
        Text = pdfStripper.getText(pdDoc);  // put breakpoint after executing 
getTtext.
        
        return Text;
    }
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        try{
          ToText();
        }
        catch (Exception e){
            int i=1;
        }

    }

}


was (Author: kalebakale):
Thanks. I still see the problem with the new PDFBox2.0.3 too. I have attached 
the code I'm using. I put a break point and saw that the Text variable 
contained the first parsed character repeatedly. 

import java.io.File;

import java.io.IOException;


import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.text.PDFTextStripper;


public class PDFBoxTesting {

    
     private static PDFParser parser;
    private static PDFTextStripper pdfStripper;
    private static PDDocument pdDoc ;
    private static COSDocument cosDoc ;
    
    private static String Text ;
    private static String filePath;
    private static File file;

    public static String ToText() throws IOException
    {
        pdfStripper = null;
        pdDoc = null;
        cosDoc = null;
        filePath = "C:\\Users\\kaleba\\Desktop\\nihao2.pdf";
        file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file,"r")); // update for 
PDFBox V 2.0
        
        parser.parse();
        
        cosDoc = parser.getDocument();
        pdfStripper = new PDFTextStripper();
        pdDoc = new PDDocument(cosDoc);
        pdDoc.getNumberOfPages();
        
        pdfStripper.setStartPage(1);
        pdfStripper.setEndPage(10);
        
        // reading text from page 1 to 10
        // if you want to get text from full pdf file use this code
        // pdfStripper.setEndPage(pdDoc.getNumberOfPages());
      
        Text = pdfStripper.getText(pdDoc);  // put breakpoint after executing 
getTtext.
        
        return Text;
    }
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        try{
          ToText();
        }
        catch (Exception e){
            int i=1;
        }

    }

}

> PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters 
> correctly
> -------------------------------------------------------------------------------------
>
>                 Key: TIKA-2080
>                 URL: https://issues.apache.org/jira/browse/TIKA-2080
>             Project: Tika
>          Issue Type: Bug
>          Components: parser
>    Affects Versions: 1.13
>         Environment: Windows 8.1, jdk1.8.0_102
>            Reporter: Kaleb Akalework
>
> I'm trying to use tika to parse PDF files that contain Japanese and Chinese 
> characters, but for some reason it does parse it correctly. Every character 
> that is extracted is changed to the first letter in the line. For example if 
> the document contains 早上好, this, the extracted text will correctly know that 
> it has 3 characters but all 3 characters will be 早早早, the last two characters 
> are replaced by the gfirst character. This same string is correctly parsed, 
> in a word document. The follwoing is what I am using as java sample code 
> (Don't forget to change the fdilename)
> package kaleb;
> import java.io.BufferedReader;
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.StringWriter;
> import java.nio.charset.Charset;
> import java.nio.charset.CharsetEncoder;
> import org.apache.commons.io.IOUtils;
> import org.apache.commons.io.input.ReaderInputStream;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.detect.Detector;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.io.TemporaryResources;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.parser.CompositeParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.pdf.PDFParser;
> import org.apache.tika.sax.BodyContentHandler;
> import org.apache.tika.sax.ContentHandlerDecorator;
> import org.apache.tika.parser.pdf.PDFParser;
> import org.xml.sax.SAXException;
> public class TestTika {
>     
>    
>    
>     /** character limit */
>     private static int parserCharLimit = 10 * 1024 * 1024;
>     public static int getParserCharLimit() {
>         return parserCharLimit;
>     }
>     public static void setParserCharLimit(int l) {
>         parserCharLimit = l;
>     }
>     private static StringBuilder sb = null;
>     
>     private static ContentHandlerDecorator handler = new 
> ContentHandlerDecorator() {
>         private void ensureLimit() throws SAXException {
>             if (sb.length() > parserCharLimit) {
>                 throw new MaxContentExceededException(
>                         "Your document contained more than "
>                         +parserCharLimit+" characters: "+sb.length());
>             }
>         }
>         @Override
>         public void characters(char[] ch, int start, int length) throws 
> SAXException {
>             if (length == 5) 
>                 length *=2;
>             sb.append(ch, start, length );
>             ensureLimit();
>         }
>         @Override
>         public void ignorableWhitespace(char[] ch, int start,
>                                         int length) throws SAXException {
>             if (sb.length() > 0)
>                 sb.append(ch, start, length);
>             ensureLimit();
>         }
>     };
>     
>     public static class MaxContentExceededException extends SAXException {
>         public MaxContentExceededException() { super(); }
>         public MaxContentExceededException(Exception e) { super(e); }
>         public MaxContentExceededException(String message, Exception e) { 
> super(message, e); }
>         public MaxContentExceededException(String message) {super(message);}
>     }
>     
>     public static void myTika() throws Exception{
>         TikaConfig tikaConfig = null;
>      
>             try{
>             
>             InputStream stream = new FileInputStream(new 
> File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb\\tika-config.xml")));
>             try {
>                 tikaConfig = new TikaConfig(stream);
>             } catch (IOException | SAXException | TikaException e) {
>                 tikaConfig = TikaConfig.getDefaultConfig();
>             } finally {
>                 try { stream.close(); } catch (IOException e) { }
>             }
>             }catch(Exception e){}
>         
>             /** default Tika detector */
>             Detector tikaDetector = tikaConfig.getDetector();
>             /** default Tika parser */
>            CompositeParser tikaParser = new 
> CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser());
>         TemporaryResources tmp = new TemporaryResources();
>         InputStream stream = new FileInputStream(new 
> File("C:\\Users\\kaleba\\Desktop\\Chin.docx"));
>         
>         TikaInputStream tis = TikaInputStream.get(stream, tmp);
>                 
>         String type ="";
>         // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler 
> instead??
>         Metadata metadata = new Metadata();
>         ParseContext context = new ParseContext();
>         context.set(org.apache.tika.parser.Parser.class, tikaParser);
>         try {
>             // TODO: limit by content type to reduce dependencies?
>             // https://tika.apache.org/1.10/parser_guide.html
>             
>                 type = tikaDetector.detect(tis, metadata).toString();
>                 metadata.set(Metadata.CONTENT_TYPE, type);
>             }
>         catch(Exception e){}
>             sb = new StringBuilder();
>             tikaParser.parse(tis, handler, metadata, context);
>             
>            
>             
>            String s = sb.toString();
>            
>            int i= 1;
>     }
>     public static void main(String[] args) {
>         // TODO Auto-generated method stub
>         
>         /*try{
>         File initialFile = new 
> File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf");
>         InputStream targetStream = new FileInputStream(initialFile);
>         String s = parse(targetStream,null, null);
>         int i=1;
>         }
>         catch (Exception e){}*/
>        /* TestTika tk = new TestTika();
>         tk.setFilePath("C:\\Users\\kaleba\\Desktop\\Rus3.pdf");
>         try{
>         System.out.println(tk.ToText());
>         }
>         catch(Exception e){}*/
>        try{
>            myTika();
>        }
>        catch (Exception e){
>            System.out.print(e.getMessage());
>        }
>     }
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

[jira] [Comment Edited] (TIKA-2080) PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters correctly

Reply via email to