[ https://issues.apache.org/jira/browse/TIKA-2080?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15494056#comment-15494056 ]
Kaleb Akalework edited comment on TIKA-2080 at 9/15/16 5:45 PM: ---------------------------------------------------------------- Thanks. I still see the problem with the new PDFBox2.0.3 too. I have attached the code I'm using. I put a break point and saw that the Text variable contained the first parsed character repeatedly. I can also provide the PDF file I'm using if needed. import java.io.File; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; public class PDFBoxTesting { private static PDFParser parser; private static PDFTextStripper pdfStripper; private static PDDocument pdDoc ; private static COSDocument cosDoc ; private static String Text ; private static String filePath; private static File file; public static String ToText() throws IOException { pdfStripper = null; pdDoc = null; cosDoc = null; filePath = "C:\\Users\\kaleba\\Desktop\\nihao2.pdf"; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file,"r")); // update for PDFBox V 2.0 parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); // put breakpoint after executing getTtext. return Text; } public static void main(String[] args) { // TODO Auto-generated method stub try{ ToText(); } catch (Exception e){ int i=1; } } } was (Author: kalebakale): Thanks. I still see the problem with the new PDFBox2.0.3 too. I have attached the code I'm using. I put a break point and saw that the Text variable contained the first parsed character repeatedly. import java.io.File; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; public class PDFBoxTesting { private static PDFParser parser; private static PDFTextStripper pdfStripper; private static PDDocument pdDoc ; private static COSDocument cosDoc ; private static String Text ; private static String filePath; private static File file; public static String ToText() throws IOException { pdfStripper = null; pdDoc = null; cosDoc = null; filePath = "C:\\Users\\kaleba\\Desktop\\nihao2.pdf"; file = new File(filePath); parser = new PDFParser(new RandomAccessFile(file,"r")); // update for PDFBox V 2.0 parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdDoc.getNumberOfPages(); pdfStripper.setStartPage(1); pdfStripper.setEndPage(10); // reading text from page 1 to 10 // if you want to get text from full pdf file use this code // pdfStripper.setEndPage(pdDoc.getNumberOfPages()); Text = pdfStripper.getText(pdDoc); // put breakpoint after executing getTtext. return Text; } public static void main(String[] args) { // TODO Auto-generated method stub try{ ToText(); } catch (Exception e){ int i=1; } } } > PDFParser tika-parsers-1.13.jar not parsing Japanese and Chinese Characters > correctly > ------------------------------------------------------------------------------------- > > Key: TIKA-2080 > URL: https://issues.apache.org/jira/browse/TIKA-2080 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 1.13 > Environment: Windows 8.1, jdk1.8.0_102 > Reporter: Kaleb Akalework > > I'm trying to use tika to parse PDF files that contain Japanese and Chinese > characters, but for some reason it does parse it correctly. Every character > that is extracted is changed to the first letter in the line. For example if > the document contains 早上好, this, the extracted text will correctly know that > it has 3 characters but all 3 characters will be 早早早, the last two characters > are replaced by the gfirst character. This same string is correctly parsed, > in a word document. The follwoing is what I am using as java sample code > (Don't forget to change the fdilename) > package kaleb; > import java.io.BufferedReader; > import java.io.File; > import java.io.FileInputStream; > import java.io.IOException; > import java.io.InputStream; > import java.io.InputStreamReader; > import java.io.StringWriter; > import java.nio.charset.Charset; > import java.nio.charset.CharsetEncoder; > import org.apache.commons.io.IOUtils; > import org.apache.commons.io.input.ReaderInputStream; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.detect.Detector; > import org.apache.tika.exception.TikaException; > import org.apache.tika.io.TemporaryResources; > import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.parser.CompositeParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.pdf.PDFParser; > import org.apache.tika.sax.BodyContentHandler; > import org.apache.tika.sax.ContentHandlerDecorator; > import org.apache.tika.parser.pdf.PDFParser; > import org.xml.sax.SAXException; > public class TestTika { > > > > /** character limit */ > private static int parserCharLimit = 10 * 1024 * 1024; > public static int getParserCharLimit() { > return parserCharLimit; > } > public static void setParserCharLimit(int l) { > parserCharLimit = l; > } > private static StringBuilder sb = null; > > private static ContentHandlerDecorator handler = new > ContentHandlerDecorator() { > private void ensureLimit() throws SAXException { > if (sb.length() > parserCharLimit) { > throw new MaxContentExceededException( > "Your document contained more than " > +parserCharLimit+" characters: "+sb.length()); > } > } > @Override > public void characters(char[] ch, int start, int length) throws > SAXException { > if (length == 5) > length *=2; > sb.append(ch, start, length ); > ensureLimit(); > } > @Override > public void ignorableWhitespace(char[] ch, int start, > int length) throws SAXException { > if (sb.length() > 0) > sb.append(ch, start, length); > ensureLimit(); > } > }; > > public static class MaxContentExceededException extends SAXException { > public MaxContentExceededException() { super(); } > public MaxContentExceededException(Exception e) { super(e); } > public MaxContentExceededException(String message, Exception e) { > super(message, e); } > public MaxContentExceededException(String message) {super(message);} > } > > public static void myTika() throws Exception{ > TikaConfig tikaConfig = null; > > try{ > > InputStream stream = new FileInputStream(new > File(("C:\\Users\\kaleba\\workspace\\TestingStuff\\src\\kaleb\\tika-config.xml"))); > try { > tikaConfig = new TikaConfig(stream); > } catch (IOException | SAXException | TikaException e) { > tikaConfig = TikaConfig.getDefaultConfig(); > } finally { > try { stream.close(); } catch (IOException e) { } > } > }catch(Exception e){} > > /** default Tika detector */ > Detector tikaDetector = tikaConfig.getDetector(); > /** default Tika parser */ > CompositeParser tikaParser = new > CompositeParser(tikaConfig.getMediaTypeRegistry(), tikaConfig.getParser()); > TemporaryResources tmp = new TemporaryResources(); > InputStream stream = new FileInputStream(new > File("C:\\Users\\kaleba\\Desktop\\Chin.docx")); > > TikaInputStream tis = TikaInputStream.get(stream, tmp); > > String type =""; > // TODO: TIKA-216: Zip bomb prevention: use SecureContentHandler > instead?? > Metadata metadata = new Metadata(); > ParseContext context = new ParseContext(); > context.set(org.apache.tika.parser.Parser.class, tikaParser); > try { > // TODO: limit by content type to reduce dependencies? > // https://tika.apache.org/1.10/parser_guide.html > > type = tikaDetector.detect(tis, metadata).toString(); > metadata.set(Metadata.CONTENT_TYPE, type); > } > catch(Exception e){} > sb = new StringBuilder(); > tikaParser.parse(tis, handler, metadata, context); > > > > String s = sb.toString(); > > int i= 1; > } > public static void main(String[] args) { > // TODO Auto-generated method stub > > /*try{ > File initialFile = new > File("C:\\Users\\kaleba\\Desktop\\UnicodeTest.pdf"); > InputStream targetStream = new FileInputStream(initialFile); > String s = parse(targetStream,null, null); > int i=1; > } > catch (Exception e){}*/ > /* TestTika tk = new TestTika(); > tk.setFilePath("C:\\Users\\kaleba\\Desktop\\Rus3.pdf"); > try{ > System.out.println(tk.ToText()); > } > catch(Exception e){}*/ > try{ > myTika(); > } > catch (Exception e){ > System.out.print(e.getMessage()); > } > } > } -- This message was sent by Atlassian JIRA (v6.3.4#6332)