[ https://issues.apache.org/jira/browse/PDFBOX-5532?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Andreas Lehmkühler updated PDFBOX-5532: --------------------------------------- Description: Hello, I am reading a pdf document but in the COSString field non-ascii characters are being retrieved. What can be the motive? I am using version pdfbox-2.0.24.jar This would be an example of the pdf document parsed: {code} COSInt\{50} COSInt\{0} PDFOperator\{Td} COSString\{åÅÕãÁâ@} PDFOperator\{Tj} COSFloat\{770.18} COSInt\{0} PDFOperator\{Td} COSString\{×–Ž–©@} PDFOperator\{Tj} COSFloat\{520.21} COSInt\{0} {code} Function java: {code} public static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException { PDPageTree pages = document.getDocumentCatalog().getPages(); for (PDPage page : pages) { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); List tokens = parser.getTokens(); for (int j = 0; j < tokens.size(); j++) { Object next = tokens.get(j); if (next instanceof Operator) { Operator op = (Operator) next; if (op.getName().equals("Tj")) { COSString previous = (COSString) tokens.get(j - 1); String string = previous.getString(); System.out.println("previous:=" + string); if (string.equals(searchString)){ COSString sx = new COSString(replacement); previous.setValue(sx.getBytes()); } } } } // now that the tokens are updated we will replace the page content stream. PDStream updatedStream = new PDStream(document); OutputStream out = updatedStream.createOutputStream(); ContentStreamWriter tokenWriter = new ContentStreamWriter(out); tokenWriter.writeTokens(tokens); page.setContents(updatedStream); out.close(); } return document; } {code} was: Hello, I am reading a pdf document but in the COSString field non-ascii characters are being retrieved. What can be the motive? I am using version pdfbox-2.0.24.jar This would be an example of the pdf document parsed: COSInt\{50} COSInt\{0} PDFOperator\{Td} COSString\{åÅÕãÁâ@} PDFOperator\{Tj} COSFloat\{770.18} COSInt\{0} PDFOperator\{Td} COSString\{×–Ž–©@} PDFOperator\{Tj} COSFloat\{520.21} COSInt\{0} Function java: public static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException { PDPageTree pages = document.getDocumentCatalog().getPages(); for (PDPage page : pages) { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); List tokens = parser.getTokens(); for (int j = 0; j < tokens.size(); j++) { Object next = tokens.get(j); if (next instanceof Operator) { Operator op = (Operator) next; if (op.getName().equals("Tj")) { COSString previous = (COSString) tokens.get(j - 1); String string = previous.getString(); System.out.println("previous:=" + string); if (string.equals(searchString)){ COSString sx = new COSString(replacement); previous.setValue(sx.getBytes()); } } } } // now that the tokens are updated we will replace the page content stream. PDStream updatedStream = new PDStream(document); OutputStream out = updatedStream.createOutputStream(); ContentStreamWriter tokenWriter = new ContentStreamWriter(out); tokenWriter.writeTokens(tokens); page.setContents(updatedStream); out.close(); } return document; } > COSString field non-ascii characters > ------------------------------------ > > Key: PDFBOX-5532 > URL: https://issues.apache.org/jira/browse/PDFBOX-5532 > Project: PDFBox > Issue Type: Bug > Reporter: David > Priority: Major > > > Hello, > I am reading a pdf document but in the COSString field non-ascii characters > are being retrieved. What can be the motive? I am using version > pdfbox-2.0.24.jar > This would be an example of the pdf document parsed: > {code} > COSInt\{50} > COSInt\{0} > PDFOperator\{Td} > COSString\{åÅÕãÁâ@} > PDFOperator\{Tj} > COSFloat\{770.18} > COSInt\{0} > PDFOperator\{Td} > COSString\{×–Ž–©@} > PDFOperator\{Tj} > COSFloat\{520.21} > COSInt\{0} > {code} > Function java: > {code} > public static PDDocument replaceText(PDDocument document, String > searchString, String replacement) throws IOException { > > PDPageTree pages = document.getDocumentCatalog().getPages(); > for (PDPage page : pages) { > > PDFStreamParser parser = new PDFStreamParser(page); > parser.parse(); > List tokens = parser.getTokens(); > for (int j = 0; j < tokens.size(); j++) { > Object next = tokens.get(j); > > if (next instanceof Operator) { > Operator op = (Operator) next; > > if (op.getName().equals("Tj")) { > COSString previous = (COSString) > tokens.get(j - 1); > String string = previous.getString(); > System.out.println("previous:=" + string); > > > if (string.equals(searchString)){ > COSString sx = new > COSString(replacement); > previous.setValue(sx.getBytes()); > > } > } > } > } > // now that the tokens are updated we will replace the > page content stream. > PDStream updatedStream = new PDStream(document); > OutputStream out = updatedStream.createOutputStream(); > ContentStreamWriter tokenWriter = new > ContentStreamWriter(out); > tokenWriter.writeTokens(tokens); > page.setContents(updatedStream); > out.close(); > > > } > return document; > } > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org