Author: lehmi Date: Mon Mar 21 18:57:59 2011 New Revision: 1083901 URL: http://svn.apache.org/viewvc?rev=1083901&view=rev Log: PDFBOX-966: escape characters as proposed by Shinya Kasatani
Added: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1083901&r1=1083900&r2=1083901&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Mon Mar 21 18:57:59 2011 @@ -67,7 +67,7 @@ public class PDFText2HTML extends PDFTex buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"" + "\n" + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"); buf.append("<html><head>"); - buf.append("<title>" + getTitle() + "</title>\n"); + buf.append("<title>" + escape(getTitle()) + "</title>\n"); if(outputEncoding != null) { buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" @@ -188,6 +188,18 @@ public class PDFText2HTML extends PDFTex */ protected void writeString(String chars) throws IOException { + super.writeString(escape(chars)); + } + + /** + * Escape some HTML characters. + * + * @param chars String to be escaped + * @return returns escaped String. + */ + private String escape(String chars) + { + StringBuilder builder = new StringBuilder(chars.length()); for (int i = 0; i < chars.length(); i++) { char c = chars.charAt(i); @@ -195,28 +207,29 @@ public class PDFText2HTML extends PDFTex if ((c < 32) || (c > 126)) { int charAsInt = c; - super.writeString("&#" + charAsInt + ";"); + builder.append("&#").append(charAsInt).append(";"); } else { switch (c) { case 34: - super.writeString("""); + builder.append("""); break; case 38: - super.writeString("&"); + builder.append("&"); break; case 60: - super.writeString("<"); + builder.append("<"); break; case 62: - super.writeString(">"); + builder.append(">"); break; default: - super.writeString(String.valueOf(c)); + builder.append(String.valueOf(c)); } } } + return builder.toString(); } } Added: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java?rev=1083901&view=auto ============================================================================== --- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java (added) +++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java Mon Mar 21 18:57:59 2011 @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pdfbox.util; + +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.edit.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.font.PDType1Font; + +import junit.framework.TestCase; + +public class TestPDFText2HTML extends TestCase { + + private PDDocument createDocument() throws IOException { + PDDocument doc = new PDDocument(); + PDPage page = new PDPage(); + doc.addPage(page); + PDFont font = PDType1Font.HELVETICA; + PDPageContentStream contentStream = new PDPageContentStream(doc, page); + contentStream.beginText(); + contentStream.setFont(font, 12); + contentStream.moveTextPositionByAmount(100, 700); + contentStream.drawString("<foo>"); + contentStream.endText(); + contentStream.close(); + return doc; + } + + public void testEscapeTitle() throws IOException { + PDFTextStripper stripper = new PDFText2HTML("UTF-8"); + PDDocument doc = createDocument(); + doc.getDocumentInformation().setTitle("<script>\u3042"); + String text = stripper.getText(doc); + + Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text); + assertTrue(m.find()); + assertEquals("<script>あ", m.group(1)); + + assertTrue(text.indexOf("<foo>") >= 0); + + } +}