Author: lehmi
Date: Mon Mar 21 18:57:59 2011
New Revision: 1083901

URL: http://svn.apache.org/viewvc?rev=1083901&view=rev
Log:
PDFBOX-966: escape characters as proposed by Shinya Kasatani

Added:
    
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1083901&r1=1083900&r2=1083901&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java 
(original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java 
Mon Mar 21 18:57:59 2011
@@ -67,7 +67,7 @@ public class PDFText2HTML extends PDFTex
         buf.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 
Transitional//EN\"" + "\n" 
                 + "\"http://www.w3.org/TR/html4/loose.dtd\";>\n");
         buf.append("<html><head>");
-        buf.append("<title>" + getTitle() + "</title>\n");
+        buf.append("<title>" + escape(getTitle()) + "</title>\n");
         if(outputEncoding != null)
         {
             buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; 
charset=" 
@@ -188,6 +188,18 @@ public class PDFText2HTML extends PDFTex
      */
     protected void writeString(String chars) throws IOException 
     {
+        super.writeString(escape(chars));
+    }
+    
+    /**
+     * Escape some HTML characters.
+     *
+     * @param chars String to be escaped
+     * @return returns escaped String.
+     */
+    private String escape(String chars)
+    {
+       StringBuilder builder = new StringBuilder(chars.length());
         for (int i = 0; i < chars.length(); i++) 
         {
             char c = chars.charAt(i);
@@ -195,28 +207,29 @@ public class PDFText2HTML extends PDFTex
             if ((c < 32) || (c > 126)) 
             {
                 int charAsInt = c;
-                super.writeString("&#" + charAsInt + ";");
+                builder.append("&#").append(charAsInt).append(";");
             } 
             else 
             {
                 switch (c) 
                 {
                 case 34:
-                    super.writeString("&quot;");
+                    builder.append("&quot;");
                     break;
                 case 38:
-                    super.writeString("&amp;");
+                    builder.append("&amp;");
                     break;
                 case 60:
-                    super.writeString("&lt;");
+                    builder.append("&lt;");
                     break;
                 case 62:
-                    super.writeString("&gt;");
+                    builder.append("&gt;");
                     break;
                 default:
-                    super.writeString(String.valueOf(c));
+                    builder.append(String.valueOf(c));
                 }
             }
         }
+        return builder.toString();
     }
 }

Added: 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java?rev=1083901&view=auto
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java 
(added)
+++ 
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java 
Mon Mar 21 18:57:59 2011
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDType1Font;
+
+import junit.framework.TestCase;
+
+public class TestPDFText2HTML extends TestCase {
+
+    private PDDocument createDocument() throws IOException {
+        PDDocument doc = new PDDocument();
+        PDPage page = new PDPage();
+        doc.addPage(page);
+        PDFont font = PDType1Font.HELVETICA;
+        PDPageContentStream contentStream = new PDPageContentStream(doc, page);
+        contentStream.beginText();
+        contentStream.setFont(font, 12);
+        contentStream.moveTextPositionByAmount(100, 700);
+        contentStream.drawString("<foo>");
+        contentStream.endText();
+        contentStream.close();
+        return doc;
+    }
+
+    public void testEscapeTitle() throws IOException {
+        PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+        PDDocument doc = createDocument();
+        doc.getDocumentInformation().setTitle("<script>\u3042");
+        String text = stripper.getText(doc);
+       
+        Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text);
+        assertTrue(m.find());
+        assertEquals("&lt;script&gt;&#12354;", m.group(1));
+        
+        assertTrue(text.indexOf("&lt;foo&gt;") >= 0);
+        
+    }
+}


Reply via email to