officeconverter

Vincent Massol Tue, 28 Oct 2008 07:24:11 -0700

Hi Asiri,

I think I'd really prefer one filter per class. Same as what is done  
in the HTML cleaner. Also please donc use any *Utils class and no  
static please (these are both anti patterns).


Thanks
-Vincent

On Oct 28, 2008, at 2:54 PM, asiri (SVN) wrote:

> Author: asiri
> Date: 2008-10-28 14:54:04 +0100 (Tue, 28 Oct 2008)
> New Revision: 13868
>
> Removed:
>   sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/filter/
> Modified:
>   sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/OfficeImporterPlugin.java
>   sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/utils/HtmlFilterUtils.java
>   sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/ 
> plugin/officeconverter/CleanHTMLTest.java
> Log:
> Moved all html filter code into a single utility class called  
> HtmlFilterUtils. I thought of introducing some sort of a filter  
> chain (may be chain of responsibility pattern) but it seemed like an  
> over-kill for this scenario.
>
> Modified: sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/ 
> xwiki/plugin/officeimporter/OfficeImporterPlugin.java
> ===================================================================
> --- sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/OfficeImporterPlugin.java       2008-10-28 11:33:41  
> UTC (rev 13867)
> +++ sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/OfficeImporterPlugin.java       2008-10-28 13:54:04  
> UTC (rev 13868)
> @@ -57,14 +57,9 @@
> import com.xpn.xwiki.doc.XWikiDocument;
> import com.xpn.xwiki.plugin.XWikiDefaultPlugin;
> import com.xpn.xwiki.plugin.XWikiPluginInterface;
> -import com.xpn.xwiki.plugin.officeimporter.filter.EmptyLinkFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.ImageTagFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.PinLiFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.TagRemoveFilter;
> -import  
> com.xpn.xwiki.plugin.officeimporter.filter.UnderlineLinkFilter;
> -import  
> com.xpn.xwiki.plugin.officeimporter.filter.XWikiSyntaxEscapeFilter;
> -import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
> import com.xpn.xwiki.plugin.officeimporter.utils.DocumentType;
> +import com.xpn.xwiki.plugin.officeimporter.utils.HtmlFilterUtils;
> +import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
> import com.xpn.xwiki.web.Utils;
>
> /**
> @@ -471,9 +466,7 @@
>                 HTMLCleaner.ROLE), e);
>         }
>         Document document = htmlCleaner.clean(new  
> StringReader(inputHTML));
> -
> -        new UnderlineLinkFilter().filter(document);
> -
> +        HtmlFilterUtils.filterUnderlinedLinks(document);
>         XMLUtils.stripHTMLEnvelope(document);
>         String cleanedHTML = XMLUtils.toString(document);
>         return cleanedHTML;
> @@ -499,14 +492,12 @@
>                 HTMLCleaner.ROLE), e);
>         }
>         Document document = htmlCleaner.clean(new  
> StringReader(inputHTML));
> -
> -        new TagRemoveFilter().filter(document);
> -        new UnderlineLinkFilter().filter(document);
> -        new XWikiSyntaxEscapeFilter().filter(document);
> -        new ImageTagFilter().filter(document);
> -        new PinLiFilter().filter(document);
> -        new EmptyLinkFilter().filter(document);
> -
> +        HtmlFilterUtils.filterTags(document, new String[]{"style",  
> "script"});
> +        HtmlFilterUtils.filterUnderlinedLinks(document);
> +        HtmlFilterUtils.filterSytaxChars(document);
> +        HtmlFilterUtils.filterImageLinks(document);
> +        HtmlFilterUtils.filterParagraphTagsInLineItemTags(document);
> +        HtmlFilterUtils.filterEmptyLinks(document);
>         XMLUtils.stripHTMLEnvelope(document);
>         String cleanedHTML = XMLUtils.toString(document);
>         return cleanedHTML;
>
> Modified: sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/ 
> xwiki/plugin/officeimporter/utils/HtmlFilterUtils.java
> ===================================================================
> --- sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/utils/HtmlFilterUtils.java      2008-10-28 11:33:41  
> UTC (rev 13867)
> +++ sandbox/xwiki-plugin-officeimporter/src/main/java/com/xpn/xwiki/ 
> plugin/officeimporter/utils/HtmlFilterUtils.java      2008-10-28 13:54:04  
> UTC (rev 13868)
> @@ -1,12 +1,247 @@
> package com.xpn.xwiki.plugin.officeimporter.utils;
>
> +import java.util.ArrayList;
> +import java.util.List;
> +
> +import org.w3c.dom.Document;
> +import org.w3c.dom.Element;
> +import org.w3c.dom.NamedNodeMap;
> +import org.w3c.dom.Node;
> +import org.w3c.dom.NodeList;
> +import org.w3c.dom.Text;
> +
> /**
>  * A utility class containing a suite of filter methods used to  
> manipulate Html documents.
>  *
>  * @version $Id$
>  * @since 1.7M1
>  */
> -public class HtmlFilterUtils
> +public abstract class HtmlFilterUtils
> {
> +    /**
> +     * Characters that need to be escaped when jumping from html to  
> xwiki syntax.
> +     */
> +    private static final List<String> escapeChars = new  
> ArrayList<String>();
>
> +    /**
> +     * Static initializer for escape chars.
> +     */
> +    static {
> +        escapeChars.add("[");
> +        escapeChars.add("]");
> +        escapeChars.add("{");
> +        escapeChars.add("}");
> +        escapeChars.add("*");
> +        escapeChars.add("~");
> +        escapeChars.add("_");
> +        escapeChars.add("-");
> +        escapeChars.add("1");
> +        escapeChars.add("#");
> +        escapeChars.add("$");
> +    }
> +
> +    /**
> +     * Removes empty links from html documents. If the label of the  
> link is empty, simply remove the
> +     * tag as in [EMAIL PROTECTED] <a/>} or [EMAIL PROTECTED] <a href=""/>}. 
> If the label  
> is not null but the href
> +     * attribute is missing, replace the tag with it's label. Like  
> changing [EMAIL PROTECTED] <a>something</a>}
> +     * to [EMAIL PROTECTED] something}.
> +     *
> +     * @param document The html document.
> +     */
> +    public static void filterEmptyLinks(Document document)
> +    {
> +        Element root = document.getDocumentElement();
> +        NodeList links = root.getElementsByTagName("a");
> +        for (int i = 0; i < links.getLength(); i++) {
> +            Node link = links.item(i);
> +            if (link.getTextContent() == null ||  
> link.getTextContent().trim().equals("")) {
> +                link.getParentNode().removeChild(link);
> +                i--;
> +                continue;
> +            }
> +
> +            Node hrefAttr =  
> link.getAttributes().getNamedItem("href");
> +            if (hrefAttr == null ||  
> hrefAttr.getTextContent().trim().equals("")) {
> +                NodeList children = link.getChildNodes();
> +                while (children.getLength() > 0) {
> +                     
> link.getParentNode().insertBefore(children.item(0), link);
> +                }
> +                link.getParentNode().removeChild(link);
> +                i--;
> +            }
> +        }
> +    }
> +
> +    /**
> +     * Replaces the [EMAIL PROTECTED] <img>} tags with corresponding {image} 
>  
> macro elements which are
> +     * recognized by xwiki syntax 1.0. Handles image attributes  
> like src, width, height, alt, align.
> +     *
> +     * @param document The html document.
> +     */
> +    public static void filterImageLinks(Document document)
> +    {
> +        Element root = document.getDocumentElement();
> +        NodeList imgs = root.getElementsByTagName("img");
> +        while (imgs.getLength() > 0) {
> +            Node image = imgs.item(0);
> +            String imageCode = generateImageMacroString(image);
> +            Node parent = image.getParentNode();
> +            Text newImg = document.createTextNode(imageCode);
> +            parent.replaceChild(newImg, image);
> +        }
> +    }
> +
> +    /**
> +     * Converts a [EMAIL PROTECTED] <img>} element into a xwiki syntax 1.0  
> {image} macro element.
> +     *
> +     * @param imageLink Node representing the image link.
> +     * @return Converted {image} macro string.
> +     */
> +    private static String generateImageMacroString(Node imageLink)
> +    {
> +        NamedNodeMap attrs = imageLink.getAttributes();
> +        if (attrs == null) {
> +            return null;
> +        }
> +        StringBuffer sb = new StringBuffer();
> +        sb.append("{image:");
> +        if (attrs.getNamedItem("src") != null) {
> +            String src = attrs.getNamedItem("src").getTextContent();
> +            sb.append(src);
> +        }
> +        if (attrs.getNamedItem("width") != null) {
> +            String width =  
> attrs.getNamedItem("width").getTextContent();
> +            sb.append("|width=" + width);
> +        }
> +        if (attrs.getNamedItem("height") != null) {
> +            String height =  
> attrs.getNamedItem("height").getTextContent();
> +            sb.append("|height=" + height);
> +        }
> +        if (attrs.getNamedItem("alt") != null) {
> +            String alt = attrs.getNamedItem("alt").getTextContent();
> +            sb.append("|alt=" + alt);
> +        }
> +        if (attrs.getNamedItem("align") != null) {
> +            String align =  
> attrs.getNamedItem("align").getTextContent();
> +            sb.append("|align=" + align);
> +        }
> +        sb.append("}");
> +        return sb.toString();
> +    }
> +
> +    /**
> +     * Removes the starting [EMAIL PROTECTED] <p>} tags found within [EMAIL 
> PROTECTED]  
> <li>} tags. This is useful since
> +     * such formations are not properly handled in xwiki 1.0 syntax.
> +     *
> +     * @param document The html document.
> +     */
> +    public static void filterParagraphTagsInLineItemTags(Document  
> document)
> +    {
> +        Element root = document.getDocumentElement();
> +        NodeList lists = root.getElementsByTagName("li");
> +        for (int i = 0; i < lists.getLength(); i++) {
> +            Node list = lists.item(i);
> +            Node firstChild = list.getFirstChild();
> +            if (firstChild.getNodeName() != null &&  
> firstChild.getNodeName().equals("p")) {
> +                NodeList childchildren = firstChild.getChildNodes();
> +                while (childchildren.getLength() > 0) {
> +                    list.insertBefore(childchildren.item(0),  
> firstChild);
> +                }
> +                list.removeChild(firstChild);
> +            }
> +        }
> +    }
> +
> +    /**
> +     * Removes all listed tags from the given html document.
> +     *
> +     * @param document The html document.
> +     * @param tags Tags to be removed.
> +     */
> +    public static void filterTags(Document document, String[] tags)
> +    {
> +        Element root = document.getDocumentElement();
> +        for (String tag : tags) {
> +            NodeList toBeRemovedTags =  
> root.getElementsByTagName(tag);
> +            while (toBeRemovedTags.getLength() > 0) {
> +                Node t = toBeRemovedTags.item(0);
> +                t.getParentNode().removeChild(t);
> +            }
> +        }
> +    }
> +
> +    /**
> +     * Strips off underline tags surrounding links like [EMAIL PROTECTED]  
> <u><a href="something">link</a></u>}.
> +     *
> +     * @param document The html document.
> +     */
> +    public static void filterUnderlinedLinks(Document document)
> +    {
> +        Element root = document.getDocumentElement();
> +        NodeList links = root.getElementsByTagName("a");
> +        for (int i = 0; i < links.getLength(); i++) {
> +            Node link = links.item(i);
> +            Node parent = link.getParentNode();
> +            String parentName = parent.getNodeName();
> +            if (parentName != null && (parentName.equals("u") ||  
> parentName.equals("del"))) {
> +                parent.getParentNode().insertBefore(link, parent);
> +                parent.getParentNode().removeChild(parent);
> +            }
> +        }
> +    }
> +
> +    /**
> +     * Escapes the xwiki sytax characters from the given html  
> document. Example : [EMAIL PROTECTED] [} will be
> +     * replaced by [EMAIL PROTECTED] \]}.
> +     *
> +     * @param document The html document.
> +     */
> +    public static void filterSytaxChars(Document document)
> +    {
> +        Element root = document.getDocumentElement();
> +        escapeNode(root);
> +    }
> +
> +    /**
> +     * Escapes xwiki syntax characters within the given node's  
> content.
> +     *
> +     * @param node The node which is to be examined.
> +     */
> +    private static void escapeNode(Node node)
> +    {
> +        NodeList nodes = node.getChildNodes();
> +        for (int i = 0; i < nodes.getLength(); i++) {
> +            Node next = nodes.item(i);
> +            if (next instanceof Text) {
> +                String text = next.getTextContent();
> +                text = escapeText(text);
> +                next.setTextContent(text);
> +            } else {
> +                if (next.hasChildNodes()) {
> +                    escapeNode(next);
> +                }
> +            }
> +        }
> +    }
> +
> +    /**
> +     * Escapes xwiki syntax characters within the given string.
> +     *
> +     * @param text The string to be examined.
> +     * @return The syntax escaped string.
> +     */
> +    private static String escapeText(String text)
> +    {
> +        StringBuffer sb = new StringBuffer();
> +        for (int i = 0; i < text.length(); i++) {
> +            char x = text.charAt(i);
> +            if (escapeChars.contains(String.valueOf(x))) {
> +                sb.append("\\");
> +                sb.append(String.valueOf(x));
> +            } else {
> +                sb.append(x);
> +            }
> +        }
> +        return sb.toString();
> +    }
> }
>
> Modified: sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/ 
> xwiki/plugin/officeconverter/CleanHTMLTest.java
> ===================================================================
> --- sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/ 
> plugin/officeconverter/CleanHTMLTest.java     2008-10-28 11:33:41 UTC  
> (rev 13867)
> +++ sandbox/xwiki-plugin-officeimporter/src/test/java/com/xpn/xwiki/ 
> plugin/officeconverter/CleanHTMLTest.java     2008-10-28 13:54:04 UTC  
> (rev 13868)
> @@ -27,13 +27,7 @@
> import org.xwiki.xml.XMLUtils;
> import org.xwiki.xml.html.HTMLCleaner;
>
> -import com.xpn.xwiki.plugin.officeimporter.filter.EmptyLinkFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.HTMLFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.ImageTagFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.PinLiFilter;
> -import com.xpn.xwiki.plugin.officeimporter.filter.TagRemoveFilter;
> -import  
> com.xpn.xwiki.plugin.officeimporter.filter.UnderlineLinkFilter;
> -import  
> com.xpn.xwiki.plugin.officeimporter.filter.XWikiSyntaxEscapeFilter;
> +import com.xpn.xwiki.plugin.officeimporter.utils.HtmlFilterUtils;
> import com.xpn.xwiki.plugin.officeimporter.utils.ImporterException;
> import com.xpn.xwiki.test.AbstractXWikiComponentTestCase;
>
> @@ -121,14 +115,12 @@
>     private void test(String input, String expected) throws  
> ImporterException
>     {
>         Document document = cleaner.clean(new StringReader(input));
> -
> -        new TagRemoveFilter().filter(document);
> -        new UnderlineLinkFilter().filter(document);
> -        new XWikiSyntaxEscapeFilter().filter(document);
> -        new ImageTagFilter().filter(document);
> -        new PinLiFilter().filter(document);
> -        new EmptyLinkFilter().filter(document);
> -
> +        HtmlFilterUtils.filterTags(document, new String[]{"style",  
> "script"});
> +        HtmlFilterUtils.filterUnderlinedLinks(document);
> +        HtmlFilterUtils.filterSytaxChars(document);
> +        HtmlFilterUtils.filterImageLinks(document);
> +        HtmlFilterUtils.filterParagraphTagsInLineItemTags(document);
> +        HtmlFilterUtils.filterEmptyLinks(document);
>         XMLUtils.stripHTMLEnvelope(document);
>         String actual = XMLUtils.toString(document);
>         assertEquals(HEAD + expected + FOOT, actual);
>
> _______________________________________________
> notifications mailing list
> [EMAIL PROTECTED]
> http://lists.xwiki.org/mailman/listinfo/notifications

_______________________________________________
devs mailing list
[email protected]
http://lists.xwiki.org/mailman/listinfo/devs

Re: [xwiki-devs] [xwiki-notifications] r13868 - in sandbox/xwiki-plugin-officeimporter/src: main/java/com/xpn/xwiki/plugin/officeimporter main/java/com/xpn/xwiki/plugin/officeimporter/utils test/java/com/xpn/xwiki/plugin/officeconverter

Reply via email to