Re: Cleaning stored text to get valid XML

Claude Schneegans Sat, 05 May 2007 09:15:39 -0700

 >>Finally found a function that seems to clean most of the MS Word control
characters and other crap out that was causing me probems. Using two
filters on the body text seems to be taking care of my problems now..


This will clean only about 1% of the crap, may be not even...

Here is a function that will clen up more, and I'm still improving it ;-)

function cleanWord (html)
    // cleans pasted text from Word
    {
    //alert(html)
    html = html.replace(/<o:p>\s*<\/o:p>/g, "") ;
    html = html.replace(/<o:p>.*?<\/o:p>/g, "") ;
   
    // Remove mso-xxx styles.
    html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;

    // Remove margin styles.
    html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
    html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;

    html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
    html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;

    html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;

    html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;

    html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;

    html = html.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
    html = html.replace( /\s*tab-stops:[^"]*/gi, "" ) ;

    html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
   
    // Remove Class attributes
    html = html.replace(/<(\w[^>]*)\s*class=([^ |>]*)([^>]*)/gi, "<$1$3") ;

    // Remove styles.
    html = html.replace( /<(\w[^>]*)style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;

    // Remove empty styles.
    html =  html.replace( /\s*style="\s*"/gi, '' ) ;
   
    html = html.replace( /<SPAN[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ;
   
    html = html.replace( /<SPAN[^>]*>\s*<\/SPAN>/gi, '' ) ;
   
    // Remove Lang attributes
    html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
   
    html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
    html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
    html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
   
    // remove all font tags
    html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
    html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
    html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
    html = html.replace( /<\/?DIV([^>]*)>/gi, '' ) ;

    // Remove XML elements and declarations
    html = html.replace(/<\\?\?xml[^>]*>/gi, "") ;
   
    // Remove Tags with XML namespace declarations: <o:p></o:p>
    html = html.replace(/<\/?\w+:[^>]*>/gi, "") ;
   
    html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;

    //clean up H tags   
    html = html.replace( /<H1([^>]*)>/gi, '<H1>' ) ;
    html = html.replace( /<H2([^>]*)>/gi, '<H2>' ) ;
    html = html.replace( /<H3([^>]*)>/gi, '<H3>' ) ;
    html = html.replace( /<H4([^>]*)>/gi, '<H4>' ) ;
    html = html.replace( /<H5([^>]*)>/gi, '<H5>' ) ;
    html = html.replace( /<H6([^>]*)>/gi, '<H6>' ) ;
    html = html.replace( /<P([^>]*)>/gi,  '<P>' ) ;
    html = html.replace( /<BR([^>]*)>/gi, '<BR>' ) ;
    html = html.replace( /<P>\s*(<P>)+<\/P>/gi,   '<P>' ) ;
    html = html.replace( /<\/P>\s*(<\/P>)+<\/P>/gi, '</P>' ) ;
   
    html = html.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ;

    // no comment...
    html = html.replace( /<!--[\s\S]*?-->/gi, '' ) ;
   
    // transform bullet lists
    var re = new RegExp("<P>·<SPAN>(&nbsp;| )*</SPAN>([\\s\\S]*?)</P>", 
"gi");
    html = html.replace( re, "<LI>$2</LI>" ) ;
    re = new RegExp("<P>·(&nbsp;| )*([\\s\\S]*?)</P>", "gi");
    html = html.replace( /(<BR>|<P>)[§·-](&nbsp;| )*([\s\S]*?)<\/P>/gi, 
"<LI>$2</LI>" ) ;
    // remove spaces at begining
    html = html.replace( /^(&nbsp;| )*\s*/, '') ;
    // replace all stupid <P align=center>...</P> because they are 
overridden by higher
    // style declarations like justify, etc.
    html = html.replace( /<P\s*align=center>([\s\S]*?)<\/P>/gi, 
'<BR><CENTER>$1</CENTER>' ) ;
    // remove useless </CENTER><CENTER>
    html = html.replace( /<\/CENTER>(\s*<BR>\s*)<CENTER>/gi, '$1' ) ;
    // remove useless <BR> in <TD>
    html = html.replace( /(<TD[^>]*>)\s*<BR>\s*/gi, '$1' ) ;
    // replace <CENTER>...</CENTER> inside of TDs
    html = html.replace( 
/(<TD[^>]*)>\s*<CENTER>([\s\S]*?)<\/CENTER>\s*<\/TD>/gi,
        '$1 align=center>$2</TD>' ) ;
    // remove Paragraphs inside TD
    html = 
html.replace(/(<TD[^>]*>)\s*<P[^>]*>([\s\S]*?)\s*<\/P>\s*([\s\S]*?<\/TD>)/gi, 

        '$1$2$3');
  
    // Remove empty tags (three times, just to make sure).
    html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
    html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
    html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
    html = html.replace( /[^\n\r]<P>/gi, '<P>' ) ;
    html = html.replace( /[^\n\r]<BR>/gi, '<BR>' ) ;

    //alert(html)
  return (html);
    }

-- 
_______________________________________
REUSE CODE! Use custom tags;
See http://www.contentbox.com/claude/customtags/tagstore.cfm
(Please send any spam to this address: [EMAIL PROTECTED])
Thanks.



~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
Create Web Applications With ColdFusion MX7 & Flex 2. 
Build powerful, scalable RIAs. Free Trial
http://www.adobe.com/products/coldfusion/flex2/?sdid=RVJS 

Archive: 
http://www.houseoffusion.com/groups/CF-Talk/message.cfm/messageid:277080
Subscription: http://www.houseoffusion.com/groups/CF-Talk/subscribe.cfm
Unsubscribe: 
http://www.houseoffusion.com/cf_lists/unsubscribe.cfm?user=11502.10531.4

Re: Cleaning stored text to get valid XML

Reply via email to