Re: Cleaning stored text to get valid XML

Les Mizzell Sat, 05 May 2007 11:09:24 -0700

  > This will clean only about 1% of the crap, may be not even...


The data in question has been entered using fckeditor, what has taken 
care of a good bit of the "problem" stuff for me. It was the few things 
left over that fck didn't deal with that was giving me fits.

I see some great potential with cleanWord! Thanks for sharing!!! I'll 
certainly add it to my toolbox and may pull some bits and pieces as 
needed for the current project as well!!!


> Here is a function that will clen up more, and I'm still improving it ;-)
> 
> function cleanWord (html)
>     // cleans pasted text from Word
>     {
>     //alert(html)
>     html = html.replace(/<o:p>\s*<\/o:p>/g, "") ;
>     html = html.replace(/<o:p>.*?<\/o:p>/g, "") ;
>    
>     // Remove mso-xxx styles.
>     html = html.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ;
> 
>     // Remove margin styles.
>     html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ;
>     html = html.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ;
> 
>     html = html.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ;
>     html = html.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ;
> 
>     html = html.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ;
> 
>     html = html.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ;
> 
>     html = html.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ;
> 
>     html = html.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ;
>     html = html.replace( /\s*tab-stops:[^"]*/gi, "" ) ;
> 
>     html = html.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ;
>    
>     // Remove Class attributes
>     html = html.replace(/<(\w[^>]*)\s*class=([^ |>]*)([^>]*)/gi, "<$1$3") ;
> 
>     // Remove styles.
>     html = html.replace( /<(\w[^>]*)style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ;
> 
>     // Remove empty styles.
>     html =  html.replace( /\s*style="\s*"/gi, '' ) ;
>    
>     html = html.replace( /<SPAN[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ;
>    
>     html = html.replace( /<SPAN[^>]*>\s*<\/SPAN>/gi, '' ) ;
>    
>     // Remove Lang attributes
>     html = html.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ;
>    
>     html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
>     html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
>     html = html.replace( /<SPAN\s*>([\s\S]*?)<\/SPAN>/gi, '$1' ) ;
>    
>     // remove all font tags
>     html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
>     html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
>     html = html.replace( /<\/?FONT[^>]*>/gi, '' ) ;
>     html = html.replace( /<\/?DIV([^>]*)>/gi, '' ) ;
> 
>     // Remove XML elements and declarations
>     html = html.replace(/<\\?\?xml[^>]*>/gi, "") ;
>    
>     // Remove Tags with XML namespace declarations: <o:p></o:p>
>     html = html.replace(/<\/?\w+:[^>]*>/gi, "") ;
>    
>     html = html.replace( /<H\d>\s*<\/H\d>/gi, '' ) ;
> 
>     //clean up H tags   
>     html = html.replace( /<H1([^>]*)>/gi, '<H1>' ) ;
>     html = html.replace( /<H2([^>]*)>/gi, '<H2>' ) ;
>     html = html.replace( /<H3([^>]*)>/gi, '<H3>' ) ;
>     html = html.replace( /<H4([^>]*)>/gi, '<H4>' ) ;
>     html = html.replace( /<H5([^>]*)>/gi, '<H5>' ) ;
>     html = html.replace( /<H6([^>]*)>/gi, '<H6>' ) ;
>     html = html.replace( /<P([^>]*)>/gi,  '<P>' ) ;
>     html = html.replace( /<BR([^>]*)>/gi, '<BR>' ) ;
>     html = html.replace( /<P>\s*(<P>)+<\/P>/gi,   '<P>' ) ;
>     html = html.replace( /<\/P>\s*(<\/P>)+<\/P>/gi, '</P>' ) ;
>    
>     html = html.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ;
> 
>     // no comment...
>     html = html.replace( /<!--[\s\S]*?-->/gi, '' ) ;
>    
>     // transform bullet lists
>     var re = new RegExp("<P>Â·<SPAN>(&nbsp;| )*</SPAN>([\\s\\S]*?)</P>", 
> "gi");
>     html = html.replace( re, "<LI>$2</LI>" ) ;
>     re = new RegExp("<P>Â·(&nbsp;| )*([\\s\\S]*?)</P>", "gi");
>     html = html.replace( /(<BR>|<P>)[Â§Â·-](&nbsp;| )*([\s\S]*?)<\/P>/gi, 
> "<LI>$2</LI>" ) ;
>     // remove spaces at begining
>     html = html.replace( /^(&nbsp;| )*\s*/, '') ;
>     // replace all stupid <P align=center>...</P> because they are 
> overridden by higher
>     // style declarations like justify, etc.
>     html = html.replace( /<P\s*align=center>([\s\S]*?)<\/P>/gi, 
> '<BR><CENTER>$1</CENTER>' ) ;
>     // remove useless </CENTER><CENTER>
>     html = html.replace( /<\/CENTER>(\s*<BR>\s*)<CENTER>/gi, '$1' ) ;
>     // remove useless <BR> in <TD>
>     html = html.replace( /(<TD[^>]*>)\s*<BR>\s*/gi, '$1' ) ;
>     // replace <CENTER>...</CENTER> inside of TDs
>     html = html.replace( 
> /(<TD[^>]*)>\s*<CENTER>([\s\S]*?)<\/CENTER>\s*<\/TD>/gi,
>         '$1 align=center>$2</TD>' ) ;
>     // remove Paragraphs inside TD
>     html = 
> html.replace(/(<TD[^>]*>)\s*<P[^>]*>([\s\S]*?)\s*<\/P>\s*([\s\S]*?<\/TD>)/gi, 
> 
>         '$1$2$3');
>   
>     // Remove empty tags (three times, just to make sure).
>     html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
>     html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
>     html = html.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ;
>     html = html.replace( /[^\n\r]<P>/gi, '<P>' ) ;
>     html = html.replace( /[^\n\r]<BR>/gi, '<BR>' ) ;
> 
>     //alert(html)
>   return (html);
>     }
> 


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
ColdFusion MX7 and Flex 2 
Build sales & marketing dashboard RIAâs for your business. Upgrade now
http://www.adobe.com/products/coldfusion/flex2?sdid=RVJT

Archive: 
http://www.houseoffusion.com/groups/CF-Talk/message.cfm/messageid:277084
Subscription: http://www.houseoffusion.com/groups/CF-Talk/subscribe.cfm
Unsubscribe: http://www.houseoffusion.com/cf_lists/unsubscribe.cfm?user=89.70.4

Re: Cleaning stored text to get valid XML

Reply via email to