filter/Library_xmlfd.mk                        |    2 
 filter/source/xmlfilterdetect/filterdetect.cxx |   86 +++++++++----------------
 2 files changed, 34 insertions(+), 54 deletions(-)

New commits:
commit bd1461e69330a5265dc8cb395cf1b554d31c4bea
Author: Maxim Monastirsky <momonas...@gmail.com>
Date:   Mon Dec 23 16:40:03 2013 +0200

    Detect UTF-16 encoded XML files
    
    This code doesn't support leading blank characters, since it's
    invalid, and the filter doesn't handle it anyway.
    
    Change-Id: I494e9f85351539d27577dc7df8be420c0c66570e
    Reviewed-on: https://gerrit.libreoffice.org/7204
    Reviewed-by: David Tardon <dtar...@redhat.com>
    Tested-by: David Tardon <dtar...@redhat.com>

diff --git a/filter/Library_xmlfd.mk b/filter/Library_xmlfd.mk
index 39f10c2..b55f06a 100644
--- a/filter/Library_xmlfd.mk
+++ b/filter/Library_xmlfd.mk
@@ -28,6 +28,8 @@ $(eval $(call gb_Library_use_libraries,xmlfd,\
        cppuhelper \
        cppu \
        sal \
+       utl \
+       tl \
        $(gb_UWINAPI) \
 ))
 
diff --git a/filter/source/xmlfilterdetect/filterdetect.cxx 
b/filter/source/xmlfilterdetect/filterdetect.cxx
index 0b36b3d..c409d82 100644
--- a/filter/source/xmlfilterdetect/filterdetect.cxx
+++ b/filter/source/xmlfilterdetect/filterdetect.cxx
@@ -42,6 +42,8 @@
 #include <com/sun/star/container/XNameAccess.hpp>
 #include <com/sun/star/beans/PropertyState.hpp>
 #include <ucbhelper/content.hxx>
+#include <unotools/ucbstreamhelper.hxx>
+#include <boost/scoped_ptr.hpp>
 
 using com::sun::star::uno::Sequence;
 using com::sun::star::uno::Reference;
@@ -72,57 +74,12 @@ using namespace com::sun::star::beans;
 
 namespace {
 
-bool isXMLStream(const OString& aHeaderStrm)
-{
-    const char* p = aHeaderStrm.getStr();
-    size_t n = aHeaderStrm.getLength();
-    size_t i = 0;
-
-    // Skip UTF-8 BOM
-    const unsigned char sBOM[] = {0xEF, 0xBB, 0xBF};
-    for (i = 0; i < n; ++i, ++p)
-    {
-        if (i < 3 && (unsigned char)(*p) == sBOM[i])
-            continue;
-        else if (i == 3 || i == 0)
-            break;
-        else if (i > 0)
-            return false;
-    }
-
-    n -= i;
-
-    // Skip all preceding blank characters.
-    for (i = 0; i < n; ++i, ++p)
-    {
-        char c = *p;
-        if (c == ' ' || c == '\n' || c == '\t')
-            continue;
-        break;
-    }
-
-    n -= i;
-
-    // First text must be '<?xml', else it's not a valid XML file stream.
-    const char* sInitChars = "<?xml";
-    const size_t nInitCharLen = std::strlen(sInitChars);
-    for (i = 0; i < n; ++i, ++p)
-    {
-        if (i < nInitCharLen)
-        {
-            if (*p != sInitChars[i])
-                return false;
-        }
-    }
-    return true;
-}
-
-OUString supportedByType( const OUString clipBoardFormat ,  const OString 
resultString, const OUString checkType)
+OUString supportedByType( const OUString clipBoardFormat ,  const OUString 
resultString, const OUString checkType)
 {
     OUString sTypeName;
     if ( clipBoardFormat.match("doctype:") )
     {
-        OString tryStr = 
OUStringToOString(clipBoardFormat.copy(8),RTL_TEXTENCODING_ASCII_US).getStr();
+        OUString tryStr = clipBoardFormat.copy(8);
         if (resultString.indexOf(tryStr) >= 0)
         {
             sTypeName = checkType;
@@ -142,7 +99,7 @@ OUString SAL_CALL FilterDetect::detect( 
com::sun::star::uno::Sequence< com::sun:
     com::sun::star::uno::Reference< com::sun::star::io::XInputStream > 
xInStream;
     const PropertyValue * pValue = aArguments.getConstArray();
     sal_Int32 nLength;
-    OString resultString;
+    OUString resultString;
 
     nLength = aArguments.getLength();
     sal_Int32 location=nLength;
@@ -174,13 +131,34 @@ OUString SAL_CALL FilterDetect::detect( 
com::sun::star::uno::Sequence< com::sun:
                 return sTypeName;
             }
         }
-        com::sun::star::uno::Sequence< sal_Int8 > aData;
-        /* long nBytesToRead= */ xInStream->available();
-        xInStream->skipBytes (0);
-        long bytestRead =xInStream->readBytes (aData,  4000);
-        resultString=OString((const sal_Char 
*)aData.getConstArray(),bytestRead) ;
 
-        if (!isXMLStream(resultString))
+        ::boost::scoped_ptr< SvStream > pInStream( 
::utl::UcbStreamHelper::CreateStream( xInStream ) );
+        pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
+        sal_Size nUniPos = pInStream->Tell();
+
+        const sal_uInt16 nSize = 4000;
+        bool  bTryUtf16 = false;
+
+        if ( nUniPos == 0 ) // No BOM detected, try to guess UTF-16 endianness
+        {
+            sal_uInt16 sHeader = 0;
+            *pInStream >> sHeader;
+            if ( sHeader == 0x003C )
+                bTryUtf16 = true;
+            else if ( sHeader == 0x3C00 )
+            {
+                bTryUtf16 = true;
+                pInStream->SetEndianSwap( !pInStream->IsEndianSwap() );
+            }
+            pInStream->Seek( STREAM_SEEK_TO_BEGIN );
+        }
+
+        if ( nUniPos == 3 || ( nUniPos == 0 && !bTryUtf16 ) ) // UTF-8 or 
non-Unicode
+            resultString = OStringToOUString( read_uInt8s_ToOString( 
*pInStream, nSize ), RTL_TEXTENCODING_UTF8 );
+        else if ( nUniPos == 2 || bTryUtf16 ) // UTF-16
+            resultString = read_uInt16s_ToOUString( *pInStream, nSize );
+
+        if ( !resultString.startsWith( "<?xml" ) )
             // This is not an XML stream.  It makes no sense to try to detect
             // a non-XML file type here.
             return OUString();
_______________________________________________
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

Reply via email to