filter/Library_xmlfd.mk | 2 filter/source/xmlfilterdetect/filterdetect.cxx | 86 +++++++++---------------- 2 files changed, 34 insertions(+), 54 deletions(-)
New commits: commit bd1461e69330a5265dc8cb395cf1b554d31c4bea Author: Maxim Monastirsky <momonas...@gmail.com> Date: Mon Dec 23 16:40:03 2013 +0200 Detect UTF-16 encoded XML files This code doesn't support leading blank characters, since it's invalid, and the filter doesn't handle it anyway. Change-Id: I494e9f85351539d27577dc7df8be420c0c66570e Reviewed-on: https://gerrit.libreoffice.org/7204 Reviewed-by: David Tardon <dtar...@redhat.com> Tested-by: David Tardon <dtar...@redhat.com> diff --git a/filter/Library_xmlfd.mk b/filter/Library_xmlfd.mk index 39f10c2..b55f06a 100644 --- a/filter/Library_xmlfd.mk +++ b/filter/Library_xmlfd.mk @@ -28,6 +28,8 @@ $(eval $(call gb_Library_use_libraries,xmlfd,\ cppuhelper \ cppu \ sal \ + utl \ + tl \ $(gb_UWINAPI) \ )) diff --git a/filter/source/xmlfilterdetect/filterdetect.cxx b/filter/source/xmlfilterdetect/filterdetect.cxx index 0b36b3d..c409d82 100644 --- a/filter/source/xmlfilterdetect/filterdetect.cxx +++ b/filter/source/xmlfilterdetect/filterdetect.cxx @@ -42,6 +42,8 @@ #include <com/sun/star/container/XNameAccess.hpp> #include <com/sun/star/beans/PropertyState.hpp> #include <ucbhelper/content.hxx> +#include <unotools/ucbstreamhelper.hxx> +#include <boost/scoped_ptr.hpp> using com::sun::star::uno::Sequence; using com::sun::star::uno::Reference; @@ -72,57 +74,12 @@ using namespace com::sun::star::beans; namespace { -bool isXMLStream(const OString& aHeaderStrm) -{ - const char* p = aHeaderStrm.getStr(); - size_t n = aHeaderStrm.getLength(); - size_t i = 0; - - // Skip UTF-8 BOM - const unsigned char sBOM[] = {0xEF, 0xBB, 0xBF}; - for (i = 0; i < n; ++i, ++p) - { - if (i < 3 && (unsigned char)(*p) == sBOM[i]) - continue; - else if (i == 3 || i == 0) - break; - else if (i > 0) - return false; - } - - n -= i; - - // Skip all preceding blank characters. - for (i = 0; i < n; ++i, ++p) - { - char c = *p; - if (c == ' ' || c == '\n' || c == '\t') - continue; - break; - } - - n -= i; - - // First text must be '<?xml', else it's not a valid XML file stream. - const char* sInitChars = "<?xml"; - const size_t nInitCharLen = std::strlen(sInitChars); - for (i = 0; i < n; ++i, ++p) - { - if (i < nInitCharLen) - { - if (*p != sInitChars[i]) - return false; - } - } - return true; -} - -OUString supportedByType( const OUString clipBoardFormat , const OString resultString, const OUString checkType) +OUString supportedByType( const OUString clipBoardFormat , const OUString resultString, const OUString checkType) { OUString sTypeName; if ( clipBoardFormat.match("doctype:") ) { - OString tryStr = OUStringToOString(clipBoardFormat.copy(8),RTL_TEXTENCODING_ASCII_US).getStr(); + OUString tryStr = clipBoardFormat.copy(8); if (resultString.indexOf(tryStr) >= 0) { sTypeName = checkType; @@ -142,7 +99,7 @@ OUString SAL_CALL FilterDetect::detect( com::sun::star::uno::Sequence< com::sun: com::sun::star::uno::Reference< com::sun::star::io::XInputStream > xInStream; const PropertyValue * pValue = aArguments.getConstArray(); sal_Int32 nLength; - OString resultString; + OUString resultString; nLength = aArguments.getLength(); sal_Int32 location=nLength; @@ -174,13 +131,34 @@ OUString SAL_CALL FilterDetect::detect( com::sun::star::uno::Sequence< com::sun: return sTypeName; } } - com::sun::star::uno::Sequence< sal_Int8 > aData; - /* long nBytesToRead= */ xInStream->available(); - xInStream->skipBytes (0); - long bytestRead =xInStream->readBytes (aData, 4000); - resultString=OString((const sal_Char *)aData.getConstArray(),bytestRead) ; - if (!isXMLStream(resultString)) + ::boost::scoped_ptr< SvStream > pInStream( ::utl::UcbStreamHelper::CreateStream( xInStream ) ); + pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + sal_Size nUniPos = pInStream->Tell(); + + const sal_uInt16 nSize = 4000; + bool bTryUtf16 = false; + + if ( nUniPos == 0 ) // No BOM detected, try to guess UTF-16 endianness + { + sal_uInt16 sHeader = 0; + *pInStream >> sHeader; + if ( sHeader == 0x003C ) + bTryUtf16 = true; + else if ( sHeader == 0x3C00 ) + { + bTryUtf16 = true; + pInStream->SetEndianSwap( !pInStream->IsEndianSwap() ); + } + pInStream->Seek( STREAM_SEEK_TO_BEGIN ); + } + + if ( nUniPos == 3 || ( nUniPos == 0 && !bTryUtf16 ) ) // UTF-8 or non-Unicode + resultString = OStringToOUString( read_uInt8s_ToOString( *pInStream, nSize ), RTL_TEXTENCODING_UTF8 ); + else if ( nUniPos == 2 || bTryUtf16 ) // UTF-16 + resultString = read_uInt16s_ToOUString( *pInStream, nSize ); + + if ( !resultString.startsWith( "<?xml" ) ) // This is not an XML stream. It makes no sense to try to detect // a non-XML file type here. return OUString(); _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits