[ https://issues.apache.org/jira/browse/XERCESC-1936?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12894980#action_12894980 ]
kirby zhou edited comment on XERCESC-1936 at 9/6/10 10:12 AM: -------------------------------------------------------------- The following 2 lines are more suitable for UTF-8 locale users to debug. ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<2;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) > /small.xml ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<100000;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) > ~/big.xml diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp --- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp 2010-01-20 16:45:02.000000000 +0800 +++ xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp 2010-08-04 02:07:06.000000000 +0800 @@ -1049,6 +1049,9 @@ XMLSize_t IconvGNUTranscoder::transco for (size_t cnt = 0; cnt < maxChars && srcLen; cnt++) { size_t rc = iconvFrom(startSrc, &srcLen, &orgTarget, uChSize()); if (rc == (size_t)-1) { + if (errno == EINVAL) { + break; + } if (errno != E2BIG || prevSrcLen == srcLen) { ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager()); } diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp --- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp 2010-01-20 16:45:02.000000000 +0800 +++ xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp 2010-08-04 02:28:46.000000000 +0800 @@ -666,7 +666,7 @@ ICUTranscoder::transcodeTo( const XMLC ); // Rememember the status before we possibly overite the error code - const bool res = (err == U_ZERO_ERROR); + const bool res = (err == U_ZERO_ERROR || (err == U_BUFFER_OVERFLOW_ERROR && startSrc > srcPtr)); // Put the old handler back err = U_ZERO_ERROR; was (Author: kirbyzhou): The following 2 lines are more suitable for UTF-8 locale users to debug. ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<2;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) > /small.xml ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for ((i=0;i<100000;++i)); do echo -en '\xd6\xd0\xce\xc4\xba\xba\xd7\xd6A'; done ; echo; echo '</data>' ) > ~/big.xml diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp --- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp 2010-01-20 16:45:02.000000000 +0800 +++ xerces-c-3.1.1/src/xercesc/util/Transcoders/IconvGNU/IconvGNUTransService.cpp 2010-08-04 02:07:06.000000000 +0800 @@ -1049,6 +1049,9 @@ XMLSize_t IconvGNUTranscoder::transco for (size_t cnt = 0; cnt < maxChars && srcLen; cnt++) { size_t rc = iconvFrom(startSrc, &srcLen, &orgTarget, uChSize()); if (rc == (size_t)-1) { + if (errno == EINVAL) { + break; + } if (errno != E2BIG || prevSrcLen == srcLen) { ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager()); } diff -x .svn -x CVS -ru --show-c-function xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp --- xerces-c-3.1.1.bak/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp 2010-01-20 16:45:02.000000000 +0800 +++ xerces-c-3.1.1/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp 2010-08-04 02:28:46.000000000 +0800 @@ -666,7 +666,7 @@ ICUTranscoder::transcodeTo( const XMLC ); // Rememember the status before we possibly overite the error code - const bool res = (err == U_ZERO_ERROR); + const bool res = (err == U_ZERO_ERROR || (err == U_BUFFER_OVERFLOW_ERROR && startSrc > srcPtr)); // Put the old handler back err = U_ZERO_ERROR; [ > ICUTransService and IconvGNUransService CAN NOT deal with huge file. > -------------------------------------------------------------------- > > Key: XERCESC-1936 > URL: https://issues.apache.org/jira/browse/XERCESC-1936 > Project: Xerces-C++ > Issue Type: Bug > Components: Utilities > Affects Versions: 2.8.0, 3.1.1 > Environment: RHEL-5.5 > glibc-2.5-49.el5_5.2 > libicu-3.6-5.11.4 > Reporter: kirby zhou > > If a huge file passed to XMLReader, it will call TransService mulitple times, > and splite the file content into several fragments. > Unfortunately, the fragment will contain incomplete multi-byte characters. > But neither ICUTransService nor IconvGNUransService deal with it. > ICUTransService did not deal with U_TRUNCATED_CHAR_FOUND, and > IconvGNUransService did not deal with EINVAL. > Both 2.8.0 and 3.1.1 have the same bug. > For example, make 2 XML like that: > ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for > ((i=0;i<2;++i)); do echo -n '中文汉字A'; done ; echo; echo '</data>' ) > > ~/small.xml > ]# ( echo '<?xml version="1.0" encoding="GBK" ?>'; echo '<data>'; for > ((i=0;i<100000;++i)); do echo -n '中文汉字A'; done ; echo; echo '</data>' ) > > ~/big.xml > # the small.xml and big.xml are analogical. > ]# samples/SAXPrint -x=gbk ~/small.xml > <?xml version="1.0" encoding="gbk"?> > <data> > 中文汉字A中文汉字A > </data> > # with icu > ]# samples/SAXPrint -x=gbk ~/big.xml > <?xml version="1.0" encoding="gbk"?> > <data> > Fatal Error at file /root/big.xml, line 3, char 16377 > Message: char 0x6C49 is not representable in 'gbk' encoding > # with iconvgnu > ]# samples/SAXPrint -x=gbk ~/big.xml > ]# samples/SAXPrint -x=gbk ~/big.xml > <?xml version="1.0" encoding="gbk"?> > <data> > Fatal Error at file /root/big.xml, line 3, char 16377 > Message: invalid multi-byte sequence -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online. --------------------------------------------------------------------- To unsubscribe, e-mail: c-dev-unsubscr...@xerces.apache.org For additional commands, e-mail: c-dev-h...@xerces.apache.org