Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/bug-898075 into lp:zorba.
Requested reviews: Paul J. Lucas (paul-lucas) Related bugs: Bug #898075 in Zorba: "fn:string-to-codepoints() doesn't stream" https://bugs.launchpad.net/zorba/+bug/898075 For more details, see: https://code.launchpad.net/~paul-lucas/zorba/bug-898075/+merge/85410 Applied William's patch; patched William's patch to handle UTF-8 properly. -- https://code.launchpad.net/~paul-lucas/zorba/bug-898075/+merge/85410 Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'include/zorba/pregenerated/diagnostic_list.h' --- include/zorba/pregenerated/diagnostic_list.h 2011-11-15 08:23:20 +0000 +++ include/zorba/pregenerated/diagnostic_list.h 2011-12-12 23:19:26 +0000 @@ -458,6 +458,8 @@ extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZXQD0005_INVALID_KEY_FOR_MAP; +extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE; + extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZAPI0002_XQUERY_COMPILATION_FAILED; extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZAPI0003_XQUERY_NOT_COMPILED; === modified file 'modules/com/zorba-xquery/www/modules/pregenerated/errors.xq' --- modules/com/zorba-xquery/www/modules/pregenerated/errors.xq 2011-11-15 08:23:20 +0000 +++ modules/com/zorba-xquery/www/modules/pregenerated/errors.xq 2011-12-12 23:19:26 +0000 @@ -217,6 +217,10 @@ (:~ :) +declare variable $zerr:ZXQD0006 as xs:QName := fn:QName($zerr:NS, "zerr:ZXQD0006"); + +(:~ +:) declare variable $zerr:ZAPI0002 as xs:QName := fn:QName($zerr:NS, "zerr:ZAPI0002"); (:~ === modified file 'src/diagnostics/diagnostic_en.xml' --- src/diagnostics/diagnostic_en.xml 2011-12-07 20:46:23 +0000 +++ src/diagnostics/diagnostic_en.xml 2011-12-12 23:19:26 +0000 @@ -1722,6 +1722,10 @@ <value>key with type $1 not subtype or castable to target type $2 of map ($3)</value> </diagnostic> + <diagnostic code="ZXQD0006" name="INVALID_UTF8_BYTE_SEQUENCE"> + <value>"$1": invalid UTF-8 byte sequence</value> + </diagnostic> + <!--////////// Zorba API Errors ////////////////////////////////////////--> <diagnostic code="ZAPI0002" name="XQUERY_COMPILATION_FAILED"> === modified file 'src/diagnostics/pregenerated/diagnostic_list.cpp' --- src/diagnostics/pregenerated/diagnostic_list.cpp 2011-11-15 08:23:20 +0000 +++ src/diagnostics/pregenerated/diagnostic_list.cpp 2011-12-12 23:19:26 +0000 @@ -666,6 +666,9 @@ ZorbaErrorCode ZXQD0005_INVALID_KEY_FOR_MAP( "ZXQD0005" ); +ZorbaErrorCode ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE( "ZXQD0006" ); + + ZorbaErrorCode ZAPI0002_XQUERY_COMPILATION_FAILED( "ZAPI0002" ); === modified file 'src/diagnostics/pregenerated/dict_en.cpp' --- src/diagnostics/pregenerated/dict_en.cpp 2011-12-01 16:19:52 +0000 +++ src/diagnostics/pregenerated/dict_en.cpp 2011-12-12 23:19:26 +0000 @@ -365,6 +365,7 @@ { "ZXQD0003", "inconsistent options to the parse-xml-fragment() function: $1" }, { "ZXQD0004", "invalid parameter: $1" }, { "ZXQD0005", "key with type $1 not subtype or castable to target type $2 of map ($3)" }, + { "ZXQD0006", "\"$1\": invalid UTF-8 byte sequence" }, { "ZXQP0000", "no error" }, { "ZXQP0001", "dynamic runtime error${: 1}" }, { "ZXQP0002", "\"$1\": assertion failed" }, === modified file 'src/runtime/spec/strings/strings.xml' --- src/runtime/spec/strings/strings.xml 2011-12-01 11:02:25 +0000 +++ src/runtime/spec/strings/strings.xml 2011-12-12 23:19:26 +0000 @@ -57,7 +57,8 @@ <zorba:member type="xs_unsignedInt" name="theIterator" brief="the current iterator"/> <zorba:member type="checked_vector<xs_unsignedInt>" name="theResult" - brief="the resulting vector"/> + brief="the resulting vector"/> + <zorba:member type="std::istream*" name="theStream" /> </zorba:state> </zorba:iterator> === modified file 'src/runtime/strings/pregenerated/strings.h' --- src/runtime/strings/pregenerated/strings.h 2011-12-01 11:02:25 +0000 +++ src/runtime/strings/pregenerated/strings.h 2011-12-12 23:19:26 +0000 @@ -82,6 +82,7 @@ public: xs_unsignedInt theIterator; //the current iterator checked_vector<xs_unsignedInt> theResult; //the resulting vector + std::istream* theStream; // StringToCodepointsIteratorState(); === modified file 'src/runtime/strings/strings_impl.cpp' --- src/runtime/strings/strings_impl.cpp 2011-12-01 16:19:52 +0000 +++ src/runtime/strings/strings_impl.cpp 2011-12-12 23:19:26 +0000 @@ -120,22 +120,76 @@ if (consumeNext(item, theChildren [0].getp(), planState )) { - item->getStringValue2(inputStr); - - if (!inputStr.empty()) - { - utf8::to_codepoints(inputStr, &state->theResult); - - while (state->theIterator < state->theResult.size()) - { - GENV_ITEMFACTORY->createInteger( - result, - Integer(state->theResult[state->theIterator]) - ); - - STACK_PUSH(true, state ); - state->theIterator = state->theIterator + 1; - } + if(!item->isStreamable()) + { + item->getStringValue2(inputStr); + } + else + { + state->theStream = &item->getStream(); + } + } + + if ( state->theStream ) + { + while ( !state->theStream->eof() ) + { + utf8::encoded_char_type ec; + ::bzero( ec, sizeof( ec ) ); + utf8::storage_type *p; + p = ec; + + if ( utf8::read( *state->theStream, ec ) == utf8::npos ) + if ( state->theStream->good() ) { + // + // If read() failed but the stream state is good, it means that an + // invalid byte was encountered. + // + char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf; + bool first = true; + for ( ; *p; ++p ) { + if ( first ) + first = false; + else + *b++ = ','; + ::strcpy( b, "0x" ); b += 2; + ::sprintf( b, "%0hhX", *p ); b += 2; + } + throw XQUERY_EXCEPTION( + zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE, + ERROR_PARAMS( buf ), + ERROR_LOC( loc ) + ); + } else { + throw XQUERY_EXCEPTION( + zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc ) + ); + } + state->theResult.clear(); + state->theResult.push_back( utf8::next_char( p ) ); + + GENV_ITEMFACTORY->createInteger( + result, + Integer(state->theResult[0]) + ); + + STACK_PUSH(true, state ); + state->theIterator = state->theIterator + 1; + } + } + else if (!inputStr.empty()) + { + utf8::to_codepoints(inputStr, &state->theResult); + + while (state->theIterator < state->theResult.size()) + { + GENV_ITEMFACTORY->createInteger( + result, + Integer(state->theResult[state->theIterator]) + ); + + STACK_PUSH(true, state ); + state->theIterator = state->theIterator + 1; } } STACK_END (state); @@ -146,6 +200,7 @@ { PlanIteratorState::init(planState); theIterator = 0; + theStream = 0; theResult.clear(); } === modified file 'src/util/utf8_util.cpp' --- src/util/utf8_util.cpp 2011-07-17 00:10:56 +0000 +++ src/util/utf8_util.cpp 2011-12-12 23:19:26 +0000 @@ -22,6 +22,7 @@ #include "cxx_util.h" #include "utf8_util.h" +using namespace std; #ifndef ZORBA_NO_UNICODE U_NAMESPACE_USE #endif /* ZORBA_NO_UNICODE */ @@ -152,6 +153,22 @@ return len; } +size_type read( istream &i, storage_type **ps ) { + char c = i.get(); + if ( !i.good() || !is_start_byte( c ) ) + return npos; + storage_type *&p = *ps; + *p++ = c; + size_type const len = char_length( c ); + for ( size_type n = 1; n < len; ++n ) { + c = i.get(); + if ( !i.good() || !is_continuation_byte( c ) ) + return npos; + *p++ = c; + } + return len; +} + #ifndef ZORBA_NO_UNICODE bool to_string( unicode::char_type const *in, unicode::size_type in_len, === modified file 'src/util/utf8_util_base.h' --- src/util/utf8_util_base.h 2011-12-01 16:19:52 +0000 +++ src/util/utf8_util_base.h 2011-12-12 23:19:26 +0000 @@ -18,6 +18,7 @@ #define ZORBA_UTF8_UTIL_BASE_H #include <cstddef> +#include <iostream> #include <iterator> #include <stdexcept> @@ -164,6 +165,32 @@ template<class OctetIterator> unicode::code_point prev_char( OctetIterator &i ); +/** + * Reads bytes from an istream until an entire UTF-8 character has been read. + * + * @param i The istream to read from. + * @param ps A pointer to a pointer to what will be the first byte of a UTF-8 + * byte sequence. The pointer is advanced to one byte past the newly read + * character. + * @return Returns the number of bytes comprising the UTF-8 character (which + * equals the number of bytes read) or \c npos if either EOF was reached or the + * bytes read are an invalid UTF-8 byte sequence. + */ +size_type read( std::istream &i, storage_type **ps ); + +/** + * Reads bytes from an istream until an entire UTF-8 character has been read. + * + * @param i The istream to read from. + * @param p A pointer to what will be the first byte of a UTF-8 byte sequence. + * @return Returns the number of bytes comprising the UTF-8 character (which + * equals the number of bytes read) or \c npos if either EOF was reached or the + * bytes read are an invalid UTF-8 byte sequence. + */ +inline size_type read( std::istream &i, storage_type *p ) { + return read( i, &p ); +} + ////////// Character access /////////////////////////////////////////////////// /**
-- Mailing list: https://launchpad.net/~zorba-coders Post to : zorba-coders@lists.launchpad.net Unsubscribe : https://launchpad.net/~zorba-coders More help : https://help.launchpad.net/ListHelp