Paul J. Lucas has proposed merging lp:~paul-lucas/zorba/bug-898075 into 
lp:zorba.

Requested reviews:
  Paul J. Lucas (paul-lucas)
Related bugs:
  Bug #898075 in Zorba: "fn:string-to-codepoints() doesn't stream"
  https://bugs.launchpad.net/zorba/+bug/898075

For more details, see:
https://code.launchpad.net/~paul-lucas/zorba/bug-898075/+merge/85410

Applied William's patch; patched William's patch to handle UTF-8 properly.
-- 
https://code.launchpad.net/~paul-lucas/zorba/bug-898075/+merge/85410
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'include/zorba/pregenerated/diagnostic_list.h'
--- include/zorba/pregenerated/diagnostic_list.h	2011-11-15 08:23:20 +0000
+++ include/zorba/pregenerated/diagnostic_list.h	2011-12-12 23:19:26 +0000
@@ -458,6 +458,8 @@
 
 extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZXQD0005_INVALID_KEY_FOR_MAP;
 
+extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE;
+
 extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZAPI0002_XQUERY_COMPILATION_FAILED;
 
 extern ZORBA_DLL_PUBLIC ZorbaErrorCode ZAPI0003_XQUERY_NOT_COMPILED;

=== modified file 'modules/com/zorba-xquery/www/modules/pregenerated/errors.xq'
--- modules/com/zorba-xquery/www/modules/pregenerated/errors.xq	2011-11-15 08:23:20 +0000
+++ modules/com/zorba-xquery/www/modules/pregenerated/errors.xq	2011-12-12 23:19:26 +0000
@@ -217,6 +217,10 @@
 
 (:~
 :)
+declare variable $zerr:ZXQD0006 as xs:QName := fn:QName($zerr:NS, "zerr:ZXQD0006");
+
+(:~
+:)
 declare variable $zerr:ZAPI0002 as xs:QName := fn:QName($zerr:NS, "zerr:ZAPI0002");
 
 (:~

=== modified file 'src/diagnostics/diagnostic_en.xml'
--- src/diagnostics/diagnostic_en.xml	2011-12-07 20:46:23 +0000
+++ src/diagnostics/diagnostic_en.xml	2011-12-12 23:19:26 +0000
@@ -1722,6 +1722,10 @@
       <value>key with type $1 not subtype or castable to target type $2 of map ($3)</value>
     </diagnostic>
 
+    <diagnostic code="ZXQD0006" name="INVALID_UTF8_BYTE_SEQUENCE">
+      <value>"$1": invalid UTF-8 byte sequence</value>
+    </diagnostic>
+
     <!--////////// Zorba API Errors ////////////////////////////////////////-->
 
     <diagnostic code="ZAPI0002" name="XQUERY_COMPILATION_FAILED">

=== modified file 'src/diagnostics/pregenerated/diagnostic_list.cpp'
--- src/diagnostics/pregenerated/diagnostic_list.cpp	2011-11-15 08:23:20 +0000
+++ src/diagnostics/pregenerated/diagnostic_list.cpp	2011-12-12 23:19:26 +0000
@@ -666,6 +666,9 @@
 ZorbaErrorCode ZXQD0005_INVALID_KEY_FOR_MAP( "ZXQD0005" );
 
 
+ZorbaErrorCode ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE( "ZXQD0006" );
+
+
 ZorbaErrorCode ZAPI0002_XQUERY_COMPILATION_FAILED( "ZAPI0002" );
 
 

=== modified file 'src/diagnostics/pregenerated/dict_en.cpp'
--- src/diagnostics/pregenerated/dict_en.cpp	2011-12-01 16:19:52 +0000
+++ src/diagnostics/pregenerated/dict_en.cpp	2011-12-12 23:19:26 +0000
@@ -365,6 +365,7 @@
   { "ZXQD0003", "inconsistent options to the parse-xml-fragment() function: $1" },
   { "ZXQD0004", "invalid parameter: $1" },
   { "ZXQD0005", "key with type $1 not subtype or castable to target type $2 of map ($3)" },
+  { "ZXQD0006", "\"$1\": invalid UTF-8 byte sequence" },
   { "ZXQP0000", "no error" },
   { "ZXQP0001", "dynamic runtime error${: 1}" },
   { "ZXQP0002", "\"$1\": assertion failed" },

=== modified file 'src/runtime/spec/strings/strings.xml'
--- src/runtime/spec/strings/strings.xml	2011-12-01 11:02:25 +0000
+++ src/runtime/spec/strings/strings.xml	2011-12-12 23:19:26 +0000
@@ -57,7 +57,8 @@
     <zorba:member type="xs_unsignedInt" name="theIterator"
                   brief="the current iterator"/>
     <zorba:member type="checked_vector&lt;xs_unsignedInt&gt;" name="theResult"
-                  brief="the resulting vector"/>
+      brief="the resulting vector"/>
+    <zorba:member type="std::istream*" name="theStream" />
   </zorba:state>
 
 </zorba:iterator>

=== modified file 'src/runtime/strings/pregenerated/strings.h'
--- src/runtime/strings/pregenerated/strings.h	2011-12-01 11:02:25 +0000
+++ src/runtime/strings/pregenerated/strings.h	2011-12-12 23:19:26 +0000
@@ -82,6 +82,7 @@
 public:
   xs_unsignedInt theIterator; //the current iterator
   checked_vector<xs_unsignedInt> theResult; //the resulting vector
+  std::istream* theStream; //
 
   StringToCodepointsIteratorState();
 

=== modified file 'src/runtime/strings/strings_impl.cpp'
--- src/runtime/strings/strings_impl.cpp	2011-12-01 16:19:52 +0000
+++ src/runtime/strings/strings_impl.cpp	2011-12-12 23:19:26 +0000
@@ -120,22 +120,76 @@
 
   if (consumeNext(item, theChildren [0].getp(), planState ))
   {
-    item->getStringValue2(inputStr);
-
-    if (!inputStr.empty())
-    {
-      utf8::to_codepoints(inputStr, &state->theResult);
-
-      while (state->theIterator < state->theResult.size())
-      {
-        GENV_ITEMFACTORY->createInteger(
-          result,
-          Integer(state->theResult[state->theIterator])
-        );
-
-        STACK_PUSH(true, state );
-        state->theIterator = state->theIterator + 1;
-      }
+    if(!item->isStreamable())
+    {
+      item->getStringValue2(inputStr);
+    }
+    else
+    {
+      state->theStream = &item->getStream();
+    }
+  }
+
+  if ( state->theStream )
+  {
+    while ( !state->theStream->eof() )
+    {
+      utf8::encoded_char_type ec;
+      ::bzero( ec, sizeof( ec ) );
+      utf8::storage_type *p;
+      p = ec;
+
+      if ( utf8::read( *state->theStream, ec ) == utf8::npos )
+        if ( state->theStream->good() ) {
+          //
+          // If read() failed but the stream state is good, it means that an
+          // invalid byte was encountered.
+          //
+          char buf[ 6 /* bytes at most */ * 5 /* chars per byte */ ], *b = buf;
+          bool first = true;
+          for ( ; *p; ++p ) {
+            if ( first )
+              first = false;
+            else
+              *b++ = ',';
+            ::strcpy( b, "0x" );          b += 2;
+            ::sprintf( b, "%0hhX", *p );  b += 2;
+          }
+          throw XQUERY_EXCEPTION(
+            zerr::ZXQD0006_INVALID_UTF8_BYTE_SEQUENCE,
+            ERROR_PARAMS( buf ),
+            ERROR_LOC( loc )
+          );
+        } else {
+          throw XQUERY_EXCEPTION(
+            zerr::ZOSE0003_STREAM_READ_FAILURE, ERROR_LOC( loc )
+          );
+        }
+      state->theResult.clear();
+      state->theResult.push_back( utf8::next_char( p ) );
+      
+      GENV_ITEMFACTORY->createInteger(
+        result,
+        Integer(state->theResult[0])
+      );
+
+      STACK_PUSH(true, state );
+      state->theIterator = state->theIterator + 1;
+    }
+  }
+  else if (!inputStr.empty())
+  {
+    utf8::to_codepoints(inputStr, &state->theResult);
+
+    while (state->theIterator < state->theResult.size())
+    {
+      GENV_ITEMFACTORY->createInteger(
+        result,
+        Integer(state->theResult[state->theIterator])
+      );
+
+      STACK_PUSH(true, state );
+      state->theIterator = state->theIterator + 1;
     }
   }
   STACK_END (state);
@@ -146,6 +200,7 @@
 {
   PlanIteratorState::init(planState);
   theIterator = 0;
+  theStream   = 0;
   theResult.clear();
 }
 

=== modified file 'src/util/utf8_util.cpp'
--- src/util/utf8_util.cpp	2011-07-17 00:10:56 +0000
+++ src/util/utf8_util.cpp	2011-12-12 23:19:26 +0000
@@ -22,6 +22,7 @@
 #include "cxx_util.h"
 #include "utf8_util.h"
 
+using namespace std;
 #ifndef ZORBA_NO_UNICODE
 U_NAMESPACE_USE
 #endif /* ZORBA_NO_UNICODE */
@@ -152,6 +153,22 @@
   return len;
 }
 
+size_type read( istream &i, storage_type **ps ) {
+  char c = i.get();
+  if ( !i.good() || !is_start_byte( c ) )
+    return npos;
+  storage_type *&p = *ps;
+  *p++ = c;
+  size_type const len = char_length( c );
+  for ( size_type n = 1; n < len; ++n ) {
+    c = i.get();
+    if ( !i.good() || !is_continuation_byte( c ) )
+      return npos;
+    *p++ = c;
+  }
+  return len;
+}
+
 #ifndef ZORBA_NO_UNICODE
 
 bool to_string( unicode::char_type const *in, unicode::size_type in_len,

=== modified file 'src/util/utf8_util_base.h'
--- src/util/utf8_util_base.h	2011-12-01 16:19:52 +0000
+++ src/util/utf8_util_base.h	2011-12-12 23:19:26 +0000
@@ -18,6 +18,7 @@
 #define ZORBA_UTF8_UTIL_BASE_H
 
 #include <cstddef>
+#include <iostream>
 #include <iterator>
 #include <stdexcept>
 
@@ -164,6 +165,32 @@
 template<class OctetIterator>
 unicode::code_point prev_char( OctetIterator &i );
 
+/**
+ * Reads bytes from an istream until an entire UTF-8 character has been read.
+ *
+ * @param i The istream to read from.
+ * @param ps A pointer to a pointer to what will be the first byte of a UTF-8
+ * byte sequence.  The pointer is advanced to one byte past the newly read
+ * character.
+ * @return Returns the number of bytes comprising the UTF-8 character (which
+ * equals the number of bytes read) or \c npos if either EOF was reached or the
+ * bytes read are an invalid UTF-8 byte sequence.
+ */
+size_type read( std::istream &i, storage_type **ps );
+
+/**
+ * Reads bytes from an istream until an entire UTF-8 character has been read.
+ *
+ * @param i The istream to read from.
+ * @param p A pointer to what will be the first byte of a UTF-8 byte sequence.
+ * @return Returns the number of bytes comprising the UTF-8 character (which
+ * equals the number of bytes read) or \c npos if either EOF was reached or the
+ * bytes read are an invalid UTF-8 byte sequence.
+ */
+inline size_type read( std::istream &i, storage_type *p ) {
+  return read( i, &p );
+}
+
 ////////// Character access ///////////////////////////////////////////////////
 
 /**

-- 
Mailing list: https://launchpad.net/~zorba-coders
Post to     : zorba-coders@lists.launchpad.net
Unsubscribe : https://launchpad.net/~zorba-coders
More help   : https://help.launchpad.net/ListHelp

Reply via email to