Paul J. Lucas has proposed merging lp:~zorba-coders/zorba/feature-ft_module 
into lp:zorba.

Requested reviews:
  Matthias Brantner (matthias-brantner)
  Paul J. Lucas (paul-lucas)
Related bugs:
  Bug #944795 in Zorba: "XQDoc doesn't handle & in URLs"
  https://bugs.launchpad.net/zorba/+bug/944795

For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/feature-ft_module/+merge/105913

Renamed Tokenizer::Numbers to Tokenizer::State now (just prior to the 2.5 
release) to give it a better name for the forthcoming addition of the ability 
to tokenize using include/exclude Item lists.  At that time, State will most 
likely be expanded to include additional state information beyond just numbers, 
hence the name change.
-- 
https://code.launchpad.net/~zorba-coders/zorba/feature-ft_module/+merge/105913
Your team Zorba Coders is subscribed to branch lp:zorba.
=== modified file 'include/zorba/tokenizer.h'
--- include/zorba/tokenizer.h	2012-05-03 12:31:51 +0000
+++ include/zorba/tokenizer.h	2012-05-16 00:57:21 +0000
@@ -48,9 +48,10 @@
   /////////////////////////////////////////////////////////////////////////////
 
   /**
-   * A %Numbers contains the current token, sentence, and paragraph numbers.
+   * A %State contains inter-Tokenizer state, currently the current token,
+   * sentence, and paragraph numbers.
    */
-  struct Numbers {
+  struct State {
     typedef Tokenizer::size_type value_type;
 
     value_type token; ///< Token number.
@@ -60,7 +61,7 @@
     /**
      * Default constructor.
      */
-    Numbers();
+    State();
   };
 
   /////////////////////////////////////////////////////////////////////////////
@@ -125,7 +126,7 @@
 
     /**
      * If \c true, XML processing instructions separate tokens.  For example,
-     * <code>net<?PI pi?>work</code> would be 2 tokens instead of 1.
+     * <code>net&lt;?PI pi?&gt;work</code> would be 2 tokens instead of 1.
      */
     bool processing_instructions_separate_tokens;
 
@@ -162,18 +163,18 @@
   virtual void destroy() const = 0;
 
   /**
-   * Gets this %Tokenizer's associated Numbers.
+   * Gets this %Tokenizer's associated State.
    *
-   * @return Returns said Numbers.
+   * @return Returns said State.
    */
-  Numbers& numbers();
+  State& state();
 
   /**
-   * Gets this %Tokenizer's associated Numbers.
+   * Gets this %Tokenizer's associated State.
    *
-   * @return Returns said Numbers.
+   * @return Returns said State.
    */
-  Numbers const& numbers() const;
+  State const& state() const;
 
   /**
    * Tokenizes the given node.
@@ -207,9 +208,9 @@
   /**
    * Constructs a %Tokenizer.
    *
-   * @param numbers the Numbers to use.
+   * @param state the State to use.
    */
-  Tokenizer( Numbers &numbers );
+  Tokenizer( State &state );
 
   /**
    * Destroys a %Tokenizer.
@@ -255,18 +256,18 @@
                                    Callback &callback, bool tokenize_acp );
 
 private:
-  Numbers *numbers_;
+  State *state_;
 };
 
-inline Tokenizer::Tokenizer( Numbers &numbers ) : numbers_( &numbers ) {
-}
-
-inline Tokenizer::Numbers& Tokenizer::numbers() {
-  return *numbers_;
-}
-
-inline Tokenizer::Numbers const& Tokenizer::numbers() const {
-  return *numbers_;
+inline Tokenizer::Tokenizer( State &state ) : state_( &state ) {
+}
+
+inline Tokenizer::State& Tokenizer::state() {
+  return *state_;
+}
+
+inline Tokenizer::State const& Tokenizer::state() const {
+  return *state_;
 }
 
 inline void Tokenizer::tokenize_node( Item const &item,
@@ -288,13 +289,13 @@
    * Creates a new %Tokenizer.
    *
    * @param lang The language of the text that the tokenizer will tokenize.
-   * @param numbers The Numbers to use.  If \c null, \a t is not set.
+   * @param state The State to use.  If \c null, \a t is not set.
    * @param t If not \c null, set to point to a Tokenizer for \a lang.
    * @return Returns \c true only if this provider can provide a tokenizer for
    * \a lang.
    */
   virtual bool getTokenizer( locale::iso639_1::type lang,
-                             Tokenizer::Numbers *numbers = 0,
+                             Tokenizer::State *state = 0,
                              Tokenizer::ptr *t = 0 ) const = 0;
 };
 

=== modified file 'src/runtime/full_text/apply.cpp'
--- src/runtime/full_text/apply.cpp	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/apply.cpp	2012-05-16 00:57:21 +0000
@@ -1251,11 +1251,11 @@
   FTTokenSeqIterator::FTTokens synonyms;
   thesaurus_callback cb( qt0.pos(), qt0.lang(), synonyms );
 
-  Tokenizer::Numbers t_num;
+  Tokenizer::State t_state;
   TokenizerProvider const *const provider = GENV_STORE.getTokenizerProvider();
   ZORBA_ASSERT( provider );
   Tokenizer::ptr tokenizer;
-  if ( !provider->getTokenizer( qt0.lang(), &t_num, &tokenizer ) )
+  if ( !provider->getTokenizer( qt0.lang(), &t_state, &tokenizer ) )
     throw XQUERY_EXCEPTION(
       err::FTST0009,
       ERROR_PARAMS(

=== modified file 'src/runtime/full_text/ft_module_impl.cpp'
--- src/runtime/full_text/ft_module_impl.cpp	2012-05-15 21:13:21 +0000
+++ src/runtime/full_text/ft_module_impl.cpp	2012-05-16 00:57:21 +0000
@@ -552,7 +552,7 @@
   zstring base_uri;
   store::Item_t item;
   iso639_1::type lang;
-  Tokenizer::Numbers no;
+  Tokenizer::State t_state;
   store::NsBindings const ns_bindings;
   TokenizerProvider const *tokenizer_provider;
   store::Item_t type_name;
@@ -574,7 +574,7 @@
     tokenizer_provider = GENV_STORE.getTokenizerProvider();
     ZORBA_ASSERT( tokenizer_provider );
     state->doc_tokens_ =
-      state->doc_item_->getTokens( *tokenizer_provider, no, lang );
+      state->doc_item_->getTokens( *tokenizer_provider, t_state, lang );
 
     while ( state->doc_tokens_->hasNext() ) {
       FTToken const *token;
@@ -667,7 +667,7 @@
   store::Item_t element, item, junk, name;
   zstring base_uri;
   iso639_1::type lang;
-  Tokenizer::Numbers no;
+  Tokenizer::State t_state;
   store::NsBindings const ns_bindings;
   Tokenizer::ptr tokenizer;
   store::Item_t type_name;
@@ -689,7 +689,7 @@
 
   tokenizer_provider = GENV_STORE.getTokenizerProvider();
   ZORBA_ASSERT( tokenizer_provider );
-  if ( !tokenizer_provider->getTokenizer( lang, &no, &tokenizer ) )
+  if ( !tokenizer_provider->getTokenizer( lang, &t_state, &tokenizer ) )
     throw XQUERY_EXCEPTION(
       err::FTST0009 /* lang not supported */,
       ERROR_PARAMS(
@@ -826,9 +826,9 @@
     TokenizerProvider const *const tokenizer_provider =
       GENV_STORE.getTokenizerProvider();
     ZORBA_ASSERT( tokenizer_provider );
-    Tokenizer::Numbers no;
+    Tokenizer::State t_state;
     Tokenizer::ptr tokenizer;
-    if ( !tokenizer_provider->getTokenizer( lang, &no, &tokenizer ) )
+    if ( !tokenizer_provider->getTokenizer( lang, &t_state, &tokenizer ) )
       throw XQUERY_EXCEPTION(
         err::FTST0009 /* lang not supported */,
         ERROR_PARAMS(

=== modified file 'src/runtime/full_text/ftcontains_visitor.cpp'
--- src/runtime/full_text/ftcontains_visitor.cpp	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/ftcontains_visitor.cpp	2012-05-16 00:57:21 +0000
@@ -426,9 +426,9 @@
     // actual query.
     //
     while ( PlanIterator::consumeNext( item, plan_iter, plan_state_ ) ) {
-      Tokenizer::Numbers no;
+      Tokenizer::State t_state;
       query_item_t const qi(
-        item->getTokens( tokenizer_provider, no, lang, wildcards )
+        item->getTokens( tokenizer_provider, t_state, lang, wildcards )
       );
       if ( qi->hasNext() )
         query_items.push_back( qi );

=== modified file 'src/runtime/full_text/full_text_impl.cpp'
--- src/runtime/full_text/full_text_impl.cpp	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/full_text_impl.cpp	2012-05-16 00:57:21 +0000
@@ -84,9 +84,9 @@
   tokenizer_provider = GENV_STORE.getTokenizerProvider();
 
   while ( !ftcontains && consumeNext( doc_item, search_ctx, plan_state ) ) {
-    Tokenizer::Numbers no;
+    Tokenizer::State t_state;
     FTTokenIterator_t doc_tokens(
-      doc_item->getTokens( *tokenizer_provider, no, lang )
+      doc_item->getTokens( *tokenizer_provider, t_state, lang )
     );
     store::Item_t ignore_item;
     if ( ftignore )

=== modified file 'src/runtime/full_text/icu_tokenizer.cpp'
--- src/runtime/full_text/icu_tokenizer.cpp	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/icu_tokenizer.cpp	2012-05-16 00:57:21 +0000
@@ -130,8 +130,8 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-ICU_Tokenizer::ICU_Tokenizer( iso639_1::type lang, Numbers &no ) :
-  Tokenizer( no ),
+ICU_Tokenizer::ICU_Tokenizer( iso639_1::type lang, State &state ) :
+  Tokenizer( state ),
   lang_( lang )
 {
   Locale const &icu_locale = get_icu_locale_for( lang );
@@ -381,9 +381,9 @@
         cout << "  setting token" << endl;
 #       endif
         t.set(
-          utf8_buf, utf8_len, numbers().token, numbers().sent, numbers().para
+          utf8_buf, utf8_len, state().token, state().sent, state().para
         );
-        ++numbers().token;
+        ++state().token;
       }
     }
 
@@ -408,7 +408,7 @@
       // The addition of the "if" fixes:
       // https://bugs.launchpad.net/bugs/863320
       if ( sent_end != BreakIterator::DONE )
-        ++numbers().sent;
+        ++state().sent;
     }
   } // while
 
@@ -419,7 +419,7 @@
   t.send( item, callback );
   // Incrementing "sent" here fixes:
   // https://bugs.launchpad.net/bugs/897800
-  ++numbers().sent;
+  ++state().sent;
 #if DEBUG_TOKENIZER
   cout << "--------------------\n";
 #endif /* DEBUG_TOKENIZER */
@@ -428,13 +428,13 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 bool ICU_TokenizerProvider::getTokenizer( iso639_1::type lang,
-                                          Tokenizer::Numbers *num,
+                                          Tokenizer::State *state,
                                           Tokenizer::ptr *t ) const {
   for ( int32_t n = ubrk_countAvailable(), i = 0; i < n; ++i ) {
     if ( char const *const icu_locale = ubrk_getAvailable( i ) )
       if ( lang == find_lang( icu_locale ) ) {
-        if ( num && t )
-          t->reset( new ICU_Tokenizer( lang, *num ) );
+        if ( state && t )
+          t->reset( new ICU_Tokenizer( lang, *state ) );
         return true;
       }
   }

=== modified file 'src/runtime/full_text/icu_tokenizer.h'
--- src/runtime/full_text/icu_tokenizer.h	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/icu_tokenizer.h	2012-05-16 00:57:21 +0000
@@ -40,9 +40,9 @@
    * Constructs an %ICU_Tokenizer.
    *
    * @param lang The language of the text that the tokenizer will tokenize.
-   * @param no The Numbers to use.
+   * @param state The State to use.
    */
-  ICU_Tokenizer( locale::iso639_1::type lang, Numbers &no );
+  ICU_Tokenizer( locale::iso639_1::type lang, State &state );
 
   ~ICU_Tokenizer();
 
@@ -67,7 +67,7 @@
   ICU_TokenizerProvider() { }           // needed to work-around compiler bug
 
   // inherited
-  bool getTokenizer( locale::iso639_1::type, Tokenizer::Numbers* = 0,
+  bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0,
                      Tokenizer::ptr* = 0 ) const;
 };
 

=== modified file 'src/runtime/full_text/latin_tokenizer.cpp'
--- src/runtime/full_text/latin_tokenizer.cpp	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/latin_tokenizer.cpp	2012-05-16 00:57:21 +0000
@@ -242,12 +242,12 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 bool LatinTokenizerProvider::getTokenizer( iso639_1::type lang,
-                                           Tokenizer::Numbers *num,
+                                           Tokenizer::State *state,
                                            Tokenizer::ptr *t ) const {
   switch ( lang ) {
     case iso639_1::en:
-      if ( num && t )
-        t->reset( new LatinTokenizer( *num ) );
+      if ( state && t )
+        t->reset( new LatinTokenizer( *state ) );
       return true;
     default:
       return false;

=== modified file 'src/runtime/full_text/latin_tokenizer.h'
--- src/runtime/full_text/latin_tokenizer.h	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/latin_tokenizer.h	2012-05-16 00:57:21 +0000
@@ -34,7 +34,7 @@
  */
 class LatinTokenizer : public Tokenizer {
 public:
-  LatinTokenizer( Numbers &num ) : Tokenizer( num ) { }
+  LatinTokenizer( State &state ) : Tokenizer( state ) { }
 
   // inherited
   void destroy() const;
@@ -66,7 +66,7 @@
 class LatinTokenizerProvider : public TokenizerProvider {
 public:
   // inherited
-  bool getTokenizer( locale::iso639_1::type, Tokenizer::Numbers* = 0,
+  bool getTokenizer( locale::iso639_1::type, Tokenizer::State* = 0,
                      Tokenizer::ptr* = 0 ) const;
 };
 

=== modified file 'src/runtime/full_text/tokenizer.cpp'
--- src/runtime/full_text/tokenizer.cpp	2012-05-03 12:31:51 +0000
+++ src/runtime/full_text/tokenizer.cpp	2012-05-16 00:57:21 +0000
@@ -59,7 +59,7 @@
 void Tokenizer::item( Item const &item, bool entering ) {
   if ( entering && item.isNode() &&
        item.getNodeKind() == store::StoreConsts::elementNode ) {
-    ++numbers().para;
+    ++state().para;
   }
 }
 
@@ -78,7 +78,7 @@
         if ( find_lang_attribute( item, &lang ) ) {
           TokenizerProvider const *const p = GENV_STORE.getTokenizerProvider();
           ZORBA_ASSERT( p );
-          if ( !p->getTokenizer( lang, numbers_, &t_ptr ) )
+          if ( !p->getTokenizer( lang, state_, &t_ptr ) )
             break;
           t_raw = t_ptr.get();
         }
@@ -109,7 +109,7 @@
   }
 }
 
-Tokenizer::Numbers::Numbers() {
+Tokenizer::State::State() {
   token = para = 0;
   sent = 1;
 }

=== modified file 'src/store/api/item.h'
--- src/store/api/item.h	2012-05-03 12:31:51 +0000
+++ src/store/api/item.h	2012-05-16 00:57:21 +0000
@@ -838,13 +838,13 @@
    * Gets the tokens for this item.
    *
    * @param provider The TokenizerProvider to use.
-   * @param numbers The Tokenizer::Numbers to use.
+   * @param state The Tokenizer::State to use.
    * @param lang The language to use for tokenization.
    * @param wildcards If \c true, allow XQuery wildcard syntax.
    * @return Returns an iterator over the tokens.
    */
   virtual FTTokenIterator_t
-  getTokens(TokenizerProvider const &provider, Tokenizer::Numbers &numbers,
+  getTokens(TokenizerProvider const &provider, Tokenizer::State &state,
             locale::iso639_1::type lang, bool wildcards = false) const;
 #endif /* ZORBA_NO_FULL_TEXT */
 

=== modified file 'src/store/naive/atomic_items.cpp'
--- src/store/naive/atomic_items.cpp	2012-05-15 21:12:27 +0000
+++ src/store/naive/atomic_items.cpp	2012-05-16 00:57:21 +0000
@@ -1651,7 +1651,7 @@
 #ifndef ZORBA_NO_FULL_TEXT
 FTTokenIterator_t StringItem::getTokens( 
     TokenizerProvider const &provider,
-    Tokenizer::Numbers &numbers,
+    Tokenizer::State &state,
     iso639_1::type lang,
     bool wildcards ) const
 {
@@ -1660,7 +1660,7 @@
   AtomicItemTokenizerCallback callback( *tokens );
 
   Tokenizer::ptr tokenizer;
-  if ( provider.getTokenizer( lang, &numbers, &tokenizer ) )
+  if ( provider.getTokenizer( lang, &state, &tokenizer ) )
     tokenizer->tokenize_string(
       theValue.data(), theValue.size(), lang, wildcards, callback
     );

=== modified file 'src/store/naive/atomic_items.h'
--- src/store/naive/atomic_items.h	2012-05-08 01:09:52 +0000
+++ src/store/naive/atomic_items.h	2012-05-16 00:57:21 +0000
@@ -852,7 +852,7 @@
 #ifndef ZORBA_NO_FULL_TEXT
   FTTokenIterator_t getTokens( 
       TokenizerProvider const&,
-      Tokenizer::Numbers&,
+      Tokenizer::State&,
       locale::iso639_1::type,
       bool = false ) const;
 #endif /* ZORBA_NO_FULL_TEXT */

=== modified file 'src/store/naive/item.cpp'
--- src/store/naive/item.cpp	2012-05-03 12:31:51 +0000
+++ src/store/naive/item.cpp	2012-05-16 00:57:21 +0000
@@ -354,7 +354,7 @@
 
 #ifndef ZORBA_NO_FULL_TEXT
 FTTokenIterator_t
-Item::getTokens( TokenizerProvider const&, Tokenizer::Numbers&,
+Item::getTokens( TokenizerProvider const&, Tokenizer::State&,
                  locale::iso639_1::type, bool ) const
 {
   throw ZORBA_EXCEPTION(

=== modified file 'src/store/naive/node_items.cpp'
--- src/store/naive/node_items.cpp	2012-05-08 23:31:37 +0000
+++ src/store/naive/node_items.cpp	2012-05-16 00:57:21 +0000
@@ -4822,7 +4822,7 @@
 
 FTTokenIterator_t
 AttributeNode::getTokens( TokenizerProvider const &provider,
-                          Tokenizer::Numbers &numbers, iso639_1::type lang,
+                          Tokenizer::State &state, iso639_1::type lang,
                           bool ) const
 {
   FTTokenStore &token_store = getTree()->getTokenStore();
@@ -4838,7 +4838,7 @@
 
     zorba::Item const api_attr( this );
     Tokenizer::ptr tokenizer;
-    if ( provider.getTokenizer( lang, &numbers, &tokenizer ) ) {
+    if ( provider.getTokenizer( lang, &state, &tokenizer ) ) {
       tokenizer->tokenize_node( api_attr, lang, callback );
       token_store.putAttr( this, att_tokens );
     }
@@ -4907,7 +4907,7 @@
 
 FTTokenIterator_t
 XmlNode::getTokens( TokenizerProvider const &provider,
-                    Tokenizer::Numbers &numbers, iso639_1::type lang,
+                    Tokenizer::State &state, iso639_1::type lang,
                     bool ) const
 {
   FTTokenStore &token_store = getTree()->getTokenStore();
@@ -4918,7 +4918,7 @@
     zorba::Item const api_root( getRoot() );
     XmlNodeTokenizerCallback callback( token_store );
     Tokenizer::ptr tokenizer;
-    if ( provider.getTokenizer( lang, &numbers, &tokenizer ) )
+    if ( provider.getTokenizer( lang, &state, &tokenizer ) )
       tokenizer->tokenize_node( api_root, lang, callback );
   }
 

=== modified file 'src/store/naive/node_items.h'
--- src/store/naive/node_items.h	2012-05-03 12:31:51 +0000
+++ src/store/naive/node_items.h	2012-05-16 00:57:21 +0000
@@ -555,7 +555,7 @@
 #ifndef ZORBA_NO_FULL_TEXT
   FTTokenIterator_t getTokens( 
       TokenizerProvider const&,
-      Tokenizer::Numbers&,
+      Tokenizer::State&,
       locale::iso639_1::type,
       bool = false ) const;
 #endif /* ZORBA_NO_FULL_TEXT */
@@ -1233,7 +1233,7 @@
   isPrecedingSibling(const store::Item_t&) const { return false; }
 
 #ifndef ZORBA_NO_FULL_TEXT
-  FTTokenIterator_t getTokens( TokenizerProvider const&, Tokenizer::Numbers&,
+  FTTokenIterator_t getTokens( TokenizerProvider const&, Tokenizer::State&,
                                locale::iso639_1::type,
                                bool wildcards = false ) const;
 #endif /* ZORBA_NO_FULL_TEXT */

=== modified file 'src/unit_tests/tokenizer.cpp'
--- src/unit_tests/tokenizer.cpp	2012-05-03 12:31:51 +0000
+++ src/unit_tests/tokenizer.cpp	2012-05-16 00:57:21 +0000
@@ -60,7 +60,7 @@
 
 class TestTokenizer : public Tokenizer {
 public:
-  TestTokenizer( Numbers &num ) : Tokenizer( num ) { }
+  TestTokenizer( State &state ) : Tokenizer( state ) { }
   ~TestTokenizer();
 
   // inherited
@@ -125,7 +125,7 @@
     item.getNodeName( qname );
     if ( ::binary_search( block_elements, end, qname.getLocalName().c_str(),
                           less<char const*>() ) ) {
-      ++numbers().para;
+      ++state().para;
     }
   }
 }
@@ -291,7 +291,7 @@
           // no break;
         case '!':
         case '?':
-          ++numbers().sent;
+          ++state().sent;
       }
   } // for
 
@@ -324,19 +324,19 @@
                                 Callback &callback, Item const *item ) {
   if ( !token.empty() ) {
 #if PRINT_TOKENS
-    cout <<   "t=" << setw(2) << numbers().token
-         << ", s=" << setw(2) << numbers().sent
-         << ", p=" << setw(2) << numbers().para
+    cout <<   "t=" << setw(2) << state().token
+         << ", s=" << setw(2) << state().sent
+         << ", p=" << setw(2) << state().para
          << ": \"" << token << "\"\n";
 #endif /* PRINT_TOKENS */
 
-    check_token( token.c_str(), numbers().token );
+    check_token( token.c_str(), state().token );
 
     callback.token(
       token.data(), token.size(), lang,
-      numbers().token, numbers().sent, numbers().para, item
+      state().token, state().sent, state().para, item
     );
-    ++numbers().token;
+    ++state().token;
     return true;
   }
   return false;
@@ -347,15 +347,15 @@
 class TestTokenizerProvider : public TokenizerProvider {
 public:
   // inherited
-  bool getTokenizer( iso639_1::type, Tokenizer::Numbers* = 0,
+  bool getTokenizer( iso639_1::type, Tokenizer::State* = 0,
                      Tokenizer::ptr* = 0 ) const;
 };
 
 bool TestTokenizerProvider::getTokenizer( iso639_1::type lang,
-                                          Tokenizer::Numbers *num,
+                                          Tokenizer::State *state,
                                           Tokenizer::ptr *t ) const {
-  if ( num && t )
-    t->reset( new TestTokenizer( *num ) );
+  if ( state && t )
+    t->reset( new TestTokenizer( *state ) );
   return true;
 }
 

-- 
Mailing list: https://launchpad.net/~zorba-coders
Post to     : zorba-coders@lists.launchpad.net
Unsubscribe : https://launchpad.net/~zorba-coders
More help   : https://help.launchpad.net/ListHelp

Reply via email to