Source: ucto
Source-Version: 0.9.6-1
Severity: important
Tags: patch
Usertags: icu63

Dear Maintainer,

ICU 63.1 recently released, packaged and uploaded to experimental.
Its transition is going to start soon. However your package fails to
build with this version. I attach a patch which fixes the problem.
Please check if it works with the version in Sid and upload the
package when it's feasible for you.

Thanks,
Laszlo/GCS
Description: fix FTBFS with ICU 63.1
 Add icu namespace.
Author: Laszlo Boszormenyi (GCS) <gcs@debian.org>
Last-Update: 2018-11-09

---

--- ucto-0.9.6.orig/include/ucto/setting.h
+++ ucto-0.9.6/include/ucto/setting.h
@@ -38,15 +38,15 @@ namespace Tokenizer {
   public:
   Rule(): regexp(0){
     };
-    Rule( const UnicodeString& id, const UnicodeString& pattern);
+    Rule( const icu::UnicodeString& id, const icu::UnicodeString& pattern);
     ~Rule();
-    UnicodeString id;
-    UnicodeString pattern;
+    icu::UnicodeString id;
+    icu::UnicodeString pattern;
     UnicodeRegexMatcher *regexp;
-    bool matchAll( const UnicodeString&,
-		   UnicodeString&,
-		   UnicodeString&,
-		   std::vector<UnicodeString>& );
+    bool matchAll( const icu::UnicodeString&,
+		   icu::UnicodeString&,
+		   icu::UnicodeString&,
+		   std::vector<icu::UnicodeString>& );
   private:
     Rule( const Rule& ); // inhibit copies
     Rule& operator=( const Rule& ); // inhibit copies
@@ -56,17 +56,17 @@ namespace Tokenizer {
   class Quoting {
     friend std::ostream& operator<<( std::ostream&, const Quoting& );
     struct QuotePair {
-      UnicodeString openQuote;
-      UnicodeString closeQuote;
+      icu::UnicodeString openQuote;
+      icu::UnicodeString closeQuote;
     };
   public:
-    void add( const UnicodeString&, const UnicodeString& );
-    UnicodeString lookupOpen( const UnicodeString &) const;
-    UnicodeString lookupClose( const UnicodeString & ) const;
+    void add( const icu::UnicodeString&, const icu::UnicodeString& );
+    icu::UnicodeString lookupOpen( const icu::UnicodeString &) const;
+    icu::UnicodeString lookupClose( const icu::UnicodeString & ) const;
     bool empty() const { return _quotes.empty(); };
     bool emptyStack() const { return quotestack.empty(); };
     void clearStack() { quoteindexstack.clear(); quotestack.clear(); };
-    int lookup( const UnicodeString&, int& );
+    int lookup( const icu::UnicodeString&, int& );
     void eraseAtPos( int pos ) {
       quotestack.erase( quotestack.begin()+pos );
       quoteindexstack.erase( quoteindexstack.begin()+pos );
@@ -90,14 +90,14 @@ namespace Tokenizer {
     bool readfilters( const std::string& );
     bool readquotes( const std::string& );
     bool readeosmarkers( const std::string& );
-    bool readabbreviations( const std::string&,  UnicodeString& );
-    void add_rule( const UnicodeString&, const std::vector<UnicodeString>& );
-    void sortRules( std::map<UnicodeString, Rule *>&,
-		    const std::vector<UnicodeString>& );
-    UnicodeString eosmarkers;
+    bool readabbreviations( const std::string&,  icu::UnicodeString& );
+    void add_rule( const icu::UnicodeString&, const std::vector<icu::UnicodeString>& );
+    void sortRules( std::map<icu::UnicodeString, Rule *>&,
+		    const std::vector<icu::UnicodeString>& );
+    icu::UnicodeString eosmarkers;
     std::vector<Rule *> rules;
-    std::map<UnicodeString, Rule *> rulesmap;
-    std::map<UnicodeString, int> rules_index;
+    std::map<icu::UnicodeString, Rule *> rulesmap;
+    std::map<icu::UnicodeString, int> rules_index;
     Quoting quotes;
     UnicodeFilter filter;
     std::string set_file; // the name of the settingsfile
--- ucto-0.9.6.orig/include/ucto/tokenize.h
+++ ucto-0.9.6/include/ucto/tokenize.h
@@ -78,11 +78,11 @@ namespace Tokenizer {
   class Token {
     friend std::ostream& operator<< (std::ostream&, const Token& );
   public:
-    UnicodeString type;
-    UnicodeString us;
+    icu::UnicodeString type;
+    icu::UnicodeString us;
     TokenRole role;
-    Token( const UnicodeString&,
-	   const UnicodeString&,
+    Token( const icu::UnicodeString&,
+	   const icu::UnicodeString&,
 	   TokenRole role = NOROLE,
 	   const std::string& = "" );
     std::string lc;                // ISO 639-3 language code
@@ -130,12 +130,12 @@ namespace Tokenizer {
 
     // Tokenize a line (a line is NOT a sentence, but an arbitrary string
     //                  of characters, inclusive EOS markers, Newlines etc.)
-    int tokenizeLine( const UnicodeString&,
+    int tokenizeLine( const icu::UnicodeString&,
 		      const std::string& = "default" ); // Unicode chars
     int tokenizeLine( const std::string&,
 		      const std::string& = "default" ); // UTF8 chars
 
-    void passthruLine( const UnicodeString&, bool& );
+    void passthruLine( const icu::UnicodeString&, bool& );
     void passthruLine( const std::string&, bool& );
 
     //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found
@@ -209,8 +209,8 @@ namespace Tokenizer {
     void setLanguage( const std::string& l ){ default_language = l; };
 
     // set eos marker
-    UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark =  folia::UTF8ToUnicode(s); return t; };
-    UnicodeString getEosMarker( ) const { return eosmark; }
+    icu::UnicodeString setEosMarker( const std::string& s = "<utt>") { icu::UnicodeString t = eosmark; eosmark =  folia::UTF8ToUnicode(s); return t; };
+    icu::UnicodeString getEosMarker( ) const { return eosmark; }
 
     bool setNormSet( const std::string& );
 
@@ -255,14 +255,14 @@ namespace Tokenizer {
   private:
     TokenizerClass( const TokenizerClass& ); // inhibit copies
     TokenizerClass& operator=( const TokenizerClass& ); // inhibit copies
-    void add_rule( const UnicodeString&,
-		   const std::vector<UnicodeString>& );
-    void tokenizeWord( const UnicodeString&,
+    void add_rule( const icu::UnicodeString&,
+		   const std::vector<icu::UnicodeString>& );
+    void tokenizeWord( const icu::UnicodeString&,
 		       bool,
 		       const std::string&,
-		       const UnicodeString& ="" );
+		       const icu::UnicodeString& ="" );
 
-    bool detectEos( size_t, const UnicodeString&, const Quoting& ) const;
+    bool detectEos( size_t, const icu::UnicodeString&, const Quoting& ) const;
     void detectSentenceBounds( const int offset,
 			       const std::string& = "default" );
     void detectQuotedSentenceBounds( const int offset,
@@ -272,7 +272,7 @@ namespace Tokenizer {
     //Signal the tokeniser that a paragraph is detected
     void signalParagraph( bool b=true ) { paragraphsignal = b; };
 
-    bool resolveQuote( int, const UnicodeString&, Quoting& );
+    bool resolveQuote( int, const icu::UnicodeString&, Quoting& );
     bool u_isquote( UChar32,
 		    const Quoting& ) const;
     std::string checkBOM( std::istream& );
@@ -289,9 +289,9 @@ namespace Tokenizer {
     UnicodeNormalizer normalizer;
     std::string inputEncoding;
 
-    UnicodeString eosmark;
+    icu::UnicodeString eosmark;
     std::vector<Token> tokens;
-    std::set<UnicodeString> norm_set;
+    std::set<icu::UnicodeString> norm_set;
     TiCC::LogStream *theErrLog;
 
     std::string default_language;
--- ucto-0.9.6.orig/include/ucto/unicode.h
+++ ucto-0.9.6/include/ucto/unicode.h
@@ -42,7 +42,7 @@ namespace Tokenizer {
   class UnicodeNormalizer {
   public:
   UnicodeNormalizer(): mode(UNORM_NFC){};
-    UnicodeString normalize( const UnicodeString& );
+    icu::UnicodeString normalize( const icu::UnicodeString& );
     std::string getMode( ) const;
     std::string setMode( const std::string& );
   private:
@@ -52,34 +52,34 @@ namespace Tokenizer {
   class UnicodeFilter {
     friend std::ostream& operator<<( std::ostream&, const UnicodeFilter& );
   public:
-    UnicodeString filter( const UnicodeString& );
+    icu::UnicodeString filter( const icu::UnicodeString& );
     bool fill( const std::string& );
-    bool add( const UnicodeString& );
+    bool add( const icu::UnicodeString& );
     bool add( const std::string& );
     bool empty() const { return the_map.empty(); };
   private:
-    void add( UChar uc, const UnicodeString& us ) { the_map[uc] = us; };
-    std::map<UChar, UnicodeString> the_map;
+    void add( UChar uc, const icu::UnicodeString& us ) { the_map[uc] = us; };
+    std::map<UChar, icu::UnicodeString> the_map;
   };
 
   class UnicodeRegexMatcher {
   public:
-    UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" );
+    UnicodeRegexMatcher( const icu::UnicodeString&, const icu::UnicodeString& name="" );
     ~UnicodeRegexMatcher();
-    bool match_all( const UnicodeString&, UnicodeString&, UnicodeString&  );
-    const UnicodeString get_match( unsigned int ) const;
+    bool match_all( const icu::UnicodeString&, icu::UnicodeString&, icu::UnicodeString&  );
+    const icu::UnicodeString get_match( unsigned int ) const;
     int NumOfMatches() const;
-    int split( const UnicodeString&, std::vector<UnicodeString>& );
-    UnicodeString Pattern() const;
+    int split( const icu::UnicodeString&, std::vector<icu::UnicodeString>& );
+    icu::UnicodeString Pattern() const;
   private:
     UnicodeRegexMatcher( const UnicodeRegexMatcher& );  // inhibit copies
     UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies
     std::string failString;
-    RegexPattern *pattern;
-    RegexMatcher *matcher;
+    icu::RegexPattern *pattern;
+    icu::RegexMatcher *matcher;
     UnicodeRegexMatcher();
-    std::vector<UnicodeString> results;
-    const UnicodeString _name;
+    std::vector<icu::UnicodeString> results;
+    const icu::UnicodeString _name;
   };
 
 } // namespace
--- ucto-0.9.6.orig/src/setting.cxx
+++ ucto-0.9.6/src/setting.cxx
@@ -54,7 +54,7 @@ namespace Tokenizer {
 		    ORDINALS, EOSMARKERS, QUOTES, CURRENCY,
 		    FILTER, RULEORDER, METARULES };
 
-  ConfigMode getMode( const UnicodeString& line ) {
+  ConfigMode getMode( const icu::UnicodeString& line ) {
     ConfigMode mode = NONE;
     if (line == "[RULES]") {
       mode = RULES;
@@ -111,7 +111,7 @@ namespace Tokenizer {
   public:
     uConfigError( const string& s, const string& f ):
       invalid_argument( "ucto: " + s + " (" + f + ")"  ){};
-    uConfigError( const UnicodeString& us, const string& f ):
+    uConfigError( const icu::UnicodeString& us, const string& f ):
       uConfigError( folia::UnicodeToUTF8(us), f ){};
   };
 
@@ -143,14 +143,14 @@ namespace Tokenizer {
     }
   }
 
-  void Quoting::add( const UnicodeString& o, const UnicodeString& c ){
+  void Quoting::add( const icu::UnicodeString& o, const icu::UnicodeString& c ){
     QuotePair quote;
     quote.openQuote = o;
     quote.closeQuote = c;
     _quotes.push_back( quote );
   }
 
-  int Quoting::lookup( const UnicodeString& open, int& stackindex ){
+  int Quoting::lookup( const icu::UnicodeString& open, int& stackindex ){
     if (quotestack.empty() || (quotestack.size() != quoteindexstack.size())) return -1;
     auto it = quotestack.crbegin();
     size_t i = quotestack.size();
@@ -165,7 +165,7 @@ namespace Tokenizer {
     return -1;
   }
 
-  UnicodeString Quoting::lookupOpen( const UnicodeString &q ) const {
+  icu::UnicodeString Quoting::lookupOpen( const icu::UnicodeString &q ) const {
     for ( const auto& quote : _quotes ){
       if ( quote.openQuote.indexOf(q) >=0 )
 	return quote.closeQuote;
@@ -173,8 +173,8 @@ namespace Tokenizer {
     return "";
   }
 
-  UnicodeString Quoting::lookupClose( const UnicodeString &q ) const {
-    UnicodeString res;
+  icu::UnicodeString Quoting::lookupClose( const icu::UnicodeString &q ) const {
+    icu::UnicodeString res;
     for ( const auto& quote : _quotes ){
       if ( quote.closeQuote.indexOf(q) >= 0 )
 	return quote.openQuote;
@@ -186,7 +186,7 @@ namespace Tokenizer {
     delete regexp;
   }
 
-  Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern):
+  Rule::Rule( const icu::UnicodeString& _id, const icu::UnicodeString& _pattern):
     id(_id), pattern(_pattern) {
     regexp = new UnicodeRegexMatcher( pattern, id );
   }
@@ -200,10 +200,10 @@ namespace Tokenizer {
     return os;
   }
 
-  bool Rule::matchAll( const UnicodeString& line,
-		       UnicodeString& pre,
-		       UnicodeString& post,
-		       vector<UnicodeString>& matches ){
+  bool Rule::matchAll( const icu::UnicodeString& line,
+		       icu::UnicodeString& pre,
+		       icu::UnicodeString& post,
+		       vector<icu::UnicodeString>& matches ){
     matches.clear();
     pre = "";
     post = "";
@@ -244,7 +244,7 @@ namespace Tokenizer {
     else {
       string rawline;
       while ( getline(f,rawline) ){
-	UnicodeString line = folia::UTF8ToUnicode(rawline);
+	icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
 	line.trim();
 	if ((line.length() > 0) && (line[0] != '#')) {
 	  if ( tokDebug >= 5 ){
@@ -255,8 +255,8 @@ namespace Tokenizer {
 	    throw uConfigError( "invalid RULES entry: " + line,
 				fname );
 	  }
-	  UnicodeString id = UnicodeString( line, 0,splitpoint);
-	  UnicodeString pattern = UnicodeString( line, splitpoint+1);
+	  icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint);
+	  icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1);
 	  rulesmap[id] = new Rule( id, pattern);
 	}
       }
@@ -282,7 +282,7 @@ namespace Tokenizer {
     else {
       string rawline;
       while ( getline(f,rawline) ){
-	UnicodeString line = folia::UTF8ToUnicode(rawline);
+	icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
 	line.trim();
 	if ((line.length() > 0) && (line[0] != '#')) {
 	  if ( tokDebug >= 5 ){
@@ -296,8 +296,8 @@ namespace Tokenizer {
 				+ " (missing whitespace)",
 				fname );
 	  }
-	  UnicodeString open = UnicodeString( line, 0,splitpoint);
-	  UnicodeString close = UnicodeString( line, splitpoint+1);
+	  icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint);
+	  icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1);
 	  open = open.trim().unescape();
 	  close = close.trim().unescape();
 	  if ( open.isEmpty() || close.isEmpty() ){
@@ -323,7 +323,7 @@ namespace Tokenizer {
     else {
       string rawline;
       while ( getline(f,rawline) ){
-	UnicodeString line = folia::UTF8ToUnicode(rawline);
+	icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
 	line.trim();
 	if ((line.length() > 0) && (line[0] != '#')) {
 	  if ( tokDebug >= 5 ){
@@ -331,7 +331,7 @@ namespace Tokenizer {
 	  }
 	  if ( ( line.startsWith("\\u") && line.length() == 6 ) ||
 	       ( line.startsWith("\\U") && line.length() == 10 ) ){
-	    UnicodeString uit = line.unescape();
+	    icu::UnicodeString uit = line.unescape();
 	    if ( uit.isEmpty() ){
 	      throw uConfigError( "Invalid EOSMARKERS entry: " + line, fname );
 	    }
@@ -344,7 +344,7 @@ namespace Tokenizer {
   }
 
   bool Setting::readabbreviations( const string& fname,
-				   UnicodeString& abbreviations ){
+				   icu::UnicodeString& abbreviations ){
     if ( tokDebug > 0 ){
       *theErrLog << "%include " << fname << endl;
     }
@@ -355,7 +355,7 @@ namespace Tokenizer {
     else {
       string rawline;
       while ( getline(f,rawline) ){
-	UnicodeString line = folia::UTF8ToUnicode(rawline);
+	icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
 	line.trim();
 	if ((line.length() > 0) && (line[0] != '#')) {
 	  if ( tokDebug >= 5 ){
@@ -370,17 +370,17 @@ namespace Tokenizer {
     return true;
   }
 
-  void Setting::add_rule( const UnicodeString& name,
-			  const vector<UnicodeString>& parts ){
-    UnicodeString pat;
+  void Setting::add_rule( const icu::UnicodeString& name,
+			  const vector<icu::UnicodeString>& parts ){
+    icu::UnicodeString pat;
     for ( auto const& part : parts ){
       pat += part;
     }
     rulesmap[name] = new Rule( name, pat );
   }
 
-  void Setting::sortRules( map<UnicodeString, Rule *>& rulesmap,
-			   const vector<UnicodeString>& sort ){
+  void Setting::sortRules( map<icu::UnicodeString, Rule *>& rulesmap,
+			   const vector<icu::UnicodeString>& sort ){
     // LOG << "rules voor sort : " << endl;
     // for ( size_t i=0; i < rules.size(); ++i ){
     //   LOG << "rule " << i << " " << *rules[i] << endl;
@@ -432,14 +432,14 @@ namespace Tokenizer {
     return result;
   }
 
-  void addOrder( vector<UnicodeString>& order,
-		 map<UnicodeString,int>& reverse_order,
+  void addOrder( vector<icu::UnicodeString>& order,
+		 map<icu::UnicodeString,int>& reverse_order,
 		 int& index,
-		 UnicodeString &line,
+		 icu::UnicodeString &line,
 		 const string& fn ){
     try {
       UnicodeRegexMatcher m( "\\s+" );
-      vector<UnicodeString> usv;
+      vector<icu::UnicodeString> usv;
       m.split( line, usv );
       for ( const auto& us : usv  ){
 	if ( reverse_order.find( us ) != reverse_order.end() ){
@@ -500,7 +500,7 @@ namespace Tokenizer {
 		      int dbg, LogStream* ls ) {
     tokDebug = dbg;
     theErrLog = ls;
-    map<ConfigMode, UnicodeString> pattern = { { ABBREVIATIONS, "" },
+    map<ConfigMode, icu::UnicodeString> pattern = { { ABBREVIATIONS, "" },
 					       { TOKENS, "" },
 					       { PREFIXES, "" },
 					       { SUFFIXES, "" },
@@ -508,7 +508,7 @@ namespace Tokenizer {
 					       { ATTACHEDSUFFIXES, "" },
 					       { UNITS, "" },
 					       { ORDINALS, "" } };
-    vector<UnicodeString> rules_order;
+    vector<icu::UnicodeString> rules_order;
     vector<string> meta_rules;
     string conffile = get_filename( settings_name );
 
@@ -572,7 +572,7 @@ namespace Tokenizer {
 	  continue;
 	}
 
-	UnicodeString line = folia::UTF8ToUnicode(rawline);
+	icu::UnicodeString line = folia::UTF8ToUnicode(rawline);
 	line.trim();
 	if ((line.length() > 0) && (line[0] != '#')) {
 	  if (line[0] == '[') {
@@ -580,7 +580,7 @@ namespace Tokenizer {
 	  }
 	  else {
 	    if ( line[0] == '\\' && line.length() > 1 && line[1] == '[' ){
-	      line = UnicodeString( line, 1 );
+	      line = icu::UnicodeString( line, 1 );
 	    }
 	    switch( mode ){
 	    case RULES: {
@@ -589,8 +589,8 @@ namespace Tokenizer {
 		throw uConfigError( "invalid RULES entry: " + line,
 				    set_file );
 	      }
-	      UnicodeString id = UnicodeString( line, 0,splitpoint);
-	      UnicodeString pattern = UnicodeString( line, splitpoint+1);
+	      icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint);
+	      icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1);
 	      rulesmap[id] = new Rule( id, pattern);
 	    }
 	      break;
@@ -617,7 +617,7 @@ namespace Tokenizer {
 	    case EOSMARKERS:
 	      if ( ( line.startsWith("\\u") && line.length() == 6 ) ||
 		   ( line.startsWith("\\U") && line.length() == 10 ) ){
-		UnicodeString uit = line.unescape();
+		icu::UnicodeString uit = line.unescape();
 		if ( uit.isEmpty() ){
 		  throw uConfigError( "Invalid EOSMARKERS entry: " + line,
 				      set_file );
@@ -634,8 +634,8 @@ namespace Tokenizer {
 				    + " (missing whitespace)",
 				    set_file );
 	      }
-	      UnicodeString open = UnicodeString( line, 0,splitpoint);
-	      UnicodeString close = UnicodeString( line, splitpoint+1);
+	      icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint);
+	      icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1);
 	      open = open.trim().unescape();
 	      close = close.trim().unescape();
 	      if ( open.isEmpty() || close.isEmpty() ){
@@ -702,7 +702,7 @@ namespace Tokenizer {
 	  }
 	  continue;
 	}
-	UnicodeString name = folia::UTF8ToUnicode( nam );
+	icu::UnicodeString name = folia::UTF8ToUnicode( nam );
 	string rule = mr.substr( pos+1 );
 	if ( tokDebug > 5 ){
 	  LOG << "SPLIT using: '" << split << "'" << endl;
@@ -712,11 +712,11 @@ namespace Tokenizer {
 	for ( auto& str : parts ){
 	  str = TiCC::trim( str );
 	}
-	vector<UnicodeString> new_parts;
-	vector<UnicodeString> undef_parts;
+	vector<icu::UnicodeString> new_parts;
+	vector<icu::UnicodeString> undef_parts;
 	bool skip_rule = false;
 	for ( const auto& part : parts ){
-	  UnicodeString meta = folia::UTF8ToUnicode( part );
+	  icu::UnicodeString meta = folia::UTF8ToUnicode( part );
 	  ConfigMode mode = getMode( "[" + meta + "]" );
 	  switch ( mode ){
 	  case ORDINALS:
--- ucto-0.9.6.orig/src/tokenize.cxx
+++ ucto-0.9.6/src/tokenize.cxx
@@ -88,11 +88,11 @@ namespace Tokenizer {
   };
 
 
-  UnicodeString convert( const string& line,
+  icu::UnicodeString convert( const string& line,
 			 const string& inputEncoding ){
-    UnicodeString result;
+    icu::UnicodeString result;
     try {
-      result = UnicodeString( line.c_str(),
+      result = icu::UnicodeString( line.c_str(),
 			      line.length(),
 			      inputEncoding.c_str() );
     }
@@ -108,17 +108,17 @@ namespace Tokenizer {
     return result;
   }
 
-  const UnicodeString type_space = "SPACE";
-  const UnicodeString type_currency = "CURRENCY";
-  const UnicodeString type_emoticon = "EMOTICON";
-  const UnicodeString type_word = "WORD";
-  const UnicodeString type_symbol = "SYMBOL";
-  const UnicodeString type_punctuation = "PUNCTUATION";
-  const UnicodeString type_number = "NUMBER";
-  const UnicodeString type_unknown = "UNKNOWN";
+  const icu::UnicodeString type_space = "SPACE";
+  const icu::UnicodeString type_currency = "CURRENCY";
+  const icu::UnicodeString type_emoticon = "EMOTICON";
+  const icu::UnicodeString type_word = "WORD";
+  const icu::UnicodeString type_symbol = "SYMBOL";
+  const icu::UnicodeString type_punctuation = "PUNCTUATION";
+  const icu::UnicodeString type_number = "NUMBER";
+  const icu::UnicodeString type_unknown = "UNKNOWN";
 
-  Token::Token( const UnicodeString& _type,
-		const UnicodeString& _s,
+  Token::Token( const icu::UnicodeString& _type,
+		const icu::UnicodeString& _s,
 		TokenRole _role, const string& _lc ):
     type(_type), us(_s), role(_role), lc(_lc) {}
 
@@ -226,7 +226,7 @@ namespace Tokenizer {
 			<< "'" << endl;
       }
       stripCR( line );
-      UnicodeString input_line;
+      icu::UnicodeString input_line;
       if ( line.size() > 0 && line[0] == 0 ){
 	// when processing UTF16LE, '0' bytes show up at pos 0
 	// we discard them, not for UTF16BE!
@@ -273,7 +273,7 @@ namespace Tokenizer {
 	      LOG << "use textCat to guess language from: "
 		  << input_line << endl;
 	    }
-	    UnicodeString temp = input_line;
+	    icu::UnicodeString temp = input_line;
 	    temp.toLower();
 	    string lan = tc->get_language( folia::UnicodeToUTF8(temp) );
 	    if ( settings.find( lan ) != settings.end() ){
@@ -531,7 +531,7 @@ namespace Tokenizer {
     if ( root->hastext( outputclass ) ){
       return;
     }
-    UnicodeString utxt = root->text( outputclass, false, false );
+    icu::UnicodeString utxt = root->text( outputclass, false, false );
     // cerr << "untok: '" << utxt << "'" << endl;
     // UnicodeString txt = root->text( outputclass, true );
     // cerr << "  tok: '" << txt << "'" << endl;
@@ -664,7 +664,7 @@ namespace Tokenizer {
     if  ( tokDebug > 0 ){
       cerr << "tokenize sentence element: " << element->id() << endl;
     }
-    UnicodeString line = element->stricttext( inputclass );
+    icu::UnicodeString line = element->stricttext( inputclass );
     if ( line.isEmpty() ){
       // so no usefull text in this element. skip it
       return;
@@ -837,7 +837,7 @@ namespace Tokenizer {
 	args["space"]= "no";
       }
       folia::FoliaElement *w = new folia::Word( args, root->doc() );
-      UnicodeString out = token.us;
+      icu::UnicodeString out = token.us;
       if (lowercase) {
 	out.toLower();
       }
@@ -898,7 +898,7 @@ namespace Tokenizer {
 	  OUT << endl << endl;
 	}
       }
-      UnicodeString s = token.us;
+      icu::UnicodeString s = token.us;
       if (lowercase) {
 	s = s.toLower();
       }
@@ -1118,12 +1118,12 @@ namespace Tokenizer {
       quote = true;
     }
     else {
-      UnicodeString opening = quotes.lookupOpen( c );
+      icu::UnicodeString opening = quotes.lookupOpen( c );
       if (!opening.isEmpty()) {
 	quote = true;
       }
       else {
-	UnicodeString closing = quotes.lookupClose( c );
+	icu::UnicodeString closing = quotes.lookupClose( c );
 	if (!closing.isEmpty()) {
 	  quote = true;
 	}
@@ -1151,7 +1151,7 @@ namespace Tokenizer {
   }
 
   bool TokenizerClass::resolveQuote( int endindex,
-				     const UnicodeString& open,
+				     const icu::UnicodeString& open,
 				     Quoting& quotes ) {
     //resolve a quote
     int stackindex = -1;
@@ -1250,7 +1250,7 @@ namespace Tokenizer {
   }
 
   bool TokenizerClass::detectEos( size_t i,
-				  const UnicodeString& eosmarkers,
+				  const icu::UnicodeString& eosmarkers,
 				  const Quoting& quotes ) const {
     bool is_eos = false;
     UChar32 c = tokens[i].us.char32At(0);
@@ -1288,7 +1288,7 @@ namespace Tokenizer {
 					  Quoting& quotes ) {
     UChar32 c = tokens[i].us.char32At(0);
     //Detect Quotation marks
-    if ((c == '"') || ( UnicodeString(c) == """) ) {
+    if ((c == '"') || ( icu::UnicodeString(c) == """) ) {
       if (tokDebug > 1 ){
 	LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl;
       }
@@ -1311,7 +1311,7 @@ namespace Tokenizer {
       }
     }
     else {
-      UnicodeString close = quotes.lookupOpen( c );
+      icu::UnicodeString close = quotes.lookupOpen( c );
       if ( !close.isEmpty() ){ // we have a opening quote
 	if ( tokDebug > 1 ) {
 	  LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl;
@@ -1319,7 +1319,7 @@ namespace Tokenizer {
 	quotes.push( i, c ); // remember it
       }
       else {
-	UnicodeString open = quotes.lookupClose( c );
+	icu::UnicodeString open = quotes.lookupClose( c );
 	if ( !open.isEmpty() ) { // we have a closeing quote
 	  if (tokDebug > 1 ) {
 	    LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl;
@@ -1484,17 +1484,17 @@ namespace Tokenizer {
 
   void TokenizerClass::passthruLine( const string& s, bool& bos ) {
     // string wrapper
-    UnicodeString us = convert( s, inputEncoding );;
+    icu::UnicodeString us = convert( s, inputEncoding );;
     passthruLine( us, bos );
   }
 
-  void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) {
+  void TokenizerClass::passthruLine( const icu::UnicodeString& input, bool& bos ) {
     if (tokDebug) {
       LOG << "[passthruLine] input: line=[" << input << "]" << endl;
     }
     bool alpha = false, num = false, punct = false;
-    UnicodeString word;
-    StringCharacterIterator sit(input);
+    icu::UnicodeString word;
+    icu::StringCharacterIterator sit(input);
     while ( sit.hasNext() ){
       UChar32 c = sit.current32();
       if ( u_isspace(c)) {
@@ -1514,7 +1514,7 @@ namespace Tokenizer {
 	  bos = true;
 	}
 	else {
-	  UnicodeString type;
+	  icu::UnicodeString type;
 	  if (alpha && !num && !punct) {
 	    type = type_word;
 	  }
@@ -1577,7 +1577,7 @@ namespace Tokenizer {
 	  tokens.back().role |= ENDOFSENTENCE;
       }
       else {
-	UnicodeString type;
+	icu::UnicodeString type;
 	if (alpha && !num && !punct) {
 	  type = type_word;
 	}
@@ -1653,7 +1653,7 @@ namespace Tokenizer {
   // string wrapper
   int TokenizerClass::tokenizeLine( const string& s,
 				    const string& lang ){
-    UnicodeString uinputstring = convert( s, inputEncoding );
+    icu::UnicodeString uinputstring = convert( s, inputEncoding );
     return tokenizeLine( uinputstring, lang );
   }
 
@@ -1673,7 +1673,7 @@ namespace Tokenizer {
       || u_charType( c ) == U_OTHER_SYMBOL;
   }
 
-  const UnicodeString& detect_type( UChar32 c ){
+  const icu::UnicodeString& detect_type( UChar32 c ){
     if ( u_isspace(c)) {
       return type_space;
     }
@@ -1768,7 +1768,7 @@ namespace Tokenizer {
     }
   }
 
-  int TokenizerClass::tokenizeLine( const UnicodeString& originput,
+  int TokenizerClass::tokenizeLine( const icu::UnicodeString& originput,
 				    const string& _lang ){
     string lang = _lang;
     if ( lang.empty() ){
@@ -1786,7 +1786,7 @@ namespace Tokenizer {
       LOG << "[tokenizeLine] input: line=["
 	  << originput << "] (" << lang << ")" << endl;
     }
-    UnicodeString input = normalizer.normalize( originput );
+    icu::UnicodeString input = normalizer.normalize( originput );
     if ( doFilter ){
       input = settings[lang]->filter.filter( input );
     }
@@ -1808,13 +1808,13 @@ namespace Tokenizer {
     bool tokenizeword = false;
     bool reset = false;
     //iterate over all characters
-    UnicodeString word;
-    StringCharacterIterator sit(input);
+    icu::UnicodeString word;
+    icu::StringCharacterIterator sit(input);
     long int i = 0;
     while ( sit.hasNext() ){
       UChar32 c = sit.current32();
       if ( tokDebug > 8 ){
-	UnicodeString s = c;
+	icu::UnicodeString s = c;
 	int8_t charT = u_charType( c );
 	LOG << "examine character: " << s << " type= "
 			<< toString( charT  ) << endl;
@@ -1855,7 +1855,7 @@ namespace Tokenizer {
 	    }
 	    int eospos = tokens.size()-1;
 	    if (expliciteosfound > 0) {
-	      UnicodeString realword;
+	      icu::UnicodeString realword;
 	      word.extract(0,expliciteosfound,realword);
 	      if (tokDebug >= 2) {
 		LOG << "[tokenizeLine] Prefix before EOS: "
@@ -1865,7 +1865,7 @@ namespace Tokenizer {
 	      eospos++;
 	    }
 	    if ( expliciteosfound + eosmark.length() < word.length() ){
-	      UnicodeString realword;
+	      icu::UnicodeString realword;
 	      word.extract( expliciteosfound+eosmark.length(),
 			    word.length() - expliciteosfound - eosmark.length(),
 			    realword );
@@ -1941,10 +1941,10 @@ namespace Tokenizer {
     return numNewTokens;
   }
 
-  void TokenizerClass::tokenizeWord( const UnicodeString& input,
+  void TokenizerClass::tokenizeWord( const icu::UnicodeString& input,
 				     bool space,
 				     const string& lang,
-				     const UnicodeString& assigned_type ) {
+				     const icu::UnicodeString& assigned_type ) {
     bool recurse = !assigned_type.isEmpty();
 
     int32_t inpLen = input.countChar32();
@@ -1977,7 +1977,7 @@ namespace Tokenizer {
     if ( inpLen == 1) {
       //single character, no need to process all rules, do some simpler (faster) detection
       UChar32 c = input.char32At(0);
-      UnicodeString type = detect_type( c );
+      icu::UnicodeString type = detect_type( c );
       if ( type == type_space ){
 	return;
       }
@@ -1993,7 +1993,7 @@ namespace Tokenizer {
 	}
       }
       else {
-	UnicodeString word = input;
+	icu::UnicodeString word = input;
 	if ( norm_set.find( type ) != norm_set.end() ){
 	  word = "{{" + type + "}}";
 	}
@@ -2010,10 +2010,10 @@ namespace Tokenizer {
 	if ( tokDebug >= 4){
 	  LOG << "\tTESTING " << rule->id << endl;
 	}
-	UnicodeString type = rule->id;
+	icu::UnicodeString type = rule->id;
 	//Find first matching rule
-	UnicodeString pre, post;
-	vector<UnicodeString> matches;
+	icu::UnicodeString pre, post;
+	vector<icu::UnicodeString> matches;
 	if ( rule->matchAll( input, pre, post, matches ) ){
 	  a_rule_matched = true;
 	  if ( tokDebug >= 4 ){
@@ -2083,7 +2083,7 @@ namespace Tokenizer {
 		if ( post.length() > 0 ) {
 		  internal_space = false;
 		}
-		UnicodeString word = matches[m];
+		icu::UnicodeString word = matches[m];
 		if ( norm_set.find( type ) != norm_set.end() ){
 		  word = "{{" + type + "}}";
 		  tokens.push_back( Token( type, word, internal_space ? NOROLE : NOSPACE, lang ) );
--- ucto-0.9.6.orig/src/unicode.cxx
+++ ucto-0.9.6/src/unicode.cxx
@@ -84,10 +84,10 @@ namespace Tokenizer {
     return res;
   }
 
-  UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){
-    UnicodeString r;
+  icu::UnicodeString UnicodeNormalizer::normalize( const icu::UnicodeString& us ){
+    icu::UnicodeString r;
     UErrorCode status=U_ZERO_ERROR;
-    Normalizer::normalize( us, mode, 0, r, status );
+    icu::Normalizer::normalize( us, mode, 0, r, status );
     if (U_FAILURE(status)){
       throw std::invalid_argument("Normalizer");
     }
@@ -101,18 +101,18 @@ namespace Tokenizer {
     else {
       auto it=q.the_map.cbegin();
       while ( it != q.the_map.cend() ){
-	os << folia::UnicodeToUTF8(UnicodeString(it->first)) << "\t" << it->second << endl;
+	os << folia::UnicodeToUTF8(icu::UnicodeString(it->first)) << "\t" << it->second << endl;
 	++it;
       }
     }
     return os;
   }
 
-  UnicodeString UnicodeFilter::filter( const UnicodeString& s ){
+  icu::UnicodeString UnicodeFilter::filter( const icu::UnicodeString& s ){
     if ( empty() )
       return s;
     else {
-      UnicodeString result;
+      icu::UnicodeString result;
       for ( int i=0; i < s.length(); ++i ){
 	auto it=the_map.find(s[i]);
 	if ( it != the_map.cend() )
@@ -125,16 +125,16 @@ namespace Tokenizer {
   }
 
   bool UnicodeFilter::add( const string& s ){
-    UnicodeString line = folia::UTF8ToUnicode(s);
+    icu::UnicodeString line = folia::UTF8ToUnicode(s);
     return add( line );
   }
 
-  bool UnicodeFilter::add( const UnicodeString& s ){
-    UnicodeString line = s;
+  bool UnicodeFilter::add( const icu::UnicodeString& s ){
+    icu::UnicodeString line = s;
     line.trim();
     if ((line.length() > 0) && (line[0] != '#')) {
-      UnicodeString open = "";
-      UnicodeString close = "";
+      icu::UnicodeString open = "";
+      icu::UnicodeString close = "";
       int splitpoint = line.indexOf(" ");
       if ( splitpoint == -1 )
 	splitpoint = line.indexOf("\t");
@@ -142,8 +142,8 @@ namespace Tokenizer {
 	open = line;
       }
       else {
-	open = UnicodeString( line, 0,splitpoint);
-	close = UnicodeString( line, splitpoint+1);
+	open = icu::UnicodeString( line, 0,splitpoint);
+	close = icu::UnicodeString( line, splitpoint+1);
       }
       open = open.trim().unescape();
       close = close.trim().unescape();
@@ -175,29 +175,29 @@ namespace Tokenizer {
   class uConfigError: public std::invalid_argument {
   public:
     uConfigError( const string& s ): invalid_argument( "ucto: config file:" + s ){};
-    uConfigError( const UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){};
+    uConfigError( const icu::UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){};
   };
 
 
-  UnicodeString UnicodeRegexMatcher::Pattern() const{
+  icu::UnicodeString UnicodeRegexMatcher::Pattern() const{
     return pattern->pattern();
   }
 
-  UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat,
-					    const UnicodeString& name ):
+  UnicodeRegexMatcher::UnicodeRegexMatcher( const icu::UnicodeString& pat,
+					    const icu::UnicodeString& name ):
     _name(name)
   {
     failString.clear();
     matcher = NULL;
     UErrorCode u_stat = U_ZERO_ERROR;
     UParseError errorInfo;
-    pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat );
+    pattern = icu::RegexPattern::compile( pat, 0, errorInfo, u_stat );
     if ( U_FAILURE(u_stat) ){
       string spat = folia::UnicodeToUTF8(pat);
       failString = folia::UnicodeToUTF8(_name);
       if ( errorInfo.offset >0 ){
 	failString += " Invalid regular expression at position " + TiCC::toString( errorInfo.offset ) + "\n";
-	UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 );
+	icu::UnicodeString pat1 = icu::UnicodeString( pat, 0, errorInfo.offset -1 );
 	failString += folia::UnicodeToUTF8(pat1) + " <== HERE\n";
       }
       else {
@@ -222,9 +222,9 @@ namespace Tokenizer {
 
   //#define MATCH_DEBUG 1
 
-  bool UnicodeRegexMatcher::match_all( const UnicodeString& line,
-				       UnicodeString& pre,
-				       UnicodeString& post ){
+  bool UnicodeRegexMatcher::match_all( const icu::UnicodeString& line,
+				       icu::UnicodeString& pre,
+				       icu::UnicodeString& post ){
     UErrorCode u_stat = U_ZERO_ERROR;
     pre = "";
     post = "";
@@ -243,21 +243,21 @@ namespace Tokenizer {
 #endif
 	if ( matcher->groupCount() == 0 ){
 	  // case 1: a rule without capture groups matches
-	  UnicodeString us = matcher->group(0,u_stat) ;
+	  icu::UnicodeString us = matcher->group(0,u_stat) ;
 #ifdef MATCH_DEBUG
 	  cerr << "case 1, result = " << us << endl;
 #endif
 	  results.push_back( us );
 	  int start = matcher->start( 0, u_stat );
 	  if ( start > 0 ){
-	    pre = UnicodeString( line, 0, start );
+	    pre = icu::UnicodeString( line, 0, start );
 #ifdef MATCH_DEBUG
 	    cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl;
 #endif
 	  }
 	  int end = matcher->end( 0, u_stat );
 	  if ( end < line.length() ){
-	    post = UnicodeString( line, end );
+	    post = icu::UnicodeString( line, end );
 #ifdef MATCH_DEBUG
 	    cerr << "found post " << folia::UnicodeToUTF8(post) << endl;
 #endif
@@ -268,20 +268,20 @@ namespace Tokenizer {
 	  // case 2: a rule with one capture group matches
 	  int start = matcher->start( 1, u_stat );
 	  if ( start >= 0 ){
-	    UnicodeString us = matcher->group(1,u_stat) ;
+	    icu::UnicodeString us = matcher->group(1,u_stat) ;
 #ifdef MATCH_DEBUG
 	    cerr << "case 2a , result = " << us << endl;
 #endif
 	    results.push_back( us );
 	    if ( start > 0 ){
-	      pre = UnicodeString( line, 0, start );
+	      pre = icu::UnicodeString( line, 0, start );
 #ifdef MATCH_DEBUG
 	      cerr << "found pre " << pre << endl;
 #endif
 	    }
 	    int end = matcher->end( 1, u_stat );
 	    if ( end < line.length() ){
-	      post = UnicodeString( line, end );
+	      post = icu::UnicodeString( line, end );
 #ifdef MATCH_DEBUG
 	      cerr << "found post " << post << endl;
 #endif
@@ -289,21 +289,21 @@ namespace Tokenizer {
 	  }
 	  else {
 	    // group 1 is empty, return group 0
-	    UnicodeString us = matcher->group(0,u_stat) ;
+	    icu::UnicodeString us = matcher->group(0,u_stat) ;
 #ifdef MATCH_DEBUG
 	    cerr << "case 2b , result = " << us << endl;
 #endif
 	    results.push_back( us );
 	    start = matcher->start( 0, u_stat );
 	    if ( start > 0 ){
-	      pre = UnicodeString( line, 0, start );
+	      pre = icu::UnicodeString( line, 0, start );
 #ifdef MATCH_DEBUG
 	      cerr << "found pre " << pre << endl;
 #endif
 	    }
 	    int end = matcher->end( 0, u_stat );
 	    if ( end < line.length() ){
-	      post = UnicodeString( line, end );
+	      post = icu::UnicodeString( line, end );
 #ifdef MATCH_DEBUG
 	      cerr << "found post " << post << endl;
 #endif
@@ -332,7 +332,7 @@ namespace Tokenizer {
 	    else
 	      break;
 	    if ( start > end ){
-	      pre = UnicodeString( line, end, start );
+	      pre = icu::UnicodeString( line, end, start );
 #ifdef MATCH_DEBUG
 	      cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl;
 #endif
@@ -342,7 +342,7 @@ namespace Tokenizer {
 	    cerr << "end = " << end << endl;
 #endif
 	    if (!U_FAILURE(u_stat)){
-	      results.push_back( UnicodeString( line, start, end - start ) );
+	      results.push_back( icu::UnicodeString( line, start, end - start ) );
 #ifdef MATCH_DEBUG
 	      cerr << "added result " << folia::UnicodeToUTF8( results.back() ) << endl;
 #endif
@@ -351,7 +351,7 @@ namespace Tokenizer {
 	      break;
 	  }
 	  if ( end < line.length() ){
-	    post = UnicodeString( line, end );
+	    post = icu::UnicodeString( line, end );
 #ifdef MATCH_DEBUG
 	    cerr << "found post " << folia::UnicodeToUTF8(post) << endl;
 #endif
@@ -364,7 +364,7 @@ namespace Tokenizer {
     return false;
   }
 
-  const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{
+  const icu::UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{
     if ( n < results.size() )
       return results[n];
     else
@@ -378,11 +378,11 @@ namespace Tokenizer {
       return 0;
   }
 
-  int UnicodeRegexMatcher::split( const UnicodeString& us,
-				  vector<UnicodeString>& result ){
+  int UnicodeRegexMatcher::split( const icu::UnicodeString& us,
+				  vector<icu::UnicodeString>& result ){
     result.clear();
     const int maxWords = 256;
-    UnicodeString words[maxWords];
+    icu::UnicodeString words[maxWords];
     UErrorCode status = U_ZERO_ERROR;
     int numWords = matcher->split( us, words, maxWords, status );
     for ( int i = 0; i < numWords; ++i )

Reply via email to