Source: ucto Source-Version: 0.9.6-1 Severity: important Tags: patch Usertags: icu63
Dear Maintainer, ICU 63.1 recently released, packaged and uploaded to experimental. Its transition is going to start soon. However your package fails to build with this version. I attach a patch which fixes the problem. Please check if it works with the version in Sid and upload the package when it's feasible for you. Thanks, Laszlo/GCS
Description: fix FTBFS with ICU 63.1 Add icu namespace. Author: Laszlo Boszormenyi (GCS) <gcs@debian.org> Last-Update: 2018-11-09 --- --- ucto-0.9.6.orig/include/ucto/setting.h +++ ucto-0.9.6/include/ucto/setting.h @@ -38,15 +38,15 @@ namespace Tokenizer { public: Rule(): regexp(0){ }; - Rule( const UnicodeString& id, const UnicodeString& pattern); + Rule( const icu::UnicodeString& id, const icu::UnicodeString& pattern); ~Rule(); - UnicodeString id; - UnicodeString pattern; + icu::UnicodeString id; + icu::UnicodeString pattern; UnicodeRegexMatcher *regexp; - bool matchAll( const UnicodeString&, - UnicodeString&, - UnicodeString&, - std::vector<UnicodeString>& ); + bool matchAll( const icu::UnicodeString&, + icu::UnicodeString&, + icu::UnicodeString&, + std::vector<icu::UnicodeString>& ); private: Rule( const Rule& ); // inhibit copies Rule& operator=( const Rule& ); // inhibit copies @@ -56,17 +56,17 @@ namespace Tokenizer { class Quoting { friend std::ostream& operator<<( std::ostream&, const Quoting& ); struct QuotePair { - UnicodeString openQuote; - UnicodeString closeQuote; + icu::UnicodeString openQuote; + icu::UnicodeString closeQuote; }; public: - void add( const UnicodeString&, const UnicodeString& ); - UnicodeString lookupOpen( const UnicodeString &) const; - UnicodeString lookupClose( const UnicodeString & ) const; + void add( const icu::UnicodeString&, const icu::UnicodeString& ); + icu::UnicodeString lookupOpen( const icu::UnicodeString &) const; + icu::UnicodeString lookupClose( const icu::UnicodeString & ) const; bool empty() const { return _quotes.empty(); }; bool emptyStack() const { return quotestack.empty(); }; void clearStack() { quoteindexstack.clear(); quotestack.clear(); }; - int lookup( const UnicodeString&, int& ); + int lookup( const icu::UnicodeString&, int& ); void eraseAtPos( int pos ) { quotestack.erase( quotestack.begin()+pos ); quoteindexstack.erase( quoteindexstack.begin()+pos ); @@ -90,14 +90,14 @@ namespace Tokenizer { bool readfilters( const std::string& ); bool readquotes( const std::string& ); bool readeosmarkers( const std::string& ); - bool readabbreviations( const std::string&, UnicodeString& ); - void add_rule( const UnicodeString&, const std::vector<UnicodeString>& ); - void sortRules( std::map<UnicodeString, Rule *>&, - const std::vector<UnicodeString>& ); - UnicodeString eosmarkers; + bool readabbreviations( const std::string&, icu::UnicodeString& ); + void add_rule( const icu::UnicodeString&, const std::vector<icu::UnicodeString>& ); + void sortRules( std::map<icu::UnicodeString, Rule *>&, + const std::vector<icu::UnicodeString>& ); + icu::UnicodeString eosmarkers; std::vector<Rule *> rules; - std::map<UnicodeString, Rule *> rulesmap; - std::map<UnicodeString, int> rules_index; + std::map<icu::UnicodeString, Rule *> rulesmap; + std::map<icu::UnicodeString, int> rules_index; Quoting quotes; UnicodeFilter filter; std::string set_file; // the name of the settingsfile --- ucto-0.9.6.orig/include/ucto/tokenize.h +++ ucto-0.9.6/include/ucto/tokenize.h @@ -78,11 +78,11 @@ namespace Tokenizer { class Token { friend std::ostream& operator<< (std::ostream&, const Token& ); public: - UnicodeString type; - UnicodeString us; + icu::UnicodeString type; + icu::UnicodeString us; TokenRole role; - Token( const UnicodeString&, - const UnicodeString&, + Token( const icu::UnicodeString&, + const icu::UnicodeString&, TokenRole role = NOROLE, const std::string& = "" ); std::string lc; // ISO 639-3 language code @@ -130,12 +130,12 @@ namespace Tokenizer { // Tokenize a line (a line is NOT a sentence, but an arbitrary string // of characters, inclusive EOS markers, Newlines etc.) - int tokenizeLine( const UnicodeString&, + int tokenizeLine( const icu::UnicodeString&, const std::string& = "default" ); // Unicode chars int tokenizeLine( const std::string&, const std::string& = "default" ); // UTF8 chars - void passthruLine( const UnicodeString&, bool& ); + void passthruLine( const icu::UnicodeString&, bool& ); void passthruLine( const std::string&, bool& ); //Processes tokens and initialises the sentence buffer. Returns the amount of sentences found @@ -209,8 +209,8 @@ namespace Tokenizer { void setLanguage( const std::string& l ){ default_language = l; }; // set eos marker - UnicodeString setEosMarker( const std::string& s = "<utt>") { UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; }; - UnicodeString getEosMarker( ) const { return eosmark; } + icu::UnicodeString setEosMarker( const std::string& s = "<utt>") { icu::UnicodeString t = eosmark; eosmark = folia::UTF8ToUnicode(s); return t; }; + icu::UnicodeString getEosMarker( ) const { return eosmark; } bool setNormSet( const std::string& ); @@ -255,14 +255,14 @@ namespace Tokenizer { private: TokenizerClass( const TokenizerClass& ); // inhibit copies TokenizerClass& operator=( const TokenizerClass& ); // inhibit copies - void add_rule( const UnicodeString&, - const std::vector<UnicodeString>& ); - void tokenizeWord( const UnicodeString&, + void add_rule( const icu::UnicodeString&, + const std::vector<icu::UnicodeString>& ); + void tokenizeWord( const icu::UnicodeString&, bool, const std::string&, - const UnicodeString& ="" ); + const icu::UnicodeString& ="" ); - bool detectEos( size_t, const UnicodeString&, const Quoting& ) const; + bool detectEos( size_t, const icu::UnicodeString&, const Quoting& ) const; void detectSentenceBounds( const int offset, const std::string& = "default" ); void detectQuotedSentenceBounds( const int offset, @@ -272,7 +272,7 @@ namespace Tokenizer { //Signal the tokeniser that a paragraph is detected void signalParagraph( bool b=true ) { paragraphsignal = b; }; - bool resolveQuote( int, const UnicodeString&, Quoting& ); + bool resolveQuote( int, const icu::UnicodeString&, Quoting& ); bool u_isquote( UChar32, const Quoting& ) const; std::string checkBOM( std::istream& ); @@ -289,9 +289,9 @@ namespace Tokenizer { UnicodeNormalizer normalizer; std::string inputEncoding; - UnicodeString eosmark; + icu::UnicodeString eosmark; std::vector<Token> tokens; - std::set<UnicodeString> norm_set; + std::set<icu::UnicodeString> norm_set; TiCC::LogStream *theErrLog; std::string default_language; --- ucto-0.9.6.orig/include/ucto/unicode.h +++ ucto-0.9.6/include/ucto/unicode.h @@ -42,7 +42,7 @@ namespace Tokenizer { class UnicodeNormalizer { public: UnicodeNormalizer(): mode(UNORM_NFC){}; - UnicodeString normalize( const UnicodeString& ); + icu::UnicodeString normalize( const icu::UnicodeString& ); std::string getMode( ) const; std::string setMode( const std::string& ); private: @@ -52,34 +52,34 @@ namespace Tokenizer { class UnicodeFilter { friend std::ostream& operator<<( std::ostream&, const UnicodeFilter& ); public: - UnicodeString filter( const UnicodeString& ); + icu::UnicodeString filter( const icu::UnicodeString& ); bool fill( const std::string& ); - bool add( const UnicodeString& ); + bool add( const icu::UnicodeString& ); bool add( const std::string& ); bool empty() const { return the_map.empty(); }; private: - void add( UChar uc, const UnicodeString& us ) { the_map[uc] = us; }; - std::map<UChar, UnicodeString> the_map; + void add( UChar uc, const icu::UnicodeString& us ) { the_map[uc] = us; }; + std::map<UChar, icu::UnicodeString> the_map; }; class UnicodeRegexMatcher { public: - UnicodeRegexMatcher( const UnicodeString&, const UnicodeString& name="" ); + UnicodeRegexMatcher( const icu::UnicodeString&, const icu::UnicodeString& name="" ); ~UnicodeRegexMatcher(); - bool match_all( const UnicodeString&, UnicodeString&, UnicodeString& ); - const UnicodeString get_match( unsigned int ) const; + bool match_all( const icu::UnicodeString&, icu::UnicodeString&, icu::UnicodeString& ); + const icu::UnicodeString get_match( unsigned int ) const; int NumOfMatches() const; - int split( const UnicodeString&, std::vector<UnicodeString>& ); - UnicodeString Pattern() const; + int split( const icu::UnicodeString&, std::vector<icu::UnicodeString>& ); + icu::UnicodeString Pattern() const; private: UnicodeRegexMatcher( const UnicodeRegexMatcher& ); // inhibit copies UnicodeRegexMatcher& operator=( const UnicodeRegexMatcher& ); // inhibit copies std::string failString; - RegexPattern *pattern; - RegexMatcher *matcher; + icu::RegexPattern *pattern; + icu::RegexMatcher *matcher; UnicodeRegexMatcher(); - std::vector<UnicodeString> results; - const UnicodeString _name; + std::vector<icu::UnicodeString> results; + const icu::UnicodeString _name; }; } // namespace --- ucto-0.9.6.orig/src/setting.cxx +++ ucto-0.9.6/src/setting.cxx @@ -54,7 +54,7 @@ namespace Tokenizer { ORDINALS, EOSMARKERS, QUOTES, CURRENCY, FILTER, RULEORDER, METARULES }; - ConfigMode getMode( const UnicodeString& line ) { + ConfigMode getMode( const icu::UnicodeString& line ) { ConfigMode mode = NONE; if (line == "[RULES]") { mode = RULES; @@ -111,7 +111,7 @@ namespace Tokenizer { public: uConfigError( const string& s, const string& f ): invalid_argument( "ucto: " + s + " (" + f + ")" ){}; - uConfigError( const UnicodeString& us, const string& f ): + uConfigError( const icu::UnicodeString& us, const string& f ): uConfigError( folia::UnicodeToUTF8(us), f ){}; }; @@ -143,14 +143,14 @@ namespace Tokenizer { } } - void Quoting::add( const UnicodeString& o, const UnicodeString& c ){ + void Quoting::add( const icu::UnicodeString& o, const icu::UnicodeString& c ){ QuotePair quote; quote.openQuote = o; quote.closeQuote = c; _quotes.push_back( quote ); } - int Quoting::lookup( const UnicodeString& open, int& stackindex ){ + int Quoting::lookup( const icu::UnicodeString& open, int& stackindex ){ if (quotestack.empty() || (quotestack.size() != quoteindexstack.size())) return -1; auto it = quotestack.crbegin(); size_t i = quotestack.size(); @@ -165,7 +165,7 @@ namespace Tokenizer { return -1; } - UnicodeString Quoting::lookupOpen( const UnicodeString &q ) const { + icu::UnicodeString Quoting::lookupOpen( const icu::UnicodeString &q ) const { for ( const auto& quote : _quotes ){ if ( quote.openQuote.indexOf(q) >=0 ) return quote.closeQuote; @@ -173,8 +173,8 @@ namespace Tokenizer { return ""; } - UnicodeString Quoting::lookupClose( const UnicodeString &q ) const { - UnicodeString res; + icu::UnicodeString Quoting::lookupClose( const icu::UnicodeString &q ) const { + icu::UnicodeString res; for ( const auto& quote : _quotes ){ if ( quote.closeQuote.indexOf(q) >= 0 ) return quote.openQuote; @@ -186,7 +186,7 @@ namespace Tokenizer { delete regexp; } - Rule::Rule( const UnicodeString& _id, const UnicodeString& _pattern): + Rule::Rule( const icu::UnicodeString& _id, const icu::UnicodeString& _pattern): id(_id), pattern(_pattern) { regexp = new UnicodeRegexMatcher( pattern, id ); } @@ -200,10 +200,10 @@ namespace Tokenizer { return os; } - bool Rule::matchAll( const UnicodeString& line, - UnicodeString& pre, - UnicodeString& post, - vector<UnicodeString>& matches ){ + bool Rule::matchAll( const icu::UnicodeString& line, + icu::UnicodeString& pre, + icu::UnicodeString& post, + vector<icu::UnicodeString>& matches ){ matches.clear(); pre = ""; post = ""; @@ -244,7 +244,7 @@ namespace Tokenizer { else { string rawline; while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + icu::UnicodeString line = folia::UTF8ToUnicode(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ @@ -255,8 +255,8 @@ namespace Tokenizer { throw uConfigError( "invalid RULES entry: " + line, fname ); } - UnicodeString id = UnicodeString( line, 0,splitpoint); - UnicodeString pattern = UnicodeString( line, splitpoint+1); + icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint); + icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1); rulesmap[id] = new Rule( id, pattern); } } @@ -282,7 +282,7 @@ namespace Tokenizer { else { string rawline; while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + icu::UnicodeString line = folia::UTF8ToUnicode(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ @@ -296,8 +296,8 @@ namespace Tokenizer { + " (missing whitespace)", fname ); } - UnicodeString open = UnicodeString( line, 0,splitpoint); - UnicodeString close = UnicodeString( line, splitpoint+1); + icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint); + icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1); open = open.trim().unescape(); close = close.trim().unescape(); if ( open.isEmpty() || close.isEmpty() ){ @@ -323,7 +323,7 @@ namespace Tokenizer { else { string rawline; while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + icu::UnicodeString line = folia::UTF8ToUnicode(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ @@ -331,7 +331,7 @@ namespace Tokenizer { } if ( ( line.startsWith("\\u") && line.length() == 6 ) || ( line.startsWith("\\U") && line.length() == 10 ) ){ - UnicodeString uit = line.unescape(); + icu::UnicodeString uit = line.unescape(); if ( uit.isEmpty() ){ throw uConfigError( "Invalid EOSMARKERS entry: " + line, fname ); } @@ -344,7 +344,7 @@ namespace Tokenizer { } bool Setting::readabbreviations( const string& fname, - UnicodeString& abbreviations ){ + icu::UnicodeString& abbreviations ){ if ( tokDebug > 0 ){ *theErrLog << "%include " << fname << endl; } @@ -355,7 +355,7 @@ namespace Tokenizer { else { string rawline; while ( getline(f,rawline) ){ - UnicodeString line = folia::UTF8ToUnicode(rawline); + icu::UnicodeString line = folia::UTF8ToUnicode(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if ( tokDebug >= 5 ){ @@ -370,17 +370,17 @@ namespace Tokenizer { return true; } - void Setting::add_rule( const UnicodeString& name, - const vector<UnicodeString>& parts ){ - UnicodeString pat; + void Setting::add_rule( const icu::UnicodeString& name, + const vector<icu::UnicodeString>& parts ){ + icu::UnicodeString pat; for ( auto const& part : parts ){ pat += part; } rulesmap[name] = new Rule( name, pat ); } - void Setting::sortRules( map<UnicodeString, Rule *>& rulesmap, - const vector<UnicodeString>& sort ){ + void Setting::sortRules( map<icu::UnicodeString, Rule *>& rulesmap, + const vector<icu::UnicodeString>& sort ){ // LOG << "rules voor sort : " << endl; // for ( size_t i=0; i < rules.size(); ++i ){ // LOG << "rule " << i << " " << *rules[i] << endl; @@ -432,14 +432,14 @@ namespace Tokenizer { return result; } - void addOrder( vector<UnicodeString>& order, - map<UnicodeString,int>& reverse_order, + void addOrder( vector<icu::UnicodeString>& order, + map<icu::UnicodeString,int>& reverse_order, int& index, - UnicodeString &line, + icu::UnicodeString &line, const string& fn ){ try { UnicodeRegexMatcher m( "\\s+" ); - vector<UnicodeString> usv; + vector<icu::UnicodeString> usv; m.split( line, usv ); for ( const auto& us : usv ){ if ( reverse_order.find( us ) != reverse_order.end() ){ @@ -500,7 +500,7 @@ namespace Tokenizer { int dbg, LogStream* ls ) { tokDebug = dbg; theErrLog = ls; - map<ConfigMode, UnicodeString> pattern = { { ABBREVIATIONS, "" }, + map<ConfigMode, icu::UnicodeString> pattern = { { ABBREVIATIONS, "" }, { TOKENS, "" }, { PREFIXES, "" }, { SUFFIXES, "" }, @@ -508,7 +508,7 @@ namespace Tokenizer { { ATTACHEDSUFFIXES, "" }, { UNITS, "" }, { ORDINALS, "" } }; - vector<UnicodeString> rules_order; + vector<icu::UnicodeString> rules_order; vector<string> meta_rules; string conffile = get_filename( settings_name ); @@ -572,7 +572,7 @@ namespace Tokenizer { continue; } - UnicodeString line = folia::UTF8ToUnicode(rawline); + icu::UnicodeString line = folia::UTF8ToUnicode(rawline); line.trim(); if ((line.length() > 0) && (line[0] != '#')) { if (line[0] == '[') { @@ -580,7 +580,7 @@ namespace Tokenizer { } else { if ( line[0] == '\\' && line.length() > 1 && line[1] == '[' ){ - line = UnicodeString( line, 1 ); + line = icu::UnicodeString( line, 1 ); } switch( mode ){ case RULES: { @@ -589,8 +589,8 @@ namespace Tokenizer { throw uConfigError( "invalid RULES entry: " + line, set_file ); } - UnicodeString id = UnicodeString( line, 0,splitpoint); - UnicodeString pattern = UnicodeString( line, splitpoint+1); + icu::UnicodeString id = icu::UnicodeString( line, 0,splitpoint); + icu::UnicodeString pattern = icu::UnicodeString( line, splitpoint+1); rulesmap[id] = new Rule( id, pattern); } break; @@ -617,7 +617,7 @@ namespace Tokenizer { case EOSMARKERS: if ( ( line.startsWith("\\u") && line.length() == 6 ) || ( line.startsWith("\\U") && line.length() == 10 ) ){ - UnicodeString uit = line.unescape(); + icu::UnicodeString uit = line.unescape(); if ( uit.isEmpty() ){ throw uConfigError( "Invalid EOSMARKERS entry: " + line, set_file ); @@ -634,8 +634,8 @@ namespace Tokenizer { + " (missing whitespace)", set_file ); } - UnicodeString open = UnicodeString( line, 0,splitpoint); - UnicodeString close = UnicodeString( line, splitpoint+1); + icu::UnicodeString open = icu::UnicodeString( line, 0,splitpoint); + icu::UnicodeString close = icu::UnicodeString( line, splitpoint+1); open = open.trim().unescape(); close = close.trim().unescape(); if ( open.isEmpty() || close.isEmpty() ){ @@ -702,7 +702,7 @@ namespace Tokenizer { } continue; } - UnicodeString name = folia::UTF8ToUnicode( nam ); + icu::UnicodeString name = folia::UTF8ToUnicode( nam ); string rule = mr.substr( pos+1 ); if ( tokDebug > 5 ){ LOG << "SPLIT using: '" << split << "'" << endl; @@ -712,11 +712,11 @@ namespace Tokenizer { for ( auto& str : parts ){ str = TiCC::trim( str ); } - vector<UnicodeString> new_parts; - vector<UnicodeString> undef_parts; + vector<icu::UnicodeString> new_parts; + vector<icu::UnicodeString> undef_parts; bool skip_rule = false; for ( const auto& part : parts ){ - UnicodeString meta = folia::UTF8ToUnicode( part ); + icu::UnicodeString meta = folia::UTF8ToUnicode( part ); ConfigMode mode = getMode( "[" + meta + "]" ); switch ( mode ){ case ORDINALS: --- ucto-0.9.6.orig/src/tokenize.cxx +++ ucto-0.9.6/src/tokenize.cxx @@ -88,11 +88,11 @@ namespace Tokenizer { }; - UnicodeString convert( const string& line, + icu::UnicodeString convert( const string& line, const string& inputEncoding ){ - UnicodeString result; + icu::UnicodeString result; try { - result = UnicodeString( line.c_str(), + result = icu::UnicodeString( line.c_str(), line.length(), inputEncoding.c_str() ); } @@ -108,17 +108,17 @@ namespace Tokenizer { return result; } - const UnicodeString type_space = "SPACE"; - const UnicodeString type_currency = "CURRENCY"; - const UnicodeString type_emoticon = "EMOTICON"; - const UnicodeString type_word = "WORD"; - const UnicodeString type_symbol = "SYMBOL"; - const UnicodeString type_punctuation = "PUNCTUATION"; - const UnicodeString type_number = "NUMBER"; - const UnicodeString type_unknown = "UNKNOWN"; + const icu::UnicodeString type_space = "SPACE"; + const icu::UnicodeString type_currency = "CURRENCY"; + const icu::UnicodeString type_emoticon = "EMOTICON"; + const icu::UnicodeString type_word = "WORD"; + const icu::UnicodeString type_symbol = "SYMBOL"; + const icu::UnicodeString type_punctuation = "PUNCTUATION"; + const icu::UnicodeString type_number = "NUMBER"; + const icu::UnicodeString type_unknown = "UNKNOWN"; - Token::Token( const UnicodeString& _type, - const UnicodeString& _s, + Token::Token( const icu::UnicodeString& _type, + const icu::UnicodeString& _s, TokenRole _role, const string& _lc ): type(_type), us(_s), role(_role), lc(_lc) {} @@ -226,7 +226,7 @@ namespace Tokenizer { << "'" << endl; } stripCR( line ); - UnicodeString input_line; + icu::UnicodeString input_line; if ( line.size() > 0 && line[0] == 0 ){ // when processing UTF16LE, '0' bytes show up at pos 0 // we discard them, not for UTF16BE! @@ -273,7 +273,7 @@ namespace Tokenizer { LOG << "use textCat to guess language from: " << input_line << endl; } - UnicodeString temp = input_line; + icu::UnicodeString temp = input_line; temp.toLower(); string lan = tc->get_language( folia::UnicodeToUTF8(temp) ); if ( settings.find( lan ) != settings.end() ){ @@ -531,7 +531,7 @@ namespace Tokenizer { if ( root->hastext( outputclass ) ){ return; } - UnicodeString utxt = root->text( outputclass, false, false ); + icu::UnicodeString utxt = root->text( outputclass, false, false ); // cerr << "untok: '" << utxt << "'" << endl; // UnicodeString txt = root->text( outputclass, true ); // cerr << " tok: '" << txt << "'" << endl; @@ -664,7 +664,7 @@ namespace Tokenizer { if ( tokDebug > 0 ){ cerr << "tokenize sentence element: " << element->id() << endl; } - UnicodeString line = element->stricttext( inputclass ); + icu::UnicodeString line = element->stricttext( inputclass ); if ( line.isEmpty() ){ // so no usefull text in this element. skip it return; @@ -837,7 +837,7 @@ namespace Tokenizer { args["space"]= "no"; } folia::FoliaElement *w = new folia::Word( args, root->doc() ); - UnicodeString out = token.us; + icu::UnicodeString out = token.us; if (lowercase) { out.toLower(); } @@ -898,7 +898,7 @@ namespace Tokenizer { OUT << endl << endl; } } - UnicodeString s = token.us; + icu::UnicodeString s = token.us; if (lowercase) { s = s.toLower(); } @@ -1118,12 +1118,12 @@ namespace Tokenizer { quote = true; } else { - UnicodeString opening = quotes.lookupOpen( c ); + icu::UnicodeString opening = quotes.lookupOpen( c ); if (!opening.isEmpty()) { quote = true; } else { - UnicodeString closing = quotes.lookupClose( c ); + icu::UnicodeString closing = quotes.lookupClose( c ); if (!closing.isEmpty()) { quote = true; } @@ -1151,7 +1151,7 @@ namespace Tokenizer { } bool TokenizerClass::resolveQuote( int endindex, - const UnicodeString& open, + const icu::UnicodeString& open, Quoting& quotes ) { //resolve a quote int stackindex = -1; @@ -1250,7 +1250,7 @@ namespace Tokenizer { } bool TokenizerClass::detectEos( size_t i, - const UnicodeString& eosmarkers, + const icu::UnicodeString& eosmarkers, const Quoting& quotes ) const { bool is_eos = false; UChar32 c = tokens[i].us.char32At(0); @@ -1288,7 +1288,7 @@ namespace Tokenizer { Quoting& quotes ) { UChar32 c = tokens[i].us.char32At(0); //Detect Quotation marks - if ((c == '"') || ( UnicodeString(c) == """) ) { + if ((c == '"') || ( icu::UnicodeString(c) == """) ) { if (tokDebug > 1 ){ LOG << "[detectQuoteBounds] Standard double-quote (ambiguous) found @i="<< i << endl; } @@ -1311,7 +1311,7 @@ namespace Tokenizer { } } else { - UnicodeString close = quotes.lookupOpen( c ); + icu::UnicodeString close = quotes.lookupOpen( c ); if ( !close.isEmpty() ){ // we have a opening quote if ( tokDebug > 1 ) { LOG << "[detectQuoteBounds] Opening quote found @i="<< i << ", pushing to stack for resolution later..." << endl; @@ -1319,7 +1319,7 @@ namespace Tokenizer { quotes.push( i, c ); // remember it } else { - UnicodeString open = quotes.lookupClose( c ); + icu::UnicodeString open = quotes.lookupClose( c ); if ( !open.isEmpty() ) { // we have a closeing quote if (tokDebug > 1 ) { LOG << "[detectQuoteBounds] Closing quote found @i="<< i << ", attempting to resolve..." << endl; @@ -1484,17 +1484,17 @@ namespace Tokenizer { void TokenizerClass::passthruLine( const string& s, bool& bos ) { // string wrapper - UnicodeString us = convert( s, inputEncoding );; + icu::UnicodeString us = convert( s, inputEncoding );; passthruLine( us, bos ); } - void TokenizerClass::passthruLine( const UnicodeString& input, bool& bos ) { + void TokenizerClass::passthruLine( const icu::UnicodeString& input, bool& bos ) { if (tokDebug) { LOG << "[passthruLine] input: line=[" << input << "]" << endl; } bool alpha = false, num = false, punct = false; - UnicodeString word; - StringCharacterIterator sit(input); + icu::UnicodeString word; + icu::StringCharacterIterator sit(input); while ( sit.hasNext() ){ UChar32 c = sit.current32(); if ( u_isspace(c)) { @@ -1514,7 +1514,7 @@ namespace Tokenizer { bos = true; } else { - UnicodeString type; + icu::UnicodeString type; if (alpha && !num && !punct) { type = type_word; } @@ -1577,7 +1577,7 @@ namespace Tokenizer { tokens.back().role |= ENDOFSENTENCE; } else { - UnicodeString type; + icu::UnicodeString type; if (alpha && !num && !punct) { type = type_word; } @@ -1653,7 +1653,7 @@ namespace Tokenizer { // string wrapper int TokenizerClass::tokenizeLine( const string& s, const string& lang ){ - UnicodeString uinputstring = convert( s, inputEncoding ); + icu::UnicodeString uinputstring = convert( s, inputEncoding ); return tokenizeLine( uinputstring, lang ); } @@ -1673,7 +1673,7 @@ namespace Tokenizer { || u_charType( c ) == U_OTHER_SYMBOL; } - const UnicodeString& detect_type( UChar32 c ){ + const icu::UnicodeString& detect_type( UChar32 c ){ if ( u_isspace(c)) { return type_space; } @@ -1768,7 +1768,7 @@ namespace Tokenizer { } } - int TokenizerClass::tokenizeLine( const UnicodeString& originput, + int TokenizerClass::tokenizeLine( const icu::UnicodeString& originput, const string& _lang ){ string lang = _lang; if ( lang.empty() ){ @@ -1786,7 +1786,7 @@ namespace Tokenizer { LOG << "[tokenizeLine] input: line=[" << originput << "] (" << lang << ")" << endl; } - UnicodeString input = normalizer.normalize( originput ); + icu::UnicodeString input = normalizer.normalize( originput ); if ( doFilter ){ input = settings[lang]->filter.filter( input ); } @@ -1808,13 +1808,13 @@ namespace Tokenizer { bool tokenizeword = false; bool reset = false; //iterate over all characters - UnicodeString word; - StringCharacterIterator sit(input); + icu::UnicodeString word; + icu::StringCharacterIterator sit(input); long int i = 0; while ( sit.hasNext() ){ UChar32 c = sit.current32(); if ( tokDebug > 8 ){ - UnicodeString s = c; + icu::UnicodeString s = c; int8_t charT = u_charType( c ); LOG << "examine character: " << s << " type= " << toString( charT ) << endl; @@ -1855,7 +1855,7 @@ namespace Tokenizer { } int eospos = tokens.size()-1; if (expliciteosfound > 0) { - UnicodeString realword; + icu::UnicodeString realword; word.extract(0,expliciteosfound,realword); if (tokDebug >= 2) { LOG << "[tokenizeLine] Prefix before EOS: " @@ -1865,7 +1865,7 @@ namespace Tokenizer { eospos++; } if ( expliciteosfound + eosmark.length() < word.length() ){ - UnicodeString realword; + icu::UnicodeString realword; word.extract( expliciteosfound+eosmark.length(), word.length() - expliciteosfound - eosmark.length(), realword ); @@ -1941,10 +1941,10 @@ namespace Tokenizer { return numNewTokens; } - void TokenizerClass::tokenizeWord( const UnicodeString& input, + void TokenizerClass::tokenizeWord( const icu::UnicodeString& input, bool space, const string& lang, - const UnicodeString& assigned_type ) { + const icu::UnicodeString& assigned_type ) { bool recurse = !assigned_type.isEmpty(); int32_t inpLen = input.countChar32(); @@ -1977,7 +1977,7 @@ namespace Tokenizer { if ( inpLen == 1) { //single character, no need to process all rules, do some simpler (faster) detection UChar32 c = input.char32At(0); - UnicodeString type = detect_type( c ); + icu::UnicodeString type = detect_type( c ); if ( type == type_space ){ return; } @@ -1993,7 +1993,7 @@ namespace Tokenizer { } } else { - UnicodeString word = input; + icu::UnicodeString word = input; if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; } @@ -2010,10 +2010,10 @@ namespace Tokenizer { if ( tokDebug >= 4){ LOG << "\tTESTING " << rule->id << endl; } - UnicodeString type = rule->id; + icu::UnicodeString type = rule->id; //Find first matching rule - UnicodeString pre, post; - vector<UnicodeString> matches; + icu::UnicodeString pre, post; + vector<icu::UnicodeString> matches; if ( rule->matchAll( input, pre, post, matches ) ){ a_rule_matched = true; if ( tokDebug >= 4 ){ @@ -2083,7 +2083,7 @@ namespace Tokenizer { if ( post.length() > 0 ) { internal_space = false; } - UnicodeString word = matches[m]; + icu::UnicodeString word = matches[m]; if ( norm_set.find( type ) != norm_set.end() ){ word = "{{" + type + "}}"; tokens.push_back( Token( type, word, internal_space ? NOROLE : NOSPACE, lang ) ); --- ucto-0.9.6.orig/src/unicode.cxx +++ ucto-0.9.6/src/unicode.cxx @@ -84,10 +84,10 @@ namespace Tokenizer { return res; } - UnicodeString UnicodeNormalizer::normalize( const UnicodeString& us ){ - UnicodeString r; + icu::UnicodeString UnicodeNormalizer::normalize( const icu::UnicodeString& us ){ + icu::UnicodeString r; UErrorCode status=U_ZERO_ERROR; - Normalizer::normalize( us, mode, 0, r, status ); + icu::Normalizer::normalize( us, mode, 0, r, status ); if (U_FAILURE(status)){ throw std::invalid_argument("Normalizer"); } @@ -101,18 +101,18 @@ namespace Tokenizer { else { auto it=q.the_map.cbegin(); while ( it != q.the_map.cend() ){ - os << folia::UnicodeToUTF8(UnicodeString(it->first)) << "\t" << it->second << endl; + os << folia::UnicodeToUTF8(icu::UnicodeString(it->first)) << "\t" << it->second << endl; ++it; } } return os; } - UnicodeString UnicodeFilter::filter( const UnicodeString& s ){ + icu::UnicodeString UnicodeFilter::filter( const icu::UnicodeString& s ){ if ( empty() ) return s; else { - UnicodeString result; + icu::UnicodeString result; for ( int i=0; i < s.length(); ++i ){ auto it=the_map.find(s[i]); if ( it != the_map.cend() ) @@ -125,16 +125,16 @@ namespace Tokenizer { } bool UnicodeFilter::add( const string& s ){ - UnicodeString line = folia::UTF8ToUnicode(s); + icu::UnicodeString line = folia::UTF8ToUnicode(s); return add( line ); } - bool UnicodeFilter::add( const UnicodeString& s ){ - UnicodeString line = s; + bool UnicodeFilter::add( const icu::UnicodeString& s ){ + icu::UnicodeString line = s; line.trim(); if ((line.length() > 0) && (line[0] != '#')) { - UnicodeString open = ""; - UnicodeString close = ""; + icu::UnicodeString open = ""; + icu::UnicodeString close = ""; int splitpoint = line.indexOf(" "); if ( splitpoint == -1 ) splitpoint = line.indexOf("\t"); @@ -142,8 +142,8 @@ namespace Tokenizer { open = line; } else { - open = UnicodeString( line, 0,splitpoint); - close = UnicodeString( line, splitpoint+1); + open = icu::UnicodeString( line, 0,splitpoint); + close = icu::UnicodeString( line, splitpoint+1); } open = open.trim().unescape(); close = close.trim().unescape(); @@ -175,29 +175,29 @@ namespace Tokenizer { class uConfigError: public std::invalid_argument { public: uConfigError( const string& s ): invalid_argument( "ucto: config file:" + s ){}; - uConfigError( const UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){}; + uConfigError( const icu::UnicodeString& us ): invalid_argument( "ucto: config file:" + folia::UnicodeToUTF8(us) ){}; }; - UnicodeString UnicodeRegexMatcher::Pattern() const{ + icu::UnicodeString UnicodeRegexMatcher::Pattern() const{ return pattern->pattern(); } - UnicodeRegexMatcher::UnicodeRegexMatcher( const UnicodeString& pat, - const UnicodeString& name ): + UnicodeRegexMatcher::UnicodeRegexMatcher( const icu::UnicodeString& pat, + const icu::UnicodeString& name ): _name(name) { failString.clear(); matcher = NULL; UErrorCode u_stat = U_ZERO_ERROR; UParseError errorInfo; - pattern = RegexPattern::compile( pat, 0, errorInfo, u_stat ); + pattern = icu::RegexPattern::compile( pat, 0, errorInfo, u_stat ); if ( U_FAILURE(u_stat) ){ string spat = folia::UnicodeToUTF8(pat); failString = folia::UnicodeToUTF8(_name); if ( errorInfo.offset >0 ){ failString += " Invalid regular expression at position " + TiCC::toString( errorInfo.offset ) + "\n"; - UnicodeString pat1 = UnicodeString( pat, 0, errorInfo.offset -1 ); + icu::UnicodeString pat1 = icu::UnicodeString( pat, 0, errorInfo.offset -1 ); failString += folia::UnicodeToUTF8(pat1) + " <== HERE\n"; } else { @@ -222,9 +222,9 @@ namespace Tokenizer { //#define MATCH_DEBUG 1 - bool UnicodeRegexMatcher::match_all( const UnicodeString& line, - UnicodeString& pre, - UnicodeString& post ){ + bool UnicodeRegexMatcher::match_all( const icu::UnicodeString& line, + icu::UnicodeString& pre, + icu::UnicodeString& post ){ UErrorCode u_stat = U_ZERO_ERROR; pre = ""; post = ""; @@ -243,21 +243,21 @@ namespace Tokenizer { #endif if ( matcher->groupCount() == 0 ){ // case 1: a rule without capture groups matches - UnicodeString us = matcher->group(0,u_stat) ; + icu::UnicodeString us = matcher->group(0,u_stat) ; #ifdef MATCH_DEBUG cerr << "case 1, result = " << us << endl; #endif results.push_back( us ); int start = matcher->start( 0, u_stat ); if ( start > 0 ){ - pre = UnicodeString( line, 0, start ); + pre = icu::UnicodeString( line, 0, start ); #ifdef MATCH_DEBUG cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl; #endif } int end = matcher->end( 0, u_stat ); if ( end < line.length() ){ - post = UnicodeString( line, end ); + post = icu::UnicodeString( line, end ); #ifdef MATCH_DEBUG cerr << "found post " << folia::UnicodeToUTF8(post) << endl; #endif @@ -268,20 +268,20 @@ namespace Tokenizer { // case 2: a rule with one capture group matches int start = matcher->start( 1, u_stat ); if ( start >= 0 ){ - UnicodeString us = matcher->group(1,u_stat) ; + icu::UnicodeString us = matcher->group(1,u_stat) ; #ifdef MATCH_DEBUG cerr << "case 2a , result = " << us << endl; #endif results.push_back( us ); if ( start > 0 ){ - pre = UnicodeString( line, 0, start ); + pre = icu::UnicodeString( line, 0, start ); #ifdef MATCH_DEBUG cerr << "found pre " << pre << endl; #endif } int end = matcher->end( 1, u_stat ); if ( end < line.length() ){ - post = UnicodeString( line, end ); + post = icu::UnicodeString( line, end ); #ifdef MATCH_DEBUG cerr << "found post " << post << endl; #endif @@ -289,21 +289,21 @@ namespace Tokenizer { } else { // group 1 is empty, return group 0 - UnicodeString us = matcher->group(0,u_stat) ; + icu::UnicodeString us = matcher->group(0,u_stat) ; #ifdef MATCH_DEBUG cerr << "case 2b , result = " << us << endl; #endif results.push_back( us ); start = matcher->start( 0, u_stat ); if ( start > 0 ){ - pre = UnicodeString( line, 0, start ); + pre = icu::UnicodeString( line, 0, start ); #ifdef MATCH_DEBUG cerr << "found pre " << pre << endl; #endif } int end = matcher->end( 0, u_stat ); if ( end < line.length() ){ - post = UnicodeString( line, end ); + post = icu::UnicodeString( line, end ); #ifdef MATCH_DEBUG cerr << "found post " << post << endl; #endif @@ -332,7 +332,7 @@ namespace Tokenizer { else break; if ( start > end ){ - pre = UnicodeString( line, end, start ); + pre = icu::UnicodeString( line, end, start ); #ifdef MATCH_DEBUG cerr << "found pre " << folia::UnicodeToUTF8(pre) << endl; #endif @@ -342,7 +342,7 @@ namespace Tokenizer { cerr << "end = " << end << endl; #endif if (!U_FAILURE(u_stat)){ - results.push_back( UnicodeString( line, start, end - start ) ); + results.push_back( icu::UnicodeString( line, start, end - start ) ); #ifdef MATCH_DEBUG cerr << "added result " << folia::UnicodeToUTF8( results.back() ) << endl; #endif @@ -351,7 +351,7 @@ namespace Tokenizer { break; } if ( end < line.length() ){ - post = UnicodeString( line, end ); + post = icu::UnicodeString( line, end ); #ifdef MATCH_DEBUG cerr << "found post " << folia::UnicodeToUTF8(post) << endl; #endif @@ -364,7 +364,7 @@ namespace Tokenizer { return false; } - const UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{ + const icu::UnicodeString UnicodeRegexMatcher::get_match( unsigned int n ) const{ if ( n < results.size() ) return results[n]; else @@ -378,11 +378,11 @@ namespace Tokenizer { return 0; } - int UnicodeRegexMatcher::split( const UnicodeString& us, - vector<UnicodeString>& result ){ + int UnicodeRegexMatcher::split( const icu::UnicodeString& us, + vector<icu::UnicodeString>& result ){ result.clear(); const int maxWords = 256; - UnicodeString words[maxWords]; + icu::UnicodeString words[maxWords]; UErrorCode status = U_ZERO_ERROR; int numWords = matcher->split( us, words, maxWords, status ); for ( int i = 0; i < numWords; ++i )