reassign 504030 apertium thanks Hi, The problem seems to be that MB_CUR_MAX is not defined correctly. Just adding a #define MB_CUR_MAX 1000 to input2.txt fixes the error (see attached). So, something changed in the definition of MB_CUR_MAX, but flex does not and should not define it -- and thus can't fix it.
manoj ,---- | __> /usr/bin/flex -Cfer -t >|apertium_deshtml.cc < ~/input2.txt | __> wc -l apertium_deshtml.cc | 4652 apertium_deshtml.cc | __> `----
%{ #include <cstdlib> #include <iostream> #include <map> #include <vector> #include <regex.h> #include <string> #include <lttoolbox/lt_locale.h> #include <lttoolbox/ltstr.h> using namespace std; wstring buffer; string symbuf = ""; bool isDot, hasWrite_dot, hasWrite_white; FILE *formatfile; string last; int current; long int offset; #define MB_CUR_MAX 1000 vector<long int> offsets; vector<wstring> tags; vector<int> orders; regex_t escape_chars; regex_t names_regexp; void bufferAppend(wstring &buf, string const &str) { symbuf.append(str); for(size_t i = 0, limit = symbuf.size(); i < limit;) { wchar_t symbol; int gap = mbtowc(&symbol, symbuf.c_str() + i, MB_CUR_MAX); if(gap == -1) { if(i + MB_CUR_MAX < limit) { buf += L'?'; gap = 1; } else { symbuf = symbuf.substr(i); return; } } else { buf += symbol; } i += gap; } symbuf = ""; return; } void init_escape() { if(regcomp(&escape_chars, "[EMAIL PROTECTED]/]", REG_EXTENDED)) { cerr << "ERROR: Illegal regular expression for escape characters" << endl; exit(EXIT_FAILURE); } } void init_tagNames() { if(regcomp(&names_regexp, "[a-zA-Z]+", REG_EXTENDED)) { cerr << "ERROR: Illegal regular expression for tag-names" << endl; exit(EXIT_FAILURE); } } string backslash(string const &str) { string new_str = ""; for(unsigned int i = 0; i < str.size(); i++) { if(str[i] == '\\') { new_str += str[i]; } new_str += str[i]; } return new_str; } wstring escape(string const &str) { regmatch_t pmatch; char const *mystring = str.c_str(); int base = 0; wstring result = L""; while(!regexec(&escape_chars, mystring + base, 1, &pmatch, 0)) { bufferAppend(result, str.substr(base, pmatch.rm_so)); result += L'\\'; wchar_t micaracter; int pos = mbtowc(&micaracter, str.c_str() + base + pmatch.rm_so, MB_CUR_MAX); if(pos == -1) { wcerr << L"Uno" << endl; wcerr << L"Encoding error." << endl; exit(EXIT_FAILURE); } result += micaracter; base += pmatch.rm_eo; } bufferAppend(result, str.substr(base)); return result; } wstring escape(wstring const &str) { string dest = ""; for(size_t i = 0, limit = str.size(); i < limit; i++) { char symbol[MB_CUR_MAX+1]; int pos = wctomb(symbol, str[i]); if(pos == -1) { symbol[0]='?'; pos = 1; } symbol[pos] = 0; dest.append(symbol); } return escape(dest); } string get_tagName(string tag){ regmatch_t pmatch; char const *mystring = tag.c_str(); string result = ""; if(!regexec(&names_regexp, mystring, 1, &pmatch, 0)) { result=tag.substr(pmatch.rm_so, pmatch.rm_eo - pmatch.rm_so); return result; } return ""; } map<string, wstring, Ltstr> S1_substitution; void S1_init() { S1_substitution["À"] = L"À"; S1_substitution["À"] = L"À"; S1_substitution["À"] = L"À"; S1_substitution["À"] = L"À"; S1_substitution["Á"] = L"Á"; S1_substitution["Á"] = L"Á"; S1_substitution["Á"] = L"Á"; S1_substitution["Á"] = L"Á"; S1_substitution["Â"] = L"Â"; S1_substitution["Â"] = L"Â"; S1_substitution["Â"] = L"Â"; S1_substitution["Â"] = L"Â"; S1_substitution["Ã"] = L"Ã"; S1_substitution["Ã"] = L"Ã"; S1_substitution["Ã"] = L"Ã"; S1_substitution["Ã"] = L"Ã"; S1_substitution["Ä"] = L"Ä"; S1_substitution["Ä"] = L"Ä"; S1_substitution["Ä"] = L"Ä"; S1_substitution["Ä"] = L"Ä"; S1_substitution["Å"] = L"Å"; S1_substitution["Å"] = L"Å"; S1_substitution["Å"] = L"Å"; S1_substitution["Å"] = L"Å"; S1_substitution["Æ"] = L"Æ"; S1_substitution["Æ"] = L"Æ"; S1_substitution["Æ"] = L"Æ"; S1_substitution["Æ"] = L"Æ"; S1_substitution["Ç"] = L"Ç"; S1_substitution["Ç"] = L"Ç"; S1_substitution["Ç"] = L"Ç"; S1_substitution["Ç"] = L"Ç"; S1_substitution["È"] = L"È"; S1_substitution["È"] = L"È"; S1_substitution["È"] = L"È"; S1_substitution["È"] = L"È"; S1_substitution["É"] = L"É"; S1_substitution["É"] = L"É"; S1_substitution["É"] = L"É"; S1_substitution["É"] = L"É"; S1_substitution["Ê"] = L"Ê"; S1_substitution["Ê"] = L"Ê"; S1_substitution["Ê"] = L"Ê"; S1_substitution["Ê"] = L"Ê"; S1_substitution["Ë"] = L"Ë"; S1_substitution["Ë"] = L"Ë"; S1_substitution["Ë"] = L"Ë"; S1_substitution["Ë"] = L"Ë"; S1_substitution["Ì"] = L"Ì"; S1_substitution["Ì"] = L"Ì"; S1_substitution["Ì"] = L"Ì"; S1_substitution["Ì"] = L"Ì"; S1_substitution["Í"] = L"Í"; S1_substitution["Í"] = L"Í"; S1_substitution["Í"] = L"Í"; S1_substitution["Í"] = L"Í"; S1_substitution["Î"] = L"Î"; S1_substitution["Î"] = L"Î"; S1_substitution["Î"] = L"Î"; S1_substitution["Î"] = L"Î"; S1_substitution["Ï"] = L"Ï"; S1_substitution["Ï"] = L"Ï"; S1_substitution["Ï"] = L"Ï"; S1_substitution["Ï"] = L"Ï"; S1_substitution["Ð"] = L"Ð"; S1_substitution["Ð"] = L"Ð"; S1_substitution["Ð"] = L"Ð"; S1_substitution["Ð"] = L"Ð"; S1_substitution["Ñ"] = L"Ñ"; S1_substitution["Ñ"] = L"Ñ"; S1_substitution["Ñ"] = L"Ñ"; S1_substitution["Ñ"] = L"Ñ"; S1_substitution["Ò"] = L"Ò"; S1_substitution["Ò"] = L"Ò"; S1_substitution["Ò"] = L"Ò"; S1_substitution["Ò"] = L"Ò"; S1_substitution["Ó"] = L"Ó"; S1_substitution["Ó"] = L"Ó"; S1_substitution["Ó"] = L"Ó"; S1_substitution["Ó"] = L"Ó"; S1_substitution["Ô"] = L"Ô"; S1_substitution["Ô"] = L"Ô"; S1_substitution["Ô"] = L"Ô"; S1_substitution["Ô"] = L"Ô"; S1_substitution["Õ"] = L"Õ"; S1_substitution["Õ"] = L"Õ"; S1_substitution["Õ"] = L"Õ"; S1_substitution["Õ"] = L"Õ"; S1_substitution["Ö"] = L"Ö"; S1_substitution["Ö"] = L"Ö"; S1_substitution["Ö"] = L"Ö"; S1_substitution["Ö"] = L"Ö"; S1_substitution["Ø"] = L"Ø"; S1_substitution["Ø"] = L"Ø"; S1_substitution["Ø"] = L"Ø"; S1_substitution["Ø"] = L"Ø"; S1_substitution["Ù"] = L"Ù"; S1_substitution["Ù"] = L"Ù"; S1_substitution["Ù"] = L"Ù"; S1_substitution["Ù"] = L"Ù"; S1_substitution["Ú"] = L"Ú"; S1_substitution["Ú"] = L"Ú"; S1_substitution["Ú"] = L"Ú"; S1_substitution["Ú"] = L"Ú"; S1_substitution["Û"] = L"Û"; S1_substitution["Û"] = L"Û"; S1_substitution["Û"] = L"Û"; S1_substitution["Û"] = L"Û"; S1_substitution["Ü"] = L"Ü"; S1_substitution["Ü"] = L"Ü"; S1_substitution["Ü"] = L"Ü"; S1_substitution["Ü"] = L"Ü"; S1_substitution["Ý"] = L"Ý"; S1_substitution["Ý"] = L"Ý"; S1_substitution["Ý"] = L"Ý"; S1_substitution["Ý"] = L"Ý"; S1_substitution["Þ"] = L"Þ"; S1_substitution["Þ"] = L"Þ"; S1_substitution["Þ"] = L"Þ"; S1_substitution["Þ"] = L"Þ"; S1_substitution["ß"] = L"ß"; S1_substitution["ß"] = L"ß"; S1_substitution["ß"] = L"ß"; S1_substitution["ß"] = L"ß"; S1_substitution["à"] = L"à"; S1_substitution["à"] = L"à"; S1_substitution["à"] = L"à"; S1_substitution["à"] = L"à"; S1_substitution["á"] = L"á"; S1_substitution["á"] = L"á"; S1_substitution["á"] = L"á"; S1_substitution["á"] = L"á"; S1_substitution["â"] = L"â"; S1_substitution["â"] = L"â"; S1_substitution["â"] = L"â"; S1_substitution["â"] = L"â"; S1_substitution["ã"] = L"ã"; S1_substitution["ã"] = L"ã"; S1_substitution["ã"] = L"ã"; S1_substitution["ã"] = L"ã"; S1_substitution["ä"] = L"ä"; S1_substitution["ä"] = L"ä"; S1_substitution["ä"] = L"ä"; S1_substitution["ä"] = L"ä"; S1_substitution["å"] = L"å"; S1_substitution["å"] = L"å"; S1_substitution["å"] = L"å"; S1_substitution["å"] = L"å"; S1_substitution["æ"] = L"æ"; S1_substitution["æ"] = L"æ"; S1_substitution["æ"] = L"æ"; S1_substitution["æ"] = L"æ"; S1_substitution["ç"] = L"ç"; S1_substitution["ç"] = L"ç"; S1_substitution["ç"] = L"ç"; S1_substitution["ç"] = L"ç"; S1_substitution["è"] = L"è"; S1_substitution["è"] = L"è"; S1_substitution["è"] = L"è"; S1_substitution["è"] = L"è"; S1_substitution["é"] = L"é"; S1_substitution["é"] = L"é"; S1_substitution["é"] = L"é"; S1_substitution["é"] = L"é"; S1_substitution["ê"] = L"ê"; S1_substitution["ê"] = L"ê"; S1_substitution["ê"] = L"ê"; S1_substitution["ê"] = L"ê"; S1_substitution["ë"] = L"ë"; S1_substitution["ë"] = L"ë"; S1_substitution["ë"] = L"ë"; S1_substitution["ë"] = L"ë"; S1_substitution["ì"] = L"ì"; S1_substitution["ì"] = L"ì"; S1_substitution["ì"] = L"ì"; S1_substitution["ì"] = L"ì"; S1_substitution["í"] = L"í"; S1_substitution["í"] = L"í"; S1_substitution["í"] = L"í"; S1_substitution["í"] = L"í"; S1_substitution["î"] = L"î"; S1_substitution["î"] = L"î"; S1_substitution["î"] = L"î"; S1_substitution["î"] = L"î"; S1_substitution["ï"] = L"ï"; S1_substitution["ï"] = L"ï"; S1_substitution["ï"] = L"ï"; S1_substitution["ï"] = L"ï"; S1_substitution["ð"] = L"ð"; S1_substitution["ð"] = L"ð"; S1_substitution["ð"] = L"ð"; S1_substitution["ð"] = L"ð"; S1_substitution["ñ"] = L"ñ"; S1_substitution["ñ"] = L"ñ"; S1_substitution["ñ"] = L"ñ"; S1_substitution["ñ"] = L"ñ"; S1_substitution["ò"] = L"ò"; S1_substitution["ò"] = L"ò"; S1_substitution["ò"] = L"ò"; S1_substitution["ò"] = L"ò"; S1_substitution["ó"] = L"ó"; S1_substitution["ó"] = L"ó"; S1_substitution["ó"] = L"ó"; S1_substitution["ó"] = L"ó"; S1_substitution["ô"] = L"ô"; S1_substitution["ô"] = L"ô"; S1_substitution["ô"] = L"ô"; S1_substitution["ô"] = L"ô"; S1_substitution["õ"] = L"õ"; S1_substitution["õ"] = L"õ"; S1_substitution["õ"] = L"õ"; S1_substitution["õ"] = L"õ"; S1_substitution["ö"] = L"ö"; S1_substitution["ö"] = L"ö"; S1_substitution["ö"] = L"ö"; S1_substitution["ö"] = L"ö"; S1_substitution["ø"] = L"ø"; S1_substitution["ø"] = L"ø"; S1_substitution["ø"] = L"ø"; S1_substitution["ø"] = L"ø"; S1_substitution["ù"] = L"ù"; S1_substitution["ù"] = L"ù"; S1_substitution["ù"] = L"ù"; S1_substitution["ù"] = L"ù"; S1_substitution["ú"] = L"ú"; S1_substitution["ú"] = L"ú"; S1_substitution["ú"] = L"ú"; S1_substitution["ú"] = L"ú"; S1_substitution["û"] = L"û"; S1_substitution["û"] = L"û"; S1_substitution["û"] = L"û"; S1_substitution["û"] = L"û"; S1_substitution["ü"] = L"ü"; S1_substitution["ü"] = L"ü"; S1_substitution["ü"] = L"ü"; S1_substitution["ü"] = L"ü"; S1_substitution["ý"] = L"ý"; S1_substitution["ý"] = L"ý"; S1_substitution["ý"] = L"ý"; S1_substitution["ý"] = L"ý"; S1_substitution["þ"] = L"þ"; S1_substitution["þ"] = L"þ"; S1_substitution["þ"] = L"þ"; S1_substitution["þ"] = L"þ"; S1_substitution["ÿ"] = L"ÿ"; S1_substitution["ÿ"] = L"ÿ"; S1_substitution["ÿ"] = L"ÿ"; S1_substitution["ÿ"] = L"ÿ"; S1_substitution["·"] = L"·"; S1_substitution["·"] = L"·"; S1_substitution["·"] = L"·"; S1_substitution["·"] = L"·"; S1_substitution["’"] = L"'"; } void printBuffer() { if(isDot) { fputws_unlocked(L".[]", yyout); isDot = false; } if(buffer.size() > 8192) { string filename = tmpnam(NULL); FILE *largeblock = fopen(filename.c_str(), "w"); fputws_unlocked(buffer.c_str(), largeblock); fclose(largeblock); fputwc_unlocked(L'[', yyout); fputwc_unlocked(L'@', yyout); wchar_t cad[filename.size()]; size_t pos = mbstowcs(cad, filename.c_str(), filename.size()); if(pos == (size_t) -1) { wcerr << L"Tres" << endl; wcerr << L"Encoding error." << endl; exit(EXIT_FAILURE); } cad[pos] = 0; fputws_unlocked(cad, yyout); fputwc_unlocked(L']', yyout); } else if(buffer.size() > 1) { fputwc_unlocked(L'[', yyout); wstring const tmp = escape(buffer); if(tmp[0] == L'@') { fputwc_unlocked(L'\\', yyout); } fputws_unlocked(tmp.c_str(), yyout); fputwc_unlocked(L']', yyout); } else if(buffer.size() == 1 && buffer[0] != L' ') { fputwc_unlocked(L'[', yyout); wstring const tmp = escape(buffer); if(tmp[0] == L'@') { fputwc_unlocked(L'\\', yyout); } fputws_unlocked(tmp.c_str(), yyout); fputwc_unlocked(L']', yyout); } else { fputws_unlocked(buffer.c_str(), yyout); } buffer = L""; } %} %x C1 C2 C3 %option nounput %option noyywrap %option caseless %option stack %% <C1>{ "-->" { last = "buffer"; bufferAppend(buffer, yytext); yy_pop_state(); } \n|. { last = "buffer"; bufferAppend(buffer, yytext); } } <C2>{ "<!--" { bufferAppend(buffer, yytext); yy_push_state(C1); } "</script"(" "[^>]*)?">" { last = "buffer"; bufferAppend(buffer, yytext); yy_pop_state(); } \n|. { last = "buffer"; bufferAppend(buffer, yytext); } } <C3>{ "<!--" { bufferAppend(buffer, yytext); yy_push_state(C1); } "</style"(" "[^>]*)?">" { last = "buffer"; bufferAppend(buffer, yytext); yy_pop_state(); } \n|. { last = "buffer"; bufferAppend(buffer, yytext); } } "<!--" { bufferAppend(buffer, yytext); yy_push_state(C1); } "<script"(" "[^>]*)?">" { bufferAppend(buffer, yytext); yy_push_state(C2); } "<style"(" "[^>]*)?">" { bufferAppend(buffer, yytext); yy_push_state(C3); } "<br"(" "[^>]*)?">"|"<hr"(" "[^>]*)?">"|"<p"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<li"(" "[^>]*)?">"|"<ul"(" "[^>]*)?">"|"<ol"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<tr"(" "[^>]*)?">"|"<td"(" "[^>]*)?">"|"<th"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</br"(" "[^>]*)?">"|"</hr"(" "[^>]*)?">"|"</p"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</li"(" "[^>]*)?">"|"</ul"(" "[^>]*)?">"|"</ol"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</tr"(" "[^>]*)?">"|"</td"(" "[^>]*)?">"|"</th"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<title"(" "[^>]*)?">"|"<div"(" "[^>]*)?">"|"<option"(" "[^>]*)?">"|"<h"[1-6](" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</title"(" "[^>]*)?">"|"</div"(" "[^>]*)?">"|"</option"(" "[^>]*)?">"|"</h"[1-6](" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<"("img"|"link")(" "[^>]*)?">" { bufferAppend(buffer, yytext); } ("<!"|"<?")[a-zA-Z][^>]*">" { bufferAppend(buffer, yytext); } "<"[a-zA-Z][^>]*">" { bufferAppend(buffer, yytext); } "</"[a-zA-Z][^>]*">" { bufferAppend(buffer, yytext); } "&"([a-zA-Z]+|"#x"[0-9a-fA-F]{1,4}|"#"[0-9]{1,8}); { if(S1_substitution.find(yytext) != S1_substitution.end()) { printBuffer(); fputws_unlocked(S1_substitution[yytext].c_str(), yyout); offset+=S1_substitution[yytext].size(); hasWrite_dot = hasWrite_white = true; } else { last="buffer"; bufferAppend(buffer, yytext); } } [ \n\t\r$*<>] { if (last == "open_tag") bufferAppend(tags.back(), yytext); else bufferAppend(buffer, yytext); } [EMAIL PROTECTED]/] { printBuffer(); fputwc_unlocked(L'\\', yyout); offset++; wchar_t symbol; int pos = mbtowc(&symbol, yytext, MB_CUR_MAX); if(pos == -1) { wcerr << L"Cuatro" << endl; wcerr << L"Encoding error." << endl; exit(EXIT_FAILURE); } fputwc_unlocked(symbol, yyout); offset++; hasWrite_dot = hasWrite_white = true; } . { printBuffer(); symbuf += yytext; wchar_t symbol; int pos = mbtowc(&symbol, symbuf.c_str(), MB_CUR_MAX); if(pos == -1) { if(symbuf.size() > MB_CUR_MAX) { // unknown character symbuf = ""; fputwc_unlocked(L'?', yyout); offset++; hasWrite_dot = hasWrite_white = true; } } else { symbuf = ""; fputwc_unlocked(symbol, yyout); offset++; hasWrite_dot = hasWrite_white = true; } } <<EOF>> { isDot = true; printBuffer(); return 0; } %% void usage(string const &progname) { cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; cerr << "html format processor " << endl; exit(EXIT_SUCCESS); } int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); if(argc > 3) { usage(argv[0]); } switch(argc) { case 3: yyout = fopen(argv[2], "w"); if(!yyout) { usage(argv[0]); } case 2: yyin = fopen(argv[1], "r"); if(!yyin) { usage(argv[0]); } break; default: break; } // prevent warning message yy_push_state(1); yy_top_state(); yy_pop_state(); S1_init(); last = ""; buffer = L""; isDot = hasWrite_dot = hasWrite_white = false; current=0; offset = 0; init_escape(); init_tagNames(); yylex(); fclose(yyin); fclose(yyout); }
-- Do you think that when they asked George Washington for ID that he just whipped out a quarter? -- Steven Wright Manoj Srivastava <[EMAIL PROTECTED]> <http://www.golden-gryphon.com/> 1024D/BF24424C print 4966 F272 D093 B493 410B 924B 21BA DABB BF24 424C