Package: flex Version: 2.5.35-3 Severity: important 2.5.35-3 fails to process input that worked in 2.5.35-2. This is causing another FTBFS issue in apertium, #504028.
Attached is the input file generated during apertium's build. With flex_2.5.35-2 this will succeed (even on ia64), but flex_2.5.35-3 will fail on both the archs I tested (i386 and ia64). $ /usr/bin/flex -Cfer -t >apertium_deshtml.cc < input2.txt /usr/bin/m4:stdin:9: ERROR: end of file in string -- dann frazier
%{ #include <cstdlib> #include <iostream> #include <map> #include <vector> #include <regex.h> #include <string> #include <lttoolbox/lt_locale.h> #include <lttoolbox/ltstr.h> using namespace std; wstring buffer; string symbuf = ""; bool isDot, hasWrite_dot, hasWrite_white; FILE *formatfile; string last; int current; long int offset; vector<long int> offsets; vector<wstring> tags; vector<int> orders; regex_t escape_chars; regex_t names_regexp; void bufferAppend(wstring &buf, string const &str) { symbuf.append(str); for(size_t i = 0, limit = symbuf.size(); i < limit;) { wchar_t symbol; int gap = mbtowc(&symbol, symbuf.c_str() + i, MB_CUR_MAX); if(gap == -1) { if(i + MB_CUR_MAX < limit) { buf += L'?'; gap = 1; } else { symbuf = symbuf.substr(i); return; } } else { buf += symbol; } i += gap; } symbuf = ""; return; } void init_escape() { if(regcomp(&escape_chars, "[EMAIL PROTECTED]/]", REG_EXTENDED)) { cerr << "ERROR: Illegal regular expression for escape characters" << endl; exit(EXIT_FAILURE); } } void init_tagNames() { if(regcomp(&names_regexp, "[a-zA-Z]+", REG_EXTENDED)) { cerr << "ERROR: Illegal regular expression for tag-names" << endl; exit(EXIT_FAILURE); } } string backslash(string const &str) { string new_str = ""; for(unsigned int i = 0; i < str.size(); i++) { if(str[i] == '\\') { new_str += str[i]; } new_str += str[i]; } return new_str; } wstring escape(string const &str) { regmatch_t pmatch; char const *mystring = str.c_str(); int base = 0; wstring result = L""; while(!regexec(&escape_chars, mystring + base, 1, &pmatch, 0)) { bufferAppend(result, str.substr(base, pmatch.rm_so)); result += L'\\'; wchar_t micaracter; int pos = mbtowc(&micaracter, str.c_str() + base + pmatch.rm_so, MB_CUR_MAX); if(pos == -1) { wcerr << L"Uno" << endl; wcerr << L"Encoding error." << endl; exit(EXIT_FAILURE); } result += micaracter; base += pmatch.rm_eo; } bufferAppend(result, str.substr(base)); return result; } wstring escape(wstring const &str) { string dest = ""; for(size_t i = 0, limit = str.size(); i < limit; i++) { char symbol[MB_CUR_MAX+1]; int pos = wctomb(symbol, str[i]); if(pos == -1) { symbol[0]='?'; pos = 1; } symbol[pos] = 0; dest.append(symbol); } return escape(dest); } string get_tagName(string tag){ regmatch_t pmatch; char const *mystring = tag.c_str(); string result = ""; if(!regexec(&names_regexp, mystring, 1, &pmatch, 0)) { result=tag.substr(pmatch.rm_so, pmatch.rm_eo - pmatch.rm_so); return result; } return ""; } map<string, wstring, Ltstr> S1_substitution; void S1_init() { S1_substitution["À"] = L"??"; S1_substitution["À"] = L"??"; S1_substitution["À"] = L"??"; S1_substitution["À"] = L"??"; S1_substitution["Á"] = L"??"; S1_substitution["Á"] = L"??"; S1_substitution["Á"] = L"??"; S1_substitution["Á"] = L"??"; S1_substitution["Â"] = L"??"; S1_substitution["Â"] = L"??"; S1_substitution["Â"] = L"??"; S1_substitution["Â"] = L"??"; S1_substitution["Ã"] = L"??"; S1_substitution["Ã"] = L"??"; S1_substitution["Ã"] = L"??"; S1_substitution["Ã"] = L"??"; S1_substitution["Ä"] = L"??"; S1_substitution["Ä"] = L"??"; S1_substitution["Ä"] = L"??"; S1_substitution["Ä"] = L"??"; S1_substitution["Å"] = L"??"; S1_substitution["Å"] = L"??"; S1_substitution["Å"] = L"??"; S1_substitution["Å"] = L"??"; S1_substitution["Æ"] = L"??"; S1_substitution["Æ"] = L"??"; S1_substitution["Æ"] = L"??"; S1_substitution["Æ"] = L"??"; S1_substitution["Ç"] = L"??"; S1_substitution["Ç"] = L"??"; S1_substitution["Ç"] = L"??"; S1_substitution["Ç"] = L"??"; S1_substitution["È"] = L"??"; S1_substitution["È"] = L"??"; S1_substitution["È"] = L"??"; S1_substitution["È"] = L"??"; S1_substitution["É"] = L"??"; S1_substitution["É"] = L"??"; S1_substitution["É"] = L"??"; S1_substitution["É"] = L"??"; S1_substitution["Ê"] = L"??"; S1_substitution["Ê"] = L"??"; S1_substitution["Ê"] = L"??"; S1_substitution["Ê"] = L"??"; S1_substitution["Ë"] = L"??"; S1_substitution["Ë"] = L"??"; S1_substitution["Ë"] = L"??"; S1_substitution["Ë"] = L"??"; S1_substitution["Ì"] = L"??"; S1_substitution["Ì"] = L"??"; S1_substitution["Ì"] = L"??"; S1_substitution["Ì"] = L"??"; S1_substitution["Í"] = L"??"; S1_substitution["Í"] = L"??"; S1_substitution["Í"] = L"??"; S1_substitution["Í"] = L"??"; S1_substitution["Î"] = L"??"; S1_substitution["Î"] = L"??"; S1_substitution["Î"] = L"??"; S1_substitution["Î"] = L"??"; S1_substitution["Ï"] = L"??"; S1_substitution["Ï"] = L"??"; S1_substitution["Ï"] = L"??"; S1_substitution["Ï"] = L"??"; S1_substitution["Ð"] = L"??"; S1_substitution["Ð"] = L"??"; S1_substitution["Ð"] = L"??"; S1_substitution["Ð"] = L"??"; S1_substitution["Ñ"] = L"??"; S1_substitution["Ñ"] = L"??"; S1_substitution["Ñ"] = L"??"; S1_substitution["Ñ"] = L"??"; S1_substitution["Ò"] = L"??"; S1_substitution["Ò"] = L"??"; S1_substitution["Ò"] = L"??"; S1_substitution["Ò"] = L"??"; S1_substitution["Ó"] = L"??"; S1_substitution["Ó"] = L"??"; S1_substitution["Ó"] = L"??"; S1_substitution["Ó"] = L"??"; S1_substitution["Ô"] = L"??"; S1_substitution["Ô"] = L"??"; S1_substitution["Ô"] = L"??"; S1_substitution["Ô"] = L"??"; S1_substitution["Õ"] = L"??"; S1_substitution["Õ"] = L"??"; S1_substitution["Õ"] = L"??"; S1_substitution["Õ"] = L"??"; S1_substitution["Ö"] = L"??"; S1_substitution["Ö"] = L"??"; S1_substitution["Ö"] = L"??"; S1_substitution["Ö"] = L"??"; S1_substitution["Ø"] = L"??"; S1_substitution["Ø"] = L"??"; S1_substitution["Ø"] = L"??"; S1_substitution["Ø"] = L"??"; S1_substitution["Ù"] = L"??"; S1_substitution["Ù"] = L"??"; S1_substitution["Ù"] = L"??"; S1_substitution["Ù"] = L"??"; S1_substitution["Ú"] = L"??"; S1_substitution["Ú"] = L"??"; S1_substitution["Ú"] = L"??"; S1_substitution["Ú"] = L"??"; S1_substitution["Û"] = L"??"; S1_substitution["Û"] = L"??"; S1_substitution["Û"] = L"??"; S1_substitution["Û"] = L"??"; S1_substitution["Ü"] = L"??"; S1_substitution["Ü"] = L"??"; S1_substitution["Ü"] = L"??"; S1_substitution["Ü"] = L"??"; S1_substitution["Ý"] = L"??"; S1_substitution["Ý"] = L"??"; S1_substitution["Ý"] = L"??"; S1_substitution["Ý"] = L"??"; S1_substitution["Þ"] = L"??"; S1_substitution["Þ"] = L"??"; S1_substitution["Þ"] = L"??"; S1_substitution["Þ"] = L"??"; S1_substitution["ß"] = L"??"; S1_substitution["ß"] = L"??"; S1_substitution["ß"] = L"??"; S1_substitution["ß"] = L"??"; S1_substitution["à"] = L"??"; S1_substitution["à"] = L"??"; S1_substitution["à"] = L"??"; S1_substitution["à"] = L"??"; S1_substitution["á"] = L"??"; S1_substitution["á"] = L"??"; S1_substitution["á"] = L"??"; S1_substitution["á"] = L"??"; S1_substitution["â"] = L"??"; S1_substitution["â"] = L"??"; S1_substitution["â"] = L"??"; S1_substitution["â"] = L"??"; S1_substitution["ã"] = L"??"; S1_substitution["ã"] = L"??"; S1_substitution["ã"] = L"??"; S1_substitution["ã"] = L"??"; S1_substitution["ä"] = L"??"; S1_substitution["ä"] = L"??"; S1_substitution["ä"] = L"??"; S1_substitution["ä"] = L"??"; S1_substitution["å"] = L"??"; S1_substitution["å"] = L"??"; S1_substitution["å"] = L"??"; S1_substitution["å"] = L"??"; S1_substitution["æ"] = L"??"; S1_substitution["æ"] = L"??"; S1_substitution["æ"] = L"??"; S1_substitution["æ"] = L"??"; S1_substitution["ç"] = L"??"; S1_substitution["ç"] = L"??"; S1_substitution["ç"] = L"??"; S1_substitution["ç"] = L"??"; S1_substitution["è"] = L"??"; S1_substitution["è"] = L"??"; S1_substitution["è"] = L"??"; S1_substitution["è"] = L"??"; S1_substitution["é"] = L"??"; S1_substitution["é"] = L"??"; S1_substitution["é"] = L"??"; S1_substitution["é"] = L"??"; S1_substitution["ê"] = L"??"; S1_substitution["ê"] = L"??"; S1_substitution["ê"] = L"??"; S1_substitution["ê"] = L"??"; S1_substitution["ë"] = L"??"; S1_substitution["ë"] = L"??"; S1_substitution["ë"] = L"??"; S1_substitution["ë"] = L"??"; S1_substitution["ì"] = L"??"; S1_substitution["ì"] = L"??"; S1_substitution["ì"] = L"??"; S1_substitution["ì"] = L"??"; S1_substitution["í"] = L"??"; S1_substitution["í"] = L"??"; S1_substitution["í"] = L"??"; S1_substitution["í"] = L"??"; S1_substitution["î"] = L"??"; S1_substitution["î"] = L"??"; S1_substitution["î"] = L"??"; S1_substitution["î"] = L"??"; S1_substitution["ï"] = L"??"; S1_substitution["ï"] = L"??"; S1_substitution["ï"] = L"??"; S1_substitution["ï"] = L"??"; S1_substitution["ð"] = L"??"; S1_substitution["ð"] = L"??"; S1_substitution["ð"] = L"??"; S1_substitution["ð"] = L"??"; S1_substitution["ñ"] = L"??"; S1_substitution["ñ"] = L"??"; S1_substitution["ñ"] = L"??"; S1_substitution["ñ"] = L"??"; S1_substitution["ò"] = L"??"; S1_substitution["ò"] = L"??"; S1_substitution["ò"] = L"??"; S1_substitution["ò"] = L"??"; S1_substitution["ó"] = L"??"; S1_substitution["ó"] = L"??"; S1_substitution["ó"] = L"??"; S1_substitution["ó"] = L"??"; S1_substitution["ô"] = L"??"; S1_substitution["ô"] = L"??"; S1_substitution["ô"] = L"??"; S1_substitution["ô"] = L"??"; S1_substitution["õ"] = L"??"; S1_substitution["õ"] = L"??"; S1_substitution["õ"] = L"??"; S1_substitution["õ"] = L"??"; S1_substitution["ö"] = L"??"; S1_substitution["ö"] = L"??"; S1_substitution["ö"] = L"??"; S1_substitution["ö"] = L"??"; S1_substitution["ø"] = L"??"; S1_substitution["ø"] = L"??"; S1_substitution["ø"] = L"??"; S1_substitution["ø"] = L"??"; S1_substitution["ù"] = L"??"; S1_substitution["ù"] = L"??"; S1_substitution["ù"] = L"??"; S1_substitution["ù"] = L"??"; S1_substitution["ú"] = L"??"; S1_substitution["ú"] = L"??"; S1_substitution["ú"] = L"??"; S1_substitution["ú"] = L"??"; S1_substitution["û"] = L"??"; S1_substitution["û"] = L"??"; S1_substitution["û"] = L"??"; S1_substitution["û"] = L"??"; S1_substitution["ü"] = L"??"; S1_substitution["ü"] = L"??"; S1_substitution["ü"] = L"??"; S1_substitution["ü"] = L"??"; S1_substitution["ý"] = L"??"; S1_substitution["ý"] = L"??"; S1_substitution["ý"] = L"??"; S1_substitution["ý"] = L"??"; S1_substitution["þ"] = L"??"; S1_substitution["þ"] = L"??"; S1_substitution["þ"] = L"??"; S1_substitution["þ"] = L"??"; S1_substitution["ÿ"] = L"??"; S1_substitution["ÿ"] = L"??"; S1_substitution["ÿ"] = L"??"; S1_substitution["ÿ"] = L"??"; S1_substitution["·"] = L"??"; S1_substitution["·"] = L"??"; S1_substitution["·"] = L"??"; S1_substitution["·"] = L"??"; S1_substitution["’"] = L"'"; } void printBuffer() { if(isDot) { fputws_unlocked(L".[]", yyout); isDot = false; } if(buffer.size() > 8192) { string filename = tmpnam(NULL); FILE *largeblock = fopen(filename.c_str(), "w"); fputws_unlocked(buffer.c_str(), largeblock); fclose(largeblock); fputwc_unlocked(L'[', yyout); fputwc_unlocked(L'@', yyout); wchar_t cad[filename.size()]; size_t pos = mbstowcs(cad, filename.c_str(), filename.size()); if(pos == (size_t) -1) { wcerr << L"Tres" << endl; wcerr << L"Encoding error." << endl; exit(EXIT_FAILURE); } cad[pos] = 0; fputws_unlocked(cad, yyout); fputwc_unlocked(L']', yyout); } else if(buffer.size() > 1) { fputwc_unlocked(L'[', yyout); wstring const tmp = escape(buffer); if(tmp[0] == L'@') { fputwc_unlocked(L'\\', yyout); } fputws_unlocked(tmp.c_str(), yyout); fputwc_unlocked(L']', yyout); } else if(buffer.size() == 1 && buffer[0] != L' ') { fputwc_unlocked(L'[', yyout); wstring const tmp = escape(buffer); if(tmp[0] == L'@') { fputwc_unlocked(L'\\', yyout); } fputws_unlocked(tmp.c_str(), yyout); fputwc_unlocked(L']', yyout); } else { fputws_unlocked(buffer.c_str(), yyout); } buffer = L""; } %} %x C1 C2 C3 %option nounput %option noyywrap %option caseless %option stack %% <C1>{ "-->" { last = "buffer"; bufferAppend(buffer, yytext); yy_pop_state(); } \n|. { last = "buffer"; bufferAppend(buffer, yytext); } } <C2>{ "<!--" { bufferAppend(buffer, yytext); yy_push_state(C1); } "</script"(" "[^>]*)?">" { last = "buffer"; bufferAppend(buffer, yytext); yy_pop_state(); } \n|. { last = "buffer"; bufferAppend(buffer, yytext); } } <C3>{ "<!--" { bufferAppend(buffer, yytext); yy_push_state(C1); } "</style"(" "[^>]*)?">" { last = "buffer"; bufferAppend(buffer, yytext); yy_pop_state(); } \n|. { last = "buffer"; bufferAppend(buffer, yytext); } } "<!--" { bufferAppend(buffer, yytext); yy_push_state(C1); } "<script"(" "[^>]*)?">" { bufferAppend(buffer, yytext); yy_push_state(C2); } "<style"(" "[^>]*)?">" { bufferAppend(buffer, yytext); yy_push_state(C3); } "<br"(" "[^>]*)?">"|"<hr"(" "[^>]*)?">"|"<p"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<li"(" "[^>]*)?">"|"<ul"(" "[^>]*)?">"|"<ol"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<tr"(" "[^>]*)?">"|"<td"(" "[^>]*)?">"|"<th"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</br"(" "[^>]*)?">"|"</hr"(" "[^>]*)?">"|"</p"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</li"(" "[^>]*)?">"|"</ul"(" "[^>]*)?">"|"</ol"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</tr"(" "[^>]*)?">"|"</td"(" "[^>]*)?">"|"</th"(" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<title"(" "[^>]*)?">"|"<div"(" "[^>]*)?">"|"<option"(" "[^>]*)?">"|"<h"[1-6](" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "</title"(" "[^>]*)?">"|"</div"(" "[^>]*)?">"|"</option"(" "[^>]*)?">"|"</h"[1-6](" "[^>]*)?">" { isDot = true; bufferAppend(buffer, yytext); } "<"("img"|"link")(" "[^>]*)?">" { bufferAppend(buffer, yytext); } ("<!"|"<?")[a-zA-Z][^>]*">" { bufferAppend(buffer, yytext); } "<"[a-zA-Z][^>]*">" { bufferAppend(buffer, yytext); } "</"[a-zA-Z][^>]*">" { bufferAppend(buffer, yytext); } "&"([a-zA-Z]+|"#x"[0-9a-fA-F]{1,4}|"#"[0-9]{1,8}); { if(S1_substitution.find(yytext) != S1_substitution.end()) { printBuffer(); fputws_unlocked(S1_substitution[yytext].c_str(), yyout); offset+=S1_substitution[yytext].size(); hasWrite_dot = hasWrite_white = true; } else { last="buffer"; bufferAppend(buffer, yytext); } } [ \n\t\r$*<>] { if (last == "open_tag") bufferAppend(tags.back(), yytext); else bufferAppend(buffer, yytext); } [EMAIL PROTECTED]/] { printBuffer(); fputwc_unlocked(L'\\', yyout); offset++; wchar_t symbol; int pos = mbtowc(&symbol, yytext, MB_CUR_MAX); if(pos == -1) { wcerr << L"Cuatro" << endl; wcerr << L"Encoding error." << endl; exit(EXIT_FAILURE); } fputwc_unlocked(symbol, yyout); offset++; hasWrite_dot = hasWrite_white = true; } . { printBuffer(); symbuf += yytext; wchar_t symbol; int pos = mbtowc(&symbol, symbuf.c_str(), MB_CUR_MAX); if(pos == -1) { if(symbuf.size() > MB_CUR_MAX) { // unknown character symbuf = ""; fputwc_unlocked(L'?', yyout); offset++; hasWrite_dot = hasWrite_white = true; } } else { symbuf = ""; fputwc_unlocked(symbol, yyout); offset++; hasWrite_dot = hasWrite_white = true; } } <<EOF>> { isDot = true; printBuffer(); return 0; } %% void usage(string const &progname) { cerr << "USAGE: " << progname << " [input_file [output_file]" << ']' << endl; cerr << "html format processor " << endl; exit(EXIT_SUCCESS); } int main(int argc, char *argv[]) { LtLocale::tryToSetLocale(); if(argc > 3) { usage(argv[0]); } switch(argc) { case 3: yyout = fopen(argv[2], "w"); if(!yyout) { usage(argv[0]); } case 2: yyin = fopen(argv[1], "r"); if(!yyin) { usage(argv[0]); } break; default: break; } // prevent warning message yy_push_state(1); yy_top_state(); yy_pop_state(); S1_init(); last = ""; buffer = L""; isDot = hasWrite_dot = hasWrite_white = false; current=0; offset = 0; init_escape(); init_tagNames(); yylex(); fclose(yyin); fclose(yyout); }