Hi Paul, > Le 9 sept. 2019 à 21:10, Paul Eggert <[email protected]> a écrit : > > On 9/9/19 9:46 AM, Akim Demaille wrote: >> I personally prefer treating a lone \r as a regular character, as it's more >> consistent with what my tools show me. And I think it's a problem that GCC >> and Emacs disagree, so maybe the GCS should decide. >> But in the case of Bison, WDYT today? > > I vaguely recall thinking that since Bison is a preprocessor for C/C++, > consistency with GCC was more important than consistency with other GNU tools.
Agreed. > The GCC tradition was because of Classic Mac OS, which used plain CR to > terminate lines. Aaaah! Of course! I had completely forgotten about that! > Since Apple hasn't supported Classic Mac OS since 2002 (arund the time of my > Bison patch you mentioned), this old tradition is now completely obsolete, > and I'd support having Bison and GCC be consistent with everybody else now. Great! > So how about this idea: Change Bison to be like most other GNU tools, and > file a bug report with the GCC folks. I'm all for it! The GCC's PR is here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91733 My proposal to address this in Bison is below. I saw two paths: change no_cr_read into no_crnl_read, or get rid of it leaving the handling of EOL to Flex. I chose the latter because it seemed more natural to me, but you might have a different opinion. Cheers! commit cac45485e14bb3e59761ee7e85b763c0a18bb89a Author: Akim Demaille <[email protected]> Date: Tue Sep 10 18:51:25 2019 +0200 input: \r is lone char We used to treat lone CRs (\r) as regular NLs (\n), probably to please classic MacOS. As of today, it makes more sense to treat \r like a plain white space character. https://lists.gnu.org/archive/html/bison-patches/2019-09/msg00027.html * src/scan-gram.l (no_cr_read): Remove. Instead, use... (eol): this new abbreviation denoting end-of-line. * src/location.c (caret_getc): New. (location_caret): Use it. * tests/diagnostics.at (Carriage return): Adjust expectations. (CR NL): New. diff --git a/src/location.c b/src/location.c index 40fbc04e..d2314a18 100644 --- a/src/location.c +++ b/src/location.c @@ -169,7 +169,7 @@ static struct } caret_info; void -caret_free () +caret_free (void) { if (caret_info.source) { @@ -178,6 +178,23 @@ caret_free () } } +/* Getc, but smash \r\n as \n. */ +static int +caret_getc (void) +{ + FILE *f = caret_info.source; + int res = getc (f); + if (res == '\r') + { + int c = getc (f); + if (c == '\n') + res = c; + else + ungetc (c, f); + } + return res; +} + void location_caret (location loc, const char *style, FILE *out) { @@ -230,7 +247,7 @@ location_caret (location loc, const char *style, FILE *out) /* Advance to the line's position, keeping track of the offset. */ while (caret_info.line < loc.start.line) { - int c = getc (caret_info.source); + int c = caret_getc (); if (c == EOF) /* Something is wrong, that line number does not exist. */ return; @@ -241,7 +258,7 @@ location_caret (location loc, const char *style, FILE *out) /* Read the actual line. Don't update the offset, so that we keep a pointer to the start of the line. */ { - int c = getc (caret_info.source); + int c = caret_getc (); if (c != EOF) { bool single_line = loc.start.line == loc.end.line; @@ -268,7 +285,7 @@ location_caret (location loc, const char *style, FILE *out) opened = true; } fputc (c, out); - c = getc (caret_info.source); + c = caret_getc (); ++byte; if (opened && (single_line diff --git a/src/scan-gram.l b/src/scan-gram.l index 66a8caa7..32bc69aa 100644 --- a/src/scan-gram.l +++ b/src/scan-gram.l @@ -49,9 +49,6 @@ static boundary scanner_cursor; #define YY_USER_ACTION location_compute (loc, &scanner_cursor, yytext, yyleng); -static size_t no_cr_read (FILE *, char *, size_t); -#define YY_INPUT(buf, result, size) ((result) = no_cr_read (yyin, buf, size)) - /* Report that yytext is an extension, and evaluate to its token type. */ #define BISON_DIRECTIVE(Directive) \ (bison_directive (loc, yytext), PERCENT_ ## Directive) @@ -139,12 +136,14 @@ id {letter}({letter}|[-0-9])* int [0-9]+ xint 0[xX][0-9abcdefABCDEF]+ +eol \n|\r\n + /* UTF-8 Encoded Unicode Code Point, from Flex's documentation. */ mbchar [\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF]([\x80-\xBF]{2})|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x\90-\xBF]([\x80-\xBF]{2})|[\xF1-\xF3]([\x80-\xBF]{3})|\xF4[\x80-\x8F]([\x80-\xBF]{2}) /* Zero or more instances of backslash-newline. Following GCC, allow white space between the backslash and the newline. */ -splice (\\[ \f\t\v]*\n)* +splice (\\[ \f\t\v]*{eol})* /* An equal sign, with optional leading whitespaces. This is used in some deprecated constructs. */ @@ -193,7 +192,7 @@ eqopt ({sp}=)? "," { complain (loc, Wother, _("stray ',' treated as white space")); } - [ \f\n\t\v] | + [ \f\t\v\r]|{eol} | "//".* continue; "/*" { token_start = loc->start; @@ -201,9 +200,7 @@ eqopt ({sp}=)? BEGIN SC_YACC_COMMENT; } - /* #line directives are not documented, and may be withdrawn or - modified in future versions of Bison. */ - ^"#line "{int}(" \"".*"\"")?"\n" { + ^"#line "{int}(" \"".*"\"")?{eol} { handle_syncline (yytext + sizeof "#line " - 1, *loc); } } @@ -330,7 +327,7 @@ eqopt ({sp}=)? } /* Semantic predicate. */ - "%?"[ \f\n\t\v]*"{" { + "%?"([ \f\t\v]|{eol})*"{" { nesting = 0; code_start = loc->start; BEGIN SC_PREDICATE; @@ -359,7 +356,7 @@ eqopt ({sp}=)? BEGIN SC_BRACKETED_ID; } - [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\n\t\v]+|. { + [^\[%A-Za-z0-9_<>{}\"\'*;|=/, \f\r\n\t\v]+|. { complain (loc, complaint, "%s: %s", ngettext ("invalid character", "invalid characters", yyleng), quote_mem (yytext, yyleng)); @@ -458,7 +455,7 @@ eqopt ({sp}=)? complain (loc, complaint, _("an identifier expected")); } - [^\].A-Za-z0-9_/ \f\n\t\v]+|. { + [^\].A-Za-z0-9_/ \f\r\n\t\v]+|. { complain (loc, complaint, "%s: %s", ngettext ("invalid character in bracketed name", "invalid characters in bracketed name", yyleng), @@ -491,7 +488,7 @@ eqopt ({sp}=)? <SC_YACC_COMMENT> { "*/" BEGIN context_state; - .|\n continue; + .|{eol} continue; <<EOF>> unexpected_eof (token_start, "*/"); BEGIN context_state; } @@ -513,7 +510,7 @@ eqopt ({sp}=)? <SC_LINE_COMMENT> { - "\n" STRING_GROW; BEGIN context_state; + {eol} STRING_GROW; BEGIN context_state; {splice} STRING_GROW; <<EOF>> BEGIN context_state; } @@ -535,7 +532,7 @@ eqopt ({sp}=)? RETURN_VALUE (STRING, last_string); } <<EOF>> unexpected_eof (token_start, "\""); - "\n" unexpected_newline (token_start, "\""); + {eol} unexpected_newline (token_start, "\""); } /*----------------------------------------------------------. @@ -564,7 +561,7 @@ eqopt ({sp}=)? BEGIN INITIAL; return CHAR; } - "\n" unexpected_newline (token_start, "'"); + {eol} unexpected_newline (token_start, "'"); <<EOF>> unexpected_eof (token_start, "'"); } @@ -641,7 +638,7 @@ eqopt ({sp}=)? else obstack_1grow (&obstack_for_string, c); } - \\(.|\n) { + \\(.|{eol}) { char const *p = yytext + 1; /* Quote only if escaping won't make the character visible. */ if (c_isspace ((unsigned char) *p) && c_isprint ((unsigned char) *p)) @@ -665,14 +662,14 @@ eqopt ({sp}=)? <SC_CHARACTER> { "'" STRING_GROW; BEGIN context_state; - \n unexpected_newline (token_start, "'"); + {eol} unexpected_newline (token_start, "'"); <<EOF>> unexpected_eof (token_start, "'"); } <SC_STRING> { "\"" STRING_GROW; BEGIN context_state; - \n unexpected_newline (token_start, "\""); + {eol} unexpected_newline (token_start, "\""); <<EOF>> unexpected_eof (token_start, "\""); } @@ -809,53 +806,6 @@ eqopt ({sp}=)? %% -/* Read bytes from FP into buffer BUF of size SIZE. Return the - number of bytes read. Remove '\r' from input, treating \r\n - and isolated \r as \n. */ - -static size_t -no_cr_read (FILE *fp, char *buf, size_t size) -{ - size_t bytes_read = fread (buf, 1, size, fp); - if (bytes_read) - { - char *w = memchr (buf, '\r', bytes_read); - if (w) - { - char const *r = ++w; - char const *lim = buf + bytes_read; - - for (;;) - { - /* Found an '\r'. Treat it like '\n', but ignore any - '\n' that immediately follows. */ - w[-1] = '\n'; - if (r == lim) - { - int ch = getc (fp); - if (ch != '\n' && ungetc (ch, fp) != ch) - break; - } - else if (*r == '\n') - r++; - - /* Copy until the next '\r'. */ - do - { - if (r == lim) - return w - buf; - } - while ((*w++ = *r++) != '\r'); - } - - return w - buf; - } - } - - return bytes_read; -} - - /*------------------------------------------------------. | Scan NUMBER for a base-BASE integer at location LOC. | diff --git a/tests/diagnostics.at b/tests/diagnostics.at index d9398dd5..d89b5f64 100644 --- a/tests/diagnostics.at +++ b/tests/diagnostics.at @@ -274,11 +274,43 @@ AT_TEST([[Carriage return]], %% ]], [1], -[[input.y:37.8-38.0: <error>error:</error> missing '"' at end of line -input.y:37.8-38.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag> +[[input.y:10.8-11.0: <error>error:</error> missing '"' at end of line + 10 | %token <error>"</error> + | <error>^</error> +input.y:10.8-11.0: <error>error:</error> syntax error, unexpected string, expecting char or identifier or <tag> + 10 | %token <error>"</error> + | <error>^</error> ]]) +## ------- ## +## CR NL. ## +## ------- ## + +# Check Windows EOLs. + +AT_TEST([[CR NL]], +[[^M +%token ^M FOO^M +%token ^M FOO^M +%%^M +exp:^M +]], +[0], +[[input.y:11.9-11: <warning>warning:</warning> symbol FOO redeclared [<warning>-Wother</warning>] + 11 | %token <warning>FOO</warning> + | <warning>^~~</warning> +input.y:10.9-11: previous declaration + 10 | %token <note>FOO</note> + | <note>^~~</note> +input.y:13.5: <warning>warning:</warning> empty rule without %empty [<warning>-Wempty-rule</warning>] + 13 | exp: + | <warning>^</warning> +input.y: <warning>warning:</warning> fix-its can be applied. Rerun with option '--update'. [<warning>-Wother</warning>] +]]) + + + m4_popdef([AT_TEST])
