Author: mdiep Date: Fri May 27 08:53:55 2005 New Revision: 8181 Modified: trunk/languages/tcl/classes/tclparser.pmc Log: - Add Unicode, Hex, and Octal escapes to Tcl (#35976) - Initial refactor of tclparser.pmc to be less like PIR and more like C.
Modified: trunk/languages/tcl/classes/tclparser.pmc ============================================================================== --- trunk/languages/tcl/classes/tclparser.pmc (original) +++ trunk/languages/tcl/classes/tclparser.pmc Fri May 27 08:53:55 2005 @@ -35,19 +35,21 @@ pmclass TclParser dynpmc group tcl_group PMC *string_anchor; INTVAL fixedstringarray_typenum; if (pass) { - TclList = Parrot_PMC_typenum(INTERP, "TclList"); - StringClass = Parrot_PMC_typenum(INTERP, "String"); - IntegerClass = Parrot_PMC_typenum(INTERP, "Integer"); + TclList = Parrot_PMC_typenum(INTERP, "TclList"); + StringClass = Parrot_PMC_typenum(INTERP, "String"); + IntegerClass = Parrot_PMC_typenum(INTERP, "Integer"); + bs_nl = string_from_const_cstring(INTERP, "\\\n",2); nl = string_from_const_cstring(INTERP, "\n",1); - cb = string_from_const_cstring(INTERP, "\175",1); + cb = string_from_const_cstring(INTERP, "}",1); cp = string_from_const_cstring(INTERP, ")",1); space = string_from_const_cstring(INTERP, " ",1); - ConcatWords = string_from_const_cstring(INTERP, "concat_words",12); - ConcatConst = string_from_const_cstring(INTERP, "concat_const",12); + + ConcatWords = string_from_const_cstring(INTERP, "concat_words",12); + ConcatConst = string_from_const_cstring(INTERP, "concat_const",12); ConcatVariable = string_from_const_cstring(INTERP, "concat_variable",15); - ConcatChar = string_from_const_cstring(INTERP, "concat_char",11); - ConcatCommand = string_from_const_cstring(INTERP, "concat_command", 14); + ConcatChar = string_from_const_cstring(INTERP, "concat_char",11); + ConcatCommand = string_from_const_cstring(INTERP, "concat_command", 14); /* Hack to avoid having these ``constant'' strings GC'd. */ fixedstringarray_typenum = @@ -106,9 +108,7 @@ XXX: Skip the evaluate step, and just re PMC *command; /* The current command */ INTVAL bs_marker = 0; INTVAL bs_pos; - INTVAL bs_retval; INTVAL bs_diff; - INTVAL bs_a_byte; INTVAL buffer_length; INTVAL old_length,chunk_start,escape_length,temppos; @@ -128,23 +128,15 @@ XXX: Skip the evaluate step, and just re /* Possible regression here if we do the bs_nl substitution >1 */ bs_pos = string_str_index(INTERP, buffer, bs_nl, bs_marker); - - /* bs_loop_outer: */ while (bs_marker <= buffer_length && bs_pos != -1) { bs_marker = bs_pos; - - bs_pos+=1; - /* bs_loop_inner */ - bs_retval = 1; - while (bs_retval) { - bs_a_byte = string_index(INTERP,buffer,bs_pos++); - if (bs_a_byte < 33 && bs_a_byte != 10) { - bs_retval = 1; - } else { - bs_retval = 0; - } + + bs_pos++; + INTVAL chr = string_index(INTERP, buffer, bs_pos++); + while (chr < 33 && chr != '\n') { + chr = string_index(INTERP, buffer, bs_pos++); } - + /* delete the items from marker to pos */ bs_diff = bs_pos - bs_marker; string_replace(INTERP,buffer,bs_marker,bs_diff,space,NULL); @@ -173,27 +165,18 @@ begin_word: end_of_word = 0; /* Skip any leading whitespace, unless we're preserving. */ - if (preserve_whitespace == 1) { + if (preserve_whitespace) goto pre_middle; - } space_loop: - if (start_word >= buffer_length) { + if (start_word >= buffer_length) goto end_scope; - } - + I0 = string_index(INTERP, buffer, start_word); - if (I0 < 33 && I0 != 10) { - I1 = 1; - } else { - I1 = 0; - } - - if (I1 == 0) { - goto space_loop_end; + if (I0 < 33 && I0 != '\n') { + start_word++; + goto space_loop; } - start_word++; - goto space_loop; space_loop_end: /* @@ -205,13 +188,13 @@ space_loop_end: character = string_index(INTERP, buffer, start_word); - if (character == 123) { + if (character == '{') { /* brace-encrusted block */ word_length = Parrot_TclParser_match_close(INTERP, SELF, buffer,start_word); - if (word_length < 0) { + if (word_length < 0) real_exception(INTERP, NULL, E_Exception, "missing close-brace"); - } + end_of_word = 1; /* figure out where the new word is going to start.*/ @@ -227,14 +210,13 @@ space_loop_end: start_word = I1; word_length = 0; goto middle_word; - } else if (character == 34) { + } else if (character == '"') { /* "string" */ word_length = Parrot_TclParser_match_close(INTERP, SELF, buffer,start_word); - - if (word_length < 0) { + if (word_length < 0) real_exception(INTERP, NULL, E_Exception, "missing \""); - } + end_of_word = 1 ; /*figure out where the new word is going to start.*/ @@ -271,8 +253,8 @@ space_loop_end: } /* #comment */ - if (character == 35 && Parrot_PMC_get_intval(INTERP,command) == 0) { - I0 = string_str_index(INTERP, buffer, nl, start_word); + if (character == '#' && Parrot_PMC_get_intval(INTERP,command) == 0) { + INTVAL I0 = string_str_index(INTERP, buffer, nl, start_word); if (I0 == -1) { /* If the comment is all that's left, quit. */ goto end_command; @@ -289,28 +271,20 @@ pre_middle: middle_word: I0 = start_word+word_length; - if (I0 >= buffer_length) { + if (I0 >= buffer_length) goto end_command; - } character = string_index(INTERP, buffer, I0); - if (preserve_whitespace == 1) { + if (preserve_whitespace) goto middle_word_2; - } - if (character < 33 && character != 10) { - I0 = 1; - } else { - I0 = 0; - } - if (I0 == 1) { + + if (character < 33 && character != '\n') goto end_word; - } /* if this isn't a newline, skip */ - if (character != 10) { + if (character != '\n') goto middle_word_2; - } middle_word_1: word_trailing_length=1; @@ -318,30 +292,26 @@ middle_word_1: middle_word_2: /* wait, this was a semicolon - go back and deal with it.*/ - if (character == 59) { + if (character == ';') goto middle_word_1; - } /* If we've gotten this far and we're in a brace-block, die */ /* XXX gives wrong exception on quoted string */ - if (end_of_word == 1) { + if (end_of_word == 1) real_exception(INTERP, NULL, E_Exception, "extra characters after close-brace"); - } - if (block_interpolation) { + if (block_interpolation) goto check_bs; - } - if (character == 91) { + if (character == '[') { /* command block */ old_length = word_length; chunk_start = start_word + word_length; word_length = Parrot_TclParser_match_close(INTERP, SELF, buffer,chunk_start); - if (word_length < 0) { + if (word_length < 0) real_exception(INTERP, NULL, E_Exception, "missing close-bracket"); - } if (old_length) { /* if there's any word so far, save it*/ @@ -363,16 +333,13 @@ middle_word_2: goto middle_word; } - /* $ */ - if (character == 36) { + if (character == '$') goto handle_variable; - } check_bs: /* backslash */ - if (character == 92) { + if (character == '\\') goto handle_backslash; - } last_character = character; word_length++; @@ -396,9 +363,8 @@ end_command: * if we're preserving whitespace and we're at the end of the command, * we must really be done. */ - if (preserve_whitespace==1) { + if (preserve_whitespace) goto end_command_0; - } /* * If this was an empty line, we'll have a 0 word length but @@ -406,41 +372,38 @@ end_command: */ /* unless we finished the word, in which case... */ - if (end_of_word == 1) { + if (end_of_word == 1) goto end_command_0; - } /* * Because of how we grabbed the command, we could have a word_length * of zero, but have a word object in process. */ I0 = Parrot_PMC_get_intval(INTERP, word); - if (I0 != 0) { + if (I0 != 0) goto end_command_0; - } /* If word_length is zero now, don't bother adding the word. */ /* (unless we're preserving whitespace...) */ - if (word_length == 0) { + if (word_length == 0) goto end_command_1; - } end_command_0: S0 = string_substr(INTERP, buffer, start_word, word_length, NULL, 0); P1 = VTABLE_find_method(INTERP, word, ConcatConst); Parrot_call_method(INTERP, P1, word, ConcatConst, "vS", S0); VTABLE_push_pmc(INTERP, command, word); - if (preserve_whitespace == 1) { + if (preserve_whitespace) goto preserve_end_scope; - } + end_command_1: /* go to the next word*/ I0 = Parrot_PMC_get_intval(INTERP, command); - if (I0 == 0) { + if (I0 == 0) goto end_command_2; - } + VTABLE_push_pmc(INTERP, commands, command); end_command_2: @@ -449,7 +412,6 @@ end_command_2: goto begin_command; - handle_variable: /* First, save off anything we've gotten so far as a constant.*/ old_length = word_length; @@ -457,24 +419,23 @@ handle_variable: word_length = Parrot_TclParser_match_close(INTERP, SELF, buffer,chunk_start); /* if there's any word so far, save it*/ - if (old_length == 0) { + if (old_length == 0) goto handle_variable_1; - } + S0 = string_substr(INTERP, buffer, start_word, old_length, NULL, 0); P1 = VTABLE_find_method(INTERP, word, ConcatConst); Parrot_call_method(INTERP, P1, word, ConcatConst, "vS", S0); handle_variable_1: /* Are we dealing with a braced-var name? */ - if (string_index(INTERP, buffer, ++chunk_start) != 123) { + if (string_index(INTERP, buffer, ++chunk_start) != '{') goto handle_variable_nobrace; - } /* Get the closing brace, but we don't have to have matched pairs here. */ I1 = string_str_index(INTERP, buffer, cb, chunk_start); - if (I1 > buffer_length) { + if (I1 > buffer_length) real_exception(INTERP, NULL, E_Exception, "XXX invalid variable"); - } + /* Save the new start word for when we go to middle word.*/ I1++; start_word = I1; @@ -492,16 +453,15 @@ handle_variable_nobrace: temppos = chunk_start; inner: - if (++temppos >= buffer_length) { + if (++temppos >= buffer_length) goto inner_done; - } I0 = string_index(INTERP, buffer, temppos); if ( - (I0 >= 65 && I0 <= 90) || /* A-Z */ - (I0 >= 97 && I0 <= 122) || /* a-z */ - (I0 >= 48 && I0 <= 57) || /* 0-9 */ - (I0 == 95) /* _ */ + (I0 >= 'A' && I0 <= 'Z') || + (I0 >= 'a' && I0 <= 'z') || + (I0 >= '0' && I0 <= '9') || + (I0 == '_') ) { goto inner; } @@ -539,9 +499,8 @@ gotword: handle_backslash: /* dump anything so far as a constant.*/ - if (word_length == 0) { + if (word_length == 0) goto backslash_escape; - } S0 = string_substr(INTERP, buffer, start_word, word_length, NULL, 0); P1 = VTABLE_find_method(INTERP, word, ConcatConst); @@ -561,56 +520,113 @@ backslash_escape: I0 = string_index(INTERP, buffer, start_word); /* add this escape as a sequence.*/ P1 = VTABLE_find_method(INTERP, word, ConcatChar); - if (I0 != 97) { - goto backslash_escape_b; - } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 7); - goto escape_done; -backslash_escape_b: - if (I0 != 98) { - goto backslash_escape_f; - } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 8); - goto escape_done; -backslash_escape_f: - if (I0 != 102) { - goto backslash_escape_n; - } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 12); - goto escape_done; -backslash_escape_n: - if (I0 != 110) { - goto backslash_escape_r; - } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 10); - goto escape_done; -backslash_escape_r: - if (I0 != 114) { - goto backslash_escape_t; - } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 13); - goto escape_done; -backslash_escape_t: - if (I0 != 116) { - goto backslash_escape_v; - } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 9); - goto escape_done; -backslash_escape_v: - if (I0 != 118) { - goto backslash_escape_not; + switch (I0) { + case 'a': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 7); + break; + case 'b': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 8); + break; + case 'f': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 12); + break; + case 'n': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 10); + break; + case 'r': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 13); + break; + case 't': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 9); + break; + case 'v': + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 11); + break; + /* + * Unicode Excape: + * Take 1-4 hex digits. + */ + case 'u': + I0 = 0; + while (escape_length <= 4) { + I1 = string_index(INTERP, buffer, start_word + escape_length); + if (I1 >='0' && I1 <='9') { + I0 = I0 * 16 + I1-'0'; + } else if (I1>='A' && I1<='F') { + I0 = I0 * 16 + (I1 - 'A' + 10); + } else if (I1>='a' && I1<='f') { + I0 = I0 * 16 + (I1 - 'a' + 10); + } else { + break; + } + escape_length++; + } + if (escape_length == 1) { + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 'u'); + } else { + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", I0); + } + break; + /* + * Hex Excape: + * Take as many hex digit as we can get, but only the last + * two are used to generate the character. + */ + case 'x': + /* loop, getting characters. Keep a running total in I0 */ + I0 = 0; + while (1) { + I1 = string_index(INTERP, buffer, start_word + escape_length); + if (I1 >='0' && I1 <='9') { + I0 = I0 * 16 + I1-'0'; + } else if (I1>='A' && I1<='F') { + I0 = I0 * 16 + (I1 - 'A' + 10); + } else if (I1>='a' && I1<='f') { + I0 = I0 * 16 + (I1 - 'a' + 10); + } else { + break; + } + I0 = I0 % 256; /* Toss away anything bigger than two digits. */ + escape_length++; + } + if (escape_length == 1) { + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 120); + } else { + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", I0); + } + break; + /* Octal Escape (XXX: Too ASCII?) */ + /* We can take one to three digits. */ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + I0 -= '0'; + I1 = string_index(INTERP, buffer, start_word+1); + if (I1 >='0' && I1 <='9') { + escape_length++; + I0 = 8*I0 + I1-'0'; + I1 = string_index(INTERP, buffer, start_word+2); + if (I1 >='0' && I1 <='9') { + escape_length++; + I0 = 8*I0 + I1-'0'; + } + } + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", I0); + break; + /* A backslash that didn't have any special meaning */ + default: + Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", I0); } - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", 11); - goto escape_done; - -/* XXX - here is where the \o, \x, \u escapes will go, pending more - trans-charset joy */ - -backslash_escape_not: - Parrot_call_method(INTERP, P1, word, ConcatChar, "vI", I0); - + escape_done: - /* skip the escaped char*/ + /* skip the escaped char(s) */ start_word = start_word + escape_length; goto middle_word; @@ -649,21 +665,26 @@ end_scope: bufferlen = string_length(INTERP, buffer); opener = string_index(INTERP, buffer, index); - if (opener == 123) { /* {} */ - hierarchical = 1; - closer = 125; - } else if (opener == 91) { /* [] */ - hierarchical = 1; - closer = 93; - } else if (opener == 34) { /* "" */ - closer = opener; - } else if (opener == 40) { /* () */ - hierarchical = 1; - closer = 41; - } else { - return -1; - /* invalid matching char. */ - /* XXX - generate an exception? */ + switch (opener) { + case '{': + hierarchical = 1; + closer = '}'; + break; + case '[': + hierarchical = 1; + closer = ']'; + break; + case '(': + hierarchical = 1; + closer = ')'; + break; + case '"': + closer = '"'; + break; + default: + return -1; + /* invalid matching char. */ + /* XXX - generate an exception? */ } /* state == 1, normal; 2, saw a backslash */ @@ -685,7 +706,7 @@ end_scope: } } else if ((peekchar == opener) && hierarchical) { count++; - } else if (peekchar == 92) { + } else if (peekchar == '\\') { state = 2; } }