Re: How to implement a few things?

Bruce Frederiksen Tue, 07 Oct 2008 16:05:06 -0700

If you make True/False tokens, that generally means that they are 
reserved words in your language.  That means that nobody can use them 
for other purposes.

If you want to let people use the words "True" and "False" for other 
purposes, then you probably don't want them to be tokens.  In this case, 
you will probably end up doing a lookup at runtime, which might find 
some other value.

In python 2.5, for example, you can use True and False for other purposes:

def False(): print "hi mom!"

is OK.  But this becomes illegal in python 3.0.

I've attached a scanner the does indenting ala python (i.e., the 
programmer can indent any amount so long as they line up).

-bruce

Alex_Gaynor wrote:
> I'm looking to implement a boolean type in my language, where exactly
> should I do this?  Should I make True and False both be tokens, and
> just set t.value = True/False.  Or should the parser handle them?
>
> Also, how would I go about implementing a language that uses indent/
> dedent for blocks(ala python)?
>
> Alex
> >
>
>   

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"ply-hack" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to [EMAIL PROTECTED]
For more options, visit this group at 
http://groups.google.com/group/ply-hack?hl=en
-~----------~----~----~----~------~----~------~--~---

# $Id: scanner.py 150 2008-09-24 19:46:26Z mtnyogi $
# coding=utf-8
# 
# Copyright © 2007-2008 Bruce Frederiksen
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

""" See http://www.dabeaz.com/ply/ply.html for syntax of grammer definitions.
"""

from __future__ import with_statement
import string
from ply import lex

debug=0

kfb_mode = False

states = (
    ('indent', 'exclusive'),
    ('code', 'exclusive'),
    ('checknl', 'exclusive'),
)

kfb_keywords = frozenset((
    'False',
    'None',
    'True',
))

keywords = frozenset((
    'as',
    'assert',
    'bc_extras',
    'check',
    'extending',
    'False',
    'fc_extras',
    'first',
    'forall',
    'foreach',
    'in',
    'None',
    'notany',
    'plan_extras',
    'python',
    'require',
    'step',
    'taking',
    'True',
    'use',
    'when',
    'with',
    'without',
))

base_kfb_tokens = (
  # 'DATE_TOK',         # FIX: Add the definition for this!
    'IDENTIFIER_TOK',
  # 'LB_TOK',
  # 'LC_TOK',
    'LP_TOK',
    'NL_TOK',
    'NUMBER_TOK',
  # 'RB_TOK',
  # 'RC_TOK',
    'RP_TOK',
    'STRING_TOK',
)

base_krb_tokens = base_kfb_tokens + (
    'ANONYMOUS_VAR_TOK',
    'CODE_TOK',
    'DEINDENT_TOK',
    'INDENT_TOK',
    'NOT_NL_TOK',
    'PATTERN_VAR_TOK',
)

kfb_tokens = tuple(x.upper() + '_TOK' for x in kfb_keywords) + base_kfb_tokens

tokens = tuple(x.upper() + '_TOK' for x in keywords) + base_krb_tokens

literals = '*:,!.='     # FIX: delete ':'

t_ignore = ' \t'

t_ignore_comment = r'\#.*'

def t_continuation(t):
    r'\\(\r)?\n'
    t.lexer.lineno += 1

def t_NL_TOK(t):
    # newline, followed by any number of empty or comment only lines
    r'(\r)?\n([ \t]*(\#.*)?(\r)?\n)*'
    t.lexer.lineno += t.value.count('\n')
    if kfb_mode: return t
    if nesting_level == 0:
        t.lexer.begin('indent')
        t.lexer.skip(-1)        # put the final '\n' back for tp_indent_sp!
        return t

indent_levels = []

# to prevent getting a warning...
t_indent_ignore = ''

def t_indent_sp(t):
    # ply doesn't like re's that can be empty, so we'll include the prior
    # newline char in the re and then skip over it when we count the indent
    # level.  The tp_NL_TOK function does a skip(-1) to retain the final '\n'
    # for t_indent_sp.
    r'\n[ \t]*'
    indent = count_indent(t.value[1:])[0]
    current_indent = indent_levels[-1] if indent_levels else 0
    if debug:
        print "t_indent_sp: t.value", repr(t.value), "indent", indent, \
              "current_indent", current_indent, \
              "indent_levels", indent_levels, \
              "t.lexpos", t.lexpos, \
              "t.lexer.lexpos", t.lexer.lexpos, \
              "t.lexer.lexdata[]", repr(t.lexer.lexdata[t.lexpos])
    if indent > current_indent:
        t.type = 'INDENT_TOK'
        indent_levels.append(indent)
        t.lexer.begin('INITIAL')
        if debug: print "INDENT_TOK: indent_levels", indent_levels
        return t
    if indent < current_indent:
        if indent > 0 and indent not in indent_levels:
            raise SyntaxError(
                      "deindent doesn't match any previous indent level",
                      syntaxerror_params(t.lexpos))
        t.type = 'DEINDENT_TOK'
        del indent_levels[-1]
        if indent < (indent_levels[-1] if indent_levels else 0):
            if debug: print " -- pushing indent back"
            t.lexer.skip(-len(t.value))
        else:
            if debug: print " -- doing begin('INITIAL')"
            t.lexer.begin('INITIAL')
        if debug: print "DEINDENT_TOK: indent_levels", indent_levels
        return t
    # else indent == current_indent
    t.lexer.begin('INITIAL')
    if debug: print "no indent: indent_levels", indent_levels

t_checknl_ignore = ' \t'

def t_checknl_nl(t):
    # optional comment followed by newline
    r'(\#.*)?(\r)?\n'
    t.lexer.lineno += 1
    t.lexer.begin('indent')
    t.lexer.skip(-1)        # put the final '\n' back for tp_indent_sp!
    t.type = 'NL_TOK'
    return t

def t_checknl_other(t):
    # something other than newline
    r'[^\#\r\n]'
    t.lexer.skip(-1)        # put the final char back!
    t.type = 'NOT_NL_TOK'
    return t

def start_code(plan_name = None, multiline = False,
               var_format = "(context['%s'])"):
    global current_line, code, current_plan_name, code__level
    global pattern_var_format, plan_vars_needed, code_nesting_level
    global code_lineno, code_lexpos
    global code_indent_level
    pattern_var_format = var_format
    plan_vars_needed = []
    current_line = ''
    code = []
    if multiline: code_indent_level = indent_levels[-1]
    else: code_indent_level = 1000000000
    current_plan_name = plan_name
    code_nesting_level = 0
    code_lineno = code_lexpos = None
    lexer.begin('code')

def mark(t):
    global code_lineno, code_lexpos
    if code_lineno is None:
        code_lineno = t.lexer.lineno
        code_lexpos = t.lexpos

# to prevent getting a warning...
t_code_ignore = ''

def t_code_string(t):
    r"'''([^\\]|\\.)*?'''|" \
    r'"""([^\\]|\\.)*?"""|' \
    r"'([^'\\\n\r]|\\.|\\(\r)?\n)*?'|" \
    r'"([^"\\\n\r]|\\.|\\(\r)?\n)*?"'
    global current_line
    current_line += t.value
    mark(t)
    if debug: print "scanner saw string:", t.value
    t.lexer.lineno += t.value.count('\n')

def t_code_comment(t):
    r'[ \t\f\r]*\#.*'
    global current_line
    if debug: print "scanner saw comment:", t.value
    #current_line += t.value

def t_code_plan(t):
    r'\$\$'
    global current_line
    mark(t)
    if debug:
        print "scanner saw '$$', current_plan_name is", current_plan_name
    if not current_plan_name:
        raise SyntaxError("'$$' only allowed in plan_specs within the "
                          "'when' clause",
                          syntaxerror_params(t.lexpos))
    current_line += pattern_var_format % current_plan_name
    plan_vars_needed.append(current_plan_name)

def t_code_pattern_var(t):
    r'\$[a-zA-Z_][a-zA-Z0-9_]*\b'
    global current_line
    mark(t)
    if not pattern_var_format:
        raise SyntaxError("$<name> only allowed in backward chaining rules",
                          syntaxerror_params(t.lexpos))
    current_line += pattern_var_format % t.value[1:]
    plan_vars_needed.append(t.value[1:])
    if debug: print "scanner saw pattern_var:", t.value

def t_code_continuation(t):
    r'\\(\r)?\n'
    global current_line
    t.lexer.lineno += 1
    current_line += '\\'
    code.append(current_line)
    current_line = ''
    if debug: print "scanner saw continuation:", t.value

def t_code_open(t):
    r'[{([]'
    global current_line, code_nesting_level
    mark(t)
    code_nesting_level += 1
    current_line += t.value

def t_code_close(t):
    r'[]})]'
    global current_line, code_nesting_level
    mark(t)
    if code_nesting_level <= 0:
        raise SyntaxError("unmatched %s" % repr(t.value),
                          syntaxerror_params(t.lexpos))
    code_nesting_level -= 1
    current_line += t.value

def t_code_symbol(t):
    r'''[0-9a-zA-Z_]+'''
    global current_line
    mark(t)
    current_line += t.value
    if debug: print "scanner saw symbol:", t.value

def t_code_space(t):
    r'''[ \t]+'''
    global current_line
    current_line += t.value
    if debug: print "scanner saw space chars:", t.value

def t_code_other(t):
    r'''[^][(){}$\\'"\r\n0-9a-zA-Z_ \t]+'''
    global current_line
    mark(t)
    current_line += t.value
    if debug: print "scanner saw other chars:", t.value

def t_code_NL_TOK(t):
    r'(\r)?\n([ \t]*(\#.*)?(\r)?\n)*[ \t]*'
    global current_line
    if current_line:
        code.append(current_line)
        current_line = ''
    indent = count_indent(t.value[t.value.rindex('\n') + 1:])[0]
    if debug: print "scanner saw nl:", t.value, "new indent is", indent
    if indent < code_indent_level and code_nesting_level == 0:
        t.lexer.skip(-len(t.value))
        t.type = 'CODE_TOK'
        t.value = tuple(code), tuple(plan_vars_needed), code_lineno, code_lexpos
        if debug: print "scanner begin('INITIAL')"
        t.lexer.begin('INITIAL')
        return t
    t.lexer.lineno += t.value.count('\n')
    current_line = ' ' * (indent - code_indent_level)

# strings:
def t_tsqstring(t):
    r"[uU]?[rR]?'''([^\\]|\\.)*?'''"
    #t.value = unquote(t.value[3:-3])
    t.type = 'STRING_TOK'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_tdqstring(t):
    r'[uU]?[rR]?"""([^\\]|\\.)*?"""'
    #t.value = unquote(t.value[3:-3])
    t.type = 'STRING_TOK'
    t.lexer.lineno += t.value.count('\n')
    return t

def t_sqstring(t):
    r"[uU]?[rR]?'([^'\\\n\r]|\\.|\\(\r)?\n)*?'"
    #t.value = unquote(t.value[1:-1])
    t.lexer.lineno += t.value.count('\n')
    t.type = 'STRING_TOK'
    return t

def t_dqstring(t):
    r'[uU]?[rR]?"([^"\\\n\r]|\\.|\\(\r)?\n)*?"'
    #t.value = unquote(t.value[1:-1])
    t.type = 'STRING_TOK'
    t.lexer.lineno += t.value.count('\n')
    return t
# end strings

def t_ANONYMOUS_VAR_TOK(t):
    r'\$_([a-zA-Z_][a-zA-Z0-9_]*)?'
    if kfb_mode: t_ANY_error(t)
    t.value = "'" + t.value[1:] + "'"
    return t

def t_PATTERN_VAR_TOK(t):
    r'\$[a-zA-Z][a-zA-Z0-9_]*'
    if kfb_mode: t_ANY_error(t)
    t.value = "'" + t.value[1:] + "'"
    return t

def t_IDENTIFIER_TOK(t):
    r'[a-zA-Z_][a-zA-Z0-9_]*'
    if kfb_mode and t.value in kfb_keywords or \
       not kfb_mode and t.value in keywords:
        t.type = t.value.upper() + '_TOK'
    return t

# numbers:
def t_float(t):
    r'[-+]?([0-9]+(\.[0-9]*([eE][-+][0-9]+)?|[eE][-+][0-9]+)|\.[0-9]+([eE][-+][0-9]+)?)'
    t.value = float(t.value)
    t.type = 'NUMBER_TOK'
    return t

def t_hexint(t):
    r'[-+]?0[xX][0-9a-fA-F]+'
    t.value = int(t.value, 16)
    t.type = 'NUMBER_TOK'
    return t

def t_octalint(t):
    r'[-+]?0[0-7]*'
    t.value = int(t.value, 8)
    t.type = 'NUMBER_TOK'
    return t

def t_int(t):
    r'[-+]?[1-9][0-9]*'
    t.value = int(t.value)
    t.type = 'NUMBER_TOK'
    return t
# end numbers

nesting_level = 0

def t_LB_TOK(t):
    r'\['
    global nesting_level
    nesting_level += 1
    #return t

def t_LC_TOK(t):
    r'\{'
    global nesting_level
    nesting_level += 1
    #return t

def t_LP_TOK(t):
    r'\('
    global nesting_level
    nesting_level += 1
    return t

def t_RB_TOK(t):
    r'\]'
    global nesting_level
    assert nesting_level > 0
    nesting_level -= 1
    #return t

def t_RC_TOK(t):
    r'\}'
    global nesting_level
    assert nesting_level > 0
    nesting_level -= 1
    #return t

def t_RP_TOK(t):
    r'\)'
    global nesting_level
    assert nesting_level > 0
    nesting_level -= 1
    return t

def t_ANY_error(t):
    raise SyntaxError("illegal character %s" % repr(t.value[0]),
                      syntaxerror_params(t.lexpos))

# helper functions:

def count_indent(s, count_all=False):
    r'''
        >>> count_indent('')
        (0, 0)
        >>> count_indent('   ')
        (3, 3)
        >>> count_indent('   stuff')
        (3, 3)
        >>> count_indent('\t')
        (8, 1)
        >>> count_indent('\t ')
        (9, 2)
        >>> count_indent('\t\t')
        (16, 2)
        >>> count_indent('   \t')
        (8, 4)
        >>> count_indent('       \t')
        (8, 8)
        >>> count_indent('        \t')
        (16, 9)
        >>> count_indent(' a\t', True)
        (8, 3)
        >>> count_indent(' a ', True)
        (3, 3)
    '''
    indent = 0
    chars = 0
    for c in s:
        if c == '\t': indent = (indent + 8) & ~7
        elif c == ' ' or count_all: indent += 1
        else: break
        chars += 1
    return indent, chars

escapes = {
    'a': '\a',
    'b': '\b',
    'f': '\f',
    'n': '\n',
    'r': '\r',
    't': '\t',
    'v': '\v',
    '\\': '\\',
    '\'': '\'',
    '\"': '\"',
}

def unquote(s):
    start = 0
    ans = []
    i = s.find('\\', start)
    while i >= 0:
        ans.append(s[start:i])
        e = escapes.get(s[i+1])
        if e:                   # single char escape code
            ans.append(e)
            start = i + 2
        elif s[i+1] == '\n':    # ignore \ at end of line
            start = i + 2
        elif s[i+1] == '\r':    # ignore \ at end of line
            if s[i+2] == '\n': start = i + 3
            else: start = i + 2
        elif s[i+1:i+3] == 'N{':
            end = s.index('}', i + 3)
            ans.append(unicodedata.lookup(s[i+3:end]))
            start = end + 1
        elif s[i+1] == 'u':
            ans.append(unichr(int(s[i+2:i+6], 16)))
            start = i + 6
        elif s[i+1] == 'U':
            ans.append(unichr(int(s[i+2:i+10], 16)))
            start = i + 10
        elif s[i+1] in string.octdigits:
            if s[i+2] not in string.octdigits:
                ans.append(unichr(int(s[i+2:i+3], 8)))
                start = i + 3
            elif s[i+3] not in string.octdigits:
                ans.append(unichr(int(s[i+2:i+4], 8)))
                start = i + 4
            else:
                ans.append(unichr(int(s[i+2:i+5], 8)))
                start = i + 5
        elif s[i+1] == 'x':
            if s[i+3] not in string.hexdigits:
                ans.append(unichr(int(s[i+2:i+3], 16)))
                start = i + 3
            else:
                ans.append(unichr(int(s[i+2:i+4], 16)))
                start = i + 4
        else:
            ans.append(s[i])
            start = i + 1
        i = s.find('\\', start)
    ans.append(s[start:])
    return ''.join(ans)

lexer = lex.lex(debug=0)

class token_iterator(object):
    ''' This is only used for testing the scanner.
    '''
    def __init__(self, input):
        lexer.lineno = 1
        lexer.input(input)
    def __iter__(self): return self
    def next(self):
        t = lex.token()
        if t: return t
        raise StopIteration

def tokenize(s):
    r'''
        >>> tokenize("# This is a comment\n# line 2 of comment\n\n"
        ...          "# comment after blank line\n")
        LexToken(NL_TOK,'\n# line 2 of comment\n\n# comment after blank line\n',1,19)
        >>> tokenize('name1\n    forall   foreach\n           \nname2')
        LexToken(IDENTIFIER_TOK,'name1',1,0)
        LexToken(NL_TOK,'\n',1,5)
        LexToken(INDENT_TOK,'\n    ',2,5)
        LexToken(FORALL_TOK,'forall',2,10)
        LexToken(FOREACH_TOK,'foreach',2,19)
        LexToken(NL_TOK,'\n           \n',2,26)
        LexToken(DEINDENT_TOK,'\n',4,38)
        LexToken(IDENTIFIER_TOK,'name2',4,39)
    '''
    for t in token_iterator(s):
        print t

def tokenize_file(filename = 'TEST/scan_test'):
    r""" Used for testing.

        >>> import os, os.path
        >>> tokenize_file('TEST/scan_test'
        ...               if os.path.split(os.getcwd())[1] == 'krb_compiler'
        ...               else 'krb_compiler/TEST/scan_test')
        LexToken(NL_TOK,'\n# line 2 of comment\n\n# comment after blank line\n',1,19)
        LexToken(IDENTIFIER_TOK,'name1',5,68)
        LexToken(:,':',5,73)
        LexToken(NL_TOK,'\n',5,74)
        LexToken(INDENT_TOK,'\n    ',6,74)
        LexToken(FOREACH_TOK,'foreach',6,79)
        LexToken(NL_TOK,'\n',6,86)
        LexToken(INDENT_TOK,'\n\t',7,86)
        LexToken(LP_TOK,'(',7,88)
        LexToken(NUMBER_TOK,100,7,89)
        LexToken(NUMBER_TOK,64,7,93)
        LexToken(ANONYMOUS_VAR_TOK,"'_'",7,98)
        LexToken(PATTERN_VAR_TOK,"'foo'",7,101)
        LexToken(NUMBER_TOK,256,8,118)
        LexToken(NUMBER_TOK,0,8,124)
        LexToken(RP_TOK,')',8,125)
        LexToken(NL_TOK,'\n',8,126)
        LexToken(NUMBER_TOK,3.1400000000000001,9,129)
        LexToken(NUMBER_TOK,0.98999999999999999,9,134)
        LexToken(NUMBER_TOK,3.0,10,143)
        LexToken(NUMBER_TOK,0.29999999999999999,10,146)
        LexToken(NUMBER_TOK,3,10,149)
        LexToken(IDENTIFIER_TOK,'e6',10,150)
        LexToken(NUMBER_TOK,3.0000000000000001e-06,10,153)
        LexToken(NL_TOK,'\n',10,158)
        LexToken(DEINDENT_TOK,'\n    ',11,158)
        LexToken(ASSERT_TOK,'assert',11,163)
        LexToken(NL_TOK,'\n',11,169)
        LexToken(INDENT_TOK,'\n\t',12,169)
        LexToken(STRING_TOK,"'this is a string'",12,172)
        LexToken(STRING_TOK,'"so is this"',12,191)
        LexToken(STRING_TOK,"'''\n\tand this \\t too'''",12,204)
        LexToken(STRING_TOK,"'should be\\\n        able to do this too'",13,229)
        LexToken(TRUE_TOK,'True',15,278)
        LexToken(NL_TOK,'\n',15,283)
        LexToken(!,'!',16,292)
        LexToken(IDENTIFIER_TOK,'can',16,293)
        LexToken(IDENTIFIER_TOK,'I',17,311)
        LexToken(IDENTIFIER_TOK,'do',17,313)
        LexToken(IDENTIFIER_TOK,'this',17,316)
        LexToken(NL_TOK,'\n',17,320)
        LexToken(IDENTIFIER_TOK,'too',18,329)
        LexToken(NL_TOK,'\n',18,332)
        LexToken(DEINDENT_TOK,'\n',19,332)
        LexToken(DEINDENT_TOK,'\n',19,332)
    """
    with open(filename) as f:
        tokenize(f.read())

def syntaxerror_params(pos = None, lineno = None):
    '''
        Returns (filename, lineno, column, line) for use in as the second
        argument to SyntaxError exceptions.
    '''
    if pos is None: pos = lexer.lexpos
    start = pos
    if lineno is None: lineno = lexer.lineno
    while start > 0 and (start >= len(lexer.lexdata) or
                         lexer.lexdata[start] in '\r\n'):
        start -= 1
    end = start
    if debug: print "pos", pos, "lineno", lineno, "start", start
    start = max(lexer.lexdata.rfind('\r', 0, start),
                lexer.lexdata.rfind('\n', 0, start)) + 1
    column = pos - start + 1
    end1 = lexer.lexdata.find('\r', end)
    end2 = lexer.lexdata.find('\n', end)
    if end1 < 0: end = end2
    elif end2 < 0: end = end1
    else: end = min(end1, end2)
    if debug: print "start", start, "column", column, "end", end
    return (lexer.filename, lineno, column, lexer.lexdata[start:end])

def init(kfb = False):
    global indent_levels, nesting_level, kfb_mode
    indent_levels = []
    nesting_level = 0
    kfb_mode = kfb

def test():
    import doctest
    import sys
    sys.exit(doctest.testmod()[0])

if __name__ == "__main__":
    test()

Re: How to implement a few things?

Reply via email to