On Tue, Feb 10, 2009 at 5:54 PM, Kent Johnson <[email protected]> wrote:
> Another attempt attached, it recognizes the n. separator and gets the last
> item.
And here is the actual attachment.
Kent
# Parser for legal citations, PLY version
# This version doesn't parse the names
from ply import lex, yacc
debug = 0
text = """Indemnified Capital Investments, S.A. v. R.J. O'Brien & Assoc., Inc., 12 F.3d 1406, 1409 (7th Cir.1993).
Hunt v. Washington Apple Advertising Commn., 432 U.S. 333, 343, 97 S.Ct. 2434, 2441, 53 L.Ed.2d 383 (1977)
Smith v. Wisconsin Dept. of Agriculture, 23 F.3d 1134, 1141 (7th Cir.1994)
see also Warth v. Seldin, 422 U.S. 490, 499 n. 10, 95 S.Ct. 2197, 2205 n. 10, 45 L.Ed.2d 343 (1975)
Idaho Conservation League v. Mumma, 956 F.2d 1508, 1517-18 (9th Cir.1992)
NFMA, NEPA, or MUSYA. Sierra Club v. Marita, 843 F.Supp. 1526 (E.D.Wis.1994) ("Nicolet ").
Page 500 Carter v. Jury Commission of Greene County, 396 U.S. 320, 90 S.Ct. 518, 24 L.Ed.2d 549 (1970);
Lathe Turner v. Fouche, 396 U.S. 346, 90 S.Ct. 532, 24 L.Ed.2d 567 (1970);
White v. Crook, 251 F.Supp. 401 (DCMD Ala.1966).
Moreover, the Court has also recognized that the exclusion of a discernible class from jury service
injures not only those defendants who belong to the excluded class,
but other defendants as well, in that it destroys the possibility
that the jury will reflect a representative cross section of the community.
In John Doggone Williams v. Florida, 399 U.S. 78, 90 S.Ct. 1893, 234, 26 L.Ed.2d 446 (1970)"""
# Lexical tokens
tokens = (
'NUMBER',
'MIXED',
'N',
'YEAR',
)
literals = ",()-"
# Regular expression rules for simple tokens
t_NUMBER = r'\d+'
t_MIXED = r'[A-Za-z][A-Za-z.0-9\']+' # References and names after the first work
t_YEAR = r'\([^)]+\)' # Note: "year" can contain multiple words and non-numeric
# A string containing ignored characters (spaces and tabs)
t_ignore = ' \t\r\n'
def t_N(t):
r'n\.'
return t
# Error handling rule
def t_error(t):
t.lexer.skip(1)
# Build the lexer
lexer = lex.lex()
def test_lexer(data):
lexer.input(data)
# Tokenize
while True:
tok = lexer.token()
if not tok: break # No more input
print tok
# Parser productions
def p_Page(p):
'''page : NUMBER
| NUMBER '-' NUMBER
| NUMBER N NUMBER'''
if len(p) == 2:
p[0] = p[1]
elif p[2] == 'n.':
p[0] = '%s n. %s' % (p[1], p[3])
else:
p[0] = p[1] + p[2] + p[3]
def p_Reference(p):
'''reference : NUMBER MIXED page'''
p[0] = '%s %s %s' % (p[1], p[2], p[3])
def p_Reference_List(p):
'''reference_list : reference
| reference_list ',' page
| reference_list ',' reference
| reference_list ',' page ',' reference'''
if len(p) == 2:
p[0] = [p[1]] # single reference
elif len(p) == 4:
if p.slice[3].type == 'reference':
p[0] = p[1] + [p[3]] # append new reference
else:
p[1][-1] += ', %s' % p[3] # append page number
p[0] = p[1]
else:
# page number and reference
p[1][-1] += ', %s' % p[3] # append page number
p[0] = p[1] + [p[5]] # append new reference
def p_Citation(p):
'''citation : reference_list YEAR error
| reference_list YEAR'''
for reference in p[1]:
print '%s %s' % (reference, p[2])
print
def p_Citations(p):
'''citations : citation
| citations citation'''
pass
def p_error(p):
pass
start = 'citations'
# Build the parser
parser = yacc.yacc()
if __name__ == '__main__':
parser.parse(text, debug=debug)
_______________________________________________
Tutor maillist - [email protected]
http://mail.python.org/mailman/listinfo/tutor