I ported part of the Geo Street Address from the perl module: >>>>>>>>address_regex.py from geocode.address_dicts import *
STREET_TYPE_REGEX = "|".join(STREET_TYPES.keys()) + "|" + "|".join(STREET_TYPES.values()) STATE_REGEX = "|".join(STATE_CODES.values()) DIRECTIONS_REGEX = "|".join(DIRECTIONS.keys()) + "|" + "|".join(DIRECTIONS.values()) ZIP_REGEX = "\\d{5}(?:-*\\d{4})?" ZIP_LOOSE_REGEX = "[\\d-]+" #To catch malformed zipcodes CORNER_REGEX = "(?:\\bAND\\b|\\bAT\\b|&|\\@)" UNIT_REGEX = "(?:(?:su?i?te|p\\W*[om]\\W*b(?:ox)?|dept|apt|ro*m|fl|apt| unit|box)\\W+|\#\\W*)[\\w-]+" NUMBER_REGEX = "\\d+-?\\d*" FRACTION_REGEX = "\\d+\/\\d+" # Possible street combinations: STREET_REGEX = """ (?: # special cases like 100 South Street (?: (?P<street1>""" + DIRECTIONS_REGEX + """)\\W+ (?P<street_type1>""" + STREET_TYPE_REGEX + """)\\b ) | (?:(?P<pre_dir>""" + DIRECTIONS_REGEX + """)\\W+)? (?: (?P<street2>[^,]+) (?:[^\\w,]+(?P<street_type2>""" + STREET_TYPE_REGEX + """)\\b) (?:[^\\w,]+(?P<post_dir1>""" + DIRECTIONS_REGEX + """)\\b)? | (?P<street3>[^,]*\\d) (?P<post_dir2>""" + DIRECTIONS_REGEX + """) | (?P<street4>[^,]+?) (?:[^\\w,]+(?P<street_type3>""" + STREET_TYPE_REGEX + """)\\b)? (?:[^\\w,]+(?P<post_dir3>""" + DIRECTIONS_REGEX + """)\\b)? ) )""" CITYSTATE_REGEX = """ (?: (?P<city>[^,]+?),\\W+ (?P<state>""" + STATE_REGEX +""")\\W* )?""" PLACE_REGEX = CITYSTATE_REGEX + """(?:(?P<zip>""" + ZIP_REGEX +"""))?""" ADDRESS_REGEX = """^\\W* (?P<number>""" + NUMBER_REGEX + """)?\\W* (?:""" + FRACTION_REGEX + """\\W*)? # We don't need to keep the fractional part """ + STREET_REGEX + """\\W+ (?:""" + UNIT_REGEX + """\\W+)? # We don't need to keep a unit part """# + PLACE_REGEX + """\\W*$""" INTERSECTION_REGEX = "\\W*" + STREET_REGEX + \ "\\W*?\\s+" + CORNER_REGEX + "\\s+" \ + STREET_REGEX + "\\W+" \ + PLACE_REGEX <<<<<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>address_dicts.py DIRECTIONS = { "NORTH": "N", "NORTHEAST": "NE", "EAST": "E", "SOUTHEAST": "SE", "SOUTH": "S", "SOUTHWEST": "SW", "WEST": "W", "NORTHWEST": "NW", } STATE_CODES = { "ALABAMA": "AL", "ALASKA": "AK", "AMERICAN SAMOA": "AS", "ARIZONA": "AZ", "ARKANSAS": "AR", "CALIFORNIA": "CA", "COLORADO": "CO", "CONNECTICUT": "CT", "DELAWARE": "DE", "DISTRICT OF COLUMBIA": "DC", "FEDERATED STATES OF MICRONESIA": "FM", "FLORIDA": "FL", "GEORGIA": "GA", "GUAM": "GU", "HAWAII": "HI", "IDAHO": "ID", "ILLINOIS": "IL", "INDIANA": "IN", "IOWA": "IA", "KANSAS": "KS", "KENTUCKY": "KY", "LOUISIANA": "LA", "MAINE": "ME", "MARSHALL ISLANDS": "MH", "MARYLAND": "MD", "MASSACHUSETTS": "MA", "MICHIGAN": "MI", "MINNESOTA": "MN", "MISSISSIPPI": "MS", "MISSOURI": "MO", "MONTANA": "MT", "NEBRASKA": "NE", "NEVADA": "NV", "NEW HAMPSHIRE": "NH", "NEW JERSEY": "NJ", "NEW MEXICO": "NM", "NEW YORK": "NY", "NORTH CAROLINA": "NC", "NORTH DAKOTA": "ND", "NORTHERN MARIANA ISLANDS": "MP", "OHIO": "OH", "OKLAHOMA": "OK", "OREGON": "OR", "PALAU": "PW", "PENNSYLVANIA": "PA", "PUERTO RICO": "PR", "RHODE ISLAND": "RI", "SOUTH CAROLINA": "SC", "SOUTH DAKOTA": "SD", "TENNESSEE": "TN", "TEXAS": "TX", "UTAH": "UT", "VERMONT": "VT", "VIRGIN ISLANDS": "VI", "VIRGINIA": "VA", "WASHINGTON": "WA", "WEST VIRGINIA": "WV", "WISCONSIN": "WI", "WYOMING": "WY", } STREET_TYPES = { "ALLEE": "ALY", "ALLEY": "ALY", "ALLY": "ALY", "ANEX": "ANX", "ANNEX": "ANX", "ANNX": "ANX", "ARCADE": "ARC", "AV": "AVE", "AVEN": "AVE", "AVENU": "AVE", "AVENUE": "AVE", "AVN": "AVE", "AVNUE": "AVE", "BAYOO": "BYU", "BAYOU": "BYU", "BEACH": "BCH", "BEND": "BND", "BLUF": "BLF", "BLUFF": "BLF", "BLUFFS": "BLFS", "BOT": "BTM", "BOTTM": "BTM", "BOTTOM": "BTM", "BOUL": "BLVD", "BOULEVARD": "BLVD", "BOULV": "BLVD", "BRANCH": "BR", "BRDGE": "BRG", "BRIDGE": "BRG", "BRNCH": "BR", "BROOK": "BRK", "BROOKS": "BRKS", "BURG": "BG", "BURGS": "BGS", "BYPA": "BYP", "BYPAS": "BYP", "BYPASS": "BYP", "BYPS": "BYP", "CAMP": "CP", "CANYN": "CYN", "CANYON": "CYN", "CAPE": "CPE", "CAUSEWAY": "CSWY", "CAUSWAY": "CSWY", "CEN": "CTR", "CENT": "CTR", "CENTER": "CTR", "CENTERS": "CTRS", "CENTR": "CTR", "CENTRE": "CTR", "CIRC": "CIR", "CIRCL": "CIR", "CIRCLE": "CIR", "CIRCLES": "CIRS", "CK": "CRK", "CLIFF": "CLF", "CLIFFS": "CLFS", "CLUB": "CLB", "CMP": "CP", "CNTER": "CTR", "CNTR": "CTR", "CNYN": "CYN", "COMMON": "CMN", "CORNER": "COR", "CORNERS": "CORS", "COURSE": "CRSE", "COURT": "CT", "COURTS": "CTS", "COVE": "CV", "COVES": "CVS", "CR": "CRK", "CRCL": "CIR", "CRCLE": "CIR", "CRECENT": "CRES", "CREEK": "CRK", "CRESCENT": "CRES", "CRESENT": "CRES", "CREST": "CRST", "CROSSING": "XING", "CROSSROAD": "XRD", "CRSCNT": "CRES", "CRSENT": "CRES", "CRSNT": "CRES", "CRSSING": "XING", "CRSSNG": "XING", "CRT": "CT", "CURVE": "CURV", "DALE": "DL", "DAM": "DM", "DIV": "DV", "DIVIDE": "DV", "DRIV": "DR", "DRIVE": "DR", "DRIVES": "DRS", "DRV": "DR", "DVD": "DV", "ESTATE": "EST", "ESTATES": "ESTS", "EXP": "EXPY", "EXPR": "EXPY", "EXPRESS": "EXPY", "EXPRESSWAY": "EXPY", "EXPW": "EXPY", "EXTENSION": "EXT", "EXTENSIONS": "EXTS", "EXTN": "EXT", "EXTNSN": "EXT", "FALLS": "FLS", "FERRY": "FRY", "FIELD": "FLD", "FIELDS": "FLDS", "FLAT": "FLT", "FLATS": "FLTS", "FORD": "FRD", "FORDS": "FRDS", "FOREST": "FRST", "FORESTS": "FRST", "FORG": "FRG", "FORGE": "FRG", "FORGES": "FRGS", "FORK": "FRK", "FORKS": "FRKS", "FORT": "FT", "FREEWAY": "FWY", "FREEWY": "FWY", "FRRY": "FRY", "FRT": "FT", "FRWAY": "FWY", "FRWY": "FWY", "GARDEN": "GDN", "GARDENS": "GDNS", "GARDN": "GDN", "GATEWAY": "GTWY", "GATEWY": "GTWY", "GATWAY": "GTWY", "GLEN": "GLN", "GLENS": "GLNS", "GRDEN": "GDN", "GRDN": "GDN", "GRDNS": "GDNS", "GREEN": "GRN", "GREENS": "GRNS", "GROV": "GRV", "GROVE": "GRV", "GROVES": "GRVS", "GTWAY": "GTWY", "HARB": "HBR", "HARBOR": "HBR", "HARBORS": "HBRS", "HARBR": "HBR", "HAVEN": "HVN", "HAVN": "HVN", "HEIGHT": "HTS", "HEIGHTS": "HTS", "HGTS": "HTS", "HIGHWAY": "HWY", "HIGHWY": "HWY", "HILL": "HL", "HILLS": "HLS", "HIWAY": "HWY", "HIWY": "HWY", "HLLW": "HOLW", "HOLLOW": "HOLW", "HOLLOWS": "HOLW", "HOLWS": "HOLW", "HRBOR": "HBR", "HT": "HTS", "HWAY": "HWY", "INLET": "INLT", "ISLAND": "IS", "ISLANDS": "ISS", "ISLES": "ISLE", "ISLND": "IS", "ISLNDS": "ISS", "JCTION": "JCT", "JCTN": "JCT", "JCTNS": "JCTS", "JUNCTION": "JCT", "JUNCTIONS": "JCTS", "JUNCTN": "JCT", "JUNCTON": "JCT", "KEY": "KY", "KEYS": "KYS", "KNOL": "KNL", "KNOLL": "KNL", "KNOLLS": "KNLS", "LA": "LN", "LAKE": "LK", "LAKES": "LKS", "LANDING": "LNDG", "LANE": "LN", "LANES": "LN", "LDGE": "LDG", "LIGHT": "LGT", "LIGHTS": "LGTS", "LNDNG": "LNDG", "LOAF": "LF", "LOCK": "LCK", "LOCKS": "LCKS", "LODG": "LDG", "LODGE": "LDG", "LOOPS": "LOOP", "MANOR": "MNR", "MANORS": "MNRS", "MEADOW": "MDW", "MEADOWS": "MDWS", "MEDOWS": "MDWS", "MILL": "ML", "MILLS": "MLS", "MISSION": "MSN", "MISSN": "MSN", "MNT": "MT", "MNTAIN": "MTN", "MNTN": "MTN", "MNTNS": "MTNS", "MOTORWAY": "MTWY", "MOUNT": "MT", "MOUNTAIN": "MTN", "MOUNTAINS": "MTNS", "MOUNTIN": "MTN", "MSSN": "MSN", "MTIN": "MTN", "NECK": "NCK", "ORCHARD": "ORCH", "ORCHRD": "ORCH", "OVERPASS": "OPAS", "OVL": "OVAL", "PARKS": "PARK", "PARKWAY": "PKWY", "PARKWAYS": "PKWY", "PARKWY": "PKWY", "PASSAGE": "PSGE", "PATHS": "PATH", "PIKES": "PIKE", "PINE": "PNE", "PINES": "PNES", "PK": "PARK", "PKWAY": "PKWY", "PKWYS": "PKWY", "PKY": "PKWY", "PLACE": "PL", "PLAIN": "PLN", "PLAINES": "PLNS", "PLAINS": "PLNS", "PLAZA": "PLZ", "PLZA": "PLZ", "POINT": "PT", "POINTS": "PTS", "PORT": "PRT", "PORTS": "PRTS", "PRAIRIE": "PR", "PRARIE": "PR", "PRK": "PARK", "PRR": "PR", "RAD": "RADL", "RADIAL": "RADL", "RADIEL": "RADL", "RANCH": "RNCH", "RANCHES": "RNCH", "RAPID": "RPD", "RAPIDS": "RPDS", "RDGE": "RDG", "REST": "RST", "RIDGE": "RDG", "RIDGES": "RDGS", "RIVER": "RIV", "RIVR": "RIV", "RNCHS": "RNCH", "ROAD": "RD", "ROADS": "RDS", "ROUTE": "RTE", "RVR": "RIV", "SHOAL": "SHL", "SHOALS": "SHLS", "SHOAR": "SHR", "SHOARS": "SHRS", "SHORE": "SHR", "SHORES": "SHRS", "SKYWAY": "SKWY", "SPNG": "SPG", "SPNGS": "SPGS", "SPRING": "SPG", "SPRINGS": "SPGS", "SPRNG": "SPG", "SPRNGS": "SPGS", "SPURS": "SPUR", "SQR": "SQ", "SQRE": "SQ", "SQRS": "SQS", "SQU": "SQ", "SQUARE": "SQ", "SQUARES": "SQS", "STATION": "STA", "STATN": "STA", "STN": "STA", "STR": "ST", "STRAV": "STRA", "STRAVE": "STRA", "STRAVEN": "STRA", "STRAVENUE": "STRA", "STRAVN": "STRA", "STREAM": "STRM", "STREET": "ST", "STREETS": "STS", "STREME": "STRM", "STRT": "ST", "STRVN": "STRA", "STRVNUE": "STRA", "SUMIT": "SMT", "SUMITT": "SMT", "SUMMIT": "SMT", "TERR": "TER", "TERRACE": "TER", "THROUGHWAY": "TRWY", "TPK": "TPKE", "TR": "TRL", "TRACE": "TRCE", "TRACES": "TRCE", "TRACK": "TRAK", "TRACKS": "TRAK", "TRAFFICWAY": "TRFY", "TRAIL": "TRL", "TRAILS": "TRL", "TRK": "TRAK", "TRKS": "TRAK", "TRLS": "TRL", "TRNPK": "TPKE", "TRPK": "TPKE", "TUNEL": "TUNL", "TUNLS": "TUNL", "TUNNEL": "TUNL", "TUNNELS": "TUNL", "TUNNL": "TUNL", "TURNPIKE": "TPKE", "TURNPK": "TPKE", "UNDERPASS": "UPAS", "UNION": "UN", "UNIONS": "UNS", "VALLEY": "VLY", "VALLEYS": "VLYS", "VALLY": "VLY", "VDCT": "VIA", "VIADCT": "VIA", "VIADUCT": "VIA", "VIEW": "VW", "VIEWS": "VWS", "VILL": "VLG", "VILLAG": "VLG", "VILLAGE": "VLG", "VILLAGES": "VLGS", "VILLE": "VL", "VILLG": "VLG", "VILLIAGE": "VLG", "VIST": "VIS", "VISTA": "VIS", "VLLY": "VLY", "VST": "VIS", "VSTA": "VIS", "WALKS": "WALK", "WELL": "WL", "WELLS": "WLS", "WY": "WAY", } <<<<<<<<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>>>> address.py import re from geocode.address_dicts import * from geocode.address_regex import * from geocode.models import GeocodeCache, GeocodeError from geocode import app_settings from geopy import geocoders # These are the possible order of pieces of an address. We are going to parse # in reverse order. class Address(object): """A class to handle the pieces of a parsed address""" def __init__(self, zipcode = '', state = '', city = '', unit = '', post_dir = '', street_type = '', street = '', pre_dir = '', fraction = '', number = ''): self.zipcode = '' self.state = '' self.city = '' self.unit = '' self.post_dir = '' self.street_type = '' self.street = '' self.pre_dir = '' self.fraction = '' self.number = '' def __str__(self): out = " ".join((self.number, self.fraction, self.pre_dir, self.street, self.street_type, self.post_dir, self.unit)) out = re.sub(" +", " ", out.strip()) # convert multiple spaces into 1 out2 = self.city + ", " + self.state if len(self.zipcode) > 5: out2 += " " + self.zipcode[:5] else: out2 += " " + self.zipcode out2 = re.sub(" +", " ", out2) # convert multiple spaces into 1 out2 = re.sub(",$", "", out2.strip()) # remove trailing space and/or comma out2 = re.sub("^, ", "", out2) # remove leading comma space if out != '' and out2 != '': return out + ", " + out2 else: return out + out2 def is_zipcode(piece, addr_parts): """Is the passed string a zipcode Expects addr_parts to be an Address Object""" m = re.match("(?P<zipcode>%s)" % ZIP_LOOSE_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m: addr_parts.zipcode = m.groupdict()['zipcode'] return True else: return False def is_state(piece, addr_parts): """Is the passed string a state""" m = re.match("(?P<state>%s)" % STATE_REGEX, piece, re.IGNORECASE| re.VERBOSE) if m: addr_parts["state"] = m.groupdict()['state'] return True else: return False def is_city(piece, addr_parts): """Is the passed string a city""" # If the previous piece was a state, then we can safely assume that this # piece is a city. Otherwise, skip it. if addr_parts.has_key('state') and addr_parts['state'] != '': addr_parts['city'] = piece return True else: return False def is_unit(piece, addr_parts): """Is the passed string a unit description""" m = re.match("(?P<unit>%s)" % UNIT_REGEX, piece, re.IGNORECASE| re.VERBOSE) if m: addr_parts["unit"] = m.groupdict()['unit'] return True else: return False def is_post_dir(piece, addr_parts): """Is the passed string a direction""" m = re.match("(?P<post_dir>%s)" % DIRECTIONS_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m: addr_parts["post_dir"] = m.groupdict()['post_dir'] if addr_parts['post_dir'] in DIRECTIONS.keys(): addr_parts['post_dir'] = DIRECTIONS[addr_parts['post_dir']] return True else: return False def is_pre_dir(piece, addr_parts): """Is the passed string a direction""" m = re.match("(?P<pre_dir>%s)" % DIRECTIONS_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m: addr_parts["pre_dir"] = m.groupdict()['pre_dir'] if addr_parts['pre_dir'] in DIRECTIONS.keys(): addr_parts['pre_dir'] = DIRECTIONS[addr_parts['pre_dir']] return True else: return False def is_street_type(piece, addr_parts): """Is the passed string a street descriptor""" m = re.match("(?P<street_type>%s)" % STREET_TYPE_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m: addr_parts["street_type"] = m.groupdict()['street_type'] if addr_parts['street_type'] in STREET_TYPES.keys(): addr_parts['street_type'] = STREET_TYPES[addr_parts['street_type']] return True else: return False def is_street(piece, addr_parts): """Is the passed string a street. If it gets this far, we must assume it is.""" addr_parts['street'] = piece return True def is_fraction(piece, addr_parts): """Is the passed string a fraction""" m = re.match("(?P<fraction>%s)" % FRACTION_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m: addr_parts["fraction"] = m.groupdict()['fraction'] return True else: return False def is_number(piece, addr_parts): """Is the passed string a house number""" # If we are this far, we assume it is. Because there are strange numbers # such as W333 S405, we are going to added it to the end of the number in addr_parts if addr_parts['number'] != '': addr_parts['number'] += " " addr_parts['number'] += piece return True def is_address(piece, addr_parts): """Is the passed string an address line""" m =re.match(ADDRESS_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m: match_dict = m.groupdict() for key, value in match_dict.items(): addr_parts[key] = value or '' if addr_parts['pre_dir'] in DIRECTIONS.keys(): addr_parts['pre_dir'] = DIRECTIONS[addr_parts['pre_dir']] if addr_parts['street_type'] in DIRECTIONS.keys(): addr_parts['street_type'] = DIRECTIONS[addr_parts['street_type']] if addr_parts['post_dir'] in DIRECTIONS.keys(): addr_parts['post_dir'] = DIRECTIONS[addr_parts['post_dir']] return True else: return False def is_citystatezip(piece, addr_parts): """ Is the passed string a city, state zip. Expects addr_parts to be an Address object """ m = re.match(PLACE_REGEX, piece, re.IGNORECASE|re.VERBOSE) if m and m.end() != 0: match_dict = m.groupdict() if 'city' in match_dict.keys(): addr_parts.city = match_dict['city'] or '' else: addr_parts.city = '' if 'state' in match_dict.keys(): addr_parts.state = match_dict['state'] or '' else: addr_parts.state = '' if 'state' in match_dict.keys(): addr_parts.zipcode = match_dict['zip'] or '' else: addr_parts.zipcode = '' return True else: return False def addr_piece_test_generator(): """A generator that will return the next test.""" addr_piece_tests = [ is_zipcode, is_state, is_city, is_unit, is_post_dir, is_street_type, is_street, is_pre_dir, is_fraction, is_number, ] for test in addr_piece_tests: yield test while True: yield addr_piece_tests[len(addr_piece_tests)] # Always yield the number function when all done def parse_location(address): """ Given a location, convert it to uppercase and make sure it is in a standard format """ # Convert commas to spaces, and remove periods address = address.upper().replace(".", "") addr = Address() addr_result = re.search(ADDRESS_REGEX + PLACE_REGEX + "\\W*$", address, re.IGNORECASE|re.VERBOSE) if addr_result: result_dict = addr_result.groupdict() # Normalize predir if result_dict["pre_dir"] in DIRECTIONS.keys(): addr.pre_dir = DIRECTIONS[result_dict["pre_dir"]] else: addr.pre_dir = result_dict["pre_dir"] or "" # Normalize postdir addr.post_dir = result_dict["post_dir1"] or \ result_dict["post_dir2"] or \ result_dict["post_dir3"] or "" if addr.post_dir in DIRECTIONS.keys(): addr.post_dir = DIRECTIONS[addr.post_dir] # Normalize street type addr.street_type = result_dict["street_type1"] or \ result_dict["street_type2"] or \ result_dict["street_type3"] or "" if addr.street_type in STREET_TYPES.keys(): addr.street_type = STREET_TYPES[addr.street_type] # Get the street addr.street = result_dict["street1"] or \ result_dict["street2"] or \ result_dict["street3"] or \ result_dict["street4"] or "" addr.number = result_dict["number"] or "" addr.city = result_dict["city"] addr.state = result_dict["state"] addr.zipcode = result_dict["zip"] or " " if len(addr.zipcode) > 5: addr.zipcode = result_dict["zip"][:5] # get rid of the plus 4, if there return addr else: if is_citystatezip(address, addr): return addr else: if is_zipcode(address, addr): return addr def normalize_location(location): """Determine if the location is a corner or an address""" if re.search(".+" + CORNER_REGEX + ".+", location, re.IGNORECASE| re.VERBOSE): return location.upper() else: addr = parse_location(location) return str(addr) --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "Django users" group. To post to this group, send email to django-users@googlegroups.com To unsubscribe from this group, send email to [EMAIL PROTECTED] For more options, visit this group at http://groups.google.com/group/django-users?hl=en -~----------~----~----~----~------~----~------~--~---