Matthew Barnett added the comment: This is basically what the regex module does, written in Python:
def get_grapheme_cluster_break(codepoint): """Gets the "Grapheme Cluster Break" property of a codepoint. The properties defined here: http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt """ # The return value is one of: # # "Other" # "CR" # "LF" # "Control" # "Extend" # "Prepend" # "Regional_Indicator" # "SpacingMark" # "L" # "V" # "T" # "LV" # "LVT" ... def at_grapheme_boundary(string, index): """Checks whether the codepoint at 'index' is on a grapheme boundary. The rules are defined here: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries """ # Break at the start and end of the text. if index <= 0 or index >= len(string): return True prop = get_grapheme_cluster_break(string[index]) prop_m1 = get_grapheme_cluster_break(string[index - 1]) # Don't break within CRLF. if prop_m1 == "CR" and prop == "LF": return False # Otherwise break before and after controls (including CR and LF). if prop_m1 in ("Control", "CR", "LF") or prop in ("Control", "CR", "LF"): return True # Don't break Hangul syllable sequences. if prop_m1 == "L" and prop in ("L", "V", "LV", "LVT"): return False if prop_m1 in ("LV", "V") and prop in ("V", "T"): return False if prop_m1 in ("LVT", "T") and prop == "T": return False # Don't break between regional indicator symbols. if (prop_m1 == "REGIONALINDICATOR" and prop == "REGIONALINDICATOR"): return False # Don't break just before Extend characters. if prop == "Extend": return False # Don't break before SpacingMarks, or after Prepend characters. if prop == "SpacingMark": return False if prop_m1 == "Prepend": return False # Otherwise, break everywhere. return True ---------- nosy: +mrabarnett _______________________________________ Python tracker <rep...@bugs.python.org> <http://bugs.python.org/issue18406> _______________________________________ _______________________________________________ Python-bugs-list mailing list Unsubscribe: http://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com