Hello, I have decided not to rely on very kind help by David with his Windows tools and I have written (hopefully) completely platform neutral pure Python 3 script for checking pairwise-characters. So, far it was used only for fixing https://gitlab.com/crosswire-bible-society/CzeCEP/-/issues/2 and I am quite sure it is pretty buggy, but it could be proven useful for somebody.
Temporarily the script is in its own repo (https://gitlab.com/mcepl/bible-freq-counter) and attached to this message, but I would like to submit it to sword-utils. How to do it? Blessings, Matěj -- http://matej.ceplovi.cz/blog/, @mcepl@floss.social GPG Finger: 3C76 A027 CA45 AD70 98B5 BC1D 7920 5802 880B C9D8 Afraid to die alone? Become a bus driver. -- alleged easter egg in notepad++
#!/usr/bin/python3 import enum import logging import pprint import sys import xml.sax from collections import Counter logging.basicConfig(level=logging.INFO) log = logging.getLogger() QType=enum.Enum('QType', ['SINGLE', 'DOUBLE']) class PairCheckerHandler(xml.sax.ContentHandler): configuration = { 'cs': { 'OPEN' : "‚„", 'CLOSE' : "‘“", 'SINGLE' : "‚‘", 'DOUBLE' : "„“" } } def __init__(self): xml.sax.ContentHandler.__init__(self) self.OPEN_CH = None self.CLOSE_CH = None self.SINGLE_CH = None self.DOUBLE_CH = None self.PAIR_CH = None self.freq_table = Counter() self.balance = Counter() self.current_ref = None def __process_character(self, c): self.freq_table.update(c) if c in self.OPEN_CH: if c in self.SINGLE_CH: self.balance[QType.SINGLE] += 1 elif c in self.DOUBLE_CH: self.balance[QType.DOUBLE] += 1 log.debug(f"Opening character {c} (balance {self.balance})") elif c in self.CLOSE_CH: if c in self.SINGLE_CH: self.balance[QType.SINGLE] -= 1 elif c in self.DOUBLE_CH: self.balance[QType.DOUBLE] -= 1 log.debug(f"Closing character {c} (balance {self.balance})") if any([self.balance[x] < 0 for x in self.balance]): print(f"Balance for character {c} is below zero in {self.current_ref}", file=sys.stderr) sys.exit(1) elif any([self.balance[x] > 1 for x in self.balance]): print(f"Balance for character {c} is over one in {self.current_ref}", file=sys.stderr) sys.exit(1) def startElement(self, name, attrs): if 'osisText' in name: lang = attrs.get('xml:lang') log.debug(f'lang = {lang}') self.OPEN_CH = self.configuration[lang]['OPEN'] self.CLOSE_CH = self.configuration[lang]['CLOSE'] self.SINGLE_CH = self.configuration[lang]['SINGLE'] self.DOUBLE_CH = self.configuration[lang]['DOUBLE'] self.PAIR_CH = self.OPEN_CH+self.CLOSE_CH log.debug(f'self.PAIR_CH = {self.PAIR_CH}') elif 'verse' in name: if 'sID' in attrs: log.debug(f'name = {name}, sID = {attrs["sID"]}') self.current_ref = attrs['sID'] def characters(self, content): log.debug(f'current_ref = {self.current_ref}, content: {content} ({type(content)})') if self.current_ref is None: return for c in content: if c in self.PAIR_CH: self.__process_character(c) if __name__ == "__main__": parser = xml.sax.make_parser() handler = PairCheckerHandler() parser.setContentHandler(handler) for bible_file in sys.argv[1:]: parser.parse(bible_file) pprint.pprint(dict(handler.freq_table))
signature.asc
Description: PGP signature
_______________________________________________ sword-devel mailing list: sword-devel@crosswire.org http://crosswire.org/mailman/listinfo/sword-devel Instructions to unsubscribe/change your settings at above page