Hello, I have decided not to rely on very kind help by David with his Windows tools and I have written (hopefully) completely platform neutral pure Python 3 script for checking pairwise-characters. So, far it was used only for fixing https://gitlab.com/crosswire-bible-society/CzeCEP/-/issues/2 and I am quite sure it is pretty buggy, but it could be proven useful for somebody.
Temporarily the script is in its own repo (https://gitlab.com/mcepl/bible-freq-counter) and attached to this message, but I would like to submit it to sword-utils. How to do it? Blessings, Matěj -- http://matej.ceplovi.cz/blog/, @[email protected] GPG Finger: 3C76 A027 CA45 AD70 98B5 BC1D 7920 5802 880B C9D8 Afraid to die alone? Become a bus driver. -- alleged easter egg in notepad++
#!/usr/bin/python3
import enum
import logging
import pprint
import sys
import xml.sax
from collections import Counter
logging.basicConfig(level=logging.INFO)
log = logging.getLogger()
QType=enum.Enum('QType', ['SINGLE', 'DOUBLE'])
class PairCheckerHandler(xml.sax.ContentHandler):
configuration = {
'cs': {
'OPEN' : "‚„",
'CLOSE' : "‘“",
'SINGLE' : "‚‘",
'DOUBLE' : "„“"
}
}
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.OPEN_CH = None
self.CLOSE_CH = None
self.SINGLE_CH = None
self.DOUBLE_CH = None
self.PAIR_CH = None
self.freq_table = Counter()
self.balance = Counter()
self.current_ref = None
def __process_character(self, c):
self.freq_table.update(c)
if c in self.OPEN_CH:
if c in self.SINGLE_CH:
self.balance[QType.SINGLE] += 1
elif c in self.DOUBLE_CH:
self.balance[QType.DOUBLE] += 1
log.debug(f"Opening character {c} (balance {self.balance})")
elif c in self.CLOSE_CH:
if c in self.SINGLE_CH:
self.balance[QType.SINGLE] -= 1
elif c in self.DOUBLE_CH:
self.balance[QType.DOUBLE] -= 1
log.debug(f"Closing character {c} (balance {self.balance})")
if any([self.balance[x] < 0 for x in self.balance]):
print(f"Balance for character {c} is below zero in {self.current_ref}",
file=sys.stderr)
sys.exit(1)
elif any([self.balance[x] > 1 for x in self.balance]):
print(f"Balance for character {c} is over one in {self.current_ref}",
file=sys.stderr)
sys.exit(1)
def startElement(self, name, attrs):
if 'osisText' in name:
lang = attrs.get('xml:lang')
log.debug(f'lang = {lang}')
self.OPEN_CH = self.configuration[lang]['OPEN']
self.CLOSE_CH = self.configuration[lang]['CLOSE']
self.SINGLE_CH = self.configuration[lang]['SINGLE']
self.DOUBLE_CH = self.configuration[lang]['DOUBLE']
self.PAIR_CH = self.OPEN_CH+self.CLOSE_CH
log.debug(f'self.PAIR_CH = {self.PAIR_CH}')
elif 'verse' in name:
if 'sID' in attrs:
log.debug(f'name = {name}, sID = {attrs["sID"]}')
self.current_ref = attrs['sID']
def characters(self, content):
log.debug(f'current_ref = {self.current_ref}, content: {content} ({type(content)})')
if self.current_ref is None:
return
for c in content:
if c in self.PAIR_CH:
self.__process_character(c)
if __name__ == "__main__":
parser = xml.sax.make_parser()
handler = PairCheckerHandler()
parser.setContentHandler(handler)
for bible_file in sys.argv[1:]:
parser.parse(bible_file)
pprint.pprint(dict(handler.freq_table))
signature.asc
Description: PGP signature
_______________________________________________ sword-devel mailing list: [email protected] http://crosswire.org/mailman/listinfo/sword-devel Instructions to unsubscribe/change your settings at above page
