#!/usr/bin/python # -*- coding: utf-8 -*- """Encode and decode a representation using a reduced-charset version of ASCII.
My theory is that you can make text more compressible by encoding it in a modeful way, like the old Baudot codes. I’m appropriating four ASCII codes for the role of the old FIGS: - DC1 \x11 ^Q: change state to lowercase letters (the initial state) - DC2 \x12 ^R: change state to uppercase letters - DC3 \x13 ^S: change state to numbers - DC4 \x14 ^T: treat the next character only as an uppercase letter I know SI and SO would be more appropriate here, but I need three states, not two. ESC followed by any of the above four, or itself, simply passes that character through to the output. All three of uppercase, letters, and numbers are encoded as lowercase letters. Effectively, this reduces the 128-character ASCII character set by 36 to 92 characters. The algorithm for using DC4 (capitalize next letter) is potentially problematic, in that it may use a lot of buffer space. This approach *does* make my test file (Project Gutenberg’s King James Bible, minus CR characters but retaining LFs) more compressible: it compresses to 1369889 bytes instead of 1371072 bytes with this preprocessing applied --- indicating that the preprocessing picked up some structure gzip wasn’t able to find itself! But this is a very small difference, like 0.08%. The encoding bloated the original file from 4351847 bytes to 4515061 bytes, a 3.8% increase. A random translation from a 128-character charset to a 92-character charset would be expected to increase the file size by 7.3%, not 3.8%. Doing a simple huffman compression on the same file produced 19939830 bits on the raw file and 20094861 bits on the encoded file, both exclusive of the huffman table itself, an 0.8% increase in the size of the file, rather than gzip’s 0.08% improvement. This was a little disappointing. However, it achieved its purpose of formulating a practical huffman code for manual encoding of English text with spacing, punctuation, and formatting, with only 38 code points and reasonable efficiency of about 4.6 bits per character. (Details in a later post.) """ uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' lowercase = 'abcdefghijklmnopqrstuvwxyz' numbers = '0123456789' (DC1, DC2, DC3, DC4, ESC) = ('\x11', '\x12', '\x13', '\x14', '\x1b') def chars(infile): for line in infile: for char in line: yield char def decode(infile): current_state = lowercase capitalize_next_char = False passthrough_next_char = False for char in chars(infile): if passthrough_next_char: yield char passthrough_next_char = False elif char in lowercase: if capitalize_next_char: yield uppercase[lowercase.index(char)] capitalize_next_char = False else: yield current_state[lowercase.index(char)] elif char == DC1: current_state = lowercase elif char == DC2: current_state = uppercase elif char == DC3: current_state = numbers elif char == DC4: capitalize_next_char = True elif char == ESC: passthrough_next_char = True else: yield char def ok(a, b): assert a == b, (a, b) ok(''.join(decode('this \x12is \x11a \x14test \x13bcd! \x1b\x12')), 'this IS a Test 123! \x12') def chunks(infile): "Break an infile into chunks such that any capital starts a new chunk." buffer = [] for char in chars(infile): if char in uppercase: yield buffer buffer = [char] else: buffer.append(char) yield buffer def encode(infile): current_state = lowercase for chunk in chunks(infile): if (current_state is not uppercase and chunk and chunk[0] in uppercase and any(letter in lowercase for letter in chunk)): yield DC4 chunk[0] = chunk[0].lower() for char in chunk: if char in (DC1, DC2, DC3, DC4, ESC): yield ESC elif char in current_state: pass elif char in lowercase: yield DC1 current_state = lowercase elif char in uppercase: yield DC2 current_state = uppercase elif char in numbers: yield DC3 current_state = numbers if char in current_state: yield lowercase[current_state.index(char)] else: yield char ok(''.join(encode('this IS a Test 123! \x12')), 'this \x12is \x11a \x14test \x13bcd! \x1b\x12') _me = file(__file__).read() ok(''.join(decode(encode(_me))), _me) if __name__ == '__main__': import sys decoding = (len(sys.argv) > 1 and sys.argv[1] == '-d') if decoding: sys.argv.pop(1) function = decode if decoding else encode infile = file(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin sys.stdout.writelines(function(infile)) -- To unsubscribe: http://lists.canonical.org/mailman/listinfo/kragen-hacks