Re: [NTG-context] converters (was: TexPaste alpha)

Henning Hraban Ramm Fri, 29 May 2009 01:15:32 -0700

Am 2009-05-28 um 09:45 schrieb luigi scarso:

I guess I should build a new converter suite (there's also aInDesign Tags to ConTeXt converter anywhere on my harddisk).
But I won't make GUI apps, just scripts.
That's sound good !
If in python, even better !
If only scripts, the best !
Can we have more details ?


Which conversion do you need?

If it's InDesign to ConTeXt, there's always custom programming needed- e.g. you need to know what ID paragraph style should become whatConTeXt section. (sample attached)

I'm not good in building parsers, using mostly regular expressionreplacements, so my converters are always limited, and manual cleanupis necessary - but they save a lot of manual work anyway!



Greetlings from Lake Constance!
Hraban
---
http://www.fiee.net/texnique/
http://wiki.contextgarden.net
https://www.cacert.org (I'm an assurer)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Universelle Textcodierung
2009-03-10 by Henning Hraban Ramm, fiÃ«e virtuÃ«lle

quellcodierung_to_zielcodierung.py [Optionen] Quelldatei [Zieldatei]

Es kÃ¶nnen auch ganze Verzeichnisse bearbeitet werden.

Optionen:
--filter=Dateiendung
--overwrite          (sonst wird die Originaldatei gesichert)
--hidden             (sonst werden versteckte Dateien ignoriert)
"""

import os, os.path, sys, codecs, getopt, shutil
try:
    import latex
except:
    pass

modes = ('filter', 'overwrite', 'hidden')
mode = {}

def help(message=""):
    print message
    print __doc__
    sys.exit(1)

def backup(datei):
    original = datei
    pfad, datei = os.path.split(datei)
    datei, ext = os.path.splitext(datei)
    count = 0
    while os.path.exists(os.path.join(pfad, "%s.%d%s" % (datei, count, ext))):
        count += 1
    neudatei = os.path.join(pfad, "%s.%d%s" % (datei, count, ext))
    print "Sichere %s als %s" % (original, neudatei)
    shutil.copy(original, neudatei)
    return neudatei

def is_hidden(datei):
	return (datei.startswith('.') or os.sep+'.' in datei)

def convert(source, target, so_enc, ta_enc):
    from_exists = os.path.exists(source)
    to_exists = os.path.exists(target)
    from_isdir = os.path.isdir(source)
    to_isdir = os.path.isdir(target)
    from_path, from_name = os.path.split(source)
    to_path, to_name = os.path.split(target)
    #from_name = os.path.basename(source)
    #to_name = os.path.basename(target)

    if not from_exists:
    	help("Quelle '%s' nicht gefunden!" % from_name)

    if from_isdir:
    	if is_hidden(source) and not mode['hidden']:
    		print "Ignoriere verstecktes Verzeichnis %s" % source
    		return
        if not to_isdir:
            help("Wenn die Quelle ein Verzeichnis ist, muss auch das Ziel ein Verzeichnis sein!")
    	print "Verarbeite Verzeichnis %s" % source
        dateien = os.listdir(source)
        #if not mode['hidden']:
        #	dateien = [d for d in dateien if not is_hidden(d)]
        if mode['filter']:
            dateien = [d for d in dateien if d.endswith(mode['filter'])]
        for datei in dateien:
        	s = os.path.join(source, datei)
        	t = os.path.join(target, datei)
        	convert(s, t, so_enc, ta_enc)
    else:
    	if is_hidden(from_name) and not mode['hidden']:
    		print "Ignoriere versteckte Datei %s" % source
    		return
        if to_isdir:
            target = os.path.join(target, from_name)
        if not mode['overwrite']:
            if source==target:
                source=backup(source)
            elif os.path.exists(target):
                backup(target)
        print "Konvertiere %s (%s)\n\tnach %s (%s)" % (source, so_enc, target, ta_enc)
        so_file = file(source, "rU")
        lines = so_file.readlines()
        so_file.close()
        ta_file = file(target, "w")
        for l in lines:
            ta_file.write(unicode(l, so_enc).encode(ta_enc))
        ta_file.close()
        

opts, args = getopt.getopt(sys.argv[1:], "ohf:", ["overwrite","hidden","filter="])

if len(args)<1:
    help("Zu wenige Parameter angegeben!")

for m in modes:
    mode[m] = False
    for (o, a) in opts:
        if o=='-'+m[0] or o=='--'+m:
            if a:
                print "Modus %s = %s" % (m, a)
            else:
                a = True
                print "Modus %s aktiv" % m
            mode[m] = a

#print "modes:", mode
#print "opts :", opts
#print "args :", args

# gewÃ¼nschte Codierung aus dem Dateinamen ablesen
scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
from_enc, to_enc = scriptname.split("_to_")

from_name = to_name = args[0]
if len(args)>1: to_name = args[1]

convert(from_name, to_name, from_enc, to_enc)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Convert InDesign tagged text to ConTeXt
"""
import sys, os
import re

quote = u'$&_%'

rePatterns = {
	# paragraph styles
	ur'^<pstyle:Ã 1\.>((\d\.)*\s+)?(.+)$' : ur'\\chapter{\3}\n',
	ur'^<pstyle:Ã 1\.1>((\d\.)*\s+)?(.+)$' : ur'\\section{\3}\n',
	ur'^<pstyle:Ã 1\.1\.1>((\d\.)*\s+)?(.+)$' : ur'\\subsection{\3}\n',
	ur'^<pstyle:Ã 1\.1\.1\.1>((\d\.)*\s+)?(.+)$' : ur'\\subsubsection{\3}\n',
	# character styles
	ur'<ct:Bold>(.+?)<ct:>' : ur'{\\bf \1}',
	#ur'<cf:Arial>(.*?)<cf:Times New Roman>' : ur'\\otherfont{\1}',
	
	u'<.*?>' : u'', # delete all other tags

	# lines that start with dotted numbers = section titles
	ur'^\d+\s+(.+)$' : ur'\\chapter{\1}\n',
	ur'^\d+\.\d+\.?\s+(.+)$' : ur'\\section{\1}\n',
	ur'^\d+\.\d+\.\d+\.?\s+(.+)$' : ur'\\subsection{\1}\n',
	ur'^\d+\.\d+\.\d+\.\d+\.?\s+(.+)\$' : ur'\\subsubsection{\1}\n',
	
	ur'^(\s*)[â\-Â·â¢ï¨]\s+' : ur'\1\\item\t', # itemization (lines starting with bullet etc.)
	ur'^(\s*)(\d+)\.?\)\s+' : ur'\1\\item[\2]\t', # itemization (numerical)
	ur'([Zusovz])\.([Baguo])\.' : ur'\1.\\,\2.', # u.a., s.o., o.g., z.B.
	ur'[â"â](.*?)[ââ"]' : ur'\\quotation{\1}', # German quotation
	ur'[\'â,](.*?)[\'ââ]' : ur'\\quote{\1}', # German single quotation
	#ur'"(.*?)"' : ur'\\quotation{\1}', # quotation?
	ur' (\.\?\!:;)' : ur'\1', # spaces in front of punctuation
	ur'{\\em\s+}' : ur'', # empty emphasizing
	ur' (%|Â°)' : ur'\\,\1', # spaces in front of measure units
	u' - ' : u' â ', # en dash
	ur'(\d{4})\s*(\-|â)\s*(\d{4})' : ur'\1â\3', # year numbers
	
	u' +' : u' ', # multiple spaces
	u'^\s+$' : u'\n', # make empty lines really empty

#	ur'' : ur'',
	
}

reres = {}
status = {
	'item' : False
}

# collect parameters
if len(sys.argv) > 1:
	sourcename = sys.argv[1]
	if len(sys.argv) > 2:
		targetname = sys.argv[2]
	else:
		targetname = sourcename.replace('.txt', '.tex')
else:
	print "file name?"
	sys.exit()

# compile regular expressions
for k in rePatterns:
	p = re.compile(k)
	reres[p] = rePatterns[k]

source = open(sourcename, 'rU')
target = open(targetname, 'w')

# convert lines
for line in source.readlines():
	line = unicode(line, 'utf-16be') # "unicode" encoded InDesign tagged text is UTF-16 big-endian encoded!
	for p in reres:
		line = p.sub(reres[p], line)
	for c in quote:
		line = line.replace(c, u'\\'+c)
	if '\\item ' in line and not status['item']:
		target.write('\\startitemize[]\n')
		status['item'] = True
	if status['item'] and not '\\item ' in line:
		target.write('\\stopitemize\n')
		status['item'] = False
	target.write(line.encode('utf-8')) # write UTF-8

source.close()
target.close()

print "%s completed" % targetname

___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the 
Wiki!

maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage  : http://www.pragma-ade.nl / http://tex.aanhet.net
archive  : https://foundry.supelec.fr/projects/contextrev/
wiki     : http://contextgarden.net
___________________________________________________________________________________

Re: [NTG-context] converters (was: TexPaste alpha)

Reply via email to