Am 2009-05-28 um 09:45 schrieb luigi scarso:
I guess I should build a new converter suite (there's also a
InDesign Tags to ConTeXt converter anywhere on my harddisk).
But I won't make GUI apps, just scripts.
That's sound good !
If in python, even better !
If only scripts, the best !
Can we have more details ?
Which conversion do you need?
If it's InDesign to ConTeXt, there's always custom programming needed
- e.g. you need to know what ID paragraph style should become what
ConTeXt section. (sample attached)
I'm not good in building parsers, using mostly regular expression
replacements, so my converters are always limited, and manual cleanup
is necessary - but they save a lot of manual work anyway!
Greetlings from Lake Constance!
Hraban
---
http://www.fiee.net/texnique/
http://wiki.contextgarden.net
https://www.cacert.org (I'm an assurer)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Universelle Textcodierung
2009-03-10 by Henning Hraban Ramm, fiëe virtuëlle
quellcodierung_to_zielcodierung.py [Optionen] Quelldatei [Zieldatei]
Es können auch ganze Verzeichnisse bearbeitet werden.
Optionen:
--filter=Dateiendung
--overwrite (sonst wird die Originaldatei gesichert)
--hidden (sonst werden versteckte Dateien ignoriert)
"""
import os, os.path, sys, codecs, getopt, shutil
try:
import latex
except:
pass
modes = ('filter', 'overwrite', 'hidden')
mode = {}
def help(message=""):
print message
print __doc__
sys.exit(1)
def backup(datei):
original = datei
pfad, datei = os.path.split(datei)
datei, ext = os.path.splitext(datei)
count = 0
while os.path.exists(os.path.join(pfad, "%s.%d%s" % (datei, count, ext))):
count += 1
neudatei = os.path.join(pfad, "%s.%d%s" % (datei, count, ext))
print "Sichere %s als %s" % (original, neudatei)
shutil.copy(original, neudatei)
return neudatei
def is_hidden(datei):
return (datei.startswith('.') or os.sep+'.' in datei)
def convert(source, target, so_enc, ta_enc):
from_exists = os.path.exists(source)
to_exists = os.path.exists(target)
from_isdir = os.path.isdir(source)
to_isdir = os.path.isdir(target)
from_path, from_name = os.path.split(source)
to_path, to_name = os.path.split(target)
#from_name = os.path.basename(source)
#to_name = os.path.basename(target)
if not from_exists:
help("Quelle '%s' nicht gefunden!" % from_name)
if from_isdir:
if is_hidden(source) and not mode['hidden']:
print "Ignoriere verstecktes Verzeichnis %s" % source
return
if not to_isdir:
help("Wenn die Quelle ein Verzeichnis ist, muss auch das Ziel ein Verzeichnis sein!")
print "Verarbeite Verzeichnis %s" % source
dateien = os.listdir(source)
#if not mode['hidden']:
# dateien = [d for d in dateien if not is_hidden(d)]
if mode['filter']:
dateien = [d for d in dateien if d.endswith(mode['filter'])]
for datei in dateien:
s = os.path.join(source, datei)
t = os.path.join(target, datei)
convert(s, t, so_enc, ta_enc)
else:
if is_hidden(from_name) and not mode['hidden']:
print "Ignoriere versteckte Datei %s" % source
return
if to_isdir:
target = os.path.join(target, from_name)
if not mode['overwrite']:
if source==target:
source=backup(source)
elif os.path.exists(target):
backup(target)
print "Konvertiere %s (%s)\n\tnach %s (%s)" % (source, so_enc, target, ta_enc)
so_file = file(source, "rU")
lines = so_file.readlines()
so_file.close()
ta_file = file(target, "w")
for l in lines:
ta_file.write(unicode(l, so_enc).encode(ta_enc))
ta_file.close()
opts, args = getopt.getopt(sys.argv[1:], "ohf:", ["overwrite","hidden","filter="])
if len(args)<1:
help("Zu wenige Parameter angegeben!")
for m in modes:
mode[m] = False
for (o, a) in opts:
if o=='-'+m[0] or o=='--'+m:
if a:
print "Modus %s = %s" % (m, a)
else:
a = True
print "Modus %s aktiv" % m
mode[m] = a
#print "modes:", mode
#print "opts :", opts
#print "args :", args
# gewünschte Codierung aus dem Dateinamen ablesen
scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
from_enc, to_enc = scriptname.split("_to_")
from_name = to_name = args[0]
if len(args)>1: to_name = args[1]
convert(from_name, to_name, from_enc, to_enc)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Convert InDesign tagged text to ConTeXt
"""
import sys, os
import re
quote = u'$&_%'
rePatterns = {
# paragraph styles
ur'^<pstyle:Ã 1\.>((\d\.)*\s+)?(.+)$' : ur'\\chapter{\3}\n',
ur'^<pstyle:Ã 1\.1>((\d\.)*\s+)?(.+)$' : ur'\\section{\3}\n',
ur'^<pstyle:Ã 1\.1\.1>((\d\.)*\s+)?(.+)$' : ur'\\subsection{\3}\n',
ur'^<pstyle:Ã 1\.1\.1\.1>((\d\.)*\s+)?(.+)$' : ur'\\subsubsection{\3}\n',
# character styles
ur'<ct:Bold>(.+?)<ct:>' : ur'{\\bf \1}',
#ur'<cf:Arial>(.*?)<cf:Times New Roman>' : ur'\\otherfont{\1}',
u'<.*?>' : u'', # delete all other tags
# lines that start with dotted numbers = section titles
ur'^\d+\s+(.+)$' : ur'\\chapter{\1}\n',
ur'^\d+\.\d+\.?\s+(.+)$' : ur'\\section{\1}\n',
ur'^\d+\.\d+\.\d+\.?\s+(.+)$' : ur'\\subsection{\1}\n',
ur'^\d+\.\d+\.\d+\.\d+\.?\s+(.+)\$' : ur'\\subsubsection{\1}\n',
ur'^(\s*)[â\-·â¢ï¨]\s+' : ur'\1\\item\t', # itemization (lines starting with bullet etc.)
ur'^(\s*)(\d+)\.?\)\s+' : ur'\1\\item[\2]\t', # itemization (numerical)
ur'([Zusovz])\.([Baguo])\.' : ur'\1.\\,\2.', # u.a., s.o., o.g., z.B.
ur'[â"â](.*?)[ââ"]' : ur'\\quotation{\1}', # German quotation
ur'[\'â,](.*?)[\'ââ]' : ur'\\quote{\1}', # German single quotation
#ur'"(.*?)"' : ur'\\quotation{\1}', # quotation?
ur' (\.\?\!:;)' : ur'\1', # spaces in front of punctuation
ur'{\\em\s+}' : ur'', # empty emphasizing
ur' (%|°)' : ur'\\,\1', # spaces in front of measure units
u' - ' : u' â ', # en dash
ur'(\d{4})\s*(\-|â)\s*(\d{4})' : ur'\1â\3', # year numbers
u' +' : u' ', # multiple spaces
u'^\s+$' : u'\n', # make empty lines really empty
# ur'' : ur'',
}
reres = {}
status = {
'item' : False
}
# collect parameters
if len(sys.argv) > 1:
sourcename = sys.argv[1]
if len(sys.argv) > 2:
targetname = sys.argv[2]
else:
targetname = sourcename.replace('.txt', '.tex')
else:
print "file name?"
sys.exit()
# compile regular expressions
for k in rePatterns:
p = re.compile(k)
reres[p] = rePatterns[k]
source = open(sourcename, 'rU')
target = open(targetname, 'w')
# convert lines
for line in source.readlines():
line = unicode(line, 'utf-16be') # "unicode" encoded InDesign tagged text is UTF-16 big-endian encoded!
for p in reres:
line = p.sub(reres[p], line)
for c in quote:
line = line.replace(c, u'\\'+c)
if '\\item ' in line and not status['item']:
target.write('\\startitemize[]\n')
status['item'] = True
if status['item'] and not '\\item ' in line:
target.write('\\stopitemize\n')
status['item'] = False
target.write(line.encode('utf-8')) # write UTF-8
source.close()
target.close()
print "%s completed" % targetname
___________________________________________________________________________________
If your question is of interest to others as well, please add an entry to the
Wiki!
maillist : ntg-context@ntg.nl / http://www.ntg.nl/mailman/listinfo/ntg-context
webpage : http://www.pragma-ade.nl / http://tex.aanhet.net
archive : https://foundry.supelec.fr/projects/contextrev/
wiki : http://contextgarden.net
___________________________________________________________________________________