Lars, during your break I have worked a bit on your patch that removes InsetLaTeXAccent. The lyx2lyx conversion is much better now, but has still some problems. I changed the part that touches TransManager because I think that disabling the TransManager completely should be done in a separate patch. I post the current version and a test document because I won't have too much time during the next days to work on it, and maybe somebody else wants to have a look. lyx2lyx still has some FIXMEs, and somebody should verify that LaTeX output is also OK for non-utf8 encodings. It is in many cases, since we can now switch the encoding not only per paragraph, but more fine grained, so we can simply output the accented character in the appropriate encoding, but what about accent/character combinations that are not available in any 8bit encoding? Since some people start to use 1.5svn for serious work we should only apply this after these issues have been fixed IMHO.
Georg
Index: src/insets/insetlatexaccent.C =================================================================== --- src/insets/insetlatexaccent.C (Revision 16710) +++ src/insets/insetlatexaccent.C (Arbeitskopie) @@ -1,626 +0,0 @@ -/** - * \file insetlatexaccent.C - * This file is part of LyX, the document processor. - * Licence details can be found in the file COPYING. - * - * \author Lars Gullik Bjønnes - * - * Full author contact details are available in file CREDITS. - */ - -#include <config.h> - -#include "insetlatexaccent.h" - -#include "debug.h" -#include "language.h" -#include "LColor.h" -#include "lyxlex.h" -#include "lyxrc.h" -#include "metricsinfo.h" - -#include "frontends/FontMetrics.h" -#include "frontends/Painter.h" - -#include "support/lstrings.h" - - -namespace lyx { - -using support::contains; -using support::trim; - -using std::endl; -using std::string; -using std::auto_ptr; -using std::ostream; - - -/* LatexAccent. Proper handling of accented characters */ -/* This part is done by Ivan Schreter, [EMAIL PROTECTED] */ -/* Later modified by Lars G. Bjønnes, [EMAIL PROTECTED] */ - -InsetLatexAccent::InsetLatexAccent() - : candisp(false) -{} - - -InsetLatexAccent::InsetLatexAccent(string const & str) - : contents(str) -{ - checkContents(); -} - - -auto_ptr<InsetBase> InsetLatexAccent::doClone() const -{ - return auto_ptr<InsetBase>(new InsetLatexAccent(contents)); -} - - -void InsetLatexAccent::checkContents() - // check, if we know the modifier and can display it ok on screen -{ - candisp = false; - - if (contents.empty() || contents.length() < 2) { - lyxerr[Debug::KEY] << "Cannot decode: " << contents << endl; - return; - } - - contents = trim(contents); - if (contents[0] != '\\') { // demand that first char is a '\\' - lyxerr[Debug::KEY] << "Cannot decode: " << contents << endl; - return; - } - - lyxerr[Debug::KEY] << "Decode: " << contents << endl; - - remdot = false; - plusasc = false; - plusdesc = false; - - switch (contents[1]) { // second char should be one of these - case '\'': // acute - modtype = ACUTE; // acute - plusasc = true; // at the top of character - break; - case '`': // grave - modtype = GRAVE; // grave - plusasc = true; // at the top - break; - case '=': // macron - modtype = MACRON; // macron - plusasc = true; // at the top - break; - case '~': // tilde - modtype = TILDE; // tilde - plusasc = true; // at the top - break; - case 'b': // underbar - modtype = UNDERBAR; // underbar - plusdesc = true; // at the bottom - break; - case 'c': // cedilla - modtype = CEDILLA; // cedilla - plusdesc = true; // at the bottom - break; - case 'd': // underdot - modtype = UNDERDOT; // underdot - plusdesc = true; // at the bottom - break; - case 'r': // circle - modtype = CIRCLE; // circle - plusasc = true; // at the top - break; - case 't': // tie - modtype = TIE; // tie - plusasc = true; // at the top - break; - case 'u': // breve - modtype = BREVE; // breve - plusasc = true; // at the top - break; - case 'v': // caron - modtype = CARON; // caron - plusasc = true; // at the top - break; - case 'q': // special caron - modtype = SPECIAL_CARON; // special caron - plusasc = true; // at the top - break; - case 'H': // hungarian umlaut - modtype = HUNGARIAN_UMLAUT; // hungarian umlaut - plusasc = true; // at the top - break; - case '"': // umlaut - modtype = UMLAUT; // umlaut - plusasc = true; // at the top - break; - case '.': // dot - modtype = DOT; // dot - plusasc = true; // at the top - break; - case '^': // circumflex - modtype = CIRCUMFLEX; // circumflex - plusasc = true; // at the top - break; - case 'k': // ogonek - modtype = OGONEK; // ogonek - plusdesc = true; - break; - case 'i': // dot-less-i - modtype = DOT_LESS_I; // dot-less-i - plusasc = true; // at the top (not really needed) - remdot = true; - break; - case 'j': // dot-less-j - modtype = DOT_LESS_J; // dot-less-j - plusasc = true; // at the top (not really needed) - remdot = true; - break; - case 'l': // lslash - modtype = lSLASH; - plusasc = true; // at the top (not really needed) - break; - case 'L': // lslash - modtype = LSLASH; - plusasc = true; // at the top (not really needed) - break; - default: - lyxerr[Debug::KEY] << "Default" << endl; - // unknown accent (or something else) - return; - } - - // we demand that third char is a '{' (Lgb) - if (contents[2] != '{') return; - - // special clause for \i{}, \j{} \l{} and \L{} - if ((modtype == DOT_LESS_I || modtype == DOT_LESS_J - || modtype == lSLASH || modtype == LSLASH) - && contents[3] == '}') { - switch (modtype) { - case DOT_LESS_I: ic = 'i'; break; - case DOT_LESS_J: ic = 'j'; break; - case lSLASH: ic = 'l'; break; - case LSLASH: ic = 'L'; break; - default: - // if this happens something is really wrong - lyxerr << "InsetLaTexAccent: weird error." << endl; - break; - } - //ic = (modtype == DOT_LESS_J ? 'j' : 'i'); - lyxerr[Debug::KEY] << "Contents: [" << contents << ']' - << ", ic: " << ic - << ", top: " << plusasc - << ", bot: " << plusdesc - << ", dot: " << remdot - << ", mod: " << modtype << endl; - // Special case for space - } else if (contents[3] == '}') { - ic = ' '; - } else { - int i = 3; - - // now get the char - ic = contents[3]; // i will always be 3 here - - // ic should now be a alfa-char or '\\' - if (ic == '\\') { - ic = contents[++i]; // will only allow \<foo>{\i} and \<foo>{\j} - if (ic == 'i' || ic == 'j') - remdot = true; - else - return; - } else if ((ic == 'i'|| ic == 'j') && contents[4] == '}') { - // Do a rewrite: \<foo>{i} --> \<foo>{\i} - string temp = contents; - temp.erase(3, string::npos); - temp += '\\'; - temp += char(ic); - for (string::size_type j = 4; - j < contents.length(); ++j) - temp+= contents[j]; - contents= temp; - ++i; - remdot = true; - } - - // demand a '}' at the end - if (contents[++i] != '}' && contents[++i]) return; - - // fine, the char is properly decoded now (hopefully) - lyxerr[Debug::KEY] << "Contents: [" << contents << ']' - << ", ic: " << ic - << ", top: " << plusasc - << ", bot: " << plusdesc - << ", dot: " << remdot - << ", mod: " << modtype << endl; - } - candisp = true; -} - - -bool InsetLatexAccent::metrics(MetricsInfo & mi, Dimension & dim) const -{ - LyXFont & font = mi.base.font; - frontend::FontMetrics const & fm = theFontMetrics(font); - - // This function is a bit too simplistic and is just a - // "try to make a fit for all accents" approach, to - // make it better we need to know what kind of accent is - // used and add to max based on that. - if (candisp) { - if (ic == ' ') - dim.asc = fm.ascent('a'); - else - dim.asc = fm.ascent(ic); - if (plusasc) - dim.asc += (fm.maxAscent() + 3) / 3; - - if (ic == ' ') - dim.des = fm.descent('a'); - else - dim.des = fm.descent(ic); - if (plusdesc) - dim.des += 3; - - dim.wid = fm.width(ic); - } else { - dim.asc = fm.maxAscent() + 4; - dim.des = fm.maxDescent() + 4; - docstring dcon(contents.begin(), contents.end()); - dim.wid = fm.width(dcon) + 4; - } - bool const changed = dim_ != dim; - dim_ = dim; - return changed; -} - - -bool InsetLatexAccent::displayISO8859_9(PainterInfo & pi, int x, int y) const -{ - unsigned char tmpic = ic; - - switch (modtype) { - - case CEDILLA: { - if (ic == 'c') tmpic = '\xe7'; - if (ic == 'C') tmpic = '\xc7'; - if (ic == 's') tmpic = '\xfe'; - if (ic == 'S') tmpic = '\xde'; - break; - } - - case BREVE: { - if (ic == 'g') tmpic = '\xf0'; - if (ic == 'G') tmpic = '\xd0'; - break; - } - - case UMLAUT: { - if (ic == 'o') tmpic = '\xf6'; - if (ic == 'O') tmpic = '\xd6'; - if (ic == 'u') tmpic = '\xfc'; - if (ic == 'U') tmpic = '\xdc'; - break; - } - - case DOT: - if (ic == 'I') tmpic = '\xdd'; - break; - - case DOT_LESS_I: - tmpic = '\xfd'; - break; - - default: - return false; - } - - if (tmpic == ic) - return false; - - pi.pain.text(x, y, char(tmpic), pi.base.font); - return true; -} - - -void InsetLatexAccent::drawAccent(PainterInfo const & pi, int x, int y, - char_type accent) const -{ - LyXFont const & font = pi.base.font; - frontend::FontMetrics const & fm = theFontMetrics(font); - - x -= fm.center(accent); - y -= fm.ascent(ic); - y -= fm.descent(accent); - y -= fm.height(accent) / 2; - pi.pain.text(x, y, accent, font); -} - - -void InsetLatexAccent::draw(PainterInfo & pi, int x, int baseline) const -{ - if (lyxrc.font_norm_type == LyXRC::ISO_8859_9) - if (displayISO8859_9(pi, x, baseline)) - return; - - // All the manually drawn accents in this function could use an - // overhaul. Different ways of drawing (what metrics to use) - // should also be considered. - - LyXFont font = pi.base.font; - if (lyxrc.font_norm_type == LyXRC::ISO_10646_1) - font.setLanguage(english_language); - - frontend::FontMetrics const & fm = theFontMetrics(font); - - if (candisp) { - int x2 = int(x + (fm.rbearing(ic) - fm.lbearing(ic)) / 2); - int hg; - int y; - if (plusasc) { - // mark at the top - hg = fm.maxDescent(); - y = baseline - dim_.asc; - if (font.shape() == LyXFont::ITALIC_SHAPE) - x2 += int(0.8 * hg); // italic - } else { - // at the bottom - hg = dim_.des; - y = baseline; - } - - double hg35 = hg * 0.6; - - // display with proper accent mark - // first the letter - pi.pain.text(x, baseline, ic, font); - - if (remdot) { - int tmpvar = baseline - fm.ascent('i'); - int tmpx = 0; - if (font.shape() == LyXFont::ITALIC_SHAPE) - tmpx += int(0.8 * hg); // italic - lyxerr[Debug::KEY] << "Removing dot." << endl; - // remove the dot first - pi.pain.fillRectangle(x + tmpx, tmpvar, dim_.wid, - fm.ascent('i') - - fm.ascent('x') - 1, - backgroundColor()); - // the five lines below is a simple hack to - // make the display of accent 'i' and 'j' - // better. It makes the accent be written - // closer to the top of the dot-less 'i' or 'j'. - char tmpic = ic; // store the ic when we - ic = 'x'; // calculates the ascent of -#ifdef WITH_WARNINGS -#warning metrics? -#endif - int asc = ascent(); // the dot-less version (here: 'x') - ic = tmpic; // set the orig ic back - y = baseline - asc; // update to new y coord. - } - - // now the rest - draw within (x, y, x + wid, y + hg) - switch (modtype) { - case ACUTE: - //drawAccent(pi, x2, baseline, '\xB4'); - drawAccent(pi, x2, baseline, 0xB4); - break; - - case GRAVE: - //drawAccent(pi, x2, baseline, '\x60'); - drawAccent(pi, x2, baseline, 0x60); - break; - - case MACRON: - //drawAccent(pi, x2, baseline, '\xAF'); - drawAccent(pi, x2, baseline, 0xAF); - break; - - case TILDE: - drawAccent(pi, x2, baseline, '~'); - break; - - case UNDERBAR: { - char_type const underbar = 0x5F; //('\x5F'); - pi.pain.text(x2 - fm.center(underbar), - baseline, underbar, font); - break; - } - - case CEDILLA: { - char_type const cedilla = 0xB8; //('\xB8'); - pi.pain.text(x2 - fm.center(cedilla), - baseline, cedilla, font); - break; - } - - case UNDERDOT: - pi.pain.text(x2 - fm.center('.'), - int(baseline + 1.5 * fm.height('.')), - '.', font); - break; - - case DOT: - drawAccent(pi, x2, baseline, '.'); - break; - - case CIRCLE: - //drawAccent(pi, x2, baseline, '\xB0'); - drawAccent(pi, x2, baseline, 0xB0); - break; - - case TIE: - pi.pain.arc(int(x2 + hg35), y + hg / 2, 2 * hg, hg, 0, 360 * 32, - LColor::foreground); - break; - - case BREVE: - pi.pain.arc(int(x2 - hg / 2), y, hg, hg, 0, -360*32, - LColor::foreground); - break; - - case CARON: { - int xp[3], yp[3]; - xp[0] = int(x2 - hg35); yp[0] = int(y + hg35); - xp[1] = int(x2); yp[1] = int(y + hg); - xp[2] = int(x2 + hg35); yp[2] = int(y + hg35); - pi.pain.lines(xp, yp, 3, LColor::foreground); - break; - } - - case SPECIAL_CARON: { - switch (ic) { - case 'L': dim_.wid = int(4.0 * dim_.wid / 5.0); break; - case 't': y -= int(hg35 / 2.0); break; - } - int xp[3], yp[3]; - xp[0] = int(x + dim_.wid); - yp[0] = int(y + hg35 + hg); - - xp[1] = int(x + dim_.wid + (hg35 / 2.0)); - yp[1] = int(y + hg + (hg35 / 2.0)); - - xp[2] = int(x + dim_.wid + (hg35 / 2.0)); - yp[2] = y + int(hg); - - pi.pain.lines(xp, yp, 3, LColor::foreground); - break; - } - - case HUNGARIAN_UMLAUT: - drawAccent(pi, x2, baseline, 0x02DD); - break; - - case UMLAUT: - drawAccent(pi, x2, baseline, '"'); - break; - - case CIRCUMFLEX: - drawAccent(pi, x2, baseline, '\x5E'); - break; - - case OGONEK: { - // this does probably not look like an ogonek, so - // it should certainly be refined - int xp[4], yp[4]; - - xp[0] = x2; - yp[0] = y; - - xp[1] = x2; - yp[1] = y + int(hg35); - - xp[2] = int(x2 - hg35); - yp[2] = y + hg / 2; - - xp[3] = x2 + hg / 4; - yp[3] = y + int(hg); - - pi.pain.lines(xp, yp, 4, LColor::foreground); - break; - } - - case lSLASH: - case LSLASH: { - int xp[2], yp[2]; - - xp[0] = x; - yp[0] = y + int(3 * hg); - - xp[1] = int(x + dim_.wid * 0.75); - yp[1] = y + int(hg); - - pi.pain.lines(xp, yp, 2, LColor::foreground); - break; - } - - case DOT_LESS_I: // dotless-i - case DOT_LESS_J: // dotless-j - // nothing to do for these - break; - } - - } else { - pi.pain.fillRectangle(x + 1, - baseline - dim_.asc + 1, dim_.wid - 2, - dim_.asc + dim_.des - 2, - backgroundColor()); - pi.pain.rectangle(x + 1, baseline - dim_.asc + 1, - dim_.wid - 2, dim_.asc + dim_.des - 2, - LColor::foreground); - docstring dcon(contents.begin(), contents.end()); - pi.pain.text(x + 2, baseline, dcon, font); - } -} - - -void InsetLatexAccent::write(Buffer const &, ostream & os) const -{ - os << "\\i " << contents << "\n"; -} - - -void InsetLatexAccent::read(Buffer const &, LyXLex & lex) -{ - lex.eatLine(); - contents = lex.getString(); - checkContents(); -} - - -int InsetLatexAccent::latex(Buffer const &, odocstream & os, - OutputParams const &) const -{ - os << from_ascii(contents); - return 0; -} - - -int InsetLatexAccent::plaintext(Buffer const &, odocstream & os, - OutputParams const &) const -{ - os << from_ascii(contents); - return 0; -} - - -int InsetLatexAccent::docbook(Buffer const &, odocstream & os, - OutputParams const &) const -{ - // FIXME UNICODE - os << from_ascii(contents); - return 0; -} - - -int InsetLatexAccent::textString(Buffer const & buf, odocstream & os, - OutputParams const & op) const -{ - return plaintext(buf, os, op); -} - - -bool InsetLatexAccent::directWrite() const -{ - return true; -} - - -InsetBase::Code InsetLatexAccent::lyxCode() const -{ - return InsetBase::ACCENT_CODE; -} - - -ostream & operator<<(ostream & o, InsetLatexAccent::ACCENT_TYPES at) -{ - return o << int(at); -} - - -} // namespace lyx Index: src/insets/insetlatexaccent.h =================================================================== --- src/insets/insetlatexaccent.h (Revision 16710) +++ src/insets/insetlatexaccent.h (Arbeitskopie) @@ -1,152 +0,0 @@ -// -*- C++ -*- -/** - * \file insetlatexaccent.h - * This file is part of LyX, the document processor. - * Licence details can be found in the file COPYING. - * - * \author Lars Gullik Bjønnes - * - * Full author contact details are available in file CREDITS. - */ - -#ifndef INSET_LATEX_ACCENT_H -#define INSET_LATEX_ACCENT_H - -#include "inset.h" -#include "support/types.h" - - -namespace lyx { - -class Dimension; - - -/** Insertion of accents - - Proper handling of accented characters. - This is class is supposed to handle all LaTeX accents, it - is also possible that the class will change a bit so that - it also can handle other special characters (e.g. Hstroke) - Initiated by Ivan Schreter, later modified by Lgb. - */ -class InsetLatexAccent : public InsetOld { -public: - /// - InsetLatexAccent(); - /// - explicit InsetLatexAccent(std::string const & str); - /// - bool metrics(MetricsInfo &, Dimension &) const; - /// - void draw(PainterInfo & pi, int x, int y) const; - /// - bool displayISO8859_9(PainterInfo & pi, int x, int y) const; - /// - void write(Buffer const &, std::ostream &) const; - /// - void read(Buffer const &, LyXLex & lex); - /// - int latex(Buffer const &, odocstream &, - OutputParams const &) const; - /// - int plaintext(Buffer const &, odocstream &, - OutputParams const &) const; - /// - int docbook(Buffer const &, odocstream &, - OutputParams const &) const; - /// the string that is passed to the TOC - virtual int textString(Buffer const &, odocstream &, - OutputParams const &) const; - /// - bool directWrite() const; - /// - InsetBase::Code lyxCode()const; - /// - inline bool canDisplay(); - // should this inset be handled like a normal charater - bool isChar() const { return true; } - - /// is this equivalent to a letter? - virtual bool isLetter() const { return candisp; } - - /// all the accent types - enum ACCENT_TYPES{ - /// - ACUTE, // 0 - /// - GRAVE, - /// - MACRON, - /// - TILDE, - /// - UNDERBAR, - /// - CEDILLA, // 5 - /// - UNDERDOT, - /// - CIRCLE, - /// - TIE, - /// - BREVE, - /// - CARON, // 10 - /// - SPECIAL_CARON, - /// - HUNGARIAN_UMLAUT, - /// - UMLAUT, - /// - DOT, - /// - CIRCUMFLEX, // 15 - /// - OGONEK, - /// - DOT_LESS_I, - /// - DOT_LESS_J, // 18 - /// - lSLASH, - /// - LSLASH - }; -private: - friend std::ostream & operator<<(std::ostream &, ACCENT_TYPES); - - virtual std::auto_ptr<InsetBase> doClone() const; - - /// Check if we know the modifier and can display it ok on screen. - void checkContents(); - /// - void drawAccent(PainterInfo const & pi, int x, int y, char_type accent) const; - /// - std::string contents; - /// can display as proper char - bool candisp; - /// modifier type - ACCENT_TYPES modtype; - - /// remove dot from 'i' and 'j' or transform l, L into lslash, LSLaSH - bool remdot; - /// add something to ascent - accent at the top - bool plusasc; - /// add something to descent - underlined char - bool plusdesc; - /// international char - mutable char ic; -}; - - -bool InsetLatexAccent::canDisplay() -{ - return candisp; -} - - -} // namespace lyx - -#endif Index: src/insets/Makefile.am =================================================================== --- src/insets/Makefile.am (Revision 16710) +++ src/insets/Makefile.am (Arbeitskopie) @@ -77,8 +77,6 @@ libinsets_la_SOURCES = \ insetindex.h \ insetlabel.C \ insetlabel.h \ - insetlatexaccent.C \ - insetlatexaccent.h \ insetline.C \ insetline.h \ insetmarginal.h \ @@ -112,7 +110,7 @@ libinsets_la_SOURCES = \ insetvspace.C \ insetvspace.h \ insetwrap.h \ - insetwrap.C + insetwrap.C # insetlist.C \ # insetlist.h \ Index: src/buffer.C =================================================================== --- src/buffer.C (Revision 16710) +++ src/buffer.C (Arbeitskopie) @@ -141,7 +141,7 @@ using std::string; namespace { -int const LYX_FORMAT = 256; +int const LYX_FORMAT = 257; } // namespace anon Index: src/trans_mgr.C =================================================================== --- src/trans_mgr.C (Revision 16710) +++ src/trans_mgr.C (Arbeitskopie) @@ -22,8 +22,6 @@ #include "lyxtext.h" #include "trans.h" -#include "insets/insetlatexaccent.h" - #include "support/lstrings.h" @@ -287,14 +285,7 @@ void TransManager::insert(string const & if (chset_.getName() != lyxrc.font_norm || !enc.first) { // Could not find an encoding - InsetLatexAccent ins(str); - if (ins.canDisplay()) { - cap::replaceSelection(cur); - cur.insert(new InsetLatexAccent(ins)); - cur.posRight(); - } else { - insertVerbatim(str, text, cur); - } + insertVerbatim(str, text, cur); return; } string const tmp(1, static_cast<char>(enc.second)); Index: src/text.C =================================================================== --- src/text.C (Revision 16710) +++ src/text.C (Arbeitskopie) @@ -58,7 +58,6 @@ #include "insets/insettext.h" #include "insets/insetbibitem.h" #include "insets/insethfill.h" -#include "insets/insetlatexaccent.h" #include "insets/insetline.h" #include "insets/insetnewline.h" #include "insets/insetpagebreak.h" @@ -240,10 +239,6 @@ void readParToken(Buffer const & buf, Pa par.insertInset(par.size(), inset.release(), font, change); } - } else if (token == "\\i") { - auto_ptr<InsetBase> inset(new InsetLatexAccent); - inset->read(buf, lex); - par.insertInset(par.size(), inset.release(), font, change); } else if (token == "\\backslash") { par.insertChar(par.size(), '\\', font, change); } else if (token == "\\newline") { Index: lib/lyx2lyx/LyX.py =================================================================== --- lib/lyx2lyx/LyX.py (Revision 16710) +++ lib/lyx2lyx/LyX.py (Arbeitskopie) @@ -73,7 +73,7 @@ format_relation = [("0_06", [200], ge ("1_2", [220], generate_minor_versions("1.2" , 4)), ("1_3", [221], generate_minor_versions("1.3" , 7)), ("1_4", range(222,246), generate_minor_versions("1.4" , 3)), - ("1_5", range(246,257), generate_minor_versions("1.5" , 0))] + ("1_5", range(246,258), generate_minor_versions("1.5" , 0))] def formats_list(): Index: lib/lyx2lyx/lyx_1_5.py =================================================================== --- lib/lyx2lyx/lyx_1_5.py (Revision 16710) +++ lib/lyx2lyx/lyx_1_5.py (Arbeitskopie) @@ -20,7 +20,9 @@ """ Convert files to the file format generated by lyx 1.5""" import re -from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value +import unicodedata + +from parser_tools import find_re, find_token, find_token_exact, find_tokens, find_end_of, get_value from LyX import get_encoding @@ -720,6 +722,205 @@ def revert_encodings(document): document.inputencoding = get_value(document.header, "\\inputencoding", 0) +accent_map = { + "`" : u'\u0309', # grave + "'" : u'\u0301', # acute + "^" : u'\u0302', # circumflex + "~" : u'\u0303', # tilde + "=" : u'\u0304', # macron + "u" : u'\u0306', # breve + "." : u'\u0307', # dot above + "\"": u'\u0308', # diaresis + "r" : u'\u030a', # ring above + "H" : u'\u030b', # double acute + "v" : u'\u030c', # caron + "b" : u'\u0320', # minus sign below + "d" : u'\u0323', # dot below + "c" : u'\u0327', # cedilla + "k" : u'\u0328', # ogonek + "t" : u'\u0361' # tie FIXME: Always spans two characters on screen +} + + +special_accent_map = { + 'i' : u'\u0131', # dotless i + 'j' : u'\u0237', # dotless j + 'l' : u'\u0142', # l with stroke + 'L' : u'\u0141' # L with stroke +} + + +accented_map = { + '\\i' : u'\u0131', # dotless i + '\\j' : u'\u0237' # dotless j +} + + +def _convert_accent(type, accented_char): + char = accented_char + if char == '': + if type in special_accent_map: + return special_accent_map[type] + # a missing char is treated as space by LyX + char = ' ' + elif char in accented_map: + char = accented_map[char] + elif (len(char) > 1): + # We can only convert accents on a single char + return '' + a = accent_map.get(type) + if a: + return unicodedata.normalize("NFKC", "%s%s" % (char, a)) + return '' + + +def convert_ertbackslash(body, i, ert, default_layout): + r""" ------------------------------------------------------------------------------------------- + Convert backslashes and '\n' into valid ERT code, append the converted + text to body[i] and return the (maybe incremented) line index i""" + + for c in ert: + if c == '\\': + body[i] = body[i] + '\\backslash ' + i = i + 1 + body.insert(i, '') + elif c == '\n': + body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, ''] + i = i + 4 + else: + body[i] = body[i] + c + return i + + +def convert_accent(document): + # The following forms are supported by LyX: + # '\i \"{a}' (standard form, as written by LyX) + # '\i \"{}' (standard form, as written by LyX if the accented char is a space) + # '\i \"{ }' (also accepted if the accented char is a space) + # '\i \" a' (also accepted) + # '\i \"' (also accepted) + re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$') + re_contents = re.compile(r'^([^\s{]+)(.*)$') + re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$') + i = 0 + while 1: + i = find_re(document.body, re_wholeinset, i) + if i == -1: + return + match = re_wholeinset.match(document.body[i]) + prefix = match.group(1) + contents = match.group(3).strip() + match = re_contents.match(contents) + if match: + # Strip first char (always \) + accent = match.group(1)[1:] + accented_contents = match.group(2).strip() + match = re_accentedcontents.match(accented_contents) + accented_char = match.group(1) + converted = _convert_accent(accent, accented_char) + if converted == '': + contents = '%s{%s}' % (accent, accented_char), + else: + document.body[i] = '%s%s' % (prefix, converted) + i += 1 + continue + document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents) + document.body[i] = prefix + document.body[i+1:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + ''] + i = convert_ertbackslash(document.body, i + 7, + '\\%s' % contents, + document.default_layout) + document.body[i+1:i+1] = ['\\end_layout', + '', + '\\end_inset'] + i += 3 + + +def revert_accent(document): + numberoflines = len(document.body) + # Since LyX may insert a line break within a word we must combine all + # words before unicode normalization + # FIXME: This does not work, because it creates things like + # '\begin_inset ERTstatus'. + # Parsing all this stuff would be too much work, but maybe we could test + # whether the first character of the new line is one of the accents, and + # only do the merging in that case. That should be safe. +# for i in range(numberoflines-1): +# if document.body[i] == '': +# continue +# while (document.body[i][-1] != ' ' and len(document.body[i+1]) > 0 and +# document.body[i+1][0] != ' '): +# document.body[i] += document.body[i+1][0] +# document.body[i+1] = document.body[i+1][1:] + inverse_accent_map = {} + for k in accent_map: + inverse_accent_map[accent_map[k]] = k + inverse_special_accent_map = {} + for k in special_accent_map: + inverse_special_accent_map[special_accent_map[k]] = k + inverse_accented_map = {} + for k in accented_map: + inverse_accented_map[accented_map[k]] = k + # Normalize to "Normal form D" (NFD, also known as canonical decomposition) + for i in range(numberoflines): + # Unfortunately we have a mixture of unciode strings and plain strings, + # because we never use u'xxx' for string literals, but 'xxx'. + # Therefore we may have to try two times to normalize the data. + try: + document.body[i] = unicodedata.normalize("NFKD", document.body[i]) + except TypeError: + document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8')) + # Replace accented characters with InsetLaTeXAccent + # FIXME: Do not convert characters that can be represented in the chosen + # encoding. This is not so easy because we have to track the language + # (see convert_multiencoding), using the document encoding is not enough. + for i in range(len(document.body)): + for j in range(len(document.body[i])): + # dotless i and dotless j are both in special_accent_map and can + # occur as an accented character, so we need to test that the + # following character is no accent + if (document.body[i][j] in inverse_special_accent_map and + (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)): + accent = document.body[i][j] + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented characters + if j > 0: + document.body[i] = document.body[i][:j-1] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent] + elif j > 0 and document.body[i][j] in inverse_accent_map: + accented_char = document.body[i][j-1] + if accented_char == ' ': + # Conform to LyX output + accented_char = '' + elif accented_char in inverse_accented_map: + accented_char = inverse_accented_map[accented_char] + accent = document.body[i][j] + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented characters + if j > 1: + document.body[i] = document.body[i][:j-2] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char) + # Normalize to "Normal form C" (NFC, pre-composed characters) again + for i in range(numberoflines): + document.body[i] = unicodedata.normalize("NFKC", document.body[i]) + + ## # Conversion hub # @@ -735,16 +936,18 @@ convert = [[246, []], [253, []], [254, [convert_esint]], [255, []], - [256, []]] + [256, []], + [257, [convert_accent]]] -revert = [[255, [revert_encodings]], +revert = [[256, []], + [255, [revert_encodings]], [254, [revert_clearpage, revert_cleardoublepage]], [253, [revert_esint]], [252, [revert_nomenclature, revert_printnomenclature]], [251, [revert_commandparams]], [250, [revert_cs_label]], [249, []], - [248, [revert_utf8]], + [248, [revert_accent, revert_utf8]], [247, [revert_booktabs]], [246, [revert_font_settings]], [245, [revert_framed]]] Index: development/scons/scons_manifest.py =================================================================== --- development/scons/scons_manifest.py (Revision 16710) +++ development/scons/scons_manifest.py (Arbeitskopie) @@ -346,7 +346,6 @@ src_insets_header_files = Split(''' insetinclude.h insetindex.h insetlabel.h - insetlatexaccent.h insetline.h insetmarginal.h insetnewline.h @@ -402,7 +401,6 @@ src_insets_files = Split(''' insetinclude.C insetindex.C insetlabel.C - insetlatexaccent.C insetline.C insetmarginal.C insetnewline.C
latexaccent-all.lyx
Description: application/lyx