jenkins-bot has submitted this change and it was merged. Change subject: Non-English language support via polyglossia. ......................................................................
Non-English language support via polyglossia. Add xunicode and polyglossia packages for better language support. Process HTML 'lang' attributes and emit the proper polyglossia language-change command. Add texlive-lang-all package to get all the appropriate hyphenation patterns, etc. Set default font to GNU freefont's "FreeSerif" face, which has better non-latin support (but no CJK characters). Change-Id: I08454de9c33e5971c76db2313b4bc2a48e74af1d --- M .travis.yml M README.md M bin/mw-latexer M lib/index.js A lib/polyglossia.js 5 files changed, 168 insertions(+), 5 deletions(-) Approvals: Cscott: Looks good to me, approved jenkins-bot: Verified diff --git a/.travis.yml b/.travis.yml index 24388ca..786e386 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ - "0.8" before_install: - sudo apt-get update -qq - - sudo apt-get install -qq texlive-xetex texlive-latex-recommended texlive-fonts-recommended latex-xcolor imagemagick librsvg2-bin unzip + - sudo apt-get install -qq texlive-xetex texlive-latex-recommended texlive-fonts-recommended texlive-lang-all latex-xcolor imagemagick librsvg2-bin unzip - mkdir ~/texmf - unzip -d ~/texmf texdeps/fontspec.tds.zip - cp -r ~/texmf/tex/latex ~/texmf/tex/xelatex diff --git a/README.md b/README.md index a75cbc9..4db4fb6 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,9 @@ Install other system dependencies. ``` -apt-get install texlive-xetex texlive-latex-recommended texlive-fonts-recommended latex-xcolor imagemagick librsvg2-bin unzip +apt-get install texlive-xetex texlive-latex-recommended \ + texlive-fonts-recommended texlive-lang-all latex-xcolor \ + imagemagick librsvg2-bin unzip ``` Note that up-to-date LaTeX `hyperref` and `fontspec` packages are diff --git a/bin/mw-latexer b/bin/mw-latexer index 528dd66..7c0d754 100755 --- a/bin/mw-latexer +++ b/bin/mw-latexer @@ -12,6 +12,8 @@ 'Set paper size', 'letter') .option('-t, --toc <yes|no|auto>', 'Force presence/absence of table of contents [auto]', 'auto') + .option('-L, --lang <2-char lang code>', + 'Force collection to use the given default language', null) .option('-1, --one-column', 'Render page in single column layout') .option('-v, --verbose', @@ -47,6 +49,7 @@ latex: !!program.latex, debug: !!program.debug, output: program.output, + lang: program.lang, onecolumn: program.oneColumn, log: log }; diff --git a/lib/index.js b/lib/index.js index 2b6b4e5..1e691f6 100644 --- a/lib/index.js +++ b/lib/index.js @@ -22,6 +22,7 @@ var Db = require('./db'); var DomUtil = require('./domutil'); var P = require('./p'); +var Polyglossia = require('./polyglossia'); var StatusReporter = require('./status'); var STD_HEADER = [ @@ -30,7 +31,7 @@ "", "\\documentclass[10pt,twocolumn,twoside]{article}", "\\pagestyle{headings}", - "\\usepackage{fontspec, graphicx}", + "\\usepackage{fontspec, xunicode, polyglossia, graphicx}", "\\usepackage{amsmath,amsthm,amstext,amssymb}", "\\usepackage[usenames]{xcolor}", "\\definecolor{linkcolor}{rgb}{.27,0,0}", @@ -49,6 +50,8 @@ "\\makeatother%", "}", */ + "\\setmainfont[]{FreeSerif}", +/* // Set up Charis font // XXX add non-latin (CJK, etc) fonts "\\setmainfont[", @@ -59,6 +62,7 @@ "BoldItalicFont = CharisSIL-BI.ttf ,", "]", "{CharisSIL-R.ttf}", +*/ "\\date{}\\author{}" ].join("\n"); @@ -144,6 +148,8 @@ this.output = []; this.templates = Object.create(null); this.base = options.base || ''; + this.currentLanguage = options.lang || 'en'; + this.usedLanguages = new Set(); }; // Helper function -- collect all text from the children of `node` as @@ -172,6 +178,18 @@ case node.ELEMENT_NODE: if (isHidden(node)) { return; + } + // handle LANG attributes (which override everything else) + var lang = node.getAttribute('lang') || this.currentLanguage; + // in addition to eliminating no-ops, this condition allows us + // to recursively invoke visit() inside the LANG handler. + if (lang !== this.currentLanguage) { + var savedLanguage = this.currentLanguage; + this.currentLanguage = lang; + this.usedLanguages.add(lang); + var r = this['visitLANG='].apply(this, arguments); + this.currentLanguage = savedLanguage; + return r; } // use typeof property if possible if (node.hasAttribute('typeof')) { @@ -238,7 +256,7 @@ href = href.substring(1); return this.collect(node, function(contents) { this.output.push('\\hyperlink{' + href + '}' + - '{' + contents + '}'); + '{' + contents + '}%'); }); } else { href = url.resolve(this.base, href); @@ -496,6 +514,23 @@ var delimit = display ? '$$' : '$'; var eol = display ? '' : '%'; this.output.push(delimit + math + delimit + eol); +}; + +Visitor.prototype['visitLANG='] = function(node) { + // is this a block or a span context? + var isBlock = /^(BLOCKQUOTE|BODY|CENTER|DIV|DL|FIGURE|H[1-6]|OL|P|TABLE|UL)$/.test(node.nodeName); // XXX others? + var poly = Polyglossia.lookup(this.currentLanguage); + if (isBlock) { + this.output.push('\\begin{'+poly.env+'}['+poly.options+']%'); + var r = this.visit(node); + this.output.push('\\end{'+poly.env+'}%'); + return r; + } else { + return this.collect(node, function(contents) { + this.output.push('\\text'+poly.lang+'['+poly.options+']' + + '{' + contents + '}%'); + }); + } }; Visitor.prototype['visitTYPEOF=mw:Image'] = @@ -758,6 +793,13 @@ if (!singleItem) { head = head.replace(/\]\{article\}/, ']{report}'); } + // default language (for chapter headings, page numbers, etc) + // CLI --lang option overrides + var lang = options.lang || metabook.lang || 'en'; + var poly = Polyglossia.lookup(lang); + head += '\n\\setdefaultlanguage[' + poly.options + ']{' + poly.lang + '}'; + var usedLanguages = new Set(); // usedLanguages doesn't include default lang + head += '\n\\input{'+path.join(builddir, 'languages.tex')+'}'; // we'll put used languages here // emit title, subtitle, etc. var title = metabook.title; if (!title && metabook.items.length === 1) { @@ -810,13 +852,15 @@ base: base, imagemap: imagemap, singleItem: singleItem, - hasChapters: hasChapters + hasChapters: hasChapters, + lang: lang }); var h1 = document.createElement('h1'); h1.textContent = item.title; visitor.visit(h1); // emit document title! visitor.visit(document.body); var result = visitor.output.join('\n'); + visitor.usedLanguages.forEach(function(l){ usedLanguages.add(l); }); return P.call(fs.writeFile, fs, outfile, result, 'utf8'); }); }; @@ -835,6 +879,17 @@ return write[item.type](item); }).then(function() { return P.call(output.end, output, STD_FOOTER); + }).then(function() { + // write languages file + var s = ''; + usedLanguages.forEach(function(l) { + s += Polyglossia.lookup(l).lang + ','; + }); + if (s) { + s = '\\setotherlanguages{'+s.replace(/,$/,'')+'}\n'; + } + var filename = path.join(builddir, 'languages.tex'); + return P.call(fs.writeFile, fs, filename, s, 'utf8'); }); }; diff --git a/lib/polyglossia.js b/lib/polyglossia.js new file mode 100644 index 0000000..a217aa6 --- /dev/null +++ b/lib/polyglossia.js @@ -0,0 +1,103 @@ +/** Language and option mappings for the XeLaTeX polyglossia package. */ +var table = { + sq: { lang: 'albanian' }, + am: { lang: 'amharic' }, + ar: { lang: 'arabic', env: 'Arabic' }, + 'und-Arab': { lang: 'arabic', env: 'Arabic' }, + hy: { lang: 'armenian' }, + ast: { lang: 'asturian' }, + id: { lang: 'bahasai' }, + ms: { lang: 'bahasam' }, + eu: { lang: 'basque' }, + bn: { lang: 'bengali' }, + 'pt-BR': { lang: 'brazil' }, + br: { lang: 'breton' }, + bg: { lang: 'bulgarian' }, + ca: { lang: 'catalan' }, + cop: { lang: 'coptic' }, + hr: { lang: 'croatian' }, + cs: { lang: 'czech' }, + da: { lang: 'danish' }, + dv: { lang: 'divehi' }, + nl: { lang: 'dutch' }, + en: { lang: 'english' }, + eo: { lang: 'esperanto' }, + et: { lang: 'estonian' }, + fa: { lang: 'farsi' }, + fi: { lang: 'finnish' }, + fr: { lang: 'french' }, + fur: { lang: 'friulan' }, + gl: { lang: 'galician' }, + de: { lang: 'german' }, + el: { lang: 'greek' }, + 'el-latn': { lang: 'greek', options: 'numerals=arabic' }, + grc: { lang: 'greek', options: 'variant=ancient' }, + he: { lang: 'hebrew' }, + hi: { lang: 'hindi' }, + is: { lang: 'icelandic' }, + ie: { lang: 'interlingua' }, + ga: { lang: 'irish' }, + it: { lang: 'italian' }, + kn: { lang: 'kannada' }, + lo: { lang: 'lao' }, + la: { lang: 'latin' }, + Latn: { lang: 'latin' }, // non-standard? used in arwiki sample. + lv: { lang: 'latvian' }, + lt: { lang: 'lithuanian' }, + dsb: { lang: 'lsorbian' }, + hu: { lang: 'magyar' }, + ml: { lang: 'malayalam' }, + mr: { lang: 'marathi' }, + nqo: { lang: 'nko' }, + no: { lang: 'norsk' }, + nn: { lang: 'nynorsk' }, + oc: { lang: 'occitan' }, + pmsq: { lang: 'piedmontese' }, + pl: { lang: 'polish' }, + pt: { lang: 'portuges' }, + ro: { lang: 'romanian' }, + rm: { lang: 'romansh' }, + ru: { lang: 'russian' }, + sme: { lang: 'samin' }, + sa: { lang: 'sanskrit' }, + 'sa-Latn': { lang: 'sanskrit' }, + gd: { lang: 'scottish' }, + sr: { lang: 'serbian' }, + sk: { lang: 'slovak' }, + sl: { lang: 'slovenian' }, + es: { lang: 'spanish' }, + sv: { lang: 'swedish' }, + syc: { lang: 'syriac' }, + ta: { lang: 'tamil' }, + te: { lang: 'telugu' }, + th: { lang: 'thai' }, + bo: { lang: 'tibetan' }, + tr: { lang: 'turkish' }, + tk: { lang: 'turkmen' }, + uk: { lang: 'ukrainian' }, + ur: { lang: 'urdu' }, + hsb: { lang: 'usorbian' }, + vi: { lang: 'vietnamese' }, + cy: { lang: 'welsh' } + // ja = japanese + // ja-Hani = japanese written in Kanji +}; + +var lookup = function(langcode) { + // langcode is an RFC1766 language code. That is, an ISO639 code, + // possibly followed by a dash and a variant specifier. + if (!table.hasOwnProperty(langcode)) { + console.warn('Language support not found for', langcode); + // try stripping the suffix. otherwise, fall back to 'en' + var stripped = langcode.replace(/-[\s\S]*$/, ''); + langcode = table.hasOwnProperty(stripped) ? stripped : 'en'; + } + var r = table[langcode]; + if (!r.env) { r.env = r.lang; } + if (!r.options) { r.options = ''; } + return r; +}; + +module.exports = { + lookup: lookup +}; -- To view, visit https://gerrit.wikimedia.org/r/98560 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I08454de9c33e5971c76db2313b4bc2a48e74af1d Gerrit-PatchSet: 4 Gerrit-Project: mediawiki/extensions/Collection/OfflineContentGenerator/latex_renderer Gerrit-Branch: master Gerrit-Owner: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits