Hi, I have created a patch for using a separate script file generation of debian.bib and debian.tex files. However, I have a couple of questions before I submit it. Please find my questions inline.
On Wed, Apr 22, 2015 at 7:07 PM, Akshita Jha <[email protected]> wrote: > --- > udd/bibref_gatherer.py | 110 +----------------- > udd/blends_prospective_gatherer.py | 4 + > udd/generate_bib_tex.py | 226 > +++++++++++++++++++++++++++++++++++++ > 3 files changed, 235 insertions(+), 105 deletions(-) > create mode 100644 udd/generate_bib_tex.py > > diff --git a/udd/bibref_gatherer.py b/udd/bibref_gatherer.py > index 654d7e7..7113e04 100644 > --- a/udd/bibref_gatherer.py > +++ b/udd/bibref_gatherer.py > @@ -6,8 +6,7 @@ This script imports bibliographic references from > upstream-metadata.debian.net. > > from gatherer import gatherer > from sys import stderr, exit > -from os import listdir, unlink, rename, access, X_OK > -from os.path import isfile > +from os import listdir > from fnmatch import fnmatch > import yaml > from psycopg2 import IntegrityError, InternalError > @@ -18,48 +17,16 @@ from subprocess import Popen, PIPE > > from types import * > > +from generate_bib_tex import generate_bib_tex > + > 'generate_bib_tex' is the new file that is created. Is the name of the file alright? Is it according to the naming convention followed by Debian ? debug=0 > > def get_gatherer(connection, config, source): > return bibref_gatherer(connection, config, source) > > -def rm_f(file): > - try: > - unlink(file) > - except OSError: > - pass > - > -def cleanup_tex_logs(basetexfile): > - rm_f(basetexfile+'.aux') > - rm_f(basetexfile+'.bbl') > - rm_f(basetexfile+'.blg') > - rm_f(basetexfile+'.log') > - > # seek for authors separated by ',' rather than by ' and ' > seek_broken_authors_re = > re.compile('^[^\s^,]+\s+[^\s^,]+\s*,\s*[^\s^,]+\s+[^\s^,]') > > -def open_tex_process(texexe, basetexfile): > - if texexe == 'pdflatex': > - ptex = Popen(['pdflatex', '-interaction=batchmode', basetexfile], > shell=False, stdout=PIPE) > - elif texexe == 'bibtex': > - ptex = Popen(['bibtex', basetexfile], shell=False, stdout=PIPE) > - else: > - return(False, 'Wrong exe: '+texexe) > - errstring="" > - if ptex.wait(): > - if texexe == 'pdflatex': > - for logrow in ptex.communicate()[0].splitlines(): > - if logrow.startswith('!'): > - errstring += logrow > - return(False, errstring) > - else: > - for logrow in ptex.communicate()[0].splitlines(): > - if logrow.startswith('This is BibTeX'): > - continue > - errstring += logrow + '\n' > - return(True, errstring) > - return(True, errstring) > - > other_known_keys = ('Archive', > 'Bug-Database', > 'Cite-As', > @@ -364,75 +331,8 @@ class bibref_gatherer(gatherer): > # commit before check to make sure the table is not locked in case > LaTeX run will fail for whatever reason > self.connection.commit() > > - # if there is a working LaTeX installation try to build a BibTeX > database and test it by creating a debian.pdf file > - if isfile('/usr/bin/pdflatex') and access('/usr/bin/pdflatex', X_OK) > and \ > - isfile('/usr/bin/bibtex') and access('/usr/bin/bibtex', X_OK) > and \ > - ( > isfile('/usr/share/texlive/texmf-dist/fonts/source/jknappen/ec/ecrm.mf') or > \ > - > isfile('/usr/share/texmf-texlive/fonts/source/jknappen/ec/ecrm.mf') ) : > - # create BibTeX file > - bf = open(self.bibtexfile, 'w') > - cur.execute("SELECT * FROM bibtex()") > - for row in cur.fetchall(): > - print >>bf, row[0] > - bf.close() > - > - # create LaTeX file to test BibTeX functionality > - bf = open(self.bibtex_example_tex, 'w') > - print >>bf, """\documentclass[10]{article} > -\usepackage[T1]{fontenc} > -\usepackage[utf8]{inputenc} > -\usepackage[left=2mm,top=2mm,right=2mm,bottom=2mm,nohead,nofoot]{geometry} > -\usepackage{longtable} > -\usepackage[super]{natbib} > -\setlongtables > -\\begin{document} > -\small > -\\begin{longtable}{llp{70mm}l} > -\\bf package & \\bf source & \\bf description & BibTeX key \\\\ \hline""" > - > - cur.execute("SELECT * FROM bibtex_example_data() AS (package text, > source text, bibkey text, description text)") > - for row in cur.fetchall(): > - print >>bf, row[0], '&', row[1], '&', row[3] , '&', > row[2]+'\cite{'+row[2]+'} \\\\' > - > - print >>bf, """\end{longtable} > - > -% \\bibliographystyle{plain} > -% Try a bit harder by also including URL+DOI > -\\bibliographystyle{plainnat} > -\\bibliography{debian} > - > -\end{document} > -""" > - bf.close() > - > - # try to build debian.pdf file to test aboc LaTeX file > - basetexfile = self.bibtex_example_tex.replace('.tex','') > - cleanup_tex_logs(basetexfile) > - try: > - rename(basetexfile+'.pdf', basetexfile+'.pdf~') > - except OSError: > - pass > - > - (retcode,errstring) = open_tex_process('pdflatex', basetexfile) > - if not retcode: > - self.log.error("Problem in 1. PdfLaTeX run of %s.tex: `%s` --> > please inspect %s.log" % (basetexfile, errstring, basetexfile)) > - exit(1) > - (retcode,errstring) = open_tex_process('bibtex', basetexfile) > - if errstring != "": > - if not retcode: > - self.log.error("Problem in BibTeX run of %s.bib: `%s`" % > (basetexfile, errstring)) > - exit(1) > - self.log.error("Ignore the following problems in BibTeX run of > %s.bib: `%s`" % (basetexfile, errstring)) > - (retcode,errstring) = open_tex_process('pdflatex', basetexfile) > - if not retcode: > - self.log.error("Problem in 2. PdfLaTeX run of %s.tex: `%s` --> > please inspect %s.log" % (basetexfile, errstring, basetexfile)) > - exit(1) > - (retcode,errstring) = open_tex_process('pdflatex', basetexfile) > - if not retcode: > - self.log.error("Problem in 3. PdfLaTeX run of %s.tex: `%s` --> > please inspect %s.log" % (basetexfile, errstring, basetexfile)) > - exit(1) > - > - cleanup_tex_logs(basetexfile) > > + g = generate_bib_tex() > + g.run(cur) > > Do I need to generate the debian.{bib.tex} files in bibref_gatherer ? For now I have called generate_bib_tex(), but the files generated will always consist of outdated references. > if __name__ == '__main__': > main() > > diff --git a/udd/blends_prospective_gatherer.py > b/udd/blends_prospective_gatherer.py > index a130bb1..32a6505 100644 > --- a/udd/blends_prospective_gatherer.py > +++ b/udd/blends_prospective_gatherer.py > @@ -19,6 +19,7 @@ from debian import deb822 > import email.Utils > > from bibref_gatherer import upstream_reader > +from generate_bib_tex import generate_bib_tex > > debug=0 > > @@ -434,6 +435,9 @@ class blends_prospective_gatherer(gatherer): > cur.execute("DEALLOCATE bibref_insert") > > cur.execute("ANALYZE %s" % my_config['table']) > + > + g = generate_bib_tex() > + g.run(cur) > > I think calling generate_bib_tex() in blends_prospective gatherer after the references from VCS have been inserted in bibref table, is the expected solution to the issue at hand. Am I right ? > if __name__ == '__main__': > main() > Below is generate_bib_tex.py file: diff --git a/udd/generate_bib_tex.py b/udd/generate_bib_tex.py > new file mode 100644 > index 0000000..d93b898 > --- /dev/null > +++ b/udd/generate_bib_tex.py > @@ -0,0 +1,226 @@ > +from os import unlink, rename, access, X_OK > +from os.path import isfile > +from subprocess import Popen, PIPE > +import logging > +import logging.handlers > + > +debug = 0 > + > +def rm_f(file): > + try: > + unlink(file) > + except OSError: > + pass > + > + > +def cleanup_tex_logs(basetexfile): > + rm_f(basetexfile+'.aux') > + rm_f(basetexfile+'.bbl') > + rm_f(basetexfile+'.blg') > + rm_f(basetexfile+'.log') > + > + > +def open_tex_process(texexe, basetexfile): > + if texexe == 'pdflatex': > + ptex = Popen(['pdflatex', '-interaction=batchmode', basetexfile], > shell=False, stdout=PIPE) > + elif texexe == 'bibtex': > + ptex = Popen(['bibtex', basetexfile], shell=False, stdout=PIPE) > + else: > + return(False, 'Wrong exe: '+texexe) > + errstring="" > + if ptex.wait(): > + if texexe == 'pdflatex': > + for logrow in ptex.communicate()[0].splitlines(): > + if logrow.startswith('!'): > + errstring += logrow > + return(False, errstring) > + else: > + for logrow in ptex.communicate()[0].splitlines(): > + if logrow.startswith('This is BibTeX'): > + continue > + errstring += logrow + '\n' > + return(True, errstring) > + return(True, errstring) > + > + > This creates a class generate_bib_tex(). Is it a good idea to create a class or should I define methods only ? > +class generate_bib_tex(): > + """ > + Generate a debian.bib and debian.tex files > + """ > + > + def __init__(self): > + self.log = logging.getLogger(self.__class__.__name__) > + if debug==1: > + self.log.setLevel(logging.DEBUG) > + else: > + self.log.setLevel(logging.INFO) > + handler = > logging.handlers.RotatingFileHandler(filename=self.__class__.__name__+'.log',mode='w') > + formatter = logging.Formatter("%(asctime)s - %(levelname)s - > (%(lineno)d): %(message)s") > + handler.setFormatter(formatter) > + self.log.addHandler(handler) > + > + self.bibtexfile = 'debian.bib' > + self.bibtex_example_tex = 'debian.tex' > + self.all_ref = 1 > + > If self.all_ref = 1 references for all the sources from bibref table will be included (irrespective of the fact whether or not they are included Debian). else : only references for sources which are both in VCS and Debian packages will be included. By default self.all_ref = 0 > + def run(self, cur): > + > + # if there is a working LaTeX installation try to build a BibTeX > database and test it by creating a debian.pdf file > + if isfile('/usr/bin/pdflatex') and access('/usr/bin/pdflatex', X_OK) > and \ > + isfile('/usr/bin/bibtex') and access('/usr/bin/bibtex', X_OK) > and \ > + ( > isfile('/usr/share/texlive/texmf-dist/fonts/source/jknappen/ec/ecrm.mf') or > \ > + > isfile('/usr/share/texmf-texlive/fonts/source/jknappen/ec/ecrm.mf') ) : > + > + # create BibTeX file > + bf = open(self.bibtexfile, 'w') > + > + if self.all_ref == 1: > + query = "SELECT * FROM bibtex()" > This includes refrences from all the sources in bibref table, by making use of bibtex() <https://udd.debian.org/schema/udd.html#public.function.bibtex> from UDD. However, '#' is not escaped here. How do I change that ? Do I make the changes in bibtex() of UDD itself ? Below is the default part which is similar to bibtex() <https://udd.debian.org/schema/udd.html#public.function.bibtex> of UDD but performs an inner join and escapes '#'. Is it better to include this in UDD itself ? + else: > + query = """ SELECT DISTINCT > + CASE WHEN bibjournal.value IS NULL AND > bibin.value IS NOT NULL AND bibpublisher.value IS NOT NULL THEN '@Book{' || > bibkey.value > + ELSE CASE WHEN bibauthor.value IS NULL OR > bibjournal.value IS NULL THEN '@Misc{'|| bibkey.value || > + CASE WHEN bibauthor.value IS NULL THEN > E',\n Key = "' || bibkey.value || '"' ELSE '' END -- without author we > need a sorting key > + ELSE '@Article{' || bibkey.value END END || > + CASE WHEN bibauthor.value IS NOT NULL THEN > E',\n Author = {' || bibauthor.value || '}' ELSE '' END || > + CASE WHEN bibtitle.value IS NOT NULL THEN > E',\n Title = "{' || > + replace(replace(replace(bibtitle.value, > + '#', E'\\#'), -- > + '_', E'\\_'), -- > + '%', E'\\%'), -- > + E'\xe2\x80\x89', E'\\,') -- TeX > syntax for '_' and UTF-8 "thin space" > + -- see > http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal > + || '}"' > + ELSE '' END || > + CASE WHEN bibbooktitle.value IS NOT NULL THEN > E',\n Booktitle = "{' || bibbooktitle.value || '}"' ELSE '' END || > + CASE WHEN bibyear.value IS NOT NULL THEN > E',\n Year = {' || bibyear.value || '}' ELSE '' END || > + CASE WHEN bibmonth.value IS NOT NULL THEN > E',\n Month = {' || bibmonth.value || '}' ELSE '' END || > + CASE WHEN bibjournal.value IS NOT NULL THEN > E',\n Journal = {' || replace(bibjournal.value, '&', E'\\&') || '}' ELSE > '' END || > + CASE WHEN bibaddress.value IS NOT NULL THEN > E',\n Address = {' || bibaddress.value || '}' ELSE '' END || > + CASE WHEN bibpublisher.value IS NOT NULL THEN > E',\n Publisher = {' || bibpublisher.value || '}' ELSE '' END || > + CASE WHEN bibvolume.value IS NOT NULL THEN > E',\n Volume = {' || bibvolume.value || '}' ELSE '' END || > + CASE WHEN bibnumber.value IS NOT NULL THEN > E',\n Number = {' || bibnumber.value || '}' ELSE '' END || > + CASE WHEN bibpages.value IS NOT NULL THEN > E',\n Pages = {' || regexp_replace(bibpages.value, E'(\\d)-([\\d])', > E'\\1--\\2') || '}' ELSE '' END || > + CASE WHEN biburl.value IS NOT NULL THEN > E',\n URL = {' || > + replace(replace(replace(replace(biburl.value, > + '#', E'\\#'), -- > + '_', E'\\_'), -- > + '%', E'\\%'), -- > + '&', E'\\&'), -- > + '~', E'\\~{}') -- > + || '}' > + ELSE '' END || > + CASE WHEN bibdoi.value IS NOT NULL THEN > E',\n DOI = {' || > + replace(replace(bibdoi.value, > + '#', E'\\#'), -- > + '_', E'\\_'), -- > + '&', E'\\&') -- > + || '}' > + ELSE '' END || > + CASE WHEN bibpmid.value IS NOT NULL THEN > E',\n PMID = {' || bibpmid.value || '}' ELSE '' END || > + CASE WHEN bibeprint.value IS NOT NULL THEN > E',\n EPrint = {' || > + replace(replace(replace(replace(bibeprint.value, > + '#', E'\\#'), -- > + '_', E'\\_'), -- > + '%', E'\\%'), -- > + '&', E'\\&'), -- > + '~', E'\\~{}') -- > + || '}' > + ELSE '' END || > + CASE WHEN bibin.value IS NOT NULL THEN > E',\n In = {' || bibin.value || '}' ELSE '' END || > + CASE WHEN bibissn.value IS NOT NULL THEN > E',\n ISSN = {' || bibissn.value || '}' ELSE '' END || > + E',\n}\n' > + AS bibentry > + -- p.source AS source, > + -- p.rank AS rank, > + FROM (SELECT DISTINCT source, package, rank FROM bibref) p > > + INNER JOIN sources s ON s.source = p.source > This is the INNER JOIN performed to ensure that references of packages that are both in VCS and in Debian are the only ones that are included in the bibtex file that is created. + LEFT OUTER JOIN bibref bibkey ON p.source = > bibkey.source AND bibkey.rank = p.rank AND bibkey.package = > p.package AND bibkey.key = 'bibtex' > + LEFT OUTER JOIN bibref bibyear ON p.source = > bibyear.source AND bibyear.rank = p.rank AND bibyear.package = > p.package AND bibyear.key = 'year' > + LEFT OUTER JOIN bibref bibmonth ON p.source = > bibmonth.source AND bibmonth.rank = p.rank AND bibmonth.package = > p.package AND bibmonth.key = 'month' > + LEFT OUTER JOIN bibref bibtitle ON p.source = > bibtitle.source AND bibtitle.rank = p.rank AND bibtitle.package = > p.package AND bibtitle.key = 'title' > + LEFT OUTER JOIN bibref bibbooktitle ON p.source = > bibbooktitle.source AND bibbooktitle.rank = p.rank AND bibbooktitle.package > = p.package AND bibbooktitle.key = 'booktitle' > + LEFT OUTER JOIN bibref bibauthor ON p.source = > bibauthor.source AND bibauthor.rank = p.rank AND bibauthor.package = > p.package AND bibauthor.key = 'author' > + LEFT OUTER JOIN bibref bibjournal ON p.source = > bibjournal.source AND bibjournal.rank = p.rank AND bibjournal.package = > p.package AND bibjournal.key = 'journal' > + LEFT OUTER JOIN bibref bibaddress ON p.source = > bibaddress.source AND bibaddress.rank = p.rank AND bibaddress.package = > p.package AND bibaddress.key = 'address' > + LEFT OUTER JOIN bibref bibpublisher ON p.source = > bibpublisher.source AND bibpublisher.rank = p.rank AND bibpublisher.package > = p.package AND bibpublisher.key = 'publisher' > + LEFT OUTER JOIN bibref bibvolume ON p.source = > bibvolume.source AND bibvolume.rank = p.rank AND bibvolume.package = > p.package AND bibvolume.key = 'volume' > + LEFT OUTER JOIN bibref bibdoi ON p.source = > bibdoi.source AND bibdoi.rank = p.rank AND bibdoi.package = > p.package AND bibdoi.key = 'doi' > + LEFT OUTER JOIN bibref bibpmid ON p.source = > bibpmid.source AND bibpmid.rank = p.rank AND bibpmid.package = > p.package AND bibpmid.key = 'pmid'LEFT OUTER JOIN bibref biburl ON > p.source = biburl.source AND biburl.rank = p.rank AND > biburl.package = p.package AND biburl.key = 'url' > + LEFT OUTER JOIN bibref bibnumber ON p.source = > bibnumber.source AND bibnumber.rank = p.rank AND bibnumber.package = > p.package AND bibnumber.key = 'number' > + LEFT OUTER JOIN bibref bibpages ON p.source = > bibpages.source AND bibpages.rank = p.rank AND bibpages.package = > p.package AND bibpages.key = 'pages' > + LEFT OUTER JOIN bibref bibeprint ON p.source = > bibeprint.source AND bibeprint.rank = p.rank AND bibeprint.package = > p.package AND bibeprint.key = 'eprint' > + LEFT OUTER JOIN bibref bibin ON p.source = > bibin.source AND bibin.rank = p.rank AND bibin.package = > p.package AND bibin.key = 'in' > + LEFT OUTER JOIN bibref bibissn ON p.source = > bibissn.source AND bibissn.rank = p.rank AND bibissn.package = > p.package AND bibissn.key = 'issn' > + ORDER BY bibentry -- p.source > + ;""" > + > + cur.execute(query) > + for row in cur.fetchall(): > + print >>bf, row[0] > + > + bf.close() > + > + # create LaTeX file to test BibTeX functionality > + bf = open(self.bibtex_example_tex, 'w') > + print >>bf, """\documentclass[10]{article} > +\usepackage[T1]{fontenc} > +\usepackage[utf8]{inputenc} > +\usepackage[left=2mm,top=2mm,right=2mm,bottom=2mm,nohead,nofoot]{geometry} > +\usepackage{longtable} > +\usepackage[super]{natbib} > +\setlongtables > +\\begin{document} > +\small > +\\begin{longtable}{llp{70mm}l} > +\\bf package & \\bf source & \\bf description & BibTeX key \\\\ \hline""" > + > + cur.execute("SELECT * FROM bibtex_example_data() AS (package text, > source text, bibkey text, description text)") > + for row in cur.fetchall(): > + print >>bf, row[0], '&', row[1], '&', row[3] , '&', > row[2]+'\cite{'+row[2]+'} \\\\' > + > + print >>bf, """\end{longtable} > + > +% \\bibliographystyle{plain} > +% Try a bit harder by also including URL+DOI > +\\bibliographystyle{plainnat} > +\\bibliography{debian} > + > +\end{document} > +""" > + bf.close() > + > + # try to build debian.pdf file to test aboc LaTeX file > + basetexfile = self.bibtex_example_tex.replace('.tex','') > + cleanup_tex_logs(basetexfile) > + try: > + rename(basetexfile+'.pdf', basetexfile+'.pdf~') > + except OSError: > + pass > + > + (retcode,errstring) = open_tex_process('pdflatex', basetexfile) > + if not retcode: > + self.log.error("Problem in 1. PdfLaTeX run of %s.tex: `%s` --> > please inspect %s.log" % (basetexfile, errstring, basetexfile)) > + exit(1) > + > + (retcode,errstring) = open_tex_process('bibtex', basetexfile) > + if errstring != "": > + if not retcode: > + self.log.error("Problem in BibTeX run of %s.bib: `%s`" % > (basetexfile, errstring)) > + exit(1) > + self.log.error("Ignore the following problems in BibTeX run of > %s.bib: `%s`" % (basetexfile, errstring)) > + > + (retcode,errstring) = open_tex_process('pdflatex', basetexfile) > + if not retcode: > + self.log.error("Problem in 2. PdfLaTeX run of %s.tex: `%s` --> > please inspect %s.log" % (basetexfile, errstring, basetexfile)) > + exit(1) > + > + (retcode,errstring) = open_tex_process('pdflatex', basetexfile) > + if not retcode: > + self.log.error("Problem in 3. PdfLaTeX run of %s.tex: `%s` --> > please inspect %s.log" % (basetexfile, errstring, basetexfile)) > + exit(1) > + > + cleanup_tex_logs(basetexfile) > + > + > + > -- > 1.9.1 > > I have checked the differences in the debian.bib and debian.tex files. They are as expected. -> Without these changes, the references are outdated. -> When self.all_ref = 1, then there are many more references. -> When self.all_ref = 0, then the number of references are greater than the older references but less than when self.all_ref = 1. Also, is there a possibility that the references which have been injected by bibref_gatherer are updated in blends_prospective_gatherer. I had not found any Updates when I had written the "Upsert" functionality, but if there is a possibility, then I think we should include Upsert also in blends_prospective_gatherer and then generate the debian.bib and debian.tex files. -- Regards, Akshita Jha
