Great, I found the bug and submitted a patch[0]. The reason was that all of the words are in a <strong> hOCR tag, which hocr2pdf wasn't handling.
I also found that it wasn't sorting the pages in any sane way, so patched that too[1]. As there are quite a few patches floating around for now, I'm attaching the fully patched version to this. I tested it to work fine with the .zip you sent me. Note though that sorting is done the way computers prefer to sort things, so e.g. 10.jpg comes before 2.jpg. So you should make sure your filenames are zero-padded (e.g. 02.jpg). Nick 0. https://code.google.com/p/hocr-tools/issues/detail?id=9 1. https://code.google.com/p/hocr-tools/issues/detail?id=10 -- -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To post to this group, send email to tesseract-ocr@googlegroups.com To unsubscribe from this group, send email to tesseract-ocr+unsubscr...@googlegroups.com For more options, visit this group at http://groups.google.com/group/tesseract-ocr?hl=en --- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. For more options, visit https://groups.google.com/groups/opt_out.
#!/usr/bin/python # # Copyright 2013 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Create a searchable PDF from a pile of HOCR + JPEG. Tested with # Tesseract. import sys import glob import os.path import cStringIO import base64 import zlib import re from PIL import Image from reportlab.pdfgen.canvas import Canvas from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from xml.etree.ElementTree import ElementTree, ParseError def export_pdf(playground, default_dpi): """Create a searchable PDF from a pile of HOCR + JPEG""" load_invisible_font() pdf = Canvas(sys.stdout, pageCompression=1) pdf.setCreator('hocr-tools') pdf.setTitle(os.path.basename(playground)) images = sorted(glob.glob(os.path.join(playground, '*.jpg'))) dpi = default_dpi for image in images: im = Image.open(image) w, h = im.size try: dpi = im.info['dpi'][0] except KeyError: pass width = w * 72 / dpi height = h * 72 / dpi pdf.setPageSize((width, height)) pdf.drawImage(image, 0, 0, width=width, height=height) add_text_layer(pdf, image, height, dpi) pdf.showPage() pdf.save() def add_text_layer(pdf, image, height, dpi): """Draw an invisible text layer for OCR data""" p1 = re.compile('bbox((\s+\d+){4})') p2 = re.compile('baseline((\s+[\d\.\-]+){2})') hocrfile = os.path.splitext(image)[0] + ".hocr" hocr = ElementTree() hocr.parse(hocrfile) for line in hocr.findall(".//{http://www.w3.org/1999/xhtml}span"): if line.attrib['class'] != 'ocr_line': continue linebox = p1.search(line.attrib['title']).group(1).split() try: baseline = p2.search(line.attrib['title']).group(1).split() except AttributeError: baseline = [ 0, 0 ] linebox = [float(i) for i in linebox] baseline = [float(i) for i in baseline] for word in line: if word.attrib['class'] != 'ocrx_word': continue if word.text is not None: rawtext = word.text.strip() else: try: innerword = word[0] if innerword.text is not None: rawtext = innerword.text.strip() except: continue font_width = pdf.stringWidth(rawtext, 'invisible', 8) if font_width <= 0: continue box = p1.search(word.attrib['title']).group(1).split() box = [float(i) for i in box] b = polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3] text = pdf.beginText() text.setTextRenderMode(3) # double invisible text.setFont('invisible', 8) text.setTextOrigin(box[0] * 72 / dpi, height - b * 72 / dpi) box_width = (box[2] - box[0]) * 72 / dpi text.setHorizScale(100.0 * box_width / font_width) text.textLine(rawtext) pdf.drawText(text) def polyval(poly, x): return x * poly[0] + poly[1] # Glyphless variation of vedaal's invisible font retrieved from # http://www.angelfire.com/pr/pgpf/if.html, which says: # 'Invisible font' is unrestricted freeware. Enjoy, Improve, Distribute freely def load_invisible_font(): font = """ eJzdlk1sG0UUx/+zs3btNEmrUKpCPxikSqRS4jpfFURUagmkEQQoiRXgAl07Y3vL2mvt2ml8APXG hQPiUEGEVDhWVHyIC1REPSAhBOWA+BCgSoULUqsKcWhVBKjhzfPU+VCi3Flrdn7vzZv33ryZ3TUE gC6chsTx8fHck1ONd98D0jnS7jn26GPjyMIleZhk9fT0wcHFl1/9GRDPkTxTqHg1dMkzJH9CbbTk xbWlJfKEdB+Np0pBswi+nH/Nvay92VtfJp4nvEztUJkUHXsdksUOkveXK/X5FNuLD838ICx4dv4N I1e8+ZqbxwCNP2jyqXoV/fmhy+WW/2SqFsb1pX68SfEpZ/TCrI3aHzcP//jitodvYmvL+6Xcr5mV vb1ScCzRnPRPfz+LsRSWNasuwRrZlh1sx0E8AriddyzEDfE6EkglFhJDJO5u9fJbFJ0etEMB78D5 4Djm/7kjT0wqhSNURyS+u/2MGJKRu+0ExNkrt1pJti9p2x6b3TBJgmUXuzgnDmI8UWMbkVxeinCw Mo311/l/v3rF7+01D+OkZYE0PrbsYAu+sSyxU0jLLtIiYzmBrFiwnCT9FcsdOOK8ZHbFleSn0znP nDCnxbnAnGT9JeYtrP+FOcV8nTlNnsoc3bBAD85adtCNRcsSffjBsoseca/lBE7Q09LiJOm/ttyB 0+IqcwfncJt5q4krO5k7jV7uY+5m7mPebuLKUea7iHvk48w72OYF5rvZT8C8k/WvMN/Dc19j3s02 bzPvZZv3me9j/ox5P9t/xdzPzPVJcc7yGnPL/1+GO1lPVTXM+VNWOTRRg0YRHgrUK5yj1kvaEA1E xAWiCtl4qJL2ADKkG6Q3XxYjzEcR0E9hCj5KtBd1xCxp6jV5mKP7LJBr1nTRK2h1TvU2w0akCmGl 5lWbBzJqMJsdyaijQaCm/FK5HqspHetoTtMsn4LO0T2mlqcwmlTVOT/28wGhCVKiNANKLiJRlxqB F603axQznIzRhDSq6EWZ4UUs+xud0VHsh1U1kMlmNwu9kTuFaRqpURU0VS3PVmZ0iE7gct0MG/8+ 2fmUvKlfRLYmisd1w8pk1LSu1XUlryM1MNTH9epTftWv+16gIh1oL9abJZyjrfF5a4qccp3oFAcz Wxxx4DpvlaKKxuytRDzeth5rW4W8qBFesvEX8RFRmLBHoB+TpCmRVCCb1gFCruzHqhhW6+qUF6tC pL26nlWN2K+W1LhRjxlVGKmRTFYVo7CiJug09E+GJb+QocMCPMWBK1wvEOfRFF2U0klK8CppqqvG pylRc2Zn+XDQWZIL8iO5KC9S+1RekOex1uOyZGR/w/Hf1lhzqVfFsxE39B/ws7Rm3N3nDrhPuMfc w3R/aE28KsfY2J+RPNp+j+KaOoCey4h+Dd48b9O5G0v2K7j0AM6s+5WQ/E0wVoK+pA6/3bup7bJf CMGjwvxTsr74/f/F95m3TH9x8o0/TU//N+7/D/ScVcA= """ ttf = cStringIO.StringIO(zlib.decompress(base64.decodestring(font))) pdfmetrics.registerFont(TTFont('invisible', ttf)) if __name__ == "__main__": if len(sys.argv) == 1: print("Usage: %s <imgdir>\n" % os.path.basename(sys.argv[0])) else: export_pdf(sys.argv[1], 300)