lehmi commented on code in PR #180: URL: https://github.com/apache/pdfbox/pull/180#discussion_r1477059126
########## tools/src/main/java/org/apache/pdfbox/tools/PDFText2Markdown.java: ########## @@ -0,0 +1,375 @@ +//package com.pdfexample; + +package org.apache.pdfbox.tools; + + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; + +import java.io.IOException; +import java.util.*; + +/** + * Wrap stripped text in simple HTML, trying to form HTML paragraphs. Paragraphs + * broken by pages, columns, or figures are not mended. + * + * @author John J Barton + * + */ +public class PDFText2Markdown extends PDFTextStripper{ + private static final int INITIAL_PDF_TO_HTML_BYTES = 8192; + + private final FontState fontState = new FontState(); + /** + * Constructor. + * @throws IOException If there is an error during initialization. + */ + public PDFText2Markdown() throws IOException + { + setLineSeparator(LINE_SEPARATOR); + setParagraphStart(LINE_SEPARATOR); + setParagraphEnd(LINE_SEPARATOR); + setPageStart(LINE_SEPARATOR); + setPageEnd(LINE_SEPARATOR); + setArticleStart(LINE_SEPARATOR); + setArticleEnd(LINE_SEPARATOR); + } + + @Override + protected void startDocument(PDDocument document) throws IOException + { + StringBuilder buf = new StringBuilder(INITIAL_PDF_TO_HTML_BYTES); + + super.writeString(buf.toString()); + } Review Comment: IMHO, this doesn't make sense. The used StringBuilder is empty so that nothing is written at all. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org