[ https://issues.apache.org/jira/browse/TIKA-2010?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15331829#comment-15331829 ]
Ken Krugler commented on TIKA-2010: ----------------------------------- Would it be possible for you to try this broken HTML with JSoup? Asking because we're discussing switching to JSoup over in [TIKA-1599]. > Unable to get <title> value when header is incorrect > ---------------------------------------------------- > > Key: TIKA-2010 > URL: https://issues.apache.org/jira/browse/TIKA-2010 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 1.12 > Reporter: Florent Valdelievre > > A lot of websites don't have a valid data within <head></head> tag. However, > even if header data are invalid(missplaced tag etc.) we should be able to get > title tag value if present. > Please find below a straightforward Unit Test to reproduce the problem. You > will noticed I have added an anchor in between <head><a></a></head> tags > which is not correct. If you remove it, it find title value. > {code:java} > import java.io.ByteArrayInputStream; > import java.io.IOException; > import java.nio.charset.Charset; > import java.nio.file.Files; > import java.nio.file.Paths; > import org.apache.hadoop.conf.Configuration; > import org.apache.html.dom.HTMLDocumentImpl; > import org.apache.nutch.parse.html.DOMBuilder; > import org.apache.nutch.parse.tika.DOMContentUtils; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.junit.Assert; > import org.junit.Before; > import org.junit.Test; > import org.w3c.dom.DocumentFragment; > public class TestTikaGetTitleWithInvalidHeaders { > private Configuration conf; > static byte[] readFile(String path, Charset encoding) throws > IOException { > return Files.readAllBytes(Paths.get(path)); > } > private final static String WEBPAGE = > "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML+RDFa > 1.0//EN\" \"http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd\">" > + "<html>" > + "<head>" > +"<a > href=\"https://plus.google.com/113911985765464238166\" > rel=\"publisher\">Google+</a> " > + "<title>Welcome!</title>" > + "</head>" > + "<body>" > + "content" > + "</body>" > + "</html>"; > > @Before > public void setUp() throws Exception { > conf = new Configuration(); > } > @Test > public void testGetTitle() { > HTMLDocumentImpl doc = new HTMLDocumentImpl(); > doc.setErrorChecking(false); > DocumentFragment root = doc.createDocumentFragment(); > Parser parser = new org.apache.tika.parser.html.HtmlParser(); > DOMBuilder domBuilder = new DOMBuilder(doc, root); > try { > parser.parse(new > ByteArrayInputStream(WEBPAGE.getBytes()), domBuilder, new Metadata(), new > ParseContext()); > } catch (Exception e) { > e.printStackTrace(); > } > StringBuffer sb = new StringBuffer(); > new DOMContentUtils(conf).getTitle(sb, root); > Assert.assertEquals("Welcome!", sb.toString()); > } > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)