[ https://issues.apache.org/jira/browse/TIKA-4245?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17840908#comment-17840908 ]
Tilman Hausherr commented on TIKA-4245: --------------------------------------- Happens also with the tika app GUI. > Tika does not get html content properly > ---------------------------------------- > > Key: TIKA-4245 > URL: https://issues.apache.org/jira/browse/TIKA-4245 > Project: Tika > Issue Type: Bug > Reporter: Xiaohong Yang > Priority: Major > Attachments: Sample html file and tika config xml.zip > > > We use org.apache.tika.parser.AutoDetectParser to get the content of html > files. And we found out that it does not get the content fo the sample file > properly. > Following is the sample code and attached is the tika-config.xml and the > sample html file. The content extracted with Tika reads > "㱨瑭氠硭汮猺景㴢桴瑰㨯⽷睷㌮潲术ㄹ㤹⽘卌⽆潲浡琢㸍ਉ़桥慤㸼䵅呁瑴瀭敱畩瘽≃潮瑥湴ⵔ祰攢潮瑥湴㴢瑥硴…". That is different > from the native file. > > > The operating system is Ubuntu 20.04. Java version is 21. Tika version is > 2.9.2. > {code:java} > import org.apache.commons.io.FileUtils; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.apache.tika.sax.BodyContentHandler; > > import java.io.File; > import java.io.FileInputStream; > import java.io.PrintWriter; > import java.nio.file.Files; > import java.nio.file.Path; > import java.nio.file.Paths; > > public class ExtractTxtFromHtml { > private static final Path inputFile = new > File("/home/ubuntu/testdirs/testdir_html/451434.html").toPath(); > > public static void main(String args[]) { > extactText(false); > extactText(true); > } > > static void extactText(boolean largeFile) { > PrintWriter outputFileWriter = null; > try { > BodyContentHandler handler; > Path outputFilePath = null; > > if (largeFile) { > // write tika output to disk > outputFilePath = > Paths.get("/home/ubuntu/testdirs/testdir_html/tika_parse_output.txt"); > outputFileWriter = new > PrintWriter(Files.newOutputStream(outputFilePath)); > handler = new BodyContentHandler(outputFileWriter); > } else { > // stream it in memory > handler = new BodyContentHandler(-1); > } > > Metadata metadata = new Metadata(); > FileInputStream inputData = new > FileInputStream(inputFile.toFile()); > TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_html/tika-config.xml"); > Parser autoDetectParser = new AutoDetectParser(config); > ParseContext context = new ParseContext(); > context.set(TikaConfig.class, config); > autoDetectParser.parse(inputData, handler, metadata, context); > > String content; > if (largeFile) { > content = FileUtils.readFileToString(outputFilePath.toFile()); > } > else { > content = handler.toString(); > } > System.out.println("content = " + content); > } > catch(Exception ex) { > ex.printStackTrace(); > } finally { > if (outputFileWriter != null) { > outputFileWriter.close(); > } > } > } > } > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010)