Trần Tiến Đức created JAMES-3044:
------------------------------------
Summary: JsoupTextExtractor fails on parsing htmls
Key: JAMES-3044
URL: https://issues.apache.org/jira/browse/JAMES-3044
Project: James Server
Issue Type: Bug
Reporter: Trần Tiến Đức
Suspecting Jsoup version 1.21.1 links to the issue.
https://github.com/jhy/jsoup/issues/1250
Here is the stack trace:
{code:java}
java.io.IOException: Input is binary and unsupported
at org.jsoup.UncheckedIOException.<init>(UncheckedIOException.java:11)
at org.jsoup.parser.CharacterReader.<init>(CharacterReader.java:38)
at org.jsoup.parser.CharacterReader.<init>(CharacterReader.java:43)
at org.jsoup.parser.TreeBuilder.initialiseParse(TreeBuilder.java:38)
at
org.jsoup.parser.HtmlTreeBuilder.initialiseParse(HtmlTreeBuilder.java:65)
at org.jsoup.parser.TreeBuilder.parse(TreeBuilder.java:46)
at org.jsoup.parser.Parser.parseInput(Parser.java:35)
at org.jsoup.helper.DataUtil.parseInputStream(DataUtil.java:169)
at org.jsoup.helper.DataUtil.load(DataUtil.java:66)
at org.jsoup.Jsoup.parse(Jsoup.java:118)
at
org.apache.james.mailbox.store.extractor.JsoupTextExtractor.parseHtmlContent(JsoupTextExtractor.java:61)
at
org.apache.james.mailbox.store.extractor.JsoupTextExtractor.extractContent(JsoupTextExtractor.java:48)
at
org.apache.james.mailbox.elasticsearch.json.MimePart$Builder.extractText(MimePart.java:155)
at
org.apache.james.mailbox.elasticsearch.json.MimePart$Builder.parseContent(MimePart.java:145)
at
org.apache.james.mailbox.elasticsearch.json.MimePart$Builder.build(MimePart.java:130)
at
org.apache.james.mailbox.elasticsearch.json.MimePartParser.closeMimePart(MimePartParser.java:102)
at
org.apache.james.mailbox.elasticsearch.json.MimePartParser.processMimePart(MimePartParser.java:80)
at
org.apache.james.mailbox.elasticsearch.json.MimePartParser.parse(MimePartParser.java:61)
at
org.apache.james.mailbox.elasticsearch.json.IndexableMessage$Builder.instantiateIndexedMessage(IndexableMessage.java:109)
at
org.apache.james.mailbox.elasticsearch.json.IndexableMessage$Builder.build(IndexableMessage.java:75)
at
org.apache.james.mailbox.elasticsearch.json.MessageToElasticSearchJson.convertToJson(MessageToElasticSearchJson.java:69)
at
org.apache.james.mailbox.elasticsearch.events.ElasticSearchListeningMessageSearchIndex.generateIndexedJson(ElasticSearchListeningMessageSearchIndex.java:152)
at
org.apache.james.mailbox.elasticsearch.events.ElasticSearchListeningMessageSearchIndex.add(ElasticSearchListeningMessageSearchIndex.java:145)
at
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.lambda$handleAdded$1(ListeningMessageSearchIndex.java:100)
at
com.github.fge.lambdas.consumers.ConsumerChainer.lambda$sneakyThrow$9(ConsumerChainer.java:73)
at
java.util.stream.ForEachOps$ForEachOp$OfRef.accept(ForEachOps.java:183)
at java.util.Iterator.forEachRemaining(Iterator.java:116)
at
java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801)
at
java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:647)
at
java.util.stream.ReferencePipeline$7$1.accept(ReferencePipeline.java:272)
at
java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1382)
at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)
at
java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)
at
java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150)
at
java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173)
at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
at
java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:485)
at
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.handleAdded(ListeningMessageSearchIndex.java:100)
at
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.handleMailboxEvent(ListeningMessageSearchIndex.java:82)
at
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.event(ListeningMessageSearchIndex.java:72)
at
org.apache.james.mailbox.events.MailboxListenerExecutor.execute(MailboxListenerExecutor.java:41)
at
org.apache.james.mailbox.events.GroupRegistration.runListener(GroupRegistration.java:152)
at
org.apache.james.mailbox.events.GroupRegistration.lambda$deliver$2(GroupRegistration.java:142)
at
com.github.fge.lambdas.runnable.RunnableChainer.doRun(RunnableChainer.java:18)
at
com.github.fge.lambdas.runnable.ThrowingRunnable.run(ThrowingRunnable.java:16)
at reactor.core.publisher.MonoRunnable.call(MonoRunnable.java:73)
at reactor.core.publisher.MonoRunnable.call(MonoRunnable.java:32)
at
reactor.core.publisher.MonoFlatMap$FlatMapMain.onNext(MonoFlatMap.java:132)
at
reactor.core.publisher.FluxSubscribeOnValue$ScheduledScalar.run(FluxSubscribeOnValue.java:178)
at reactor.core.scheduler.SchedulerTask.call(SchedulerTask.java:68)
at reactor.core.scheduler.SchedulerTask.call(SchedulerTask.java:28)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
{code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]