Trần Tiến Đức created JAMES-3044:
------------------------------------

             Summary: JsoupTextExtractor fails on parsing htmls
                 Key: JAMES-3044
                 URL: https://issues.apache.org/jira/browse/JAMES-3044
             Project: James Server
          Issue Type: Bug
            Reporter: Trần Tiến Đức


Suspecting Jsoup version 1.21.1 links to the issue. 
https://github.com/jhy/jsoup/issues/1250

 

Here is the stack trace:
{code:java}
        java.io.IOException: Input is binary and unsupported
        at org.jsoup.UncheckedIOException.<init>(UncheckedIOException.java:11)
        at org.jsoup.parser.CharacterReader.<init>(CharacterReader.java:38)
        at org.jsoup.parser.CharacterReader.<init>(CharacterReader.java:43)
        at org.jsoup.parser.TreeBuilder.initialiseParse(TreeBuilder.java:38)
        at 
org.jsoup.parser.HtmlTreeBuilder.initialiseParse(HtmlTreeBuilder.java:65)
        at org.jsoup.parser.TreeBuilder.parse(TreeBuilder.java:46)
        at org.jsoup.parser.Parser.parseInput(Parser.java:35)
        at org.jsoup.helper.DataUtil.parseInputStream(DataUtil.java:169)
        at org.jsoup.helper.DataUtil.load(DataUtil.java:66)
        at org.jsoup.Jsoup.parse(Jsoup.java:118)
        at 
org.apache.james.mailbox.store.extractor.JsoupTextExtractor.parseHtmlContent(JsoupTextExtractor.java:61)
        at 
org.apache.james.mailbox.store.extractor.JsoupTextExtractor.extractContent(JsoupTextExtractor.java:48)
        at 
org.apache.james.mailbox.elasticsearch.json.MimePart$Builder.extractText(MimePart.java:155)
        at 
org.apache.james.mailbox.elasticsearch.json.MimePart$Builder.parseContent(MimePart.java:145)
        at 
org.apache.james.mailbox.elasticsearch.json.MimePart$Builder.build(MimePart.java:130)
        at 
org.apache.james.mailbox.elasticsearch.json.MimePartParser.closeMimePart(MimePartParser.java:102)
        at 
org.apache.james.mailbox.elasticsearch.json.MimePartParser.processMimePart(MimePartParser.java:80)
        at 
org.apache.james.mailbox.elasticsearch.json.MimePartParser.parse(MimePartParser.java:61)
        at 
org.apache.james.mailbox.elasticsearch.json.IndexableMessage$Builder.instantiateIndexedMessage(IndexableMessage.java:109)
        at 
org.apache.james.mailbox.elasticsearch.json.IndexableMessage$Builder.build(IndexableMessage.java:75)
        at 
org.apache.james.mailbox.elasticsearch.json.MessageToElasticSearchJson.convertToJson(MessageToElasticSearchJson.java:69)
        at 
org.apache.james.mailbox.elasticsearch.events.ElasticSearchListeningMessageSearchIndex.generateIndexedJson(ElasticSearchListeningMessageSearchIndex.java:152)
        at 
org.apache.james.mailbox.elasticsearch.events.ElasticSearchListeningMessageSearchIndex.add(ElasticSearchListeningMessageSearchIndex.java:145)
        at 
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.lambda$handleAdded$1(ListeningMessageSearchIndex.java:100)
        at 
com.github.fge.lambdas.consumers.ConsumerChainer.lambda$sneakyThrow$9(ConsumerChainer.java:73)
        at 
java.util.stream.ForEachOps$ForEachOp$OfRef.accept(ForEachOps.java:183)
        at java.util.Iterator.forEachRemaining(Iterator.java:116)
        at 
java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801)
        at 
java.util.stream.ReferencePipeline$Head.forEach(ReferencePipeline.java:647)
        at 
java.util.stream.ReferencePipeline$7$1.accept(ReferencePipeline.java:272)
        at 
java.util.ArrayList$ArrayListSpliterator.forEachRemaining(ArrayList.java:1382)
        at java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:482)
        at 
java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:472)
        at 
java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150)
        at 
java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173)
        at java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234)
        at 
java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:485)
        at 
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.handleAdded(ListeningMessageSearchIndex.java:100)
        at 
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.handleMailboxEvent(ListeningMessageSearchIndex.java:82)
        at 
org.apache.james.mailbox.store.search.ListeningMessageSearchIndex.event(ListeningMessageSearchIndex.java:72)
        at 
org.apache.james.mailbox.events.MailboxListenerExecutor.execute(MailboxListenerExecutor.java:41)
        at 
org.apache.james.mailbox.events.GroupRegistration.runListener(GroupRegistration.java:152)
        at 
org.apache.james.mailbox.events.GroupRegistration.lambda$deliver$2(GroupRegistration.java:142)
        at 
com.github.fge.lambdas.runnable.RunnableChainer.doRun(RunnableChainer.java:18)
        at 
com.github.fge.lambdas.runnable.ThrowingRunnable.run(ThrowingRunnable.java:16)
        at reactor.core.publisher.MonoRunnable.call(MonoRunnable.java:73)
        at reactor.core.publisher.MonoRunnable.call(MonoRunnable.java:32)
        at 
reactor.core.publisher.MonoFlatMap$FlatMapMain.onNext(MonoFlatMap.java:132)
        at 
reactor.core.publisher.FluxSubscribeOnValue$ScheduledScalar.run(FluxSubscribeOnValue.java:178)
        at reactor.core.scheduler.SchedulerTask.call(SchedulerTask.java:68)
        at reactor.core.scheduler.SchedulerTask.call(SchedulerTask.java:28)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

{code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to