[
https://issues.apache.org/jira/browse/TIKA-2041?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15393809#comment-15393809
]
Tim Allison commented on TIKA-2041:
-----------------------------------
Sorry. I agree. Had to run to a meeting before including the important part b
above. I'm able to reproduce this issue with multiple threads processing each
file only once.
Has anyone done enough googling or used pure ICU4J to figure out if this is an
issue known by/fixed by them in a more recent version?
> Charset detection doesn't appear to be thread-safe
> --------------------------------------------------
>
> Key: TIKA-2041
> URL: https://issues.apache.org/jira/browse/TIKA-2041
> Project: Tika
> Issue Type: Bug
> Reporter: Tim Allison
>
> On the user list, Christian Leitinger noted that his team found a potential
> issue with the thread safety of the encoding detector. I was able to
> reproduce this with on the corpus of html files in [~faghani]'s encoding
> detector.
> {noformat}
> @Test
> public void testMultiThreadingEncodingDetection() throws Exception {
> Path testDocs = Paths.get("C:/data/encodings/corpus");
> List<Path> paths = new ArrayList<>();
> Map<Path, String> encodings = new ConcurrentHashMap<>();
> for (File encodingDirs : testDocs.toFile().listFiles()) {
> for (File file : encodingDirs.listFiles()) {
> String encoding = getEncoding(file.toPath());
> paths.add(file.toPath());
> encodings.put(file.toPath(), encoding);
> }
> }
> int numThreads = 1000;
> ExecutorService ex = Executors.newFixedThreadPool(numThreads);
> CompletionService<String> completionService =
> new ExecutorCompletionService<>(ex);
> for (int i = 0; i < numThreads; i++) {
> completionService.submit(new EncodingDetectorRunner(paths,
> encodings), "done");
> }
> int completed = 0;
> while (completed < numThreads) {
> Future<String> future = completionService.take();
> if (future.isDone() && "done".equals(future.get())) {
> completed++;
> }
> }
> assertTrue("success!", true);
> }
> private class EncodingDetectorRunner implements Runnable {
> private final List<Path> paths;
> private final Map<Path, String> encodings;
> private final Random r = new Random();
> private EncodingDetectorRunner(List<Path> paths, Map<Path, String>
> encodings) {
> this.paths = paths;
> this.encodings = encodings;
> }
> @Override
> public void run() {
> for (int i = 0; i < 100; i++) {
> int pInd = r.nextInt(paths.size());
> String detectedEncoding = null;
> try {
> detectedEncoding = getEncoding(paths.get(pInd));
> } catch (Exception e) {
> throw new RuntimeException(e);
> }
> String trueEncoding = encodings.get(paths.get(pInd));
> if (! detectedEncoding.equals(trueEncoding)) {
> throw new RuntimeException("detected: " +
> detectedEncoding +
> " but should have been: "+trueEncoding + " for "
> + paths.get(pInd));
> }
> }
> }
> }
> public String getEncoding(Path p) throws Exception {
> try (InputStream is = TikaInputStream.get(p)) {
> AutoDetectReader reader = new AutoDetectReader(is);
> String val = reader.getCharset().toString();
> if (val == null) {
> return "NULL";
> } else {
> return val;
> }
> }
> }
> {noformat}
> yields:
> {noformat}
> ava.util.concurrent.ExecutionException: java.lang.RuntimeException: detected:
> ISO-8859-1 but should have been: windows-1252 for
> C:\data\encodings\corpus\Shift_JIS\1
> at java.util.concurrent.FutureTask.report(FutureTask.java:122)
> at java.util.concurrent.FutureTask.get(FutureTask.java:192)
> at
> org.apache.tika.parser.html.HtmlParserTest.testMultiThreadingEncodingDetection(HtmlParserTest.java:1213)
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)