[ https://issues.apache.org/jira/browse/PDFBOX-3581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Dmitri Russu updated PDFBOX-3581: --------------------------------- Description: Hi, I try to use pdfbox to extract text from a list of files, the problem is PDFTextStripper does not work on thread mode, when I try to use it in multythread nothing happens. it is a bug or limitation? could you help me ? thanks I have added the full class {code} private void scanFolderFiles(File scanDirectory) { File[] filesScan = scanDirectory.listFiles(); if ( filesScan.length > 0 ) { int iterator=0; for (final File fileEntry : filesScan) { if (fileEntry.isDirectory()) { scanFolderFiles(fileEntry); } else { try { new PDFExtractThread(fileEntry).start(); } catch (Exception e) { e.printStackTrace(); } } iterator++; } } } //////////////////////////////////////////////////////////////////////////////////////////// package org.pdfextractor; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import java.io.*; /** * Created by dru on 21.11.2016. */ class PDFExtractThread extends Thread { private String fileName; private File readFile; private PDDocument document; public PDFExtractThread(File readFile) { try { this.readFile = readFile; } catch (Exception e) { e.printStackTrace(); System.exit(1); } } public void run() { try { //get FileName this.readFile.setWritable(true); this.document = PDDocument.load(this.readFile); this.fileName = (new String(this.readFile.getName()).toLowerCase().replace(".pdf", "")); pdfBoxExtractText(); //Closing the document this.document.close(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } public void pdfBoxExtractText() throws Exception { //Retrieving text from PDF document PDFTextStripper pdfStripper = new PDFTextStripper(); System.out.println(this.fileName); FileWriter fileWriter = new FileWriter(this.fileName+".txt"); BufferedWriter writer = new BufferedWriter(fileWriter); String text = pdfStripper.getText(this.document); writer.write(text); writer.close(); } } {code} was: Hi, I try to use pdfbox to extract text from a list of files, the problem is PDFTextStripper does not work on thread mode, when I try to use it in multythread nothing happens. it is a bug or limitation? could you help me ? thanks I have added the full class {code} package org.pdfextractor; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import java.io.*; /** * Created by dru on 21.11.2016. */ class PDFExtractThread extends Thread { private String fileName; private File readFile; private PDDocument document; public PDFExtractThread(File readFile) { try { this.readFile = readFile; } catch (Exception e) { e.printStackTrace(); System.exit(1); } } public void run() { try { //get FileName this.readFile.setWritable(true); this.document = PDDocument.load(this.readFile); this.fileName = (new String(this.readFile.getName()).toLowerCase().replace(".pdf", "")); pdfBoxExtractText(); //Closing the document this.document.close(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } public void pdfBoxExtractText() throws Exception { //Retrieving text from PDF document PDFTextStripper pdfStripper = new PDFTextStripper(); System.out.println(this.fileName); FileWriter fileWriter = new FileWriter(this.fileName+".txt"); BufferedWriter writer = new BufferedWriter(fileWriter); String text = pdfStripper.getText(this.document); writer.write(text); writer.close(); } } {code} > PDFTextStripper not working with multiple threads > ------------------------------------------------- > > Key: PDFBOX-3581 > URL: https://issues.apache.org/jira/browse/PDFBOX-3581 > Project: PDFBox > Issue Type: Bug > Components: Text extraction > Affects Versions: 2.0.3 > Environment: Ubuntu 15.1 > Reporter: Dmitri Russu > Labels: multithreading > > Hi, I try to use pdfbox to extract text from a list of files, the problem is > PDFTextStripper does not work on thread mode, when I try to use it in > multythread nothing happens. it is a bug or limitation? > could you help me ? > thanks > I have added the full class > {code} > private void scanFolderFiles(File scanDirectory) { > File[] filesScan = scanDirectory.listFiles(); > if ( filesScan.length > 0 ) { > int iterator=0; > for (final File fileEntry : filesScan) { > > if (fileEntry.isDirectory()) { > scanFolderFiles(fileEntry); > } else { > try { > new PDFExtractThread(fileEntry).start(); > } > catch (Exception e) { > e.printStackTrace(); > } > } > iterator++; > } > } > } > //////////////////////////////////////////////////////////////////////////////////////////// > package org.pdfextractor; > import org.apache.pdfbox.pdmodel.PDDocument; > import org.apache.pdfbox.text.PDFTextStripper; > import java.io.*; > /** > * Created by dru on 21.11.2016. > */ > class PDFExtractThread extends Thread { > private String fileName; > private File readFile; > private PDDocument document; > public PDFExtractThread(File readFile) { > try { > this.readFile = readFile; > } catch (Exception e) { > e.printStackTrace(); > System.exit(1); > } > } > public void run() { > try { > //get FileName > this.readFile.setWritable(true); > this.document = PDDocument.load(this.readFile); > this.fileName = (new > String(this.readFile.getName()).toLowerCase().replace(".pdf", "")); > pdfBoxExtractText(); > //Closing the document > this.document.close(); > } catch (IOException e) { > e.printStackTrace(); > } catch (Exception e) { > e.printStackTrace(); > } > } > public void pdfBoxExtractText() throws Exception { > //Retrieving text from PDF document > PDFTextStripper pdfStripper = new PDFTextStripper(); > System.out.println(this.fileName); > FileWriter fileWriter = new FileWriter(this.fileName+".txt"); > BufferedWriter writer = new BufferedWriter(fileWriter); > String text = pdfStripper.getText(this.document); > writer.write(text); > writer.close(); > } > } > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org For additional commands, e-mail: dev-h...@pdfbox.apache.org