[ 
https://issues.apache.org/jira/browse/PDFBOX-3581?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Dmitri Russu updated PDFBOX-3581:
---------------------------------
    Description: 
Hi, I try to use pdfbox to extract text from a list of files, the problem is 
PDFTextStripper does not work on thread mode, when I try to use it in 
multythread nothing happens. it is a bug or limitation? 

could you help me ?
 thanks


I have added the full class

{code}
 private void scanFolderFiles(File scanDirectory) {
        File[] filesScan = scanDirectory.listFiles();

        if ( filesScan.length > 0 ) {
            int iterator=0;
            for (final File fileEntry : filesScan) {

           
                if (fileEntry.isDirectory()) {
                    scanFolderFiles(fileEntry);
                } else {
                    try {
                       new PDFExtractThread(fileEntry).start();
                    }
                    catch (Exception e) {
                        e.printStackTrace();
                    }
                }

                iterator++;
            }
        }
    }


////////////////////////////////////////////////////////////////////////////////////////////
package org.pdfextractor;


import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.*;

/**
 * Created by dru on 21.11.2016.
 */
class PDFExtractThread extends Thread {

    private String fileName;
    private File readFile;
    private PDDocument document;

    public PDFExtractThread(File readFile) {

        try {

            this.readFile = readFile;

        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }


    public void run() {

        try {

            //get FileName
            this.readFile.setWritable(true);
            this.document = PDDocument.load(this.readFile);
            this.fileName = (new 
String(this.readFile.getName()).toLowerCase().replace(".pdf", ""));

            pdfBoxExtractText();
            //Closing the document
            this.document.close();

        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


    public void pdfBoxExtractText() throws Exception {
        //Retrieving text from PDF document
        PDFTextStripper pdfStripper = new PDFTextStripper();
        System.out.println(this.fileName);
        FileWriter fileWriter = new FileWriter(this.fileName+".txt");
        BufferedWriter writer = new BufferedWriter(fileWriter);
        String text = pdfStripper.getText(this.document);

        writer.write(text);
        writer.close();
    }
}

{code}



  was:
Hi, I try to use pdfbox to extract text from a list of files, the problem is 
PDFTextStripper does not work on thread mode, when I try to use it in 
multythread nothing happens. it is a bug or limitation? 

could you help me ?
 thanks


I have added the full class

{code}
package org.pdfextractor;


import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.*;

/**
 * Created by dru on 21.11.2016.
 */
class PDFExtractThread extends Thread {

    private String fileName;
    private File readFile;
    private PDDocument document;

    public PDFExtractThread(File readFile) {

        try {

            this.readFile = readFile;

        } catch (Exception e) {
            e.printStackTrace();
            System.exit(1);
        }
    }


    public void run() {

        try {

            //get FileName
            this.readFile.setWritable(true);
            this.document = PDDocument.load(this.readFile);
            this.fileName = (new 
String(this.readFile.getName()).toLowerCase().replace(".pdf", ""));

            pdfBoxExtractText();
            //Closing the document
            this.document.close();

        } catch (IOException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


    public void pdfBoxExtractText() throws Exception {
        //Retrieving text from PDF document
        PDFTextStripper pdfStripper = new PDFTextStripper();
        System.out.println(this.fileName);
        FileWriter fileWriter = new FileWriter(this.fileName+".txt");
        BufferedWriter writer = new BufferedWriter(fileWriter);
        String text = pdfStripper.getText(this.document);

        writer.write(text);
        writer.close();
    }
}

{code}




> PDFTextStripper not working with multiple threads
> -------------------------------------------------
>
>                 Key: PDFBOX-3581
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-3581
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 2.0.3
>         Environment: Ubuntu 15.1
>            Reporter: Dmitri Russu
>              Labels: multithreading
>
> Hi, I try to use pdfbox to extract text from a list of files, the problem is 
> PDFTextStripper does not work on thread mode, when I try to use it in 
> multythread nothing happens. it is a bug or limitation? 
> could you help me ?
>  thanks
> I have added the full class
> {code}
>  private void scanFolderFiles(File scanDirectory) {
>         File[] filesScan = scanDirectory.listFiles();
>         if ( filesScan.length > 0 ) {
>             int iterator=0;
>             for (final File fileEntry : filesScan) {
>            
>                 if (fileEntry.isDirectory()) {
>                     scanFolderFiles(fileEntry);
>                 } else {
>                     try {
>                        new PDFExtractThread(fileEntry).start();
>                     }
>                     catch (Exception e) {
>                         e.printStackTrace();
>                     }
>                 }
>                 iterator++;
>             }
>         }
>     }
> ////////////////////////////////////////////////////////////////////////////////////////////
> package org.pdfextractor;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.text.PDFTextStripper;
> import java.io.*;
> /**
>  * Created by dru on 21.11.2016.
>  */
> class PDFExtractThread extends Thread {
>     private String fileName;
>     private File readFile;
>     private PDDocument document;
>     public PDFExtractThread(File readFile) {
>         try {
>             this.readFile = readFile;
>         } catch (Exception e) {
>             e.printStackTrace();
>             System.exit(1);
>         }
>     }
>     public void run() {
>         try {
>             //get FileName
>             this.readFile.setWritable(true);
>             this.document = PDDocument.load(this.readFile);
>             this.fileName = (new 
> String(this.readFile.getName()).toLowerCase().replace(".pdf", ""));
>             pdfBoxExtractText();
>             //Closing the document
>             this.document.close();
>         } catch (IOException e) {
>             e.printStackTrace();
>         } catch (Exception e) {
>             e.printStackTrace();
>         }
>     }
>     public void pdfBoxExtractText() throws Exception {
>         //Retrieving text from PDF document
>         PDFTextStripper pdfStripper = new PDFTextStripper();
>         System.out.println(this.fileName);
>         FileWriter fileWriter = new FileWriter(this.fileName+".txt");
>         BufferedWriter writer = new BufferedWriter(fileWriter);
>         String text = pdfStripper.getText(this.document);
>         writer.write(text);
>         writer.close();
>     }
> }
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@pdfbox.apache.org
For additional commands, e-mail: dev-h...@pdfbox.apache.org

Reply via email to