PDFBox - Read pdf file line by line using C#.Net
Hi guys,
I use the code below to read a pdf file.
The code is working fine. The problem is that I have to read the pdf
line by line and not like "one big string".
I have this need, because the text is a complex one, and I need to
apply some filters while reading each line from the original.
How can I work it around?
Thanks in advance,
Aldo.
Note: The code below is C#.Net - Visual Studio 2008.
using System;
using System.IO;
using System.Windows.Forms;
using System.Collections;
using java.io;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System.Text;
namespace Pdf2Text
{
class Program
{
[STAThread]
static void Main(string[] args)
{
string initialDir = @"C:\...\Pdf_Files\";
OpenFileDialog ofd = new OpenFileDialog();
ofd.InitialDirectory = initialDir;
string fileIn = "";
string fileOut = initialDir + "x.txt";
if (ofd.ShowDialog() == DialogResult.OK){fileIn = ofd.FileName;}
// Get file encoding
System.Text.Encoding encIn = MyFileStream.GetFileEncoding(fileIn);
System.Text.Encoding encOut = System.Text.Encoding.Unicode;
// Read from PDF.
WriteToFile(fileIn, fileOut, encIn, encOut);
}
public static void WriteToFile(string fileIn, string
fileOut,Encoding encIn, Encoding encOut)
{
using (FileStream fs = new FileStream(fileOut,
FileMode.Create, FileAccess.Write))
{
using (StreamWriter sw = new StreamWriter(fs, encOut))
{
string text = ParseUsingPDFBox(fileIn, encIn.EncodingName);
// Normalize text.
text = text.Normalize();
sw.Write(text);
}
}
}
public static string ParseUsingPDFBox(string input, string encName)
{
java.io.InputStream iStream = new java.io.FileInputStream(input);
java.io.InputStreamReader isr = new
java.io.InputStreamReader(iStream, encName);
isr.read();
PDDocument doc = PDDocument.load(iStream);
PDFTextStripper stripper = new PDFTextStripper();
isr.close();
return stripper.getText(doc);
// ---------------------------------------------------------------
// I was trying as below, but getting gibberish…
java.io.InputStream iStream2 = new java.io.FileInputStream(input);
java.io.InputStreamReader isr2 = new
java.io.InputStreamReader(iStream2, encName);
LineNumberReader lnr2 = new LineNumberReader(isr2);
PDFTextStripper lineStripper = new PDFTextStripper();
for (int lineNo = 1; lineNo < 25; lineNo++)
{
lnr2.setLineNumber(lineNo);
MessageBox.Show(lineStripper.getText());
}
// ---------------------------------------------------------------
}
}
}