Hi,
I am trying to extract highlighted text with different colors inside the
pdf file. I can use "getColour()" by calling PDAnnotation class but the
problem is PDGamma objects are returned and how can I convert that
PDGamma string into human readable English names, such as "Yellow", "Red",
"Blue" ... etc.?
Below is the sample code I did for the text extraction with color returned.
Any great hints or simple codes would be appreciated.
Cheers,
Nick
====================================================================import
java.awt.geom.Rectangle2D;
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class ExtractHighlights {
public static void main(String args[]) {
try {
PDDocument pddDocument = PDDocument.load(new File("sample.pdf"));
List allPages =
pddDocument.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
int pageNum = i + 1;
PDPage page = (PDPage) allPages.get(i);
List<PDAnnotation> la = page.getAnnotations();
if (la.size() < 1) {
continue;
}
PDAnnotation pdfAnnot = la.get(0);
System.out.println("Color = " + pdfAnnot.getColour());
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDRectangle rect = pdfAnnot.getRectangle();
float x = rect.getLowerLeftX() - 1;
float y = rect.getUpperRightY() - 1;
float width = rect.getWidth() + 2;
float height = rect.getHeight() + rect.getHeight() / 4;
int rotation = page.findRotation();
if (rotation == 0) {
PDRectangle pageSize = page.findMediaBox();
y = pageSize.getHeight() - y;
}
Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width,
height);
stripper.addRegion(Integer.toString(0), awtRect);
stripper.extractRegions(page);
System.out.println("Getting text from region = " + awtRect + "\n");
System.out.println(stripper.getTextForRegion(Integer.toString(0)));
System.out.println("Getting text from comment = " +
pdfAnnot.getContents());
}
pddDocument.close();
} catch (Exception ex) {
ex.printStackTrace();
}
}
}