Author: lehmi Date: Thu Sep 30 18:16:49 2010 New Revision: 1003195 URL: http://svn.apache.org/viewvc?rev=1003195&view=rev Log: PDFBOX-828: fixed some issues with positioning when extracting or rendering text
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/GSave.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java?rev=1003195&r1=1003194&r2=1003195&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFStreamEngine.java Thu Sep 30 18:16:49 2010 @@ -19,7 +19,6 @@ package org.apache.pdfbox.util; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; @@ -33,6 +32,7 @@ import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; import org.apache.pdfbox.exceptions.WrappedIOException; @@ -43,7 +43,10 @@ import org.apache.pdfbox.pdmodel.PDResou import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.graphics.PDExtendedGraphicsState; import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState; +import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; import org.apache.pdfbox.util.operator.OperatorProcessor; @@ -74,15 +77,15 @@ public class PDFStreamEngine private Matrix textMatrix = null; private Matrix textLineMatrix = null; - private Stack graphicsStack = new Stack(); + private Stack<PDGraphicsState> graphicsStack = new Stack<PDGraphicsState>(); - private Map operators = new HashMap(); + private Map<String,OperatorProcessor> operators = new HashMap<String,OperatorProcessor>(); - private Stack streamResourcesStack = new Stack(); + private Stack<StreamResources> streamResourcesStack = new Stack<StreamResources>(); private PDPage page; - private Map documentFontCache = new HashMap(); + private Map<String,PDFont> documentFontCache = new HashMap<String,PDFont>(); private int validCharCnt; private int totalCharCnt; @@ -93,10 +96,10 @@ public class PDFStreamEngine */ private static class StreamResources { - private Map fonts; - private Map colorSpaces; - private Map xobjects; - private Map graphicsStates; + private Map<String,PDFont> fonts; + private Map<String,PDColorSpace> colorSpaces; + private Map<String,PDXObject> xobjects; + private Map<String,PDExtendedGraphicsState> graphicsStates; private PDResources resources; private StreamResources() @@ -233,7 +236,7 @@ public class PDFStreamEngine } try { - List arguments = new ArrayList(); + List<COSBase> arguments = new ArrayList<COSBase>(); parser = new PDFStreamParser( cosStream ); Iterator<Object> iter = parser.getTokenIterator(); @@ -252,7 +255,7 @@ public class PDFStreamEngine } else { - arguments.add( next ); + arguments.add( (COSBase)next ); } if(log.isDebugEnabled()) { @@ -331,11 +334,13 @@ public class PDFStreamEngine //this might be a different number final float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 ); float spaceWidthText=0; - - try{ // to avoid crash as described in PDFBOX-614 + try + { + // to avoid crash as described in PDFBOX-614 // lets see what the space displacement should be spaceWidthText = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor); - }catch (Throwable exception) + } + catch (Throwable exception) { log.warn( exception, exception); } @@ -348,38 +353,21 @@ public class PDFStreamEngine spaceWidthText *= .80f; } - - /* Convert textMatrix to display units */ - final Matrix initialMatrix = new Matrix(); - initialMatrix.setValue(0,0,1); - initialMatrix.setValue(0,1,0); - initialMatrix.setValue(0,2,0); - initialMatrix.setValue(1,0,0); - initialMatrix.setValue(1,1,1); - initialMatrix.setValue(1,2,0); - initialMatrix.setValue(2,0,0); - initialMatrix.setValue(2,1,riseText); - initialMatrix.setValue(2,2,1); - - final Matrix ctm = graphicsState.getCurrentTransformationMatrix(); - final Matrix dispMatrix = initialMatrix.multiply( ctm ); - - Matrix textMatrixStDisp = textMatrix.multiply( dispMatrix ); - Matrix textMatrixEndDisp = null; - - final float xScaleDisp = textMatrixStDisp.getXScale(); - final float yScaleDisp = textMatrixStDisp.getYScale(); - - final float spaceWidthDisp = spaceWidthText * xScaleDisp * fontSizeText; - final float wordSpacingDisp = wordSpacingText * xScaleDisp * fontSizeText; + final float spaceWidthDisp = spaceWidthText * fontSizeText * horizontalScalingText; float maxVerticalDisplacementText = 0; - float[] individualWidthsBuffer = new float[string.length]; - StringBuilder characterBuffer = new StringBuilder(string.length); + Matrix textStateParameters = new Matrix(); + textStateParameters.setValue(0,0, fontSizeText*horizontalScalingText); + textStateParameters.setValue(1,1, fontSizeText); + textStateParameters.setValue(2,1, riseText); + + int pageRotation = page.findRotation(); + float pageHeight = page.findMediaBox().getHeight(); + float pageWidth = page.findMediaBox().getWidth(); int codeLength = 1; - for( int i=0; i<string.length; i+=codeLength ) + for( int i=0; i<string.length; i+=codeLength) { // Decode the value to a Unicode character codeLength = 1; @@ -390,16 +378,14 @@ public class PDFStreamEngine codeLength++; c = font.encode( string, i, codeLength ); } - c = inspectFontEncoding(c); //todo, handle horizontal displacement // get the width and height of this character in text units - float characterHorizontalDisplacementText = - (font.getFontWidth( string, i, codeLength )/glyphSpaceToTextSpaceFactor); + float characterHorizontalDisplacementText = (font.getFontWidth( string, i, codeLength )/1000f); maxVerticalDisplacementText = Math.max( maxVerticalDisplacementText, - font.getFontHeight( string, i, codeLength)/glyphSpaceToTextSpaceFactor); + characterHorizontalDisplacementText); // PDF Spec - 5.5.2 Word Spacing // @@ -419,24 +405,17 @@ public class PDFStreamEngine // applying word spacing to either the non-32 space or to the character // code 32 non-space resulted in errors consistent with this interpretation. // - float spacingText = characterSpacingText; + float spacingText = 0; if( (string[i] == 0x20) && codeLength == 1 ) { spacingText += wordSpacingText; } - - /* The text matrix gets updated after each glyph is placed. The updated - * version will have the X and Y coordinates for the next glyph. - */ - Matrix glyphMatrixStDisp = textMatrix.multiply( dispMatrix ); - - //The adjustment will always be zero. The adjustment as shown in the - //TJ operator will be handled separately. - float adjustment=0; + // Convert textMatrix to display units + Matrix textMatrixStart = textStateParameters.copy().multiply(textMatrix).multiply(getGraphicsState().getCurrentTransformationMatrix()); + // TODO : tx should be set for horizontal text and ty for vertical text // which seems to be specified in the font (not the direction in the matrix). - float tx = ((characterHorizontalDisplacementText-adjustment/glyphSpaceToTextSpaceFactor)*fontSizeText) - * horizontalScalingText; + float tx = ((characterHorizontalDisplacementText)*fontSizeText+characterSpacingText+spacingText)*horizontalScalingText; float ty = 0; Matrix td = new Matrix(); @@ -445,84 +424,48 @@ public class PDFStreamEngine textMatrix = td.multiply( textMatrix ); - Matrix glyphMatrixEndDisp = textMatrix.multiply( dispMatrix ); - - float sx = spacingText * horizontalScalingText; - float sy = 0; - - Matrix sd = new Matrix(); - sd.setValue( 2, 0, sx ); - sd.setValue( 2, 1, sy ); - - textMatrix = sd.multiply( textMatrix ); + // The text matrix gets updated after each glyph is placed. The updated + // version will have the X and Y coordinates for the next glyph. + Matrix textMatrixEnd = textStateParameters.copy().multiply(textMatrix.copy()).multiply(getGraphicsState().getCurrentTransformationMatrix()); // determine the width of this character // XXX: Note that if we handled vertical text, we should be using Y here - - float widthText = glyphMatrixEndDisp.getXPosition() - glyphMatrixStDisp.getXPosition(); - - while( characterBuffer.length() + ( c != null ? c.length() : 1 ) > individualWidthsBuffer.length ) - { - float[] tmp = new float[individualWidthsBuffer.length * 2]; - System.arraycopy( individualWidthsBuffer, 0, tmp, 0, individualWidthsBuffer.length ); - individualWidthsBuffer = tmp; - } + float widthText = textMatrixEnd.getXPosition() - textMatrixStart.getXPosition(); //there are several cases where one character code will //output multiple characters. For example "fi" or a //glyphname that has no mapping like "visiblespace" if( c != null ) { - Arrays.fill( - individualWidthsBuffer, - characterBuffer.length(), - characterBuffer.length() + c.length(), - widthText / c.length()); - - validCharCnt += c.length(); + validCharCnt++; } else { // PDFBOX-373: Replace a null entry with "?" so it is // not printed as "(null)" c = "?"; - - individualWidthsBuffer[characterBuffer.length()] = widthText; } - characterBuffer.append(c); + totalCharCnt++; - totalCharCnt += c.length(); - - if( spacingText == 0 && (i + codeLength) < (string.length - 1) ) - { - continue; - } - - textMatrixEndDisp = glyphMatrixEndDisp; - - float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * yScaleDisp; - - float[] individualWidths = new float[characterBuffer.length()]; - System.arraycopy( individualWidthsBuffer, 0, individualWidths, 0, individualWidths.length ); + float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText; // process the decoded text processTextPosition( new TextPosition( - page, - textMatrixStDisp, - textMatrixEndDisp, + pageRotation, + pageWidth, + pageHeight, + textMatrixStart, + textMatrixEnd, totalVerticalDisplacementDisp, - individualWidths, + widthText, spaceWidthDisp, - characterBuffer.toString(), + c, font, fontSizeText, - (int)(fontSizeText * textMatrix.getXScale()), - wordSpacingDisp )); - - textMatrixStDisp = textMatrix.multiply( dispMatrix ); + (int)(fontSizeText * textMatrix.getXScale()) + )); - characterBuffer.setLength(0); } } @@ -534,7 +477,7 @@ public class PDFStreamEngine * * @throws IOException If there is an error processing the operation. */ - public void processOperator( String operation, List arguments ) throws IOException + public void processOperator( String operation, List<COSBase> arguments ) throws IOException { try { @@ -555,7 +498,7 @@ public class PDFStreamEngine * * @throws IOException If there is an error processing the operation. */ - protected void processOperator( PDFOperator operator, List arguments ) throws IOException + protected void processOperator( PDFOperator operator, List<COSBase> arguments ) throws IOException { try { @@ -584,51 +527,51 @@ public class PDFStreamEngine /** * @return Returns the colorSpaces. */ - public Map getColorSpaces() + public Map<String,PDColorSpace> getColorSpaces() { - return ((StreamResources) streamResourcesStack.peek()).colorSpaces; + return streamResourcesStack.peek().colorSpaces; } /** * @return Returns the colorSpaces. */ - public Map getXObjects() + public Map<String,PDXObject> getXObjects() { - return ((StreamResources) streamResourcesStack.peek()).xobjects; + return streamResourcesStack.peek().xobjects; } /** * @param value The colorSpaces to set. */ - public void setColorSpaces(Map value) + public void setColorSpaces(Map<String,PDColorSpace> value) { - ((StreamResources) streamResourcesStack.peek()).colorSpaces = value; + streamResourcesStack.peek().colorSpaces = value; } /** * @return Returns the fonts. */ - public Map getFonts() + public Map<String,PDFont> getFonts() { - return ((StreamResources) streamResourcesStack.peek()).fonts; + return streamResourcesStack.peek().fonts; } /** * @param value The fonts to set. */ - public void setFonts(Map value) + public void setFonts(Map<String,PDFont> value) { - ((StreamResources) streamResourcesStack.peek()).fonts = value; + streamResourcesStack.peek().fonts = value; } /** * @return Returns the graphicsStack. */ - public Stack getGraphicsStack() + public Stack<PDGraphicsState> getGraphicsStack() { return graphicsStack; } /** * @param value The graphicsStack to set. */ - public void setGraphicsStack(Stack value) + public void setGraphicsStack(Stack<PDGraphicsState> value) { graphicsStack = value; } @@ -649,14 +592,14 @@ public class PDFStreamEngine /** * @return Returns the graphicsStates. */ - public Map getGraphicsStates() + public Map<String,PDExtendedGraphicsState> getGraphicsStates() { - return ((StreamResources) streamResourcesStack.peek()).graphicsStates; + return streamResourcesStack.peek().graphicsStates; } /** * @param value The graphicsStates to set. */ - public void setGraphicsStates(Map value) + public void setGraphicsStates(Map<String,PDExtendedGraphicsState> value) { ((StreamResources) streamResourcesStack.peek()).graphicsStates = value; } @@ -693,7 +636,7 @@ public class PDFStreamEngine */ public PDResources getResources() { - return ((StreamResources) streamResourcesStack.peek()).resources; + return streamResourcesStack.peek().resources; } /** Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java?rev=1003195&r1=1003194&r2=1003195&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java Thu Sep 30 18:16:49 2010 @@ -47,6 +47,7 @@ public class TextPosition private PDFont font; private float fontSize; private int fontSizePt; + // TODO remove unused value private float wordSpacing; // word spacing value, in display units /** @@ -112,6 +113,60 @@ public class TextPosition } /** + * Constructor. + * + * @param pageRotation rotation of the page that the text is located in + * @param pageWidth rotation of the page that the text is located in + * @param pageHeight rotation of the page that the text is located in + * @param textPositionSt TextMatrix for start of text (in display units) + * @param textPositionEnd TextMatrix for end of text (in display units) + * @param maxFontH Maximum height of text (in display units) + * @param individualWidth The width of the given character/string. (in ? units) + * @param spaceWidth The width of the space character. (in display units) + * @param string The character to be displayed. + * @param currentFont The current for for this text position. + * @param fontSizeValue The new font size. + * @param fontSizeInPt The font size in pt units. + */ + public TextPosition( + int pageRotation, + float pageWidth, + float pageHeight, + Matrix textPositionSt, + Matrix textPositionEnd, + float maxFontH, + float individualWidth, + float spaceWidth, + String string, + PDFont currentFont, + float fontSizeValue, + int fontSizeInPt + ) + { + this.textPos = textPositionSt; + + this.endX = textPositionEnd.getXPosition(); + this.endY = textPositionEnd.getYPosition(); + + this.rot = pageRotation; + // make sure it is 0 to 270 and no negative numbers + if(this.rot < 0) + { + rot += 360; + } + + this.maxTextHeight = maxFontH; + this.pageHeight = pageHeight; + this.pageWidth = pageWidth; + + this.widths = new float[]{individualWidth}; + this.widthOfSpace = spaceWidth; + this.str = string; + this.font = currentFont; + this.fontSize = fontSizeValue; + this.fontSizePt = fontSizeInPt; + } + /** * Return the string of characters stored in this object. * * @return The string on the screen. @@ -393,6 +448,7 @@ public class TextPosition * * @return The current word spacing. */ + @Deprecated public float getWordSpacing() { return wordSpacing; Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/GSave.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/GSave.java?rev=1003195&r1=1003194&r2=1003195&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/GSave.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/GSave.java Thu Sep 30 18:16:49 2010 @@ -19,6 +19,7 @@ package org.apache.pdfbox.util.operator; import java.util.List; import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState; import org.apache.pdfbox.util.PDFOperator; /** @@ -36,7 +37,7 @@ public class GSave extends OperatorProce */ public void process(PDFOperator operator, List<COSBase> arguments) { - context.getGraphicsStack().push( context.getGraphicsState().clone() ); + context.getGraphicsStack().push( (PDGraphicsState)context.getGraphicsState().clone() ); } } Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java?rev=1003195&r1=1003194&r2=1003195&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/operator/ShowTextGlyph.java Thu Sep 30 18:16:49 2010 @@ -42,19 +42,20 @@ public class ShowTextGlyph extends Opera public void process(PDFOperator operator, List<COSBase> arguments) throws IOException { COSArray array = (COSArray)arguments.get( 0 ); - float adjustment=0; - for( int i=0; i<array.size(); i++ ) + int arraySize = array.size(); + float fontsize = context.getGraphicsState().getTextState().getFontSize(); + float horizontalScaling = context.getGraphicsState().getTextState().getHorizontalScalingPercent()/100; + for( int i=0; i<arraySize; i++ ) { COSBase next = array.get( i ); if( next instanceof COSNumber ) { - adjustment = ((COSNumber)next).floatValue(); - + float adjustment = ((COSNumber)next).floatValue(); Matrix adjMatrix = new Matrix(); - adjustment=(-adjustment/1000)*context.getGraphicsState().getTextState().getFontSize() * - (context.getGraphicsState().getTextState().getHorizontalScalingPercent()/100); + adjustment=-(adjustment/1000)*horizontalScaling*fontsize; + // TODO vertical writing mode adjMatrix.setValue( 2, 0, adjustment ); - context.setTextMatrix( adjMatrix.multiply( context.getTextMatrix() ) ); + context.setTextMatrix( adjMatrix.multiply(context.getTextMatrix()) ); } else if( next instanceof COSString ) { Modified: pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf-sorted.txt?rev=1003195&r1=1003194&r2=1003195&view=diff ============================================================================== Binary files - no diff available. Modified: pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/resources/input/cweb.pdf.txt?rev=1003195&r1=1003194&r2=1003195&view=diff ============================================================================== Binary files - no diff available.