Author: tilman
Date: Sat Apr 5 09:55:07 2025
New Revision: 1924801
URL: http://svn.apache.org/viewvc?rev=1924801&view=rev
Log:
PDFBOX-5982: support DP and MP operators
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPoint.java
(with props)
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPointWithProperties.java
(with props)
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDFStreamEngine.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDAbstractContentStream.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDFStreamEngine.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDFStreamEngine.java?rev=1924801&r1=1924800&r2=1924801&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDFStreamEngine.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/PDFStreamEngine.java
Sat Apr 5 09:55:07 2025
@@ -1182,4 +1182,15 @@ public abstract class PDFStreamEngine
{
return shouldProcessColorOperators;
}
+
+ /**
+ * Handles MP and DP operators.
+ *
+ * @param tag indicates the role or significance of the sequence
+ * @param properties optional properties
+ */
+ public void markedContentPoint(COSName tag, COSDictionary properties)
+ {
+ // overridden in subclasses
+ }
}
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPoint.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPoint.java?rev=1924801&view=auto
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPoint.java
(added)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPoint.java
Sat Apr 5 09:55:07 2025
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.contentstream.operator.markedcontent;
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
+import org.apache.pdfbox.contentstream.operator.MissingOperandException;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.contentstream.operator.OperatorName;
+import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSName;
+
+/**
+ *
+ * @author Tilman Hausherr
+ */
+public class MarkedContentPoint extends OperatorProcessor
+{
+ public MarkedContentPoint(PDFStreamEngine context)
+ {
+ super(context);
+ }
+
+ @Override
+ public void process(Operator operator, List<COSBase> operands) throws
IOException
+ {
+ if (operands.isEmpty())
+ {
+ throw new MissingOperandException(operator, operands);
+ }
+ COSBase base0 = operands.get(0);
+ if (!(base0 instanceof COSName))
+ {
+ return;
+ }
+ getContext().markedContentPoint((COSName) base0, null);
+ }
+
+ @Override
+ public String getName()
+ {
+ return OperatorName.MARKED_CONTENT_POINT;
+ }
+
+}
Propchange:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPoint.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPointWithProperties.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPointWithProperties.java?rev=1924801&view=auto
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPointWithProperties.java
(added)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPointWithProperties.java
Sat Apr 5 09:55:07 2025
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.contentstream.operator.markedcontent;
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.pdfbox.contentstream.PDFStreamEngine;
+import org.apache.pdfbox.contentstream.operator.MissingOperandException;
+import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.contentstream.operator.OperatorName;
+import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import
org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDPropertyList;
+
+/**
+ *
+ * @author Tilman Hausherr
+ */
+public class MarkedContentPointWithProperties extends OperatorProcessor
+{
+ public MarkedContentPointWithProperties(PDFStreamEngine context)
+ {
+ super(context);
+ }
+
+ @Override
+ public void process(Operator operator, List<COSBase> operands) throws
IOException
+ {
+ if (operands.size() < 2)
+ {
+ throw new MissingOperandException(operator, operands);
+ }
+ if (!(operands.get(0) instanceof COSName))
+ {
+ return;
+ }
+ PDFStreamEngine context = getContext();
+ COSName tag = (COSName) operands.get(0);
+ COSBase op1 = operands.get(1);
+ COSDictionary propDict = null;
+ if (op1 instanceof COSName)
+ {
+ PDPropertyList prop =
context.getResources().getProperties((COSName) op1);
+ if (prop != null)
+ {
+ propDict = prop.getCOSObject();
+ }
+ }
+ else if (op1 instanceof COSDictionary)
+ {
+ propDict = (COSDictionary) op1;
+ }
+ if (propDict == null)
+ {
+ // wrong type or property not found
+ return;
+ }
+ context.markedContentPoint(tag, propDict);
+ }
+
+ @Override
+ public String getName()
+ {
+ return OperatorName.MARKED_CONTENT_POINT_WITH_PROPS;
+ }
+
+}
Propchange:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/contentstream/operator/markedcontent/MarkedContentPointWithProperties.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDAbstractContentStream.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDAbstractContentStream.java?rev=1924801&r1=1924800&r2=1924801&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDAbstractContentStream.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/PDAbstractContentStream.java
Sat Apr 5 09:55:07 2025
@@ -1373,6 +1373,32 @@ abstract class PDAbstractContentStream i
}
/**
+ * set a marked content point.
+ *
+ * @param tag the tag to be added to the content stream
+ * @throws IOException If the content stream could not be written
+ */
+ public void setMarkedContentPoint(COSName tag) throws IOException
+ {
+ writeOperand(tag);
+ writeOperator(OperatorName.MARKED_CONTENT_POINT);
+ }
+
+ /**
+ * Set a marked content point with a reference to an entry in the page
resources' Properties dictionary.
+ *
+ * @param tag the tag to be added to the content stream
+ * @param propertyList property list to be added to the content stream
+ * @throws IOException If the content stream could not be written
+ */
+ public void setMarkedContentPointWithProperties(COSName tag,
PDPropertyList propertyList) throws IOException
+ {
+ writeOperand(tag);
+ writeOperand(resources.add(propertyList));
+ writeOperator(OperatorName.MARKED_CONTENT_POINT_WITH_PROPS);
+ }
+
+ /**
* Set an extended graphics state.
*
* @param state The extended graphics state to be added to the content
stream
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java?rev=1924801&r1=1924800&r2=1924801&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
Sat Apr 5 09:55:07 2025
@@ -31,6 +31,8 @@ import org.apache.pdfbox.contentstream.o
import
org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
import org.apache.pdfbox.contentstream.operator.markedcontent.DrawObject;
import
org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
+import
org.apache.pdfbox.contentstream.operator.markedcontent.MarkedContentPoint;
+import
org.apache.pdfbox.contentstream.operator.markedcontent.MarkedContentPointWithProperties;
/**
* This is an stream engine to extract the marked content of a pdf.
@@ -45,7 +47,7 @@ public class PDFMarkedContentExtractor e
private final Map<String, List<TextPosition>> characterListMapping = new
HashMap<>();
/**
- * Instantiate a new PDFTextStripper object.
+ * Instantiate a new PDFMarkedContentExtractor object.
*/
public PDFMarkedContentExtractor()
{
@@ -63,8 +65,8 @@ public class PDFMarkedContentExtractor e
addOperator(new BeginMarkedContentSequence(this));
addOperator(new EndMarkedContentSequence(this));
addOperator(new DrawObject(this));
- // todo: DP - Marked Content Point
- // todo: MP - Marked Content Point with Properties
+ addOperator(new MarkedContentPoint(this));
+ addOperator(new MarkedContentPointWithProperties(this));
}
/**
@@ -129,6 +131,13 @@ public class PDFMarkedContentExtractor e
}
}
+ @Override
+ public void markedContentPoint(COSName tag, COSDictionary properties)
+ {
+ // Nothing happens here yet. If you know anything useful that should
happen, please tell us.
+ super.markedContentPoint(tag, properties);
+ }
+
public void xobject(PDXObject xobject)
{
if (!this.currentMarkedContents.isEmpty())