[ 
https://issues.apache.org/jira/browse/PDFBOX-1507?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13575655#comment-13575655
 ] 

Tanmay Mandal commented on PDFBOX-1507:
---------------------------------------

Hello Andreas,
It it's happening for all PDF no matter what kind of PDF it is,
I think i should tell you some thing , i am coding on .NET so i used 
ikvm-7.2.4630.5 conversion  to get the jar file to exe and using it in my 
console application, i can attached project but did not find any option to 
browse and upload, here is my console app code
---------------------------------------------------------------------------------------------------------------------------------------------------------------------

using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;

using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.exceptions;
using org.apache.pdfbox.util;

using java.lang;

namespace org.apache.pdfbox.examples.util
{

    using InvalidPasswordException = 
org.apache.pdfbox.exceptions.InvalidPasswordException;


    using PDDocument = org.apache.pdfbox.pdmodel.PDDocument;
    using PDPage = org.apache.pdfbox.pdmodel.PDPage;
    using PDStream = org.apache.pdfbox.pdmodel.common.PDStream;
    using PDFTextStripper = org.apache.pdfbox.util.PDFTextStripper;
    using TextPosition = org.apache.pdfbox.util.TextPosition;
    using System.IO;



    /// <summary>
    /// This is an example on how to get some x/y coordinates of text.
    /// 
    /// Usage: java org.apache.pdfbox.examples.util.PrintWordLocations 
&lt;input-pdf&gt;
    /// 
    /// @author <a href="mailto:[email protected]";>Ben Litchfield</a>
    /// @version $Revision: 1.7 $
    /// </summary>
    //public class PrintWordLocations : org.apache.pdfbox.util.PDFTextStripper
        public class PrintWordLocations : org.apache.pdfbox.util.PDFTextStripper
    {
        public class WordBox
        {
            private readonly PrintWordLocations outerInstance;

            public float _xmin;
            public float _ymin;
            public float _fontsize;
            public float _xscale;
            public float _yscale;
            public float _height;
            public float _width;

            public WordBox(PrintWordLocations outerInstance, TextPosition text)
            {
                this.outerInstance = outerInstance;
                _xmin = text.getXDirAdj();
                _ymin = text.getYDirAdj();
                _fontsize = text.getFontSize();
                _xscale = text.getXScale();
                _yscale = text.getYScale();
                _height = text.getHeightDir();
                _width = text.getWidthDirAdj();
            }

            public virtual bool rejects(TextPosition text)
            {
                return (text.getXDirAdj() < _xmin) || (text.getYDirAdj() + 
text.getWidthOfSpace() < _ymin);
            }

            public virtual bool accepts(TextPosition text)
            {
                return !rejects(text);
            }

            public virtual void extendBy(TextPosition text)
            {
                float current_xmin = _xmin;
                float current_xmax = _xmin + _width;
                float current_ymin = _ymin;
                float current_ymax = _ymin + _height;

                float text_xmin = text.getXDirAdj();
                float text_xmax = text_xmin + text.getWidthDirAdj();
                float text_ymin = text.getYDirAdj();
                float text_ymax = text_ymin + text.getHeightDir();

                float new_xmin = java.lang.Math.min(current_xmin, text_xmin);
                float new_xmax = java.lang.Math.max(current_xmax, text_xmax);
                float new_ymin = java.lang.Math.min(current_ymin, text_ymin);
                float new_ymax = java.lang.Math.max(current_ymax, text_ymax);

                _xmin = new_xmin;
                _width = new_xmax - new_xmin;
                _ymin = new_ymin;
                _height = new_ymax - new_ymin;
            }
        }

        protected internal java.lang.StringBuilder word = new 
java.lang.StringBuilder("");
        protected internal char? last_character = new char?('\0');
        protected internal LinkedList<WordBox> box_list = new 
LinkedList<WordBox>();

        /// <summary>
        /// Default constructor.
        /// </summary>
        /// <exception cref="IOException"> If there is an error loading text 
stripper properties. </exception>
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not 
available in .NET:
        //ORIGINAL LINE: public PrintWordLocations() throws java.io.IOException
        public PrintWordLocations()
        {
            try
            {
                base.setSortByPosition(true);
            }
            catch (System.Exception ex)
            {
                Console.Error.WriteLine(ex.ToString());

            }
            //base.SortByPosition = true;

        }

        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not 
available in .NET:
        //ORIGINAL LINE: public void processDocuments(String[] args) throws 
Exception
        public virtual void processDocuments(string[] args)
        {
            if (args.Length != 0)
            {
                usage();
            }
            else
            {
                PDDocument document = null;
                string InputFilePath = @"D:\Projects\PDF2Alto\Sample 
Project\Pdf2Text\ConsoleApplication1\temp\fmp000000426_0541.pdf";
                string OutputFilePath = @"D:\Projects\PDF2Alto\Sample 
Project\Pdf2Text\ConsoleApplication1\temp\test.txt";
                TextWriter tsw = new StreamWriter(OutputFilePath,true);

                try
                {

                    //document = PDDocument.load(args[0]);
                    document = PDDocument.load(InputFilePath);
                                        

                    if (document.isEncrypted())
                    {
                        try
                        {
                            document.decrypt("");
                        }
                        catch (InvalidPasswordException e)
                        {
                            Console.Error.WriteLine("Error: Document is 
encrypted with a password.");
                            Environment.Exit(1);
                        }
                    }
                    PrintWordLocations printer = new PrintWordLocations();
                    //PrintTextLocations printer = new PrintTextLocations();
                    //IList allPages = document.getDocumentCatalog.AllPages;

                    IList allPages = 
document.getDocumentCatalog().getAllPages().toArray();

                    //Console.WriteLine("<?xml version=\"1.0\" 
encoding=\"UTF-8\"?><alto 
xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\";><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>");
                    tsw.WriteLine("<?xml version=\"1.0\" 
encoding=\"UTF-8\"?><alto 
xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\";><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>");
                    

                    for (int i = 0; i < allPages.Count; i++)
                    {
                        PDPage page = (PDPage)allPages[i];
                        
                        //Console.WriteLine("<Page>");
                        //Console.WriteLine("<PrintSpace>");
                        //Console.WriteLine("<TextBlock>");
                        //Console.WriteLine("<TextLine>");

                        tsw.WriteLine("<Page>");
                        tsw.WriteLine("<PrintSpace>");
                        tsw.WriteLine("<TextBlock>");
                        tsw.WriteLine("<TextLine>");

                        PDFStreamEngine engine1 = new PDFStreamEngine();

                        PDStream contents = page.getContents();
                        if (contents != null)
                        {
                            //printer.processStream(page, page.findResources(), 
page.Contents.Stream);
                           // engine1.processStream(page, page.findResources(), 
page.getContents().getStream());
                            printer.processStream(page, page.findResources(), 
page.getContents().getStream());
                           // printer.processTextPosition(page.gettext);

                            
                        }
                        endOfPage(tsw);

                        //Console.WriteLine("</TextLine>");
                        //Console.WriteLine("</TextBlock>");
                        //Console.WriteLine("</PrintSpace>");
                        //Console.WriteLine("</Page>");

                        tsw.WriteLine("</TextLine>");
                        tsw.WriteLine("</TextBlock>");
                        tsw.WriteLine("</PrintSpace>");
                        tsw.WriteLine("</Page>");

                    }
                    tsw.WriteLine("</Layout></alto>");
                }
                finally
                {
                    if (document != null)
                    {
                        document.close();
                    }
                }
            }
        }

        /// <summary>
        /// This will print the documents data.
        /// </summary>
        /// <param name="args"> The command line arguments.
        /// </param>
        /// <exception cref="Exception"> If there is an error parsing the 
document. </exception>
        //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not 
available in .NET:
        //ORIGINAL LINE: public static void main(String[] args) throws Exception
        [STAThread]
        public static void Main(string[] args)
        {
            PrintWordLocations handler = new PrintWordLocations();
            handler.processDocuments(args);
        }

        /// <summary>
        /// A method provided as an event interface to allow a subclass to 
perform
        /// some specific functionality when text needs to be processed.
        /// </summary>
        /// <param name="text"> The text to be processed </param>
        protected internal virtual void processTextPosition(TextPosition text)
        {
            char current_character = text.getCharacter().ToLower()[0];

            if (endsWord(current_character))
            {
                emitWordBoxes();
            }
            else
            {
                if (box_list.Count == 0)
                {
                    if (isAlnumOrApostrophe(current_character))
                    {
                        word = word.append(current_character);
                    }

                    box_list.AddFirst(new WordBox(this, text));
                }
                else if (box_list.Last.Value.accepts(text))
                {
                    if (isAlnumOrApostrophe(current_character))
                    {
                        word = word.append(current_character);
                    }

                    box_list.Last.Value.extendBy(text);
                }
                else
                {
                    if (!isHyphen(last_character))
                    {
                        emitWordBoxes();
                    }

                    if (isAlnumOrApostrophe(current_character))
                    {
                        word = word.append(current_character);
                    }

                    box_list.AddFirst(new WordBox(this, text));
                }
            }

            last_character = current_character;
        }

        protected internal virtual void endOfPage(TextWriter tsw)
        {
            //if (box_list.Count > 0)
            //{
            //    emitWordBoxes(tsw);
            //}

            emitWordBoxes(tsw);
        }


        protected internal virtual void emitWordBoxes()
        {
            float pointsToInch1200 = (float)16.6666;
            float mysteryHeightScale = (float)1.5;
            float height;
            float width;
            float hpos;
            float vpos;

            if (word.ToString().Trim().Length > 0)
            {
                foreach (WordBox wordbox in box_list)
                {
                    width = wordbox._width * pointsToInch1200;
                    height = wordbox._height * pointsToInch1200 * 
mysteryHeightScale;
                    hpos = wordbox._xmin * pointsToInch1200;
                    vpos = wordbox._ymin * pointsToInch1200 - height;

                    Console.WriteLine("<String HEIGHT=\"" + height + "\" 
WIDTH=\"" + width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" 
+ word.ToString().Trim() + "\"/>");
                    //tsw.Write("<String HEIGHT=\"" + height + "\" WIDTH=\"" + 
width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" + 
word.ToString().Trim() + "\"/>");
                }
            }

            //word = new StringBuilder("");
            word = new java.lang.StringBuilder("");

            last_character = new char?('\0');
            box_list.Clear();
        }

        protected internal virtual void emitWordBoxes(TextWriter tsw)
        {
            float pointsToInch1200 = (float)16.6666;
            float mysteryHeightScale = (float)1.5;
            float height;
            float width;
            float hpos;
            float vpos;

            if (word.ToString().Trim().Length > 0)
            {
                foreach (WordBox wordbox in box_list)
                {
                    width = wordbox._width * pointsToInch1200;
                    height = wordbox._height * pointsToInch1200 * 
mysteryHeightScale;
                    hpos = wordbox._xmin * pointsToInch1200;
                    vpos = wordbox._ymin * pointsToInch1200 - height;

                    //Console.WriteLine("<String HEIGHT=\"" + height + "\" 
WIDTH=\"" + width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" 
+ word.ToString().Trim() + "\"/>");
                    tsw.Write("<String HEIGHT=\"" + height + "\" WIDTH=\"" + 
width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" + 
word.ToString().Trim() + "\"/>");
                }
            }

            //word = new StringBuilder("");
            word = new java.lang.StringBuilder("");

            last_character = new char?('\0');
            box_list.Clear();
        }

        protected internal virtual bool endsWord(char ch)
        {
            return !(isAlnumOrApostrophe(ch) || isHyphen(ch));
        }

        protected internal virtual bool isAlnumOrApostrophe(char ch)
        {
            return char.IsLetterOrDigit(ch) || (ch == '\'');
        }

        protected internal virtual bool isHyphen(char? ch)
        {
            return ch == '-';
        }

        /// <summary>
        /// This will print the usage for this document.
        /// </summary>
        private static void usage()
        {
            Console.Error.WriteLine("Usage: java 
org.apache.pdfbox.examples.pdmodel.PrintWordLocations <input-pdf>");
        }
    }

}

------------------------------------------------------------------------------------------------------------------------------------------------------
problem is it is not calling "processTextPosition" and C# said it's hiding 
inherited member , that's why i tried with "new" also , but as i found it is 
not calling this function/delegate [C#] at all and before that getting issue of 
errors .


I have created a pdf with only "This is a Test" from MS-Word and getting same 
issue same kind of error , if pdf has lot's of text it's get lot's error , 
that's indicate it's getting null reference exception on word/char, might be 
one when it is fetching word or char , it is getting null, in other word it can 
not ready though recognize [as error are coming], i found due to font some one 
get this kind of issue , not sue what is my case.

thanks and regards
Tanmay Mandal
                
> Getting Issue at text reading 
> ------------------------------
>
>                 Key: PDFBOX-1507
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-1507
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Parsing
>    Affects Versions: 1.7.1
>         Environment: windows, runing pdfbox in .Net using ikvm-7.2.4630.5 
> conversion , we are actually converting pdf into ALTO file
>            Reporter: Tanmay Mandal
>   Original Estimate: 1h
>  Remaining Estimate: 1h
>
> <?xml version="1.0" encoding="UTF-8"?><alto 
> xmlns="http://www.loc.gov/standards/
> alto/alto-v2.0.xsd"><Description><MeasurementUnit>inch1200</MeasurementUnit></De
> scription><Layout>
> <Page>
> <PrintSpace>
> <TextBlock>
> <TextLine>
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
>         at 
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
>         at 
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
>         at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
>         at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
>         at 
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
>         at 
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
>         at 
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
>         at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
>         at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
>         at 
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
>         at 
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
>         at 
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
>         at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
>         at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
>         at 
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> Feb 04, 2013 8:40:03 PM org.apache.pdfbox.util.PDFStreamEngine processOperator
> WARNING: java.lang.NullPointerException
> java.lang.NullPointerException
>         at 
> org.apache.pdfbox.util.PDFTextStripper.processTextPosition(PDFTextStr
> ipper.java:954)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processEncodedText(PDFStreamEn
> gine.java:498)
>         at 
> org.apache.pdfbox.util.operator.ShowTextGlyph.process(ShowTextGlyph.j
> ava:62)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngin
> e.java:556)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:271)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngi
> ne.java:237)
>         at 
> org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.
> java:218)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.processDocumen
> ts(PrintWordLocation.cs:185)
>         at 
> cli.org.apache.pdfbox.examples.util.PrintWordLocations.Main(PrintWord
> Location.cs:228)
>         at cli.System.AppDomain._nExecuteAssembly(Unknown Source)
>         at cli.System.AppDomain.ExecuteAssembly(Unknown Source)
>         at 
> cli.Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly(U
> nknown Source)
> </TextLine>
> </TextBlock>
> </PrintSpace>
> </Page>
> We have converted Java code in C# from https://github.com/cokernel/pdf2alto

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to