Hello to everyone, lately i building up an open source project, with some
collaborator, but one of them cannot contribute any more. He is a solution
architect so he is very skilled (much more than me!). I am now analysing
his code to finish the job but i don't get this use of the lambda arrow,
it's like he is deplaring the returned tipe in the function signature (as
you would do in Java). I have never seen something like this in python..

Can someone please explain to me this usage (the part regarding the
question is highlighted in yellow):

    @classmethod
    def extract_document_data(cls, file_path : str) -> DocumentData:
        """
        Entry point of the module, it extracts the data from the document
        whose path is passed as input.
        The extraction strategy is automatically chosen based on the MIME
type
        of the file.

        @type file_path: str
        @param file_path: The path of the document to be parsed.
        @rtype: DocumentData
        @returns: An object containing the data of the parsed document.
        """

        mime = magic.Magic(mime=True)
        mime_type = mime.from_file(file_path)
        document_type = DocumentType.get_instance(mime_type)
        strategy = cls.strategies[document_type]
        return strategy.extract_document_data(file_path)


To be more verbose, this is the whole script:

from enum import Enum
import json
import magic

import docx
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTContainer, LTTextContainer
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


class DocumentType(Enum):
    """
    Defines the handled document types.
    Each value is associated to a MIME type.
    """

    def __init__(self, mime_type):
        self.mime_type = mime_type

    @classmethod
    def get_instance(cls, mime_type : str):
        values = [e for e in cls]
        for value in values:
            if value.mime_type == mime_type:
                return value
        raise MimeNotValidError(mime_type)

    PDF = 'application/pdf'
    DOCX =
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'


class MimeNotValidError(Exception):
    """
    Exception to be raised when a not valid MIME type is processed.
    """

    pass


class DocumentData:
    """
    Wrapper for the extracted document data (TOC and contents).
    """

    def __init__(self, toc : list = [], pages : list = [], document_text :
str = None):
        self.toc = toc
        self.pages = pages
        if document_text is not None:
            self.document_text = document_text
        else:
            self.document_text = ' '.join([page.replace('\n', ' ') for page
in pages])

    def toc_as_json(self) -> str:
        return json.dumps(self.toc)


class ExtractionStrategy:
    """
    Base class for the extraction strategies.
    """

    @staticmethod
    def extract_document_data(file_path : str) -> DocumentData:
        pass


class DOCXExtractionStrategy(ExtractionStrategy):
    """
    It implements the TOC and contents extraction from a DOCX document.
    """

    @staticmethod
    def extract_document_data(file_path : str) -> DocumentData:
        document = docx.Document(file_path)
        body_elements = document._body._body
        # Selecting only the <w:t> elements from DOCX XML,
        # as they're the only to contain some text.
        text_elems = body_elements.xpath('.//w:t')
        return DocumentData(document_text = ' '.join([elem.text for elem in
text_elems]))


class PDFExtractionStrategy(ExtractionStrategy):
    """
    It implements the TOC and contents extraction from a PDF document.
    """

    @staticmethod
    def parse_toc(doc : PDFDocument) -> list:
        raw_toc = []
        try:
            outlines = doc.get_outlines()
            for (level, title, dest, a, se) in outlines:
                raw_toc.append((level, title))
        except PDFNoOutlines:
            pass
        return PDFExtractionStrategy.build_toc_tree(raw_toc)

    @staticmethod
    def build_toc_tree(items : list) -> list:
        """
        Builds the TOC tree from a list of TOC items.

        @type items: list
        @param items: The TOC items.
        Each item must have the following format: (<item depth>, <item
description>).
        E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')]
        @rtype: list
        @returns: The TOC tree. The tree hasn't a root element, therefore it
        actually is a list.
        """

        toc = []
        if items is None or len(items) == 0:
            return toc
        current_toc_level = toc
        # Using an explicit stack containing the lists corresponding to
        # the various levels of the TOC, to simulate the recursive building
        # of the TOC tree in a more efficient way
        toc_levels_stack = []
        toc_levels_stack.append(current_toc_level)

        # Each TOC item can be inserted into the current TOC level as
        # string (just the item description) or as dict, where the key is
        # the item description and the value is a list containing the
        # children TOC items.
        # To correctly determine how to insert the current item into
        # the current level, a kind of look-ahead is needed, that is
        # the depth of the next item has to be considered.

        # Initializing the variables related to the previous item.
        prev_item_depth, prev_item_desc = items[0]
        # Adding a fake final item in order to handle all the TOC items
        # inside the cycle.
        items.append((-1, ''))

        for i in range(1, len(items)):
            # In fact each iteration handles the item of the previous
            # one, using the current item to determine how to insert
            # the previous item into the current TOC level,
            # as explained before.
            curr_item = items[i]
            curr_item_depth = curr_item[0]

            if curr_item_depth == prev_item_depth:
                # The depth of the current item is the same
                # as the previous one.
                # Inserting the previous item into the current TOC level
                # as string.
                current_toc_level.append(prev_item_desc)
            elif curr_item_depth == prev_item_depth + 1:
                # The depth of the current item is increased by 1 compared
to
                # the previous one.
                # Inserting the previous item into the current TOC level
                # as dict.
                prev_item_dict = { prev_item_desc : [] }
                current_toc_level.append(prev_item_dict)
                # Updating the current TOC level with the newly created one
                # which contains the children of the previous item.
                current_toc_level = prev_item_dict[prev_item_desc]
                toc_levels_stack.append(current_toc_level)
            elif curr_item_depth < prev_item_depth:
                # The depth of the current item is lesser than
                # the previous one.
                # Inserting the previous item into the current TOC level
                # as string.
                current_toc_level.append(prev_item_desc)
                if i < len(items)-1:
                    # Executing these steps for all the items except the
last one
                    depth_diff = prev_item_depth - curr_item_depth
                    # Removing from the stack as many TOC levels as the
difference
                    # between the depth of the previous item and the depth
of the
                    # current one.
                    for i in range(0, depth_diff):
                        toc_levels_stack.pop()
                    # Updating the current TOC level with the one contained
in
                    # the head of the stack.
                    current_toc_level = toc_levels_stack[-1]
            # Updating the previous item with the current one
            prev_item_depth, prev_item_desc = curr_item

        return toc

    @staticmethod
    def from_bytestring(s) -> str:
        """
        If the input string is a byte-string, converts it to a string using
        UTF-8 as encoding.

        @param s: A string or a byte-string.
        @rtype: str
        @returns: The potentially converted string.
        """

        if s:
            if isinstance(s, str):
                return s
            else:
                return s.encode('utf-8')

    @staticmethod
    def parse_layout_nodes(container : LTContainer) -> str:
        """
        Recursively extracts the text from all the nodes contained in the
        input PDF layout tree/sub-tree.

        @type container: LTContainer
        @param container: The PDF layout tree/sub-tree from which to
extract the text.
        @rtype: str
        @returns: A string containing the extracted text.
        """

        text_content = []

        # The iterator returns the children nodes.
        for node in container:
            if isinstance(node, LTTextContainer):
                # Only nodes of type LTTextContainer contain text.

text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text()))
            elif isinstance(node, LTContainer):
                # Recursively calling the method on the current node, which
is a container itself.

text_content.append(PDFExtractionStrategy.parse_layout_nodes(node))
            else:
                # Ignoring all the other node types.
                pass

        # Joining all the extracted text chunks with a new line character.
        return "\n".join(text_content)

    @staticmethod
    def parse_pages(doc : PDFDocument) -> list:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        text_content = []
        for i, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            layout = device.get_result()
            # Extracts the text from all the nodes of the PDF layout tree
of each page

text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout))

        return text_content

    @staticmethod
    def parse_pdf(file_path : str) -> (list, list):
        toc = []
        pages = []
        try:
            fp = open(file_path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            parser.set_document(doc)

            if doc.is_extractable:
                toc = PDFExtractionStrategy.parse_toc(doc)
                pages = PDFExtractionStrategy.parse_pages(doc)

            fp.close()
        except IOError:
            pass
        return (toc, pages)

    @staticmethod
    def extract_document_data(file_path : str) -> DocumentData:
        toc, pages = PDFExtractionStrategy.parse_pdf(file_path)
        return DocumentData(toc, pages = pages)


class DocumentDataExtractor:
    """
    Main class of the module.
    It's responsible for actually executing the text extraction.
    The output is constituted by the following items:
    -table of contents (TOC);
    -pages contents.
    """

    # Dictionary containing the extraction strategies for the different
    # document types, indexed by the corresponding DocumentType enum values.
    strategies = {
        DocumentType.DOCX : DOCXExtractionStrategy(),
        DocumentType.PDF : PDFExtractionStrategy()
                 }

    @classmethod
    def extract_document_data(cls, file_path : str) -> DocumentData:
        """
        Entry point of the module, it extracts the data from the document
        whose path is passed as input.
        The extraction strategy is automatically chosen based on the MIME
type
        of the file.

        @type file_path: str
        @param file_path: The path of the document to be parsed.
        @rtype: DocumentData
        @returns: An object containing the data of the parsed document.
        """

        mime = magic.Magic(mime=True)
        mime_type = mime.from_file(file_path)
        document_type = DocumentType.get_instance(mime_type)
        strategy = cls.strategies[document_type]
        return strategy.extract_document_data(file_path)
-- 
https://mail.python.org/mailman/listinfo/python-list

Reply via email to