Hello to everyone, lately i building up an open source project, with some collaborator, but one of them cannot contribute any more. He is a solution architect so he is very skilled (much more than me!). I am now analysing his code to finish the job but i don't get this use of the lambda arrow, it's like he is deplaring the returned tipe in the function signature (as you would do in Java). I have never seen something like this in python..
Can someone please explain to me this usage (the part regarding the question is highlighted in yellow): @classmethod def extract_document_data(cls, file_path : str) -> DocumentData: """ Entry point of the module, it extracts the data from the document whose path is passed as input. The extraction strategy is automatically chosen based on the MIME type of the file. @type file_path: str @param file_path: The path of the document to be parsed. @rtype: DocumentData @returns: An object containing the data of the parsed document. """ mime = magic.Magic(mime=True) mime_type = mime.from_file(file_path) document_type = DocumentType.get_instance(mime_type) strategy = cls.strategies[document_type] return strategy.extract_document_data(file_path) To be more verbose, this is the whole script: from enum import Enum import json import magic import docx from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTContainer, LTTextContainer from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser class DocumentType(Enum): """ Defines the handled document types. Each value is associated to a MIME type. """ def __init__(self, mime_type): self.mime_type = mime_type @classmethod def get_instance(cls, mime_type : str): values = [e for e in cls] for value in values: if value.mime_type == mime_type: return value raise MimeNotValidError(mime_type) PDF = 'application/pdf' DOCX = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' class MimeNotValidError(Exception): """ Exception to be raised when a not valid MIME type is processed. """ pass class DocumentData: """ Wrapper for the extracted document data (TOC and contents). """ def __init__(self, toc : list = [], pages : list = [], document_text : str = None): self.toc = toc self.pages = pages if document_text is not None: self.document_text = document_text else: self.document_text = ' '.join([page.replace('\n', ' ') for page in pages]) def toc_as_json(self) -> str: return json.dumps(self.toc) class ExtractionStrategy: """ Base class for the extraction strategies. """ @staticmethod def extract_document_data(file_path : str) -> DocumentData: pass class DOCXExtractionStrategy(ExtractionStrategy): """ It implements the TOC and contents extraction from a DOCX document. """ @staticmethod def extract_document_data(file_path : str) -> DocumentData: document = docx.Document(file_path) body_elements = document._body._body # Selecting only the <w:t> elements from DOCX XML, # as they're the only to contain some text. text_elems = body_elements.xpath('.//w:t') return DocumentData(document_text = ' '.join([elem.text for elem in text_elems])) class PDFExtractionStrategy(ExtractionStrategy): """ It implements the TOC and contents extraction from a PDF document. """ @staticmethod def parse_toc(doc : PDFDocument) -> list: raw_toc = [] try: outlines = doc.get_outlines() for (level, title, dest, a, se) in outlines: raw_toc.append((level, title)) except PDFNoOutlines: pass return PDFExtractionStrategy.build_toc_tree(raw_toc) @staticmethod def build_toc_tree(items : list) -> list: """ Builds the TOC tree from a list of TOC items. @type items: list @param items: The TOC items. Each item must have the following format: (<item depth>, <item description>). E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')] @rtype: list @returns: The TOC tree. The tree hasn't a root element, therefore it actually is a list. """ toc = [] if items is None or len(items) == 0: return toc current_toc_level = toc # Using an explicit stack containing the lists corresponding to # the various levels of the TOC, to simulate the recursive building # of the TOC tree in a more efficient way toc_levels_stack = [] toc_levels_stack.append(current_toc_level) # Each TOC item can be inserted into the current TOC level as # string (just the item description) or as dict, where the key is # the item description and the value is a list containing the # children TOC items. # To correctly determine how to insert the current item into # the current level, a kind of look-ahead is needed, that is # the depth of the next item has to be considered. # Initializing the variables related to the previous item. prev_item_depth, prev_item_desc = items[0] # Adding a fake final item in order to handle all the TOC items # inside the cycle. items.append((-1, '')) for i in range(1, len(items)): # In fact each iteration handles the item of the previous # one, using the current item to determine how to insert # the previous item into the current TOC level, # as explained before. curr_item = items[i] curr_item_depth = curr_item[0] if curr_item_depth == prev_item_depth: # The depth of the current item is the same # as the previous one. # Inserting the previous item into the current TOC level # as string. current_toc_level.append(prev_item_desc) elif curr_item_depth == prev_item_depth + 1: # The depth of the current item is increased by 1 compared to # the previous one. # Inserting the previous item into the current TOC level # as dict. prev_item_dict = { prev_item_desc : [] } current_toc_level.append(prev_item_dict) # Updating the current TOC level with the newly created one # which contains the children of the previous item. current_toc_level = prev_item_dict[prev_item_desc] toc_levels_stack.append(current_toc_level) elif curr_item_depth < prev_item_depth: # The depth of the current item is lesser than # the previous one. # Inserting the previous item into the current TOC level # as string. current_toc_level.append(prev_item_desc) if i < len(items)-1: # Executing these steps for all the items except the last one depth_diff = prev_item_depth - curr_item_depth # Removing from the stack as many TOC levels as the difference # between the depth of the previous item and the depth of the # current one. for i in range(0, depth_diff): toc_levels_stack.pop() # Updating the current TOC level with the one contained in # the head of the stack. current_toc_level = toc_levels_stack[-1] # Updating the previous item with the current one prev_item_depth, prev_item_desc = curr_item return toc @staticmethod def from_bytestring(s) -> str: """ If the input string is a byte-string, converts it to a string using UTF-8 as encoding. @param s: A string or a byte-string. @rtype: str @returns: The potentially converted string. """ if s: if isinstance(s, str): return s else: return s.encode('utf-8') @staticmethod def parse_layout_nodes(container : LTContainer) -> str: """ Recursively extracts the text from all the nodes contained in the input PDF layout tree/sub-tree. @type container: LTContainer @param container: The PDF layout tree/sub-tree from which to extract the text. @rtype: str @returns: A string containing the extracted text. """ text_content = [] # The iterator returns the children nodes. for node in container: if isinstance(node, LTTextContainer): # Only nodes of type LTTextContainer contain text. text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text())) elif isinstance(node, LTContainer): # Recursively calling the method on the current node, which is a container itself. text_content.append(PDFExtractionStrategy.parse_layout_nodes(node)) else: # Ignoring all the other node types. pass # Joining all the extracted text chunks with a new line character. return "\n".join(text_content) @staticmethod def parse_pages(doc : PDFDocument) -> list: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) layout = device.get_result() # Extracts the text from all the nodes of the PDF layout tree of each page text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout)) return text_content @staticmethod def parse_pdf(file_path : str) -> (list, list): toc = [] pages = [] try: fp = open(file_path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) if doc.is_extractable: toc = PDFExtractionStrategy.parse_toc(doc) pages = PDFExtractionStrategy.parse_pages(doc) fp.close() except IOError: pass return (toc, pages) @staticmethod def extract_document_data(file_path : str) -> DocumentData: toc, pages = PDFExtractionStrategy.parse_pdf(file_path) return DocumentData(toc, pages = pages) class DocumentDataExtractor: """ Main class of the module. It's responsible for actually executing the text extraction. The output is constituted by the following items: -table of contents (TOC); -pages contents. """ # Dictionary containing the extraction strategies for the different # document types, indexed by the corresponding DocumentType enum values. strategies = { DocumentType.DOCX : DOCXExtractionStrategy(), DocumentType.PDF : PDFExtractionStrategy() } @classmethod def extract_document_data(cls, file_path : str) -> DocumentData: """ Entry point of the module, it extracts the data from the document whose path is passed as input. The extraction strategy is automatically chosen based on the MIME type of the file. @type file_path: str @param file_path: The path of the document to be parsed. @rtype: DocumentData @returns: An object containing the data of the parsed document. """ mime = magic.Magic(mime=True) mime_type = mime.from_file(file_path) document_type = DocumentType.get_instance(mime_type) strategy = cls.strategies[document_type] return strategy.extract_document_data(file_path) -- https://mail.python.org/mailman/listinfo/python-list