Github user omalley commented on a diff in the pull request: https://github.com/apache/orc/pull/41#discussion_r71379015 --- Diff: c++/include/orc/Reader.hh --- @@ -782,85 +847,42 @@ namespace orc { getColumnStatistics(uint32_t columnId) const = 0; /** - * Get the type of the rows in the file. The top level is typically a - * struct. - * @return the root type - */ - virtual const Type& getType() const = 0; - - /** - * Get the selected type of the rows in the file. The file's row type - * is projected down to just the selected columns. Thus, if the file's - * type is struct<col0:int,col1:double,col2:string> and the selected - * columns are "col0,col2" the selected type would be - * struct<col0:int,col2:string>. - * @return the root type - */ - virtual const Type& getSelectedType() const = 0; - - /** - * Get the selected columns of the file. + * check file has correct column statistics */ - virtual const std::vector<bool> getSelectedColumns() const = 0; + virtual bool hasCorrectStatistics() const = 0; /** - * Create a row batch for reading the selected columns of this file. - * @param size the number of rows to read - * @return a new ColumnVectorBatch to read into + * Get the serialized file tail. + * Usefull if another reader of the same file wants to avoid re-reading + * the file tail. See ReaderOptions.setSerializedFileTail(). + * @return a string of bytes with the file tail */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; + virtual std::string getSerializedFileTail() const = 0; /** - * Read the next row batch from the current position. - * Caller must look at numElements in the row batch to determine how - * many rows were read. - * @param data the row batch to read into. - * @return true if a non-zero number of rows were read or false if the - * end of the file was reached. + * Get the type of the rows in the file. The top level is typically a + * struct. + * @return the root type */ - virtual bool next(ColumnVectorBatch& data) = 0; + virtual const Type& getType() const = 0; /** - * Get the row number of the first row in the previously read batch. - * @return the row number of the previous batch. + * @return a RowReader to read the rows */ - virtual uint64_t getRowNumber() const = 0; + virtual ORC_UNIQUE_PTR<RowReader> getRowReader() const = 0; /** - * Seek to a given row. - * @param rowNumber the next row the reader should return + * @param include update with new columns + * @return a RowReader to read the rows */ - virtual void seekToRow(uint64_t rowNumber) = 0; + virtual ORC_UNIQUE_PTR<RowReader> + getRowReader(const std::list<uint64_t>& include) const = 0; --- End diff -- Let's go ahead and make a RowReaderOptions class and pass that in here. It is almost guaranteed that the include vector will not be the only option that we want to pass in. One of the options will be to specify the include vector.
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---