pitrou commented on code in PR #45234: URL: https://github.com/apache/arrow/pull/45234#discussion_r1944462112
########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). Review Comment: I would suggest either a length or a exclusive end. Inclusive is more confusing and error-prone IMHO. ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> Review Comment: Also need to include `<cstdint>` ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { + /// Start row of the range (inclusive). + int64_t offset; + /// Zero appended if there are less than 64 elements. + uint64_t bitmap; + }; + + /// \brief An end marker for the row range iterator. + struct End {}; + + /// \brief An iterator for accessing row ranges in order. + class Iterator { + public: + virtual ~Iterator() = default; + virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0; Review Comment: I agree that doesn't look terrific. Perhaps something like: ```c++ /// Return a batch of ranges. The returned batch has size 0 iff the iterator is exhausted. virtual util::span<std::variant<IntervalRange, BitmapRange>> NextRanges() = 0; ``` ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). Review Comment: We commonly use the (start, length) convention in other places. ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { + /// Start row of the range (inclusive). + int64_t offset; + /// Zero appended if there are less than 64 elements. + uint64_t bitmap; + }; + + /// \brief An end marker for the row range iterator. + struct End {}; + + /// \brief An iterator for accessing row ranges in order. + class Iterator { + public: + virtual ~Iterator() = default; + virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0; + }; + + /// \brief Create a new iterator for accessing row ranges in order. + std::unique_ptr<Iterator> NewIterator() const; + + /// \brief Validate the row ranges. + /// \throws ParquetException if the row ranges are not in ascending order or + /// overlapped. + void Validate() const; + + /// \brief Get the total number of rows in the row ranges. + int64_t row_count() const; + + /// \brief Compute the intersection of two row ranges. + static RowRanges Intersect(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Compute the union of two row ranges. + static RowRanges Union(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Make a single row range of [0, row_count - 1]. + static RowRanges MakeSingle(int64_t row_count); + + /// \brief Make a single row range of [start, end]. + static RowRanges MakeSingle(int64_t start, int64_t end); + + /// \brief Make a row range from a list of intervals. + static RowRanges MakeIntervals(const std::vector<IntervalRange>& intervals); Review Comment: Could take a `span<const IntervalRange>` instead. ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { + /// Start row of the range (inclusive). + int64_t offset; + /// Zero appended if there are less than 64 elements. + uint64_t bitmap; + }; + + /// \brief An end marker for the row range iterator. + struct End {}; + + /// \brief An iterator for accessing row ranges in order. + class Iterator { + public: + virtual ~Iterator() = default; + virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0; + }; + + /// \brief Create a new iterator for accessing row ranges in order. + std::unique_ptr<Iterator> NewIterator() const; + + /// \brief Validate the row ranges. + /// \throws ParquetException if the row ranges are not in ascending order or + /// overlapped. + void Validate() const; + + /// \brief Get the total number of rows in the row ranges. + int64_t row_count() const; + + /// \brief Compute the intersection of two row ranges. + static RowRanges Intersect(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Compute the union of two row ranges. + static RowRanges Union(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Make a single row range of [0, row_count - 1]. + static RowRanges MakeSingle(int64_t row_count); + + /// \brief Make a single row range of [start, end]. + static RowRanges MakeSingle(int64_t start, int64_t end); + + /// \brief Make a row range from a list of intervals. + static RowRanges MakeIntervals(const std::vector<IntervalRange>& intervals); Review Comment: Also, perhaps name this `FromIntervals`. ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { Review Comment: There are other possible representations, for example: ```c++ /// \brief A range of 8 rows represented by a start offset and small indices struct IndexedRange { /// Start row of the range int64_t offset; /// Indices from `offset` that point to selected rows, in ascending order. std::array<uint8_t, 8> indices; }; ``` ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { Review Comment: Plural class names are always a bit weird, should we perhaps name it `RowSelection`? ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { Review Comment: Is this actually convenient to use? I see it's not supported yet in the implementation... ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { + /// Start row of the range (inclusive). + int64_t offset; + /// Zero appended if there are less than 64 elements. + uint64_t bitmap; + }; + + /// \brief An end marker for the row range iterator. + struct End {}; + + /// \brief An iterator for accessing row ranges in order. + class Iterator { Review Comment: We have iteration facilities in `arrow/util/iterator.h`, should they be reused instead of defining our own here? ########## cpp/src/parquet/row_range.h: ########## @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <variant> +#include <vector> + +#include "parquet/platform.h" + +namespace parquet { + +/// RowRanges is a collection of non-overlapping and ascendingly ordered row ranges. +class PARQUET_EXPORT RowRanges { + public: + /// \brief A range of contiguous rows represented by an interval. + struct IntervalRange { + /// Start row of the range (inclusive). + int64_t start; + /// End row of the range (inclusive). + int64_t end; + }; + + /// \brief A range of contiguous rows represented by a bitmap. + struct BitmapRange { + /// Start row of the range (inclusive). + int64_t offset; + /// Zero appended if there are less than 64 elements. + uint64_t bitmap; + }; + + /// \brief An end marker for the row range iterator. + struct End {}; + + /// \brief An iterator for accessing row ranges in order. + class Iterator { + public: + virtual ~Iterator() = default; + virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0; + }; + + /// \brief Create a new iterator for accessing row ranges in order. + std::unique_ptr<Iterator> NewIterator() const; + + /// \brief Validate the row ranges. + /// \throws ParquetException if the row ranges are not in ascending order or + /// overlapped. + void Validate() const; + + /// \brief Get the total number of rows in the row ranges. + int64_t row_count() const; + + /// \brief Compute the intersection of two row ranges. + static RowRanges Intersect(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Compute the union of two row ranges. + static RowRanges Union(const RowRanges& lhs, const RowRanges& rhs); + + /// \brief Make a single row range of [0, row_count - 1]. + static RowRanges MakeSingle(int64_t row_count); + + /// \brief Make a single row range of [start, end]. + static RowRanges MakeSingle(int64_t start, int64_t end); Review Comment: This is basically a simpler version of the IntervalRange-taking constructor? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
