wgtmac commented on code in PR #43977:
URL: https://github.com/apache/arrow/pull/43977#discussion_r1759070389
##########
cpp/src/parquet/thrift_internal.h:
##########
@@ -231,6 +231,36 @@ static inline AadMetadata FromThrift(format::AesGcmCtrV1
aesGcmCtrV1) {
aesGcmCtrV1.supply_aad_prefix};
}
+static inline EncodedGeometryStatistics FromThrift(
+ const format::GeometryStatistics& geometry_stats, bool has_geometry_stats)
{
Review Comment:
Please remove `bool has_geometry_stats` to be consistent with other similar
functions.
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -243,6 +243,10 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) {
if (stats.__isset.distinct_count) {
page_statistics.set_distinct_count(stats.distinct_count);
}
+ if (stats.__isset.geometry_stats) {
+ page_statistics.set_geometry(
+ FromThrift(stats.geometry_stats, stats.__isset.geometry_stats));
Review Comment:
```suggestion
page_statistics.set_geometry(FromThrift(stats.geometry_stats));
```
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
Review Comment:
```suggestion
if (!crs_.empty()) {
```
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the crs or assume that it's
valid JSON
+ json << R"(, "crs": )" << crs_;
+ }
+
+ json << R"(, "edges": ")" << geometry_edges_string(edges_) << R"(")";
+ json << R"(, "encoding": ")" << geometry_encoding_string(encoding_) <<
R"(")";
+
+ if (metadata_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the metadata or assume that
it's valid JSON
+ json << R"(, "metadata": )" << crs_;
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (crs_.size() > 0) {
Review Comment:
```suggestion
if (!crs_.empty()) {
```
##########
cpp/src/parquet/column_writer_test.cc:
##########
@@ -1774,5 +1782,191 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) {
ASSERT_EQ("bar", value);
}
+// Test writing and reading geometry columns
+class TestGeometryValuesWriter : public TestPrimitiveWriter<ByteArrayType> {
+ public:
+ static const char* CRS;
+ static const char* METADATA;
Review Comment:
```suggestion
inline static constexpr std::string_view kCrs = R"({"id": {"authority":
"OGC", "code": "CRS84"}})";
inline static constexpr std::string_view kMetadata = "test_metadata";
```
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the crs or assume that it's
valid JSON
+ json << R"(, "crs": )" << crs_;
+ }
+
+ json << R"(, "edges": ")" << geometry_edges_string(edges_) << R"(")";
+ json << R"(, "encoding": ")" << geometry_encoding_string(encoding_) <<
R"(")";
+
+ if (metadata_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the metadata or assume that
it's valid JSON
+ json << R"(, "metadata": )" << crs_;
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (crs_.size() > 0) {
+ geometry_type.__set_crs(crs_);
+ }
+
+ DCHECK(edges_ != LogicalType::GeometryEdges::UNKNOWN);
+ if (edges_ == LogicalType::GeometryEdges::SPHERICAL) {
+ geometry_type.__set_edges(format::Edges::SPHERICAL);
+ } else {
+ geometry_type.__set_edges(format::Edges::PLANAR);
+ }
+
+ DCHECK_EQ(encoding_, LogicalType::GeometryEncoding::WKB);
Review Comment:
ditto
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
Review Comment:
```suggestion
: LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNSIGNED),
```
##########
cpp/src/parquet/page_index.cc:
##########
@@ -167,6 +206,10 @@ class TypedColumnIndexImpl : public
TypedColumnIndex<DType> {
std::vector<T> max_values_;
/// A list of page indices for non-null pages.
std::vector<int32_t> non_null_page_indices_;
+ /// A list of encoded geometry statistics
+ std::vector<EncodedGeometryStatistics> encoded_geometry_statistics_;
Review Comment:
Why do we need to keep encoded ones? We would always be in favor of
`GeometryStatistics`.
##########
cpp/src/parquet/statistics.cc:
##########
@@ -47,6 +48,305 @@ using arrow::util::SafeCopy;
using arrow::util::SafeLoad;
namespace parquet {
+
+class GeometryStatisticsImpl {
+ public:
+ GeometryStatisticsImpl() = default;
+ GeometryStatisticsImpl(const GeometryStatisticsImpl&) = default;
+
+ bool Equals(const GeometryStatisticsImpl& other) const {
+ if (is_valid_ != other.is_valid_) {
+ return false;
+ }
+
+ if (!is_valid_ && !other.is_valid_) {
+ return true;
+ }
+
+ auto geometry_types = bounder_.GeometryTypes();
+ auto other_geometry_types = other.bounder_.GeometryTypes();
+ if (geometry_types.size() != other_geometry_types.size()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < geometry_types.size(); i++) {
+ if (geometry_types[i] != other_geometry_types[i]) {
+ return false;
+ }
+ }
+
+ return bounder_.Bounds() == other.bounder_.Bounds();
+ }
+
+ void Merge(const GeometryStatisticsImpl& other) {
+ if (!is_valid_ || !other.is_valid_) {
+ is_valid_ = false;
+ return;
+ }
+
+ bounder_.ReadBox(other.bounder_.Bounds());
+ bounder_.ReadGeometryTypes(other.bounder_.GeometryTypes());
+ }
+
+ void Update(const ByteArray* values, int64_t num_values, int64_t null_count)
{
+ if (!is_valid_) {
+ return;
+ }
+
+ geometry::WKBBuffer buf;
+ try {
+ for (int64_t i = 0; i < num_values; i++) {
+ const ByteArray& item = values[i];
+ buf.Init(item.ptr, item.len);
+ bounder_.ReadGeometry(&buf);
+ }
+
+ bounder_.Flush();
+ } catch (ParquetException&) {
+ is_valid_ = false;
+ }
+ }
+
+ void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t num_spaced_values,
+ int64_t num_values, int64_t null_count) {
+ DCHECK_GT(num_spaced_values, 0);
+
+ geometry::WKBBuffer buf;
+ try {
+ ::arrow::internal::VisitSetBitRunsVoid(
+ valid_bits, valid_bits_offset, num_spaced_values,
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ ByteArray item = SafeLoad(values + i + position);
+ buf.Init(item.ptr, item.len);
+ bounder_.ReadGeometry(&buf);
+ }
+ });
+ bounder_.Flush();
+ } catch (ParquetException&) {
+ is_valid_ = false;
+ }
+ }
+
+ void Update(const ::arrow::Array& values, bool update_counts) {
+ ARROW_UNUSED(update_counts);
Review Comment:
Why do we need `update_counts`?
##########
cpp/src/parquet/statistics.h:
##########
@@ -372,11 +445,14 @@ template <typename DType>
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
const ColumnDescriptor* descr, const std::string& encoded_min,
const std::string& encoded_max, int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max, bool has_null_count,
- bool has_distinct_count, ::arrow::MemoryPool* pool =
::arrow::default_memory_pool()) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
- descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
- has_min_max, has_null_count, has_distinct_count, pool));
+ int64_t distinct_count, const EncodedGeometryStatistics&
geometry_statistics,
Review Comment:
Why not directly add `const EncodedGeometryStatistics* geometry_statistics =
NULLPTR` to the end of the existing function signature?
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
Review Comment:
`SortOrder::UNSIGNED` is the default sort order of `BYTE_ARRAY` type. Could
we just use this so you don't have to change a line in column_writer.cc. The
good thing is that ColumnIndex of geometry type can also be generated
automatically, though the min/max values are derived from their binary values
and useless. This is the same practice used in the Java PoC impl.
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -243,6 +243,10 @@ EncodedStatistics ExtractStatsFromHeader(const H& header) {
if (stats.__isset.distinct_count) {
page_statistics.set_distinct_count(stats.distinct_count);
}
+ if (stats.__isset.geometry_stats) {
+ page_statistics.set_geometry(
+ FromThrift(stats.geometry_stats, stats.__isset.geometry_stats));
Review Comment:
I have also left a comment in thrift_internal.h about its signature.
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the crs or assume that it's
valid JSON
+ json << R"(, "crs": )" << crs_;
+ }
+
+ json << R"(, "edges": ")" << geometry_edges_string(edges_) << R"(")";
+ json << R"(, "encoding": ")" << geometry_encoding_string(encoding_) <<
R"(")";
+
+ if (metadata_.size() > 0) {
Review Comment:
```suggestion
if (!metadata_.empty()) {
```
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
Review Comment:
Could we use functions in the anonymous namespace instead of macros for
`geometry_edges_string` and `geometry_encoding_string`? We don't have to follow
the existing style here.
##########
cpp/src/parquet/CMakeLists.txt:
##########
@@ -375,6 +375,7 @@ add_parquet_test(internals-test
statistics_test.cc
encoding_test.cc
metadata_test.cc
+ geometry_util_internal_test.cc
Review Comment:
It's a pity that these files are not sorted alphabetically.
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the crs or assume that it's
valid JSON
+ json << R"(, "crs": )" << crs_;
+ }
+
+ json << R"(, "edges": ")" << geometry_edges_string(edges_) << R"(")";
+ json << R"(, "encoding": ")" << geometry_encoding_string(encoding_) <<
R"(")";
+
+ if (metadata_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the metadata or assume that
it's valid JSON
+ json << R"(, "metadata": )" << crs_;
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (crs_.size() > 0) {
+ geometry_type.__set_crs(crs_);
+ }
+
+ DCHECK(edges_ != LogicalType::GeometryEdges::UNKNOWN);
+ if (edges_ == LogicalType::GeometryEdges::SPHERICAL) {
+ geometry_type.__set_edges(format::Edges::SPHERICAL);
+ } else {
+ geometry_type.__set_edges(format::Edges::PLANAR);
+ }
+
+ DCHECK_EQ(encoding_, LogicalType::GeometryEncoding::WKB);
+ geometry_type.__set_encoding(format::GeometryEncoding::WKB);
+
+ // Canonically export empty metadata as unset
+ if (metadata_.size() > 0) {
Review Comment:
```suggestion
if (!metadata_.empty()) {
```
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the crs or assume that it's
valid JSON
+ json << R"(, "crs": )" << crs_;
+ }
+
+ json << R"(, "edges": ")" << geometry_edges_string(edges_) << R"(")";
+ json << R"(, "encoding": ")" << geometry_encoding_string(encoding_) <<
R"(")";
+
+ if (metadata_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the metadata or assume that
it's valid JSON
+ json << R"(, "metadata": )" << crs_;
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (crs_.size() > 0) {
+ geometry_type.__set_crs(crs_);
+ }
+
+ DCHECK(edges_ != LogicalType::GeometryEdges::UNKNOWN);
Review Comment:
What about throw `ParquetException`?
##########
cpp/src/parquet/types.cc:
##########
@@ -1603,6 +1646,139 @@ class LogicalType::Impl::Float16 final : public
LogicalType::Impl::Incompatible,
GENERATE_MAKE(Float16)
+#define geometry_edges_string(u___) \
+ ((u___) == LogicalType::GeometryEdges::PLANAR \
+ ? "planar" \
+ : ((u___) == LogicalType::GeometryEdges::SPHERICAL ? "spherical" :
"unknown"))
+
+#define geometry_encoding_string(u___) \
+ ((u___) == LogicalType::GeometryEncoding::WKB ? "wkb" : "unknown")
+
+class LogicalType::Impl::Geometry final : public
LogicalType::Impl::Incompatible,
+ public
LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class GeometryLogicalType;
+
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ const std::string& crs() const { return crs_; }
+ LogicalType::GeometryEdges::edges edges() const { return edges_; }
+ LogicalType::GeometryEncoding::geometry_encoding encoding() const { return
encoding_; }
+ const std::string& metadata() const { return metadata_; }
+
+ private:
+ Geometry(std::string crs, LogicalType::GeometryEdges::edges edges,
+ LogicalType::GeometryEncoding::geometry_encoding encoding,
+ std::string metadata)
+ : LogicalType::Impl(LogicalType::Type::GEOMETRY, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY),
+ crs_(std::move(crs)),
+ edges_(edges),
+ encoding_(encoding),
+ metadata_(std::move(metadata)) {}
+
+ std::string crs_;
+ LogicalType::GeometryEdges::edges edges_;
+ LogicalType::GeometryEncoding::geometry_encoding encoding_;
+ std::string metadata_;
+};
+
+std::string LogicalType::Impl::Geometry::ToString() const {
+ std::stringstream type;
+ type << "Geometry(crs=" << crs_ << ", edges=" <<
geometry_edges_string(edges_)
+ << ", encoding=" << geometry_encoding_string(encoding_)
+ << ", metadata=" << metadata_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Geometry::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Geometry")";
+
+ if (crs_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the crs or assume that it's
valid JSON
+ json << R"(, "crs": )" << crs_;
+ }
+
+ json << R"(, "edges": ")" << geometry_edges_string(edges_) << R"(")";
+ json << R"(, "encoding": ")" << geometry_encoding_string(encoding_) <<
R"(")";
+
+ if (metadata_.size() > 0) {
+ // TODO(paleolimbot): we'll need to escape the metadata or assume that
it's valid JSON
+ json << R"(, "metadata": )" << crs_;
+ }
+
+ json << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Geometry::ToThrift() const {
+ format::LogicalType type;
+ format::GeometryType geometry_type;
+
+ // Canonially export crs of "" as an unset CRS
+ if (crs_.size() > 0) {
+ geometry_type.__set_crs(crs_);
+ }
+
+ DCHECK(edges_ != LogicalType::GeometryEdges::UNKNOWN);
+ if (edges_ == LogicalType::GeometryEdges::SPHERICAL) {
+ geometry_type.__set_edges(format::Edges::SPHERICAL);
+ } else {
+ geometry_type.__set_edges(format::Edges::PLANAR);
+ }
+
+ DCHECK_EQ(encoding_, LogicalType::GeometryEncoding::WKB);
+ geometry_type.__set_encoding(format::GeometryEncoding::WKB);
+
+ // Canonically export empty metadata as unset
+ if (metadata_.size() > 0) {
+ geometry_type.__set_metadata(metadata_);
+ }
+
+ type.__set_GEOMETRY(geometry_type);
+ return type;
+}
+
+bool LogicalType::Impl::Geometry::Equals(const LogicalType& other) const {
+ if (other.is_geometry()) {
+ const auto& other_geometry = checked_cast<const
GeometryLogicalType&>(other);
+ return crs() == other_geometry.crs() && edges() == other_geometry.edges()
&&
+ encoding() == other_geometry.encoding() &&
+ metadata() == other_geometry.metadata();
+ } else {
+ return false;
+ }
+}
+
+const std::string& GeometryLogicalType::crs() const {
+ return (dynamic_cast<const LogicalType::Impl::Geometry&>(*impl_)).crs();
Review Comment:
We can replace `dynamic_cast` with `checked_cast` here and below.
##########
cpp/src/parquet/test_util.h:
##########
@@ -830,5 +833,45 @@ inline void GenerateData<FLBA>(int num_values, FLBA* out,
std::vector<uint8_t>*
random_fixed_byte_array(num_values, 0, heap->data(),
kGenerateDataFLBALength, out);
}
+// ----------------------------------------------------------------------
+// Test utility functions for geometry
+
+#if defined(ARROW_LITTLE_ENDIAN)
+static constexpr int WKB_NATIVE_ENDIANNESS =
geometry::WKBBuffer::WKB_LITTLE_ENDIAN;
+#else
+static constexpr int WKB_NATIVE_ENDIANNESS =
geometry::WKBBuffer::WKB_BIG_ENDIAN;
+#endif
+
+static constexpr int WKB_POINT_SIZE = 21; // 1:endianness + 4:type + 8:x + 8:y
Review Comment:
```suggestion
#if defined(ARROW_LITTLE_ENDIAN)
static constexpr int kWkbNativeEndianness =
geometry::WKBBuffer::kWkbLittleEndian;
#else
static constexpr int kWkbNativeEndianness =
geometry::WKBBuffer::kWkbBigEndian;
#endif
static constexpr int kWkbPointSize = 21; // 1:endianness + 4:type + 8:x +
8:y
```
The convention for constant variables uses camel style and prefix `k`.
##########
cpp/src/parquet/geometry_util_internal.h:
##########
@@ -0,0 +1,691 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/exception.h"
+
+namespace parquet::geometry {
+
+constexpr double kInf = std::numeric_limits<double>::infinity();
+
+struct Dimensions {
+ enum dimensions { XY = 0, XYZ = 1, XYM = 2, XYZM = 3 };
Review Comment:
Should we use enum class here and below?
##########
cpp/src/parquet/geometry_util_internal.h:
##########
@@ -0,0 +1,691 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/exception.h"
+
+namespace parquet::geometry {
+
+constexpr double kInf = std::numeric_limits<double>::infinity();
+
+struct Dimensions {
+ enum dimensions { XY = 0, XYZ = 1, XYM = 2, XYZM = 3 };
+
+ static dimensions FromWKB(uint32_t wkb_geometry_type) {
+ switch (wkb_geometry_type / 1000) {
+ case 0:
+ return XY;
+ case 1:
+ return XYZ;
+ case 2:
+ return XYM;
+ case 3:
+ return XYZM;
+ default:
+ throw ParquetException("Invalid wkb_geometry_type: ",
wkb_geometry_type);
+ }
+ }
+
+ template <dimensions dims>
+ constexpr static uint32_t size();
+
+ static uint32_t size(dimensions dims);
+
+ // Where to look in a coordinate with this dimension
+ // for the X, Y, Z, and M dimensions, respectively.
+ static std::array<int, 4> ToXYZM(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return {0, 1, -1, -1};
+ case XYZ:
+ return {0, 1, 2, -1};
+ case XYM:
+ return {0, 1, -1, 2};
+ case XYZM:
+ return {0, 1, 2, 3};
+ default:
+ return {-1, -1, -1, -1};
+ }
+ }
+
+ static std::string ToString(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return "XY";
+ case XYZ:
+ return "XYZ";
+ case XYM:
+ return "XYM";
+ case XYZM:
+ return "XYZM";
+ default:
+ return "";
Review Comment:
ditto
##########
cpp/src/parquet/geometry_util_internal.h:
##########
@@ -0,0 +1,691 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/exception.h"
+
+namespace parquet::geometry {
+
+constexpr double kInf = std::numeric_limits<double>::infinity();
+
+struct Dimensions {
+ enum dimensions { XY = 0, XYZ = 1, XYM = 2, XYZM = 3 };
+
+ static dimensions FromWKB(uint32_t wkb_geometry_type) {
+ switch (wkb_geometry_type / 1000) {
+ case 0:
+ return XY;
+ case 1:
+ return XYZ;
+ case 2:
+ return XYM;
+ case 3:
+ return XYZM;
+ default:
+ throw ParquetException("Invalid wkb_geometry_type: ",
wkb_geometry_type);
+ }
+ }
+
+ template <dimensions dims>
+ constexpr static uint32_t size();
+
+ static uint32_t size(dimensions dims);
+
+ // Where to look in a coordinate with this dimension
+ // for the X, Y, Z, and M dimensions, respectively.
+ static std::array<int, 4> ToXYZM(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return {0, 1, -1, -1};
+ case XYZ:
+ return {0, 1, 2, -1};
+ case XYM:
+ return {0, 1, -1, 2};
+ case XYZM:
+ return {0, 1, 2, 3};
+ default:
+ return {-1, -1, -1, -1};
Review Comment:
Throw ParquetException instead?
##########
cpp/src/parquet/geometry_util_internal.h:
##########
@@ -0,0 +1,691 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/exception.h"
+
+namespace parquet::geometry {
+
+constexpr double kInf = std::numeric_limits<double>::infinity();
+
+struct Dimensions {
+ enum dimensions { XY = 0, XYZ = 1, XYM = 2, XYZM = 3 };
+
+ static dimensions FromWKB(uint32_t wkb_geometry_type) {
+ switch (wkb_geometry_type / 1000) {
+ case 0:
+ return XY;
+ case 1:
+ return XYZ;
+ case 2:
+ return XYM;
+ case 3:
+ return XYZM;
+ default:
+ throw ParquetException("Invalid wkb_geometry_type: ",
wkb_geometry_type);
+ }
+ }
+
+ template <dimensions dims>
+ constexpr static uint32_t size();
+
+ static uint32_t size(dimensions dims);
+
+ // Where to look in a coordinate with this dimension
+ // for the X, Y, Z, and M dimensions, respectively.
+ static std::array<int, 4> ToXYZM(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return {0, 1, -1, -1};
+ case XYZ:
+ return {0, 1, 2, -1};
+ case XYM:
+ return {0, 1, -1, 2};
+ case XYZM:
+ return {0, 1, 2, 3};
+ default:
+ return {-1, -1, -1, -1};
+ }
+ }
+
+ static std::string ToString(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return "XY";
+ case XYZ:
+ return "XYZ";
+ case XYM:
+ return "XYM";
+ case XYZM:
+ return "XYZM";
+ default:
+ return "";
+ }
+ }
+};
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XY>() {
+ return 2;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYZ>() {
+ return 3;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYM>() {
+ return 3;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYZM>() {
+ return 4;
+}
+
+inline uint32_t Dimensions::size(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return size<XY>();
+ case XYZ:
+ return size<XYZ>();
+ case XYM:
+ return size<XYM>();
+ case XYZM:
+ return size<XYZM>();
+ default:
+ return 0;
Review Comment:
Throw?
##########
cpp/src/parquet/page_index.h:
##########
@@ -76,6 +78,13 @@ class PARQUET_EXPORT ColumnIndex {
/// \brief A vector of page indices for non-null pages.
virtual const std::vector<int32_t>& non_null_page_indices() const = 0;
+
+ /// \brief A vector of encoded geometry statistics for each data page in
this column.
+ ///
+ /// `null_pages` should be inspected first, as only pages with non-null
values
+ /// may have their upper bounds populated.
+ virtual const std::vector<EncodedGeometryStatistics>&
encoded_geometry_statistics()
Review Comment:
Same as comment above, I think we can remove this function.
##########
cpp/src/parquet/reader_test.cc:
##########
@@ -1812,4 +1817,235 @@ TEST(PageIndexReaderTest, ReadFileWithoutPageIndex) {
ASSERT_EQ(nullptr, row_group_index_reader);
}
+class TestGeometryLogicalType : public ::testing::Test {
+ public:
+ const int NUM_ROWS = 1000;
Review Comment:
```suggestion
constexpr int kNumRows = 1000;
```
##########
cpp/src/parquet/statistics.cc:
##########
@@ -47,6 +48,305 @@ using arrow::util::SafeCopy;
using arrow::util::SafeLoad;
namespace parquet {
+
+class GeometryStatisticsImpl {
+ public:
+ GeometryStatisticsImpl() = default;
+ GeometryStatisticsImpl(const GeometryStatisticsImpl&) = default;
+
+ bool Equals(const GeometryStatisticsImpl& other) const {
+ if (is_valid_ != other.is_valid_) {
+ return false;
+ }
+
+ if (!is_valid_ && !other.is_valid_) {
+ return true;
+ }
+
+ auto geometry_types = bounder_.GeometryTypes();
+ auto other_geometry_types = other.bounder_.GeometryTypes();
+ if (geometry_types.size() != other_geometry_types.size()) {
+ return false;
+ }
+
+ for (size_t i = 0; i < geometry_types.size(); i++) {
+ if (geometry_types[i] != other_geometry_types[i]) {
+ return false;
+ }
+ }
+
+ return bounder_.Bounds() == other.bounder_.Bounds();
+ }
+
+ void Merge(const GeometryStatisticsImpl& other) {
+ if (!is_valid_ || !other.is_valid_) {
+ is_valid_ = false;
+ return;
+ }
+
+ bounder_.ReadBox(other.bounder_.Bounds());
+ bounder_.ReadGeometryTypes(other.bounder_.GeometryTypes());
+ }
+
+ void Update(const ByteArray* values, int64_t num_values, int64_t null_count)
{
+ if (!is_valid_) {
+ return;
+ }
+
+ geometry::WKBBuffer buf;
+ try {
+ for (int64_t i = 0; i < num_values; i++) {
+ const ByteArray& item = values[i];
+ buf.Init(item.ptr, item.len);
+ bounder_.ReadGeometry(&buf);
+ }
+
+ bounder_.Flush();
+ } catch (ParquetException&) {
+ is_valid_ = false;
+ }
+ }
+
+ void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t num_spaced_values,
+ int64_t num_values, int64_t null_count) {
+ DCHECK_GT(num_spaced_values, 0);
+
+ geometry::WKBBuffer buf;
+ try {
+ ::arrow::internal::VisitSetBitRunsVoid(
+ valid_bits, valid_bits_offset, num_spaced_values,
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ ByteArray item = SafeLoad(values + i + position);
+ buf.Init(item.ptr, item.len);
+ bounder_.ReadGeometry(&buf);
+ }
+ });
+ bounder_.Flush();
+ } catch (ParquetException&) {
+ is_valid_ = false;
+ }
+ }
+
+ void Update(const ::arrow::Array& values, bool update_counts) {
+ ARROW_UNUSED(update_counts);
+
+ const auto& binary_array = static_cast<const
::arrow::BinaryArray&>(values);
+ geometry::WKBBuffer buf;
+ try {
+ for (int64_t i = 0; i < binary_array.length(); ++i) {
+ if (!binary_array.IsNull(i)) {
+ std::string_view byte_array = binary_array.GetView(i);
+ buf.Init(reinterpret_cast<const uint8_t*>(byte_array.data()),
+ byte_array.length());
+ bounder_.ReadGeometry(&buf);
+ bounder_.Flush();
+ }
+ }
+ } catch (ParquetException&) {
+ is_valid_ = false;
+ }
+ }
+
+ void Reset() {
+ bounder_.Reset();
+ coverings_.clear();
+ is_valid_ = true;
+ }
+
+ EncodedGeometryStatistics Encode() const {
+ const double* mins = bounder_.Bounds().min;
+ const double* maxes = bounder_.Bounds().max;
+
+ EncodedGeometryStatistics out;
+ out.geometry_types = bounder_.GeometryTypes();
+
+ out.xmin = mins[0];
+ out.xmax = maxes[0];
+ out.ymin = mins[1];
+ out.ymax = maxes[1];
+ out.zmin = mins[2];
+ out.zmax = maxes[2];
+ out.mmin = mins[3];
+ out.mmax = maxes[3];
+
+ if (coverings_.empty()) {
+ // Generate coverings from bounding box if coverings is not present
Review Comment:
When will `coverings_` be empty? Is it the default behavior? I'm not sure if
we need to check if the edges is planar since bbox is not accurate for
spherical edges. BTW, if we don't have a good implementation for coverings, I
think we can just ignore it for now.
##########
cpp/src/parquet/statistics.h:
##########
@@ -114,19 +115,79 @@ std::shared_ptr<TypedComparator<DType>>
MakeComparator(const ColumnDescriptor* d
return
std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
}
+class PARQUET_EXPORT EncodedGeometryStatistics {
+ public:
+ static constexpr double kInf = std::numeric_limits<double>::infinity();
+
+ EncodedGeometryStatistics() = default;
+ EncodedGeometryStatistics(const EncodedGeometryStatistics&) = default;
+ EncodedGeometryStatistics(EncodedGeometryStatistics&&) = default;
+ EncodedGeometryStatistics& operator=(const EncodedGeometryStatistics&) =
default;
+
+ double xmin{kInf};
+ double xmax{-kInf};
+ double ymin{kInf};
+ double ymax{-kInf};
+ double zmin{kInf};
+ double zmax{-kInf};
+ double mmin{kInf};
+ double mmax{-kInf};
+ std::vector<std::pair<std::string, std::string>> coverings;
+ std::vector<uint32_t> geometry_types;
+
+ bool has_z() const { return (zmax - zmin) > 0; }
+
+ bool has_m() const { return (mmax - mmin) > 0; }
+};
+
+class GeometryStatisticsImpl;
+
+class PARQUET_EXPORT GeometryStatistics {
Review Comment:
Thanks for adding theses!
##########
cpp/src/parquet/geometry_util_internal.h:
##########
@@ -0,0 +1,691 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/exception.h"
+
+namespace parquet::geometry {
+
+constexpr double kInf = std::numeric_limits<double>::infinity();
+
+struct Dimensions {
+ enum dimensions { XY = 0, XYZ = 1, XYM = 2, XYZM = 3 };
+
+ static dimensions FromWKB(uint32_t wkb_geometry_type) {
+ switch (wkb_geometry_type / 1000) {
+ case 0:
+ return XY;
+ case 1:
+ return XYZ;
+ case 2:
+ return XYM;
+ case 3:
+ return XYZM;
+ default:
+ throw ParquetException("Invalid wkb_geometry_type: ",
wkb_geometry_type);
+ }
+ }
+
+ template <dimensions dims>
+ constexpr static uint32_t size();
+
+ static uint32_t size(dimensions dims);
+
+ // Where to look in a coordinate with this dimension
+ // for the X, Y, Z, and M dimensions, respectively.
+ static std::array<int, 4> ToXYZM(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return {0, 1, -1, -1};
+ case XYZ:
+ return {0, 1, 2, -1};
+ case XYM:
+ return {0, 1, -1, 2};
+ case XYZM:
+ return {0, 1, 2, 3};
+ default:
+ return {-1, -1, -1, -1};
+ }
+ }
+
+ static std::string ToString(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return "XY";
+ case XYZ:
+ return "XYZ";
+ case XYM:
+ return "XYM";
+ case XYZM:
+ return "XYZM";
+ default:
+ return "";
+ }
+ }
+};
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XY>() {
+ return 2;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYZ>() {
+ return 3;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYM>() {
+ return 3;
+}
+
+template <>
+constexpr uint32_t Dimensions::size<Dimensions::XYZM>() {
+ return 4;
+}
+
+inline uint32_t Dimensions::size(dimensions dims) {
+ switch (dims) {
+ case XY:
+ return size<XY>();
+ case XYZ:
+ return size<XYZ>();
+ case XYM:
+ return size<XYM>();
+ case XYZM:
+ return size<XYZM>();
+ default:
+ return 0;
+ }
+}
+
+struct GeometryType {
+ enum geometry_type {
+ POINT = 1,
+ LINESTRING = 2,
+ POLYGON = 3,
+ MULTIPOINT = 4,
+ MULTILINESTRING = 5,
+ MULTIPOLYGON = 6,
+ GEOMETRYCOLLECTION = 7
+ };
+
+ static geometry_type FromWKB(uint32_t wkb_geometry_type) {
+ switch (wkb_geometry_type % 1000) {
+ case 1:
+ return POINT;
+ case 2:
+ return LINESTRING;
+ case 3:
+ return POLYGON;
+ case 4:
+ return MULTIPOINT;
+ case 5:
+ return MULTILINESTRING;
+ case 6:
+ return MULTIPOLYGON;
+ case 7:
+ return GEOMETRYCOLLECTION;
+ default:
+ throw ParquetException("Invalid wkb_geometry_type: ",
wkb_geometry_type);
+ }
+ }
+
+ static uint32_t ToWKB(geometry_type geometry_type, bool has_z, bool has_m) {
+ uint32_t wkb_geom_type = 0;
+ switch (geometry_type) {
+ case POINT:
+ wkb_geom_type = 1;
+ break;
+ case LINESTRING:
+ wkb_geom_type = 2;
+ break;
+ case POLYGON:
+ wkb_geom_type = 3;
+ break;
+ case MULTIPOINT:
+ wkb_geom_type = 4;
+ break;
+ case MULTILINESTRING:
+ wkb_geom_type = 5;
+ break;
+ case MULTIPOLYGON:
+ wkb_geom_type = 6;
+ break;
+ case GEOMETRYCOLLECTION:
+ wkb_geom_type = 7;
+ break;
+ default:
+ throw ParquetException("Invalid geometry_type: ", geometry_type);
+ }
+ if (has_z) {
+ wkb_geom_type += 1000;
+ }
+ if (has_m) {
+ wkb_geom_type += 2000;
+ }
+ return wkb_geom_type;
+ }
+
+ static std::string ToString(geometry_type geometry_type) {
+ switch (geometry_type) {
+ case POINT:
+ return "POINT";
+ case LINESTRING:
+ return "LINESTRING";
+ case POLYGON:
+ return "POLYGON";
+ case MULTIPOINT:
+ return "MULTIPOINT";
+ case MULTILINESTRING:
+ return "MULTILINESTRING";
+ case MULTIPOLYGON:
+ return "MULTIPOLYGON";
+ case GEOMETRYCOLLECTION:
+ return "GEOMETRYCOLLECTION";
+ default:
+ return "";
+ }
+ }
+};
+
+class WKBBuffer {
+ public:
+ enum Endianness { WKB_BIG_ENDIAN = 0, WKB_LITTLE_ENDIAN = 1 };
+
+ WKBBuffer() : data_(NULLPTR), size_(0) {}
+ WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
+
+ void Init(const uint8_t* data, int64_t size) {
+ data_ = data;
+ size_ = size;
+ }
+
+ uint8_t ReadUInt8() {
+ if (size_ < 1) {
+ throw ParquetException("Can't read 1 byte from empty WKBBuffer");
+ }
+
+ size_ -= 1;
+ return *data_++;
+ }
+
+ uint32_t ReadUInt32(bool swap) {
+ if (ARROW_PREDICT_FALSE(swap)) {
+ return ReadUInt32<true>();
+ } else {
+ return ReadUInt32<false>();
+ }
+ }
+
+ template <bool swap>
+ uint32_t ReadUInt32() {
+ if (size_ < sizeof(uint32_t)) {
+ throw ParquetException("Can't read 4 bytes from WKBBuffer with ", size_,
+ "remaining");
+ }
+
+ uint32_t value;
+ memcpy(&value, data_, sizeof(uint32_t));
+ data_ += sizeof(uint32_t);
+ size_ -= sizeof(uint32_t);
+
+ if constexpr (swap) {
+ value = ::arrow::bit_util::ByteSwap(value);
+ }
+
+ return value;
+ }
+
+ template <bool swap>
+ void ReadDoubles(uint32_t n, double* out) {
+ if (n == 0) {
+ return;
+ }
+
+ size_t total_bytes = n * sizeof(double);
+ if (size_ < total_bytes) {
+ throw ParquetException("Can't read ", total_bytes, " bytes from
WKBBuffer with ",
+ size_, "remaining");
+ }
+
+ memcpy(out, data_, total_bytes);
+ data_ += total_bytes;
+ size_ -= total_bytes;
+
+ if constexpr (swap) {
+ for (uint32_t i = 0; i < n; i++) {
+ out[i] = ::arrow::bit_util::ByteSwap(out[i]);
+ }
+ }
+ }
+
+ size_t size() { return size_; }
+
+ private:
+ const uint8_t* data_;
+ size_t size_;
+};
+
+struct BoundingBox {
+ BoundingBox(Dimensions::dimensions dimensions, const std::array<double, 4>&
mins,
+ const std::array<double, 4>& maxes)
+ : dimensions(dimensions) {
+ std::memcpy(min, mins.data(), sizeof(min));
+ std::memcpy(max, maxes.data(), sizeof(max));
+ }
+ explicit BoundingBox(Dimensions::dimensions dimensions = Dimensions::XYZM)
+ : dimensions(dimensions),
+ min{kInf, kInf, kInf, kInf},
+ max{-kInf, -kInf, -kInf, -kInf} {}
+
+ BoundingBox(const BoundingBox& other) = default;
+ BoundingBox& operator=(const BoundingBox&) = default;
+
+ void Reset() {
+ for (int i = 0; i < 4; i++) {
+ min[i] = kInf;
+ max[i] = -kInf;
+ }
+ }
+
+ void Merge(const BoundingBox& other) {
+ if (ARROW_PREDICT_TRUE(dimensions == other.dimensions)) {
+ for (int i = 0; i < 4; i++) {
+ min[i] = std::min(min[i], other.min[i]);
+ max[i] = std::max(max[i], other.max[i]);
+ }
+
+ return;
+ } else if (dimensions == Dimensions::XYZM) {
+ Merge(other.ToXYZM());
Review Comment:
I do not quite understand when we will enter this branch. At the initial
state?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]