This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new ca9bb59b4 ORC-1920: [C++] Support `Geometry` and `Geography` types
ca9bb59b4 is described below
commit ca9bb59b4bfd15da89e6270d077d2464495db0d9
Author: ffacs <[email protected]>
AuthorDate: Sun Jul 13 17:31:32 2025 -0700
ORC-1920: [C++] Support `Geometry` and `Geography` types
What changes were proposed in this pull request?
Support Geometry and Geography types for c++ side
Why are the changes needed?
Add support for Geometry and Geography types
How was this patch tested?
UT passed
Was this patch authored or co-authored using generative AI tooling?
No
Closes #2269 from ffacs/geo.
Authored-by: ffacs <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/include/orc/Geospatial.hh | 196 ++++++++++++++++++++++
c++/include/orc/Statistics.hh | 30 +++-
c++/include/orc/Type.hh | 24 ++-
c++/include/orc/meson.build | 1 +
c++/src/CMakeLists.txt | 1 +
c++/src/ColumnPrinter.cc | 2 +
c++/src/ColumnReader.cc | 2 +
c++/src/ColumnWriter.cc | 65 ++++++++
c++/src/Geospatial.cc | 307 +++++++++++++++++++++++++++++++++++
c++/src/Geospatial.hh | 86 ++++++++++
c++/src/Reader.cc | 2 +
c++/src/SchemaEvolution.cc | 7 +-
c++/src/Statistics.cc | 47 ++++++
c++/src/Statistics.hh | 122 ++++++++++++++
c++/src/TypeImpl.cc | 139 ++++++++++++++++
c++/src/TypeImpl.hh | 26 +++
c++/src/Writer.cc | 35 ++++
c++/src/meson.build | 1 +
c++/test/CMakeLists.txt | 2 +
c++/test/TestColumnStatistics.cc | 342 +++++++++++++++++++++++++++++++++++++++
c++/test/TestStatistics.cc | 230 ++++++++++++++++++++++++++
c++/test/TestUtil.cc | 65 ++++++++
c++/test/TestUtil.hh | 44 +++++
c++/test/TestWriter.cc | 137 +++++++++++++++-
c++/test/meson.build | 2 +
tools/src/CSVFileImport.cc | 2 +
26 files changed, 1912 insertions(+), 5 deletions(-)
diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh
new file mode 100644
index 000000000..d3b9e2828
--- /dev/null
+++ b/c++/include/orc/Geospatial.hh
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains code adapted from the Apache Arrow project.
+ *
+ * Original source:
+ *
https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h
+ *
+ * The original code is licensed under the Apache License, Version 2.0.
+ *
+ * Modifications may have been made from the original source.
+ */
+
+#ifndef ORC_GEOSPATIAL_HH
+#define ORC_GEOSPATIAL_HH
+
+#include <array>
+#include <cmath>
+#include <ostream>
+#include <string>
+
+namespace orc::geospatial {
+
+ constexpr double INF = std::numeric_limits<double>::infinity();
+ // The maximum number of dimensions supported (X, Y, Z, M)
+ inline constexpr int MAX_DIMENSIONS = 4;
+
+ // Supported combinations of geometry dimensions
+ enum class Dimensions {
+ XY = 0, // X and Y only
+ XYZ = 1, // X, Y, and Z
+ XYM = 2, // X, Y, and M
+ XYZM = 3, // X, Y, Z, and M
+ VALUE_MIN = 0,
+ VALUE_MAX = 3
+ };
+
+ // Supported geometry types according to ISO WKB
+ enum class GeometryType {
+ POINT = 1,
+ LINESTRING = 2,
+ POLYGON = 3,
+ MULTIPOINT = 4,
+ MULTILINESTRING = 5,
+ MULTIPOLYGON = 6,
+ GEOMETRYCOLLECTION = 7,
+ VALUE_MIN = 1,
+ VALUE_MAX = 7
+ };
+
+ // BoundingBox represents the minimum bounding rectangle (or box) for a
geometry.
+ // It supports up to 4 dimensions (X, Y, Z, M).
+ struct BoundingBox {
+ using XY = std::array<double, 2>;
+ using XYZ = std::array<double, 3>;
+ using XYM = std::array<double, 3>;
+ using XYZM = std::array<double, 4>;
+
+ // Default constructor: initializes to an empty bounding box.
+ BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {}
+ // Constructor with explicit min/max values.
+ BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {}
+ BoundingBox(const BoundingBox& other) = default;
+ BoundingBox& operator=(const BoundingBox&) = default;
+
+ // Update the bounding box to include a 2D coordinate.
+ void updateXY(const XY& coord) {
+ updateInternal(coord);
+ }
+ // Update the bounding box to include a 3D coordinate (XYZ).
+ void updateXYZ(const XYZ& coord) {
+ updateInternal(coord);
+ }
+ // Update the bounding box to include a 3D coordinate (XYM).
+ void updateXYM(const XYM& coord) {
+ std::array<int, 3> dims = {0, 1, 3};
+ for (int i = 0; i < 3; ++i) {
+ auto dim = dims[i];
+ if (!std::isnan(min[dim]) && !std::isnan(max[dim])) {
+ min[dim] = std::min(min[dim], coord[i]);
+ max[dim] = std::max(max[dim], coord[i]);
+ }
+ }
+ }
+ // Update the bounding box to include a 4D coordinate (XYZM).
+ void updateXYZM(const XYZM& coord) {
+ updateInternal(coord);
+ }
+
+ // Reset the bounding box to its initial empty state.
+ void reset() {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ min[i] = INF;
+ max[i] = -INF;
+ }
+ }
+
+ // Invalidate the bounding box (set all values to NaN).
+ void invalidate() {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ min[i] = std::numeric_limits<double>::quiet_NaN();
+ max[i] = std::numeric_limits<double>::quiet_NaN();
+ }
+ }
+
+ // Check if the bound for a given dimension is empty.
+ bool boundEmpty(int dim) const {
+ return std::isinf(min[dim] - max[dim]);
+ }
+
+ // Check if the bound for a given dimension is valid (not NaN).
+ bool boundValid(int dim) const {
+ return !std::isnan(min[dim]) && !std::isnan(max[dim]);
+ }
+
+ // Get the lower bound (min values).
+ const XYZM& lowerBound() const {
+ return min;
+ }
+ // Get the upper bound (max values).
+ const XYZM& upperBound() const {
+ return max;
+ }
+
+ // Get validity for each dimension.
+ std::array<bool, MAX_DIMENSIONS> dimensionValid() const {
+ return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)};
+ }
+ // Get emptiness for each dimension.
+ std::array<bool, MAX_DIMENSIONS> dimensionEmpty() const {
+ return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)};
+ }
+
+ // Merge another bounding box into this one.
+ void merge(const BoundingBox& other) {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ if (std::isnan(min[i]) || std::isnan(max[i]) ||
std::isnan(other.min[i]) ||
+ std::isnan(other.max[i])) {
+ min[i] = std::numeric_limits<double>::quiet_NaN();
+ max[i] = std::numeric_limits<double>::quiet_NaN();
+ } else {
+ min[i] = std::min(min[i], other.min[i]);
+ max[i] = std::max(max[i], other.max[i]);
+ }
+ }
+ }
+
+ // Convert the bounding box to a string representation.
+ std::string toString() const;
+
+ XYZM min; // Minimum values for each dimension
+ XYZM max; // Maximum values for each dimension
+
+ private:
+ // Internal update function for XY, XYZ, or XYZM coordinates.
+ template <typename Coord>
+ void updateInternal(const Coord& coord) {
+ for (size_t i = 0; i < coord.size(); ++i) {
+ if (!std::isnan(min[i]) && !std::isnan(max[i])) {
+ min[i] = std::min(min[i], coord[i]);
+ max[i] = std::max(max[i], coord[i]);
+ }
+ }
+ }
+ };
+
+ inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
+ return lhs.min == rhs.min && lhs.max == rhs.max;
+ }
+ inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) {
+ return !(lhs == rhs);
+ }
+ inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) {
+ os << obj.toString();
+ return os;
+ }
+
+} // namespace orc::geospatial
+
+#endif // ORC_GEOSPATIAL_HH
diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh
index 4ba8c35f7..58169abe5 100644
--- a/c++/include/orc/Statistics.hh
+++ b/c++/include/orc/Statistics.hh
@@ -19,12 +19,11 @@
#ifndef ORC_STATISTICS_HH
#define ORC_STATISTICS_HH
+#include "orc/Geospatial.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
#include "orc/orc-config.hh"
-#include <sstream>
-
namespace orc {
/**
@@ -367,6 +366,33 @@ namespace orc {
virtual int32_t getMaximumNanos() const = 0;
};
+ /**
+ * Statistics for Geometry and Geography
+ */
+ class GeospatialColumnStatistics : public ColumnStatistics {
+ public:
+ virtual ~GeospatialColumnStatistics();
+
+ /**
+ * Get bounding box
+ * @return bounding box
+ */
+ virtual const geospatial::BoundingBox& getBoundingBox() const = 0;
+
+ /**
+ * Get geospatial types
+ * @return a sorted vector of geometry type IDs that elements is unique
+ */
+ virtual std::vector<int32_t> getGeospatialTypes() const = 0;
+
+ /**
+ * Update stats by a new value
+ * @param value new value to update
+ * @param length length of the value
+ */
+ virtual void update(const char* value, size_t length) = 0;
+ };
+
class Statistics {
public:
virtual ~Statistics();
diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh
index 82e0e3cc8..4bb794ff3 100644
--- a/c++/include/orc/Type.hh
+++ b/c++/include/orc/Type.hh
@@ -25,6 +25,18 @@
namespace orc {
+ namespace geospatial {
+ enum EdgeInterpolationAlgorithm {
+ SPHERICAL = 0,
+ VINCENTY = 1,
+ THOMAS = 2,
+ ANDOYER = 3,
+ KARNEY = 4
+ };
+ std::string AlgoToString(EdgeInterpolationAlgorithm algo);
+ EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo);
+ } // namespace geospatial
+
enum TypeKind {
BOOLEAN = 0,
BYTE = 1,
@@ -44,7 +56,9 @@ namespace orc {
DATE = 15,
VARCHAR = 16,
CHAR = 17,
- TIMESTAMP_INSTANT = 18
+ TIMESTAMP_INSTANT = 18,
+ GEOMETRY = 19,
+ GEOGRAPHY = 20
};
class Type {
@@ -59,6 +73,10 @@ namespace orc {
virtual uint64_t getMaximumLength() const = 0;
virtual uint64_t getPrecision() const = 0;
virtual uint64_t getScale() const = 0;
+ // for geospatial types only
+ virtual const std::string& getCrs() const = 0;
+ // for geography type only
+ virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0;
virtual Type& setAttribute(const std::string& key, const std::string&
value) = 0;
virtual bool hasAttributeKey(const std::string& key) const = 0;
virtual Type& removeAttribute(const std::string& key) = 0;
@@ -115,6 +133,10 @@ namespace orc {
std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements);
std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
std::unique_ptr<Type> value);
std::unique_ptr<Type> createUnionType();
+ std::unique_ptr<Type> createGeometryType(const std::string& crs =
"OGC:CRS84");
+ std::unique_ptr<Type> createGeographyType(
+ const std::string& crs = "OGC:CRS84",
+ geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL);
} // namespace orc
#endif
diff --git a/c++/include/orc/meson.build b/c++/include/orc/meson.build
index 2e9e18199..e2524051f 100644
--- a/c++/include/orc/meson.build
+++ b/c++/include/orc/meson.build
@@ -34,6 +34,7 @@ install_headers(
'ColumnPrinter.hh',
'Common.hh',
'Exceptions.hh',
+ 'Geospatial.hh',
'Int128.hh',
'MemoryPool.hh',
'OrcFile.hh',
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index e378429f1..09a0b148e 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -171,6 +171,7 @@ set(SOURCE_FILES
ConvertColumnReader.cc
CpuInfoUtil.cc
Exceptions.cc
+ Geospatial.cc
Int128.cc
LzoDecompressor.cc
MemoryPool.cc
diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc
index 8b16ecbd0..6535c612c 100644
--- a/c++/src/ColumnPrinter.cc
+++ b/c++/src/ColumnPrinter.cc
@@ -254,6 +254,8 @@ namespace orc {
break;
case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY:
result = std::make_unique<BinaryColumnPrinter>(buffer, param);
break;
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 0fd17de1b..89ff0e024 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -1747,6 +1747,8 @@ namespace orc {
case CHAR:
case STRING:
case VARCHAR:
+ case GEOMETRY:
+ case GEOGRAPHY:
switch
(static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
case proto::ColumnEncoding_Kind_DICTIONARY:
case proto::ColumnEncoding_Kind_DICTIONARY_V2:
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index d31b1c65d..c99890b88 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -17,8 +17,11 @@
*/
#include "orc/Int128.hh"
+#include "orc/Statistics.hh"
+#include "orc/Type.hh"
#include "orc/Writer.hh"
+#include <memory>
#include "ByteRLE.hh"
#include "ColumnWriter.hh"
#include "RLE.hh"
@@ -2871,6 +2874,65 @@ namespace orc {
}
}
+ class GeospatialColumnWriter : public BinaryColumnWriter {
+ public:
+ GeospatialColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : BinaryColumnWriter(type, factory, options),
+ isGeometry_(type.getKind() == TypeKind::GEOMETRY) {}
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t
numValues,
+ const char* incomingMask) override {
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const StringVectorBatch* strBatch = dynamic_cast<const
StringVectorBatch*>(&rowBatch);
+ if (strBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+ auto data = &strBatch->data[offset];
+ auto length = &strBatch->length[offset];
+ const char* notNull = strBatch->hasNulls ? strBatch->notNull.data() +
offset : nullptr;
+
+ bool hasNull = false;
+ GeospatialColumnStatisticsImpl* geoStats = nullptr;
+ if (isGeometry_) {
+ geoStats =
dynamic_cast<GeospatialColumnStatisticsImpl*>(colIndexStatistics.get());
+ }
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ uint64_t len = static_cast<uint64_t>(length[i]);
+ directDataStream->write(data[i], len);
+
+ // update stats
+ if (geoStats) {
+ ++count;
+ geoStats->update(data[i], len);
+ }
+
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], length[i]);
+ }
+ } else if (!hasNull) {
+ hasNull = true;
+ if (geoStats) {
+ geoStats->setHasNull(hasNull);
+ }
+ }
+ }
+
+ directLengthEncoder->add(length, numValues, notNull);
+
+ if (geoStats) {
+ geoStats->increase(count);
+ }
+ }
+
+ private:
+ bool isGeometry_;
+ };
+
std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const
StreamsFactory& factory,
const WriterOptions& options) {
switch (static_cast<int64_t>(type.getKind())) {
@@ -2941,6 +3003,9 @@ namespace orc {
return std::make_unique<MapColumnWriter>(type, factory, options);
case UNION:
return std::make_unique<UnionColumnWriter>(type, factory, options);
+ case GEOMETRY:
+ case GEOGRAPHY:
+ return std::make_unique<GeospatialColumnWriter>(type, factory,
options);
default:
throw NotImplementedYet(
"Type is not supported yet for creating "
diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc
new file mode 100644
index 000000000..6d7d26870
--- /dev/null
+++ b/c++/src/Geospatial.cc
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains code adapted from the Apache Arrow project.
+ *
+ * Original source:
+ *
https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.cc
+ *
+ * The original code is licensed under the Apache License, Version 2.0.
+ *
+ * Modifications may have been made from the original source.
+ */
+
+#include "orc/Geospatial.hh"
+#include "orc/Exceptions.hh"
+
+#include "Geospatial.hh"
+
+#include <algorithm>
+#include <cstring>
+#include <optional>
+#include <sstream>
+
+namespace orc::geospatial {
+
+ template <typename T>
+ inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> safeLoadAs(const
uint8_t* unaligned) {
+ std::remove_const_t<T> ret;
+ std::memcpy(&ret, unaligned, sizeof(T));
+ return ret;
+ }
+
+ template <typename U, typename T>
+ inline std::enable_if_t<std::is_trivially_copyable_v<T> &&
std::is_trivially_copyable_v<U> &&
+ sizeof(T) == sizeof(U),
+ U>
+ safeCopy(T value) {
+ std::remove_const_t<U> ret;
+ std::memcpy(&ret, static_cast<const void*>(&value), sizeof(T));
+ return ret;
+ }
+
+ static bool isLittleEndian() {
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
+ return num.c[0] == 4;
+ }
+
+#if defined(_MSC_VER)
+#include <intrin.h> // IWYU pragma: keep
+#define ORC_BYTE_SWAP64 _byteSwap_uint64
+#define ORC_BYTE_SWAP32 _byteSwap_ulong
+#else
+#define ORC_BYTE_SWAP64 __builtin_bswap64
+#define ORC_BYTE_SWAP32 __builtin_bswap32
+#endif
+
+ // Swap the byte order (i.e. endianness)
+ static inline uint32_t byteSwap(uint32_t value) {
+ return static_cast<uint32_t>(ORC_BYTE_SWAP32(value));
+ }
+ static inline double byteSwap(double value) {
+ const uint64_t swapped = ORC_BYTE_SWAP64(safeCopy<uint64_t>(value));
+ return safeCopy<double>(swapped);
+ }
+
+ std::string BoundingBox::toString() const {
+ std::stringstream ss;
+ ss << "BoundingBox{xMin=" << min[0] << ", xMax=" << max[0] << ", yMin=" <<
min[1]
+ << ", yMax=" << max[1] << ", zMin=" << min[2] << ", zMax=" << max[2] <<
", mMin=" << min[3]
+ << ", mMax=" << max[3] << "}";
+ return ss.str();
+ }
+
+ /// \brief Object to keep track of the low-level consumption of a well-known
binary
+ /// geometry
+ ///
+ /// Briefly, ISO well-known binary supported by the Parquet spec is an
endian byte
+ /// (0x01 or 0x00), followed by geometry type + dimensions encoded as a
(uint32_t),
+ /// followed by geometry-specific data. Coordinate sequences are represented
by a
+ /// uint32_t (the number of coordinates) plus a sequence of doubles (number
of coordinates
+ /// multiplied by the number of dimensions).
+ class WKBBuffer {
+ public:
+ WKBBuffer() : data_(nullptr), size_(0) {}
+ WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
+
+ uint8_t readUInt8() {
+ return readChecked<uint8_t>();
+ }
+
+ uint32_t readUInt32(bool swap) {
+ auto value = readChecked<uint32_t>();
+ return swap ? byteSwap(value) : value;
+ }
+
+ template <typename Coord, typename Visit>
+ void readCoords(uint32_t nCoords, bool swap, Visit&& visit) {
+ size_t total_bytes = nCoords * sizeof(Coord);
+ if (size_ < total_bytes) {
+ }
+
+ if (swap) {
+ Coord coord;
+ for (uint32_t i = 0; i < nCoords; i++) {
+ coord = readUnchecked<Coord>();
+ for (auto& c : coord) {
+ c = byteSwap(c);
+ }
+
+ std::forward<Visit>(visit)(coord);
+ }
+ } else {
+ for (uint32_t i = 0; i < nCoords; i++) {
+ std::forward<Visit>(visit)(readUnchecked<Coord>());
+ }
+ }
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ private:
+ const uint8_t* data_;
+ size_t size_;
+
+ template <typename T>
+ T readChecked() {
+ if (size_ < sizeof(T)) {
+ std::stringstream ss;
+ ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " <<
size_ << " remaining";
+ throw ParseError(ss.str());
+ }
+
+ return readUnchecked<T>();
+ }
+
+ template <typename T>
+ T readUnchecked() {
+ T out = safeLoadAs<T>(data_);
+ data_ += sizeof(T);
+ size_ -= sizeof(T);
+ return out;
+ }
+ };
+
+ using GeometryTypeAndDimensions = std::pair<GeometryType, Dimensions>;
+
+ namespace {
+
+ std::optional<GeometryTypeAndDimensions> parseGeometryType(uint32_t
wkbGeometryType) {
+ // The number 1000 can be used because WKB geometry types are constructed
+ // on purpose such that this relationship is true (e.g., LINESTRING ZM
maps
+ // to 3002).
+ uint32_t geometryTypeComponent = wkbGeometryType % 1000;
+ uint32_t dimensionsComponent = wkbGeometryType / 1000;
+
+ auto minGeometryTypeValue =
static_cast<uint32_t>(GeometryType::VALUE_MIN);
+ auto maxGeometryTypeValue =
static_cast<uint32_t>(GeometryType::VALUE_MAX);
+ auto minDimensionValue = static_cast<uint32_t>(Dimensions::VALUE_MIN);
+ auto maxDimensionValue = static_cast<uint32_t>(Dimensions::VALUE_MAX);
+
+ if (geometryTypeComponent < minGeometryTypeValue ||
+ geometryTypeComponent > maxGeometryTypeValue || dimensionsComponent
< minDimensionValue ||
+ dimensionsComponent > maxDimensionValue) {
+ return std::nullopt;
+ }
+
+ return std::make_optional(
+
GeometryTypeAndDimensions{static_cast<GeometryType>(geometryTypeComponent),
+
static_cast<Dimensions>(dimensionsComponent)});
+ }
+
+ } // namespace
+
+ std::vector<int32_t> WKBGeometryBounder::geometryTypes() const {
+ std::vector<int32_t> out(geospatialTypes_.begin(), geospatialTypes_.end());
+ std::sort(out.begin(), out.end());
+ return out;
+ }
+
+ void WKBGeometryBounder::mergeGeometry(std::string_view bytesWkb) {
+ if (!isValid_) {
+ return;
+ }
+ mergeGeometry(reinterpret_cast<const uint8_t*>(bytesWkb.data()),
bytesWkb.size());
+ }
+
+ void WKBGeometryBounder::mergeGeometry(const uint8_t* bytesWkb, size_t
bytesSize) {
+ if (!isValid_) {
+ return;
+ }
+ WKBBuffer src{bytesWkb, static_cast<int64_t>(bytesSize)};
+ try {
+ mergeGeometryInternal(&src, /*record_wkb_type=*/true);
+ } catch (const ParseError&) {
+ invalidate();
+ return;
+ }
+ if (src.size() != 0) {
+ // "Exepcted zero bytes after consuming WKB
+ invalidate();
+ }
+ }
+
+ void WKBGeometryBounder::mergeGeometryInternal(WKBBuffer* src, bool
recordWkbType) {
+ uint8_t endian = src->readUInt8();
+ bool swap = endian != 0x00;
+ if (isLittleEndian()) {
+ swap = endian != 0x01;
+ }
+
+ uint32_t wkbGeometryType = src->readUInt32(swap);
+ auto geometryTypeAndDimensions = parseGeometryType(wkbGeometryType);
+ if (!geometryTypeAndDimensions.has_value()) {
+ invalidate();
+ return;
+ }
+ auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value();
+
+ // Keep track of geometry types encountered if at the top level
+ if (recordWkbType) {
+ geospatialTypes_.insert(static_cast<int32_t>(wkbGeometryType));
+ }
+
+ switch (geometry_type) {
+ case GeometryType::POINT:
+ mergeSequence(src, dimensions, 1, swap);
+ break;
+
+ case GeometryType::LINESTRING: {
+ uint32_t nCoords = src->readUInt32(swap);
+ mergeSequence(src, dimensions, nCoords, swap);
+ break;
+ }
+ case GeometryType::POLYGON: {
+ uint32_t n_parts = src->readUInt32(swap);
+ for (uint32_t i = 0; i < n_parts; i++) {
+ uint32_t nCoords = src->readUInt32(swap);
+ mergeSequence(src, dimensions, nCoords, swap);
+ }
+ break;
+ }
+
+ // These are all encoded the same in WKB, even though this encoding would
+ // allow for parts to be of a different geometry type or different
dimensions.
+ // For the purposes of bounding, this does not cause us problems. We pass
+ // record_wkb_type = false because we do not want the child geometry to
be
+ // added to the geometry_types list (e.g., for a MultiPoint, we only want
+ // the code for MultiPoint to be added, not the code for Point).
+ case GeometryType::MULTIPOINT:
+ case GeometryType::MULTILINESTRING:
+ case GeometryType::MULTIPOLYGON:
+ case GeometryType::GEOMETRYCOLLECTION: {
+ uint32_t n_parts = src->readUInt32(swap);
+ for (uint32_t i = 0; i < n_parts; i++) {
+ mergeGeometryInternal(src, /*record_wkb_type*/ false);
+ }
+ break;
+ }
+ }
+ }
+
+ void WKBGeometryBounder::mergeSequence(WKBBuffer* src, Dimensions
dimensions, uint32_t nCoords,
+ bool swap) {
+ switch (dimensions) {
+ case Dimensions::XY:
+ src->readCoords<BoundingBox::XY>(nCoords, swap,
+ [&](BoundingBox::XY coord) {
box_.updateXY(coord); });
+ break;
+ case Dimensions::XYZ:
+ src->readCoords<BoundingBox::XYZ>(nCoords, swap,
+ [&](BoundingBox::XYZ coord) {
box_.updateXYZ(coord); });
+ break;
+ case Dimensions::XYM:
+ src->readCoords<BoundingBox::XYM>(nCoords, swap,
+ [&](BoundingBox::XYM coord) {
box_.updateXYM(coord); });
+ break;
+ case Dimensions::XYZM:
+ src->readCoords<BoundingBox::XYZM>(
+ nCoords, swap, [&](BoundingBox::XYZM coord) {
box_.updateXYZM(coord); });
+ break;
+ default:
+ invalidate();
+ }
+ }
+
+} // namespace orc::geospatial
diff --git a/c++/src/Geospatial.hh b/c++/src/Geospatial.hh
new file mode 100644
index 000000000..aebb72747
--- /dev/null
+++ b/c++/src/Geospatial.hh
@@ -0,0 +1,86 @@
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_GEOSPATIAL_IMPL_HH
+#define ORC_GEOSPATIAL_IMPL_HH
+
+#include "orc/Geospatial.hh"
+
+#include <unordered_set>
+#include <vector>
+
+namespace orc {
+ namespace geospatial {
+ class WKBBuffer;
+
+ class WKBGeometryBounder {
+ public:
+ void mergeGeometry(std::string_view bytesWkb);
+ void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize);
+
+ void mergeBox(const BoundingBox& box) {
+ box_.merge(box);
+ }
+ void mergeGeometryTypes(const std::vector<int>& geospatialTypes) {
+ geospatialTypes_.insert(geospatialTypes.begin(),
geospatialTypes.end());
+ }
+ void merge(const WKBGeometryBounder& other) {
+ if (!isValid() || !other.isValid()) {
+ invalidate();
+ return;
+ }
+ box_.merge(other.box_);
+ geospatialTypes_.insert(other.geospatialTypes_.begin(),
other.geospatialTypes_.end());
+ }
+
+ // Get the bounding box for the merged geometries.
+ const BoundingBox& bounds() const {
+ return box_;
+ }
+
+ // Get the set of geometry types encountered during merging.
+ // Returns a sorted vector of geometry type IDs.
+ std::vector<int32_t> geometryTypes() const;
+
+ void reset() {
+ isValid_ = true;
+ box_.reset();
+ geospatialTypes_.clear();
+ }
+ bool isValid() const {
+ return isValid_;
+ }
+ void invalidate() {
+ isValid_ = false;
+ box_.invalidate();
+ geospatialTypes_.clear();
+ }
+
+ private:
+ BoundingBox box_;
+ std::unordered_set<int32_t> geospatialTypes_;
+ bool isValid_ = true;
+
+ void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType);
+ void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t
nCoords, bool swap);
+ };
+ } // namespace geospatial
+} // namespace orc
+
+#endif
diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc
index 17bf83520..349ae1b40 100644
--- a/c++/src/Reader.cc
+++ b/c++/src/Reader.cc
@@ -873,6 +873,8 @@ namespace orc {
case proto::Type_Kind_CHAR:
case proto::Type_Kind_STRING:
case proto::Type_Kind_VARCHAR:
+ case proto::Type_Kind_GEOMETRY:
+ case proto::Type_Kind_GEOGRAPHY:
return 4;
default:
return 0;
diff --git a/c++/src/SchemaEvolution.cc b/c++/src/SchemaEvolution.cc
index 7cf3b5c51..442c43c22 100644
--- a/c++/src/SchemaEvolution.cc
+++ b/c++/src/SchemaEvolution.cc
@@ -18,6 +18,7 @@
#include "SchemaEvolution.hh"
#include "orc/Exceptions.hh"
+#include "orc/Type.hh"
namespace orc {
@@ -113,7 +114,9 @@ namespace orc {
case TIMESTAMP:
case TIMESTAMP_INSTANT:
case DATE:
- case BINARY: {
+ case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY: {
// Not support
break;
}
@@ -235,6 +238,8 @@ namespace orc {
case FLOAT:
case DOUBLE:
case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY:
case TIMESTAMP:
case LIST:
case MAP:
diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc
index c1a23cad1..a86247f10 100644
--- a/c++/src/Statistics.cc
+++ b/c++/src/Statistics.cc
@@ -44,6 +44,8 @@ namespace orc {
return new DateColumnStatisticsImpl(s, statContext);
} else if (s.has_binary_statistics()) {
return new BinaryColumnStatisticsImpl(s, statContext);
+ } else if (s.has_geospatial_statistics()) {
+ return new GeospatialColumnStatisticsImpl(s);
} else {
return new ColumnStatisticsImpl(s);
}
@@ -148,6 +150,10 @@ namespace orc {
// PASS
}
+ GeospatialColumnStatistics::~GeospatialColumnStatistics() {
+ // PASS
+ }
+
ColumnStatisticsImpl::~ColumnStatisticsImpl() {
// PASS
}
@@ -188,6 +194,10 @@ namespace orc {
// PASS
}
+ GeospatialColumnStatisticsImpl::~GeospatialColumnStatisticsImpl() {
+ // PASS
+ }
+
ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics&
pb) {
stats_.setNumberOfValues(pb.number_of_values());
stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
@@ -391,6 +401,40 @@ namespace orc {
}
}
+ GeospatialColumnStatisticsImpl::GeospatialColumnStatisticsImpl(
+ const proto::ColumnStatistics& pb) {
+ reset();
+ if (!pb.has_geospatial_statistics()) {
+ bounder_.invalidate();
+ } else {
+ const proto::GeospatialStatistics& stats = pb.geospatial_statistics();
+ geospatial::BoundingBox::XYZM min;
+ geospatial::BoundingBox::XYZM max;
+ for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) {
+ min[i] = max[i] = std::numeric_limits<double>::quiet_NaN();
+ }
+ if (stats.has_bbox()) {
+ const auto& protoBBox = stats.bbox();
+ min[0] = protoBBox.xmin();
+ min[1] = protoBBox.ymin();
+ max[0] = protoBBox.xmax();
+ max[1] = protoBBox.ymax();
+ if (protoBBox.has_zmin() && protoBBox.has_zmax()) {
+ min[2] = protoBBox.zmin();
+ max[2] = protoBBox.zmax();
+ }
+ if (protoBBox.has_mmin() && protoBBox.has_mmax()) {
+ min[3] = protoBBox.mmin();
+ max[3] = protoBBox.mmax();
+ }
+ }
+ bounder_.mergeBox(geospatial::BoundingBox(min, max));
+ std::vector<int32_t> types = {stats.geospatial_types().begin(),
+ stats.geospatial_types().end()};
+ bounder_.mergeGeometryTypes(types);
+ }
+ }
+
std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type&
type) {
switch (static_cast<int64_t>(type.getKind())) {
case BOOLEAN:
@@ -422,6 +466,9 @@ namespace orc {
return std::make_unique<TimestampColumnStatisticsImpl>();
case DECIMAL:
return std::make_unique<DecimalColumnStatisticsImpl>();
+ case GEOGRAPHY:
+ case GEOMETRY:
+ return std::make_unique<GeospatialColumnStatisticsImpl>();
default:
throw NotImplementedYet("Not supported type: " + type.toString());
}
diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh
index b7ed5d1e5..94b1e5d2b 100644
--- a/c++/src/Statistics.hh
+++ b/c++/src/Statistics.hh
@@ -24,6 +24,7 @@
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
+#include "Geospatial.hh"
#include "Timezone.hh"
#include "TypeImpl.hh"
@@ -1683,6 +1684,127 @@ namespace orc {
}
};
+ class GeospatialColumnStatisticsImpl : public GeospatialColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ geospatial::WKBGeometryBounder bounder_;
+ InternalCharStatistics stats_;
+
+ public:
+ GeospatialColumnStatisticsImpl() {
+ reset();
+ }
+ explicit GeospatialColumnStatisticsImpl(const proto::ColumnStatistics&
stats);
+ virtual ~GeospatialColumnStatisticsImpl();
+
+ uint64_t getNumberOfValues() const override {
+ return stats_.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ stats_.setNumberOfValues(value);
+ }
+
+ void increase(uint64_t count) override {
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
+ }
+
+ bool hasNull() const override {
+ return stats_.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ stats_.setHasNull(hasNull);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const GeospatialColumnStatisticsImpl& geoStats =
+ dynamic_cast<const GeospatialColumnStatisticsImpl&>(other);
+ stats_.merge(geoStats.stats_);
+ bounder_.merge(geoStats.bounder_);
+ }
+
+ void reset() override {
+ stats_.reset();
+ bounder_.reset();
+ }
+
+ void update(const char* value, size_t length) override {
+ bounder_.mergeGeometry(std::string_view(value, length));
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
+
+ proto::GeospatialStatistics* geoStats =
pbStats.mutable_geospatial_statistics();
+ const auto& bbox = bounder_.bounds();
+ if (bbox.boundValid(0) && bbox.boundValid(1) && !bbox.boundEmpty(0) &&
!bbox.boundEmpty(1)) {
+ geoStats->mutable_bbox()->set_xmin(bbox.min[0]);
+ geoStats->mutable_bbox()->set_xmax(bbox.max[0]);
+ geoStats->mutable_bbox()->set_ymin(bbox.min[1]);
+ geoStats->mutable_bbox()->set_ymax(bbox.max[1]);
+ if (bbox.boundValid(2) && !bbox.boundEmpty(2)) {
+ geoStats->mutable_bbox()->set_zmin(bbox.min[2]);
+ geoStats->mutable_bbox()->set_zmax(bbox.max[2]);
+ }
+ if (bbox.boundValid(3) && !bbox.boundEmpty(3)) {
+ geoStats->mutable_bbox()->set_mmin(bbox.min[3]);
+ geoStats->mutable_bbox()->set_mmax(bbox.max[3]);
+ }
+ }
+ for (auto type : bounder_.geometryTypes()) {
+ geoStats->add_geospatial_types(type);
+ }
+ }
+
+ std::string toString() const override {
+ if (!bounder_.isValid()) {
+ return "<GeoStatistics> invalid";
+ }
+
+ std::stringstream ss;
+ ss << "<GeoStatistics>";
+
+ std::string dim_label("xyzm");
+ const auto& bbox = bounder_.bounds();
+ auto dim_valid = bbox.dimensionValid();
+ auto dim_empty = bbox.dimensionEmpty();
+ auto lower = bbox.lowerBound();
+ auto upper = bbox.upperBound();
+
+ for (int i = 0; i < 4; i++) {
+ ss << " " << dim_label[i] << ": ";
+ if (!dim_valid[i]) {
+ ss << "invalid";
+ } else if (dim_empty[i]) {
+ ss << "empty";
+ } else {
+ ss << "[" << lower[i] << ", " << upper[i] << "]";
+ }
+ }
+
+ std::vector<int32_t> maybe_geometry_types = bounder_.geometryTypes();
+ ss << " geometry_types: [";
+ std::string sep("");
+ for (int32_t geometry_type : maybe_geometry_types) {
+ ss << sep << geometry_type;
+ sep = ", ";
+ }
+ ss << "]";
+
+ return ss.str();
+ }
+
+ const geospatial::BoundingBox& getBoundingBox() const override {
+ return bounder_.bounds();
+ }
+
+ std::vector<int32_t> getGeospatialTypes() const override {
+ return bounder_.geometryTypes();
+ }
+ };
+
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext);
diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc
index cbc7b8279..18c4985ab 100644
--- a/c++/src/TypeImpl.cc
+++ b/c++/src/TypeImpl.cc
@@ -19,8 +19,10 @@
#include "TypeImpl.hh"
#include "Adaptor.hh"
#include "orc/Exceptions.hh"
+#include "orc/Type.hh"
#include <iostream>
+#include <memory>
#include <sstream>
namespace orc {
@@ -62,6 +64,33 @@ namespace orc {
subtypeCount_ = 0;
}
+ TypeImpl::TypeImpl(TypeKind kind, const std::string& crs) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = 0;
+ precision_ = 0;
+ scale_ = 0;
+ subtypeCount_ = 0;
+ crs_ = crs;
+ edgeInterpolationAlgorithm_ =
geospatial::EdgeInterpolationAlgorithm::SPHERICAL;
+ }
+
+ TypeImpl::TypeImpl(TypeKind kind, const std::string& crs,
+ geospatial::EdgeInterpolationAlgorithm algo) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = 0;
+ precision_ = 0;
+ scale_ = 0;
+ subtypeCount_ = 0;
+ crs_ = crs;
+ edgeInterpolationAlgorithm_ = algo;
+ }
+
uint64_t TypeImpl::assignIds(uint64_t root) const {
columnId_ = static_cast<int64_t>(root);
uint64_t current = root + 1;
@@ -120,6 +149,14 @@ namespace orc {
return scale_;
}
+ const std::string& TypeImpl::getCrs() const {
+ return crs_;
+ }
+
+ geospatial::EdgeInterpolationAlgorithm TypeImpl::getAlgorithm() const {
+ return edgeInterpolationAlgorithm_;
+ }
+
Type& TypeImpl::setAttribute(const std::string& key, const std::string&
value) {
attributes_[key] = value;
return *this;
@@ -189,6 +226,45 @@ namespace orc {
return true;
}
+ namespace geospatial {
+ std::string AlgoToString(EdgeInterpolationAlgorithm algo) {
+ switch (algo) {
+ case EdgeInterpolationAlgorithm::SPHERICAL:
+ return "speherial";
+ case VINCENTY:
+ return "vincenty";
+ case THOMAS:
+ return "thomas";
+ case ANDOYER:
+ return "andoyer";
+ case KARNEY:
+ return "karney";
+ default:
+ throw InvalidArgument("Unknown algo");
+ }
+ }
+
+ EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo) {
+ if (algo == "speherial") {
+ return EdgeInterpolationAlgorithm::SPHERICAL;
+ }
+ if (algo == "vincenty") {
+ return VINCENTY;
+ }
+ if (algo == "thomas") {
+ return THOMAS;
+ }
+ if (algo == "andoyer") {
+ return ANDOYER;
+ }
+ if (algo == "karney") {
+ return KARNEY;
+ }
+ throw InvalidArgument("Unknown algo: " + algo);
+ }
+
+ } // namespace geospatial
+
std::string TypeImpl::toString() const {
switch (static_cast<int64_t>(kind_)) {
case BOOLEAN:
@@ -271,6 +347,17 @@ namespace orc {
result << "char(" << maxLength_ << ")";
return result.str();
}
+ case GEOMETRY: {
+ std::stringstream result;
+ result << "geometry(" << crs_ << ")";
+ return result.str();
+ }
+ case GEOGRAPHY: {
+ std::stringstream result;
+ result << "geography(" << crs_ << ","
+ << geospatial::AlgoToString(edgeInterpolationAlgorithm_) << ")";
+ return result.str();
+ }
default:
throw NotImplementedYet("Unknown type");
}
@@ -322,6 +409,8 @@ namespace orc {
case BINARY:
case CHAR:
case VARCHAR:
+ case GEOMETRY:
+ case GEOGRAPHY:
return encoded ? std::make_unique<EncodedStringVectorBatch>(capacity,
memoryPool)
: std::make_unique<StringVectorBatch>(capacity,
memoryPool);
@@ -419,6 +508,15 @@ namespace orc {
return std::make_unique<TypeImpl>(UNION);
}
+ std::unique_ptr<Type> createGeometryType(const std::string& crs) {
+ return std::make_unique<TypeImpl>(GEOMETRY, crs);
+ }
+
+ std::unique_ptr<Type> createGeographyType(const std::string& crs,
+
geospatial::EdgeInterpolationAlgorithm algo) {
+ return std::make_unique<TypeImpl>(GEOGRAPHY, crs, algo);
+ }
+
std::string printProtobufMessage(const google::protobuf::Message& message);
std::unique_ptr<Type> convertType(const proto::Type& type, const
proto::Footer& footer) {
std::unique_ptr<Type> ret;
@@ -443,6 +541,16 @@ namespace orc {
ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()),
type.maximum_length());
break;
+ case proto::Type_Kind_GEOMETRY:
+ ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()),
type.crs());
+ break;
+
+ case proto::Type_Kind_GEOGRAPHY:
+ ret = std::make_unique<TypeImpl>(
+ static_cast<TypeKind>(type.kind()), type.crs(),
+
static_cast<geospatial::EdgeInterpolationAlgorithm>(type.algorithm()));
+ break;
+
case proto::Type_Kind_DECIMAL:
ret = std::make_unique<TypeImpl>(DECIMAL, type.precision(),
type.scale());
break;
@@ -523,6 +631,13 @@ namespace orc {
case CHAR:
result = std::make_unique<TypeImpl>(fileType->getKind(),
fileType->getMaximumLength());
break;
+ case GEOMETRY:
+ result = std::make_unique<TypeImpl>(fileType->getKind(),
fileType->getCrs());
+ break;
+ case GEOGRAPHY:
+ result = std::make_unique<TypeImpl>(fileType->getKind(),
fileType->getCrs(),
+ fileType->getAlgorithm());
+ break;
case LIST:
result = std::make_unique<TypeImpl>(fileType->getKind());
@@ -710,6 +825,22 @@ namespace orc {
return std::make_unique<TypeImpl>(DECIMAL, precision, scale);
}
+ std::unique_ptr<Type> TypeImpl::parseGeographyType(const std::string& input,
size_t start,
+ size_t end) {
+ if (input[start] != '(') {
+ throw std::logic_error("Missing ( after geography.");
+ }
+ size_t pos = start + 1;
+ size_t sep = input.find(',', pos);
+ if (sep + 1 >= end || sep == std::string::npos) {
+ throw std::logic_error("Geography type must specify CRS.");
+ }
+ std::string crs = input.substr(pos, sep - pos);
+ std::string algoStr = input.substr(sep + 1, end - sep - 1);
+ geospatial::EdgeInterpolationAlgorithm algo =
geospatial::AlgoFromString(algoStr);
+ return std::make_unique<TypeImpl>(GEOGRAPHY, crs, algo);
+ }
+
void validatePrimitiveType(std::string category, const std::string& input,
const size_t pos) {
if (input[pos] == '<' || input[pos] == '(') {
std::ostringstream oss;
@@ -780,6 +911,14 @@ namespace orc {
uint64_t maxLength =
static_cast<uint64_t>(atoi(input.substr(start + 1, end - start +
1).c_str()));
return std::make_unique<TypeImpl>(CHAR, maxLength);
+ } else if (category == "geometry") {
+ if (input[start] != '(') {
+ throw std::logic_error("Missing ( after geometry.");
+ }
+ std::string crs = input.substr(start + 1, end - start + 1);
+ return std::make_unique<TypeImpl>(GEOMETRY, crs);
+ } else if (category == "geography") {
+ return parseGeographyType(input, start, end);
} else {
throw std::logic_error("Unknown type " + category);
}
diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh
index 647d5a5d2..2db175aba 100644
--- a/c++/src/TypeImpl.hh
+++ b/c++/src/TypeImpl.hh
@@ -24,6 +24,7 @@
#include "Adaptor.hh"
#include "wrap/orc-proto-wrapper.hh"
+#include <memory>
#include <vector>
namespace orc {
@@ -41,6 +42,9 @@ namespace orc {
uint64_t precision_;
uint64_t scale_;
std::map<std::string, std::string> attributes_;
+ std::string crs_;
+ geospatial::EdgeInterpolationAlgorithm edgeInterpolationAlgorithm_ =
+ geospatial::EdgeInterpolationAlgorithm::SPHERICAL;
public:
/**
@@ -58,6 +62,16 @@ namespace orc {
*/
TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale);
+ /**
+ * Create geometry type.
+ */
+ TypeImpl(TypeKind kind, const std::string& crs);
+
+ /**
+ * Create geography type.
+ */
+ TypeImpl(TypeKind kind, const std::string& crs,
geospatial::EdgeInterpolationAlgorithm algo);
+
uint64_t getColumnId() const override;
uint64_t getMaximumColumnId() const override;
@@ -76,6 +90,10 @@ namespace orc {
uint64_t getScale() const override;
+ const std::string& getCrs() const override;
+
+ geospatial::EdgeInterpolationAlgorithm getAlgorithm() const override;
+
Type& setAttribute(const std::string& key, const std::string& value)
override;
bool hasAttributeKey(const std::string& key) const override;
@@ -176,6 +194,14 @@ namespace orc {
static std::unique_ptr<Type> parseDecimalType(const std::string& input,
size_t start,
size_t end);
+ /**
+ * Parse geography type from string
+ * @param input the input string of a decimal type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseGeographyType(const std::string& input,
size_t start,
+ size_t end);
/**
* Parse type for a category
* @param category type name
diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc
index 775e6d245..c235169cc 100644
--- a/c++/src/Writer.cc
+++ b/c++/src/Writer.cc
@@ -24,6 +24,7 @@
#include "Utils.hh"
#include <memory>
+#include <stdexcept>
namespace orc {
@@ -702,6 +703,40 @@ namespace orc {
protoType.set_kind(proto::Type_Kind_CHAR);
break;
}
+ case GEOMETRY: {
+ protoType.set_kind(proto::Type_Kind_GEOMETRY);
+ protoType.set_crs(t.getCrs());
+ break;
+ }
+ case GEOGRAPHY: {
+ protoType.set_kind(proto::Type_Kind_GEOGRAPHY);
+ protoType.set_crs(t.getCrs());
+ switch (t.getAlgorithm()) {
+ case geospatial::EdgeInterpolationAlgorithm::SPHERICAL: {
+
protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::VINCENTY: {
+
protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::THOMAS: {
+
protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::ANDOYER: {
+
protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_ANDOYER);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::KARNEY: {
+
protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_KARNEY);
+ break;
+ }
+ default:
+ throw std::invalid_argument("Unknown Algorithm.");
+ }
+ break;
+ }
default:
throw std::logic_error("Unknown type.");
}
diff --git a/c++/src/meson.build b/c++/src/meson.build
index 3d77d3242..0794dec84 100644
--- a/c++/src/meson.build
+++ b/c++/src/meson.build
@@ -151,6 +151,7 @@ source_files += files(
'ConvertColumnReader.cc',
'CpuInfoUtil.cc',
'Exceptions.cc',
+ 'Geospatial.cc',
'Int128.cc',
'LzoDecompressor.cc',
'MemoryPool.cc',
diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt
index f7328abb3..3261fedde 100644
--- a/c++/test/CMakeLists.txt
+++ b/c++/test/CMakeLists.txt
@@ -56,12 +56,14 @@ add_executable (orc-test
TestRleEncoder.cc
TestRLEV2Util.cc
TestSargsApplier.cc
+ TestStatistics.cc
TestSearchArgument.cc
TestSchemaEvolution.cc
TestStripeIndexStatistics.cc
TestTimestampStatistics.cc
TestTimezone.cc
TestType.cc
+ TestUtil.cc
TestWriter.cc
TestCache.cc
${SIMD_TEST_SRCS}
diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc
index 5cf2d9e41..642a8019d 100644
--- a/c++/test/TestColumnStatistics.cc
+++ b/c++/test/TestColumnStatistics.cc
@@ -17,6 +17,7 @@
*/
#include "Statistics.hh"
+#include "TestUtil.hh"
#include "orc/OrcFile.hh"
#include "wrap/gmock.h"
#include "wrap/gtest-wrapper.h"
@@ -531,4 +532,345 @@ namespace orc {
collectionStats->merge(*other);
EXPECT_FALSE(collectionStats->hasTotalChildren());
}
+
+ TEST(ColumnStatistics, TestGeospatialDefaults) {
+ std::unique_ptr<GeospatialColumnStatisticsImpl> geoStats(new
GeospatialColumnStatisticsImpl());
+ EXPECT_TRUE(geoStats->getGeospatialTypes().empty());
+ auto bbox = geoStats->getBoundingBox();
+ for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) {
+ EXPECT_TRUE(bbox.boundEmpty(i));
+ EXPECT_TRUE(bbox.boundValid(i));
+ }
+ EXPECT_EQ("<GeoStatistics> x: empty y: empty z: empty m: empty
geometry_types: []",
+ geoStats->toString());
+ }
+
+ TEST(ColumnStatistics, TestGeospatialUpdate) {
+ std::unique_ptr<GeospatialColumnStatisticsImpl> geoStats(new
GeospatialColumnStatisticsImpl());
+ EXPECT_TRUE(geoStats->getGeospatialTypes().empty());
+ const auto& bbox = geoStats->getBoundingBox();
+ for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) {
+ EXPECT_TRUE(bbox.boundEmpty(i));
+ EXPECT_TRUE(bbox.boundValid(i));
+ }
+ EXPECT_EQ(geoStats->getGeospatialTypes().size(), 0);
+
+ geospatial::BoundingBox::XYZM expectedMin;
+ geospatial::BoundingBox::XYZM expectedMax;
+ std::array<bool, geospatial::MAX_DIMENSIONS> expectedEmpty;
+ std::array<bool, geospatial::MAX_DIMENSIONS> expectedValid;
+ std::vector<int32_t> expectedTypes;
+ for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) {
+ expectedMin[i] = geospatial::INF;
+ expectedMax[i] = -geospatial::INF;
+ expectedEmpty[i] = true;
+ expectedValid[i] = true;
+ }
+
+ auto Verify = [&]() {
+ EXPECT_EQ(expectedEmpty, geoStats->getBoundingBox().dimensionEmpty());
+ EXPECT_EQ(expectedValid, geoStats->getBoundingBox().dimensionValid());
+ EXPECT_EQ(expectedTypes, geoStats->getGeospatialTypes());
+ for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) {
+ if (geoStats->getBoundingBox().boundValid(i)) {
+ EXPECT_EQ(expectedMin[i],
geoStats->getBoundingBox().lowerBound()[i]);
+ EXPECT_EQ(expectedMax[i],
geoStats->getBoundingBox().upperBound()[i]);
+ } else {
+ EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().lowerBound()[i]));
+ EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().upperBound()[i]));
+ }
+ }
+ };
+
+ // Update a xy point
+ std::string xy0 = MakeWKBPoint({10, 11}, false, false);
+ geoStats->update(xy0.c_str(), xy0.size());
+ expectedMin[0] = expectedMax[0] = 10;
+ expectedMin[1] = expectedMax[1] = 11;
+ expectedEmpty[0] = expectedEmpty[1] = false;
+ expectedTypes.push_back(1);
+ Verify();
+
+ // Update a xyz point.
+ std::string xyz0 = MakeWKBPoint({11, 12, 13}, true, false);
+ geoStats->update(xyz0.c_str(), xyz0.size());
+ expectedMax[0] = 11;
+ expectedMax[1] = 12;
+ expectedMin[2] = expectedMax[2] = 13;
+ expectedEmpty[2] = false;
+ expectedTypes.push_back(1001);
+ Verify();
+
+ // Update a xym point.
+ std::string xym0 = MakeWKBPoint({9, 10, 0, 11}, false, true);
+ geoStats->update(xym0.c_str(), xym0.size());
+ expectedMin[0] = 9;
+ expectedMin[1] = 10;
+ expectedMin[3] = expectedMax[3] = 11;
+ expectedEmpty[3] = false;
+ expectedTypes.push_back(2001);
+ Verify();
+
+ // Update a xymz point.
+ std::string xymz0 = MakeWKBPoint({8, 9, 10, 12}, true, true);
+ geoStats->update(xymz0.c_str(), xymz0.size());
+ expectedMin[0] = 8;
+ expectedMin[1] = 9;
+ expectedMin[2] = 10;
+ expectedMax[3] = 12;
+ expectedTypes.push_back(3001);
+ Verify();
+
+ // Update NaN to every dimension.
+ std::string xyzm1 = MakeWKBPoint(
+ {std::numeric_limits<double>::quiet_NaN(),
std::numeric_limits<double>::quiet_NaN(),
+ std::numeric_limits<double>::quiet_NaN(),
std::numeric_limits<double>::quiet_NaN()},
+ true, false);
+ geoStats->update(xyzm1.c_str(), xyzm1.size());
+ Verify();
+
+ // Update a invalid WKB
+ std::string invalidWKB;
+ geoStats->update(invalidWKB.c_str(), invalidWKB.size());
+ expectedValid[0] = expectedValid[1] = expectedValid[2] = expectedValid[3]
= false;
+ expectedTypes.clear();
+ Verify();
+
+ // Update a xy point again
+ std::string xy1 = MakeWKBPoint({10, 11}, false, false);
+ geoStats->update(xy1.c_str(), xy1.size());
+ Verify();
+ }
+
+ TEST(ColumnStatistics, TestGeospatialToProto) {
+ // Test Empty
+ std::unique_ptr<GeospatialColumnStatisticsImpl> geoStats(new
GeospatialColumnStatisticsImpl());
+ proto::ColumnStatistics pbStats;
+ geoStats->toProtoBuf(pbStats);
+ EXPECT_TRUE(pbStats.has_geospatial_statistics());
+ EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size());
+ EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox());
+
+ // Update a xy point
+ std::string xy = MakeWKBPoint({10, 11}, false, false);
+ geoStats->update(xy.c_str(), xy.size());
+ pbStats.Clear();
+ geoStats->toProtoBuf(pbStats);
+ EXPECT_TRUE(pbStats.has_geospatial_statistics());
+ EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types().size());
+ EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0));
+ EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox());
+ const auto& bbox0 = pbStats.geospatial_statistics().bbox();
+ EXPECT_TRUE(bbox0.has_xmin());
+ EXPECT_TRUE(bbox0.has_xmax());
+ EXPECT_TRUE(bbox0.has_ymin());
+ EXPECT_TRUE(bbox0.has_ymax());
+ EXPECT_FALSE(bbox0.has_zmin());
+ EXPECT_FALSE(bbox0.has_zmax());
+ EXPECT_FALSE(bbox0.has_mmin());
+ EXPECT_FALSE(bbox0.has_mmax());
+ EXPECT_EQ(10, bbox0.xmin());
+ EXPECT_EQ(10, bbox0.xmax());
+ EXPECT_EQ(11, bbox0.ymin());
+ EXPECT_EQ(11, bbox0.ymax());
+
+ // Update a xyzm point.
+ std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true);
+ geoStats->update(xyzm.c_str(), xyzm.size());
+ pbStats.Clear();
+ geoStats->toProtoBuf(pbStats);
+ EXPECT_TRUE(pbStats.has_geospatial_statistics());
+ EXPECT_EQ(2, pbStats.geospatial_statistics().geospatial_types().size());
+ EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0));
+ EXPECT_EQ(3001, pbStats.geospatial_statistics().geospatial_types(1));
+ EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox());
+ const auto& bbox1 = pbStats.geospatial_statistics().bbox();
+ EXPECT_TRUE(bbox1.has_xmin());
+ EXPECT_TRUE(bbox1.has_xmax());
+ EXPECT_TRUE(bbox1.has_ymin());
+ EXPECT_TRUE(bbox1.has_ymax());
+ EXPECT_TRUE(bbox1.has_zmin());
+ EXPECT_TRUE(bbox1.has_zmax());
+ EXPECT_TRUE(bbox1.has_mmin());
+ EXPECT_TRUE(bbox1.has_mmax());
+ EXPECT_EQ(-10, bbox1.xmin());
+ EXPECT_EQ(10, bbox1.xmax());
+ EXPECT_EQ(-11, bbox1.ymin());
+ EXPECT_EQ(11, bbox1.ymax());
+ EXPECT_EQ(-12, bbox1.zmin());
+ EXPECT_EQ(-12, bbox1.zmax());
+ EXPECT_EQ(-13, bbox1.mmin());
+ EXPECT_EQ(-13, bbox1.mmax());
+
+ // Update a invalid point
+ std::string invalidWKB;
+ geoStats->update(invalidWKB.c_str(), invalidWKB.size());
+ pbStats.Clear();
+ geoStats->toProtoBuf(pbStats);
+ EXPECT_TRUE(pbStats.has_geospatial_statistics());
+ EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size());
+ EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox());
+ }
+
+ TEST(ColumnStatistics, TestGeospatialMerge) {
+ std::unique_ptr<GeospatialColumnStatisticsImpl> invalidStats(
+ new GeospatialColumnStatisticsImpl());
+ invalidStats->update("0", 0);
+
+ std::unique_ptr<GeospatialColumnStatisticsImpl> emptyStats(
+ new GeospatialColumnStatisticsImpl());
+
+ std::unique_ptr<GeospatialColumnStatisticsImpl> xyStats(new
GeospatialColumnStatisticsImpl());
+ std::string xy = MakeWKBPoint({10, 11}, false, false);
+ xyStats->update(xy.c_str(), xy.size());
+
+ std::unique_ptr<GeospatialColumnStatisticsImpl> xyzStats(new
GeospatialColumnStatisticsImpl());
+ std::string xyz = MakeWKBPoint({12, 13, 14}, true, false);
+ xyzStats->update(xyz.c_str(), xyz.size());
+
+ std::unique_ptr<GeospatialColumnStatisticsImpl> xyzmStats(new
GeospatialColumnStatisticsImpl());
+ std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true);
+ xyzmStats->update(xyzm.c_str(), xyzm.size());
+
+ // invalid merge invalid
+ invalidStats->merge(*invalidStats);
+ std::array<bool, 4> expectedValid = {false, false, false, false};
+ EXPECT_EQ(invalidStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(invalidStats->getGeospatialTypes().size(), 0);
+
+ // Empty merge empty
+ emptyStats->merge(*emptyStats);
+ expectedValid = {true, true, true, true};
+ std::array<bool, 4> expectedEmpty = {true, true, true, true};
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty);
+ EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0);
+
+ // Empty merge xy
+ emptyStats->merge(*xyStats);
+ expectedEmpty = {false, false, true, true};
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty);
+ EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]);
+ EXPECT_EQ(10, emptyStats->getBoundingBox().upperBound()[0]);
+ EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]);
+ EXPECT_EQ(11, emptyStats->getBoundingBox().upperBound()[1]);
+ EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 1);
+ EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1);
+
+ // Empty merge xyz
+ emptyStats->merge(*xyzStats);
+ expectedEmpty = {false, false, false, true};
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty);
+ EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]);
+ EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]);
+ EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]);
+ EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]);
+ EXPECT_EQ(14, emptyStats->getBoundingBox().lowerBound()[2]);
+ EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]);
+ EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 2);
+ EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1);
+ EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001);
+
+ // Empty merge xyzm
+ emptyStats->merge(*xyzmStats);
+ expectedEmpty = {false, false, false, false};
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty);
+ EXPECT_EQ(-10, emptyStats->getBoundingBox().lowerBound()[0]);
+ EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]);
+ EXPECT_EQ(-11, emptyStats->getBoundingBox().lowerBound()[1]);
+ EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]);
+ EXPECT_EQ(-12, emptyStats->getBoundingBox().lowerBound()[2]);
+ EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]);
+ EXPECT_EQ(-13, emptyStats->getBoundingBox().lowerBound()[3]);
+ EXPECT_EQ(-13, emptyStats->getBoundingBox().upperBound()[3]);
+ EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 3);
+ EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1);
+ EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001);
+ EXPECT_EQ(emptyStats->getGeospatialTypes()[2], 3001);
+
+ // Empty merge invalid
+ emptyStats->merge(*invalidStats);
+ expectedValid = {false, false, false, false};
+ EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0);
+ }
+
+ TEST(ColumnStatistics, TestGeospatialFromProto) {
+ proto::ColumnStatistics pbStats;
+ // No geostats
+
+ std::unique_ptr<GeospatialColumnStatisticsImpl> emptyStats0(
+ new GeospatialColumnStatisticsImpl(pbStats));
+ std::array<bool, 4> expectedValid = {false, false, false, false};
+ EXPECT_TRUE(emptyStats0->getGeospatialTypes().empty());
+ EXPECT_EQ(emptyStats0->getBoundingBox().dimensionValid(), expectedValid);
+
+ // Add empty geostats
+ pbStats.mutable_geospatial_statistics();
+ std::unique_ptr<GeospatialColumnStatisticsImpl> emptyStats1(
+ new GeospatialColumnStatisticsImpl(pbStats));
+ EXPECT_TRUE(emptyStats1->getGeospatialTypes().empty());
+ EXPECT_EQ(emptyStats1->getBoundingBox().dimensionValid(), expectedValid);
+
+ // Set xy bounds
+ auto* geoProtoStas = pbStats.mutable_geospatial_statistics();
+ geoProtoStas->mutable_bbox()->set_xmin(0);
+ geoProtoStas->mutable_bbox()->set_xmax(1);
+ geoProtoStas->mutable_bbox()->set_ymin(0);
+ geoProtoStas->mutable_bbox()->set_ymax(1);
+ geoProtoStas->mutable_geospatial_types()->Add(2);
+ std::unique_ptr<GeospatialColumnStatisticsImpl> xyStats(
+ new GeospatialColumnStatisticsImpl(pbStats));
+ expectedValid = {true, true, false, false};
+ EXPECT_EQ(xyStats->getGeospatialTypes().size(), 1);
+ EXPECT_EQ(xyStats->getGeospatialTypes()[0], 2);
+ EXPECT_EQ(xyStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[0]);
+ EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[0]);
+ EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[1]);
+ EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[1]);
+
+ // Set xyz bounds
+ geoProtoStas->mutable_bbox()->set_zmin(0);
+ geoProtoStas->mutable_bbox()->set_zmax(1);
+ geoProtoStas->mutable_geospatial_types()->Add(1003);
+ std::unique_ptr<GeospatialColumnStatisticsImpl> xyzStats(
+ new GeospatialColumnStatisticsImpl(pbStats));
+ expectedValid = {true, true, true, false};
+ EXPECT_EQ(xyzStats->getGeospatialTypes().size(), 2);
+ EXPECT_EQ(xyzStats->getGeospatialTypes()[0], 2);
+ EXPECT_EQ(xyzStats->getGeospatialTypes()[1], 1003);
+ EXPECT_EQ(xyzStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[0]);
+ EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[0]);
+ EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[1]);
+ EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[1]);
+ EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[2]);
+ EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[2]);
+
+ // Set xyzm bounds
+ geoProtoStas->mutable_bbox()->set_mmin(0);
+ geoProtoStas->mutable_bbox()->set_mmax(1);
+ geoProtoStas->mutable_geospatial_types()->Add(3003);
+ std::unique_ptr<GeospatialColumnStatisticsImpl> xyzmStats(
+ new GeospatialColumnStatisticsImpl(pbStats));
+ expectedValid = {true, true, true, true};
+ EXPECT_EQ(xyzmStats->getGeospatialTypes().size(), 3);
+ EXPECT_EQ(xyzmStats->getGeospatialTypes()[0], 2);
+ EXPECT_EQ(xyzmStats->getGeospatialTypes()[1], 1003);
+ EXPECT_EQ(xyzmStats->getGeospatialTypes()[2], 3003);
+ EXPECT_EQ(xyzmStats->getBoundingBox().dimensionValid(), expectedValid);
+ EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[0]);
+ EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[0]);
+ EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[1]);
+ EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[1]);
+ EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[2]);
+ EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[2]);
+ EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[3]);
+ EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[3]);
+ }
+
} // namespace orc
diff --git a/c++/test/TestStatistics.cc b/c++/test/TestStatistics.cc
new file mode 100644
index 000000000..61c5e08cb
--- /dev/null
+++ b/c++/test/TestStatistics.cc
@@ -0,0 +1,230 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc/OrcFile.hh"
+
+#include "MemoryInputStream.hh"
+#include "MemoryOutputStream.hh"
+#include "TestUtil.hh"
+
+#include "wrap/gtest-wrapper.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+namespace orc {
+
+#define ENSURE_DYNAMIC_CAST_NOT_NULL(PTR) \
+ if (PTR == NULL) throw std::logic_error("dynamic_cast returns null");
+
+ const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M
+
+ static std::unique_ptr<Writer> createWriter(uint64_t stripeSize, const Type&
type,
+ MemoryPool* memoryPool,
OutputStream* stream) {
+ WriterOptions options;
+ options.setStripeSize(stripeSize);
+ options.setCompressionBlockSize(256);
+ options.setMemoryBlockSize(256);
+ options.setCompression(CompressionKind_ZLIB);
+ options.setMemoryPool(memoryPool);
+ options.setRowIndexStride(10);
+ return createWriter(type, stream, options);
+ }
+
+ static std::unique_ptr<Reader> createReader(MemoryPool* memoryPool,
+ MemoryOutputStream& memStream) {
+ std::unique_ptr<InputStream> inStream(
+ new MemoryInputStream(memStream.getData(), memStream.getLength()));
+ ReaderOptions options;
+ options.setMemoryPool(*memoryPool);
+ return createReader(std::move(inStream), options);
+ }
+
+ TEST(Statistics, geometryStatsWithNull) {
+ std::unique_ptr<Type> const
type(Type::buildTypeFromString("struct<col1:geometry(OGC:CRS84)>"));
+
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool* const pool = getDefaultPool();
+ uint64_t const stripeSize = 32; // small stripe size to garantee multi
stripes
+ std::unique_ptr<Writer> writer = createWriter(stripeSize, *type, pool,
&memStream);
+
+ uint64_t const batchCount = 1000;
+ uint64_t const batches = 10;
+ std::unique_ptr<ColumnVectorBatch> const batch =
writer->createRowBatch(batchCount);
+ StructVectorBatch* structBatch =
dynamic_cast<StructVectorBatch*>(batch.get());
+ ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch);
+
+ StringVectorBatch* strBatch =
dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch);
+
+ // create str values
+ std::vector<std::string> wkbs;
+ std::array<double, 4> mins = {geospatial::INF, geospatial::INF,
geospatial::INF,
+ geospatial::INF};
+ std::array<double, 4> maxs = {-geospatial::INF, -geospatial::INF,
-geospatial::INF,
+ -geospatial::INF};
+ for (uint64_t i = 1; i < batchCount - 1; ++i) {
+ if (i % 3 == 0) {
+ wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false));
+ mins[0] = std::min(mins[0], i * 1.0);
+ maxs[0] = std::max(maxs[0], i * 1.0);
+ mins[1] = std::min(mins[1], i * 1.0);
+ maxs[1] = std::max(maxs[1], i * 1.0);
+ } else if (i % 3 == 1) {
+ wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false));
+ mins[0] = std::min(mins[0], i * 1.0);
+ maxs[0] = std::max(maxs[0], i * 1.0);
+ mins[1] = std::min(mins[1], i * 1.0);
+ maxs[1] = std::max(maxs[1], i * 1.0);
+ mins[2] = std::min(mins[2], i * 1.0);
+ maxs[2] = std::max(maxs[2], i * 1.0);
+ } else if (i % 3 == 2) {
+ wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0},
true, true));
+ mins[0] = std::min(mins[0], i * 1.0);
+ maxs[0] = std::max(maxs[0], i * 1.0);
+ mins[1] = std::min(mins[1], i * 1.0);
+ maxs[1] = std::max(maxs[1], i * 1.0);
+ mins[2] = std::min(mins[2], i * 1.0);
+ maxs[2] = std::max(maxs[2], i * 1.0);
+ mins[3] = std::min(mins[3], i * 1.0);
+ maxs[3] = std::max(maxs[3], i * 1.0);
+ }
+ }
+ for (uint64_t i = 1; i < batchCount - 1; ++i) {
+ strBatch->data[i] = const_cast<char*>(wkbs[i - 1].c_str());
+ strBatch->length[i] = static_cast<int32_t>(wkbs[i - 1].length());
+ }
+
+ structBatch->numElements = batchCount;
+ strBatch->numElements = batchCount;
+
+ structBatch->hasNulls = true;
+ structBatch->notNull[0] = '\0';
+ structBatch->notNull[batchCount - 1] = '\0';
+ strBatch->hasNulls = true;
+ strBatch->notNull[0] = '\0';
+ strBatch->notNull[batchCount - 1] = '\0';
+
+ for (uint64_t i = 0; i < batches; ++i) {
+ writer->add(*batch.get());
+ }
+ writer->close();
+
+ std::unique_ptr<Reader> reader = createReader(pool, memStream);
+
+ // check column 1 (string) file stats
+ auto stats1 = reader->getColumnStatistics(1);
+ const GeospatialColumnStatistics* geoFileStats =
+ dynamic_cast<const GeospatialColumnStatistics*>(stats1.get());
+ ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats);
+ EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 3);
+ EXPECT_EQ(geoFileStats->getGeospatialTypes()[0], 1);
+ EXPECT_EQ(geoFileStats->getGeospatialTypes()[1], 1001);
+ EXPECT_EQ(geoFileStats->getGeospatialTypes()[2], 3001);
+ std::array<bool, 4> expectValid = {true, true, true, true};
+ std::array<bool, 4> expectEmpty = {false, false, false, false};
+ EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid);
+ EXPECT_EQ(geoFileStats->getBoundingBox().dimensionEmpty(), expectEmpty);
+ EXPECT_EQ(geoFileStats->getBoundingBox().lowerBound(), mins);
+ EXPECT_EQ(geoFileStats->getBoundingBox().upperBound(), maxs);
+ }
+
+ TEST(Statistics, geographyStatsWithNull) {
+ std::unique_ptr<Type> const type(
+
Type::buildTypeFromString("struct<col1:geography(OGC:CRS84,speherial)>"));
+
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool* const pool = getDefaultPool();
+ uint64_t const stripeSize = 32; // small stripe size to garantee multi
stripes
+ std::unique_ptr<Writer> writer = createWriter(stripeSize, *type, pool,
&memStream);
+
+ uint64_t const batchCount = 1000;
+ uint64_t const batches = 10;
+ std::unique_ptr<ColumnVectorBatch> const batch =
writer->createRowBatch(batchCount);
+ StructVectorBatch* structBatch =
dynamic_cast<StructVectorBatch*>(batch.get());
+ ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch);
+
+ StringVectorBatch* strBatch =
dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch);
+
+ // create str values
+ std::vector<std::string> wkbs;
+ std::array<double, 4> mins = {geospatial::INF, geospatial::INF,
geospatial::INF,
+ geospatial::INF};
+ std::array<double, 4> maxs = {-geospatial::INF, -geospatial::INF,
-geospatial::INF,
+ -geospatial::INF};
+ for (uint64_t i = 1; i < batchCount - 1; ++i) {
+ if (i % 3 == 0) {
+ wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false));
+ mins[0] = std::min(mins[0], i * 1.0);
+ maxs[0] = std::max(maxs[0], i * 1.0);
+ mins[1] = std::min(mins[1], i * 1.0);
+ maxs[1] = std::max(maxs[1], i * 1.0);
+ } else if (i % 3 == 1) {
+ wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false));
+ mins[0] = std::min(mins[0], i * 1.0);
+ maxs[0] = std::max(maxs[0], i * 1.0);
+ mins[1] = std::min(mins[1], i * 1.0);
+ maxs[1] = std::max(maxs[1], i * 1.0);
+ mins[2] = std::min(mins[2], i * 1.0);
+ maxs[2] = std::max(maxs[2], i * 1.0);
+ } else if (i % 3 == 2) {
+ wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0},
true, true));
+ mins[0] = std::min(mins[0], i * 1.0);
+ maxs[0] = std::max(maxs[0], i * 1.0);
+ mins[1] = std::min(mins[1], i * 1.0);
+ maxs[1] = std::max(maxs[1], i * 1.0);
+ mins[2] = std::min(mins[2], i * 1.0);
+ maxs[2] = std::max(maxs[2], i * 1.0);
+ mins[3] = std::min(mins[3], i * 1.0);
+ maxs[3] = std::max(maxs[3], i * 1.0);
+ }
+ }
+ for (uint64_t i = 1; i < batchCount - 1; ++i) {
+ strBatch->data[i] = const_cast<char*>(wkbs[i - 1].c_str());
+ strBatch->length[i] = static_cast<int32_t>(wkbs[i - 1].length());
+ }
+
+ structBatch->numElements = batchCount;
+ strBatch->numElements = batchCount;
+
+ structBatch->hasNulls = true;
+ structBatch->notNull[0] = '\0';
+ structBatch->notNull[batchCount - 1] = '\0';
+ strBatch->hasNulls = true;
+ strBatch->notNull[0] = '\0';
+ strBatch->notNull[batchCount - 1] = '\0';
+
+ for (uint64_t i = 0; i < batches; ++i) {
+ writer->add(*batch.get());
+ }
+ writer->close();
+
+ std::unique_ptr<Reader> reader = createReader(pool, memStream);
+
+ // check column 1 (string) file stats
+ auto stats1 = reader->getColumnStatistics(1);
+ const GeospatialColumnStatistics* geoFileStats =
+ dynamic_cast<const GeospatialColumnStatistics*>(stats1.get());
+ ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats);
+ EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 0);
+ std::array<bool, 4> expectValid = {false, false, false, false};
+ EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid);
+ }
+} // namespace orc
\ No newline at end of file
diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc
new file mode 100644
index 000000000..a76880340
--- /dev/null
+++ b/c++/test/TestUtil.cc
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TestUtil.hh"
+#include <cassert>
+#include <cstring>
+
+namespace orc {
+ uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ,
bool hasM) {
+ auto wkbGeomType = static_cast<uint32_t>(geometryType);
+
+ if (hasZ) {
+ wkbGeomType += 1000;
+ }
+
+ if (hasM) {
+ wkbGeomType += 2000;
+ }
+
+ return wkbGeomType;
+ }
+
+ std::string MakeWKBPoint(const std::vector<double>& xyzm, bool hasZ, bool
hasM) {
+ // 1:endianness + 4:type + 8:x + 8:y
+ int numBytes = kWkbPointXYSize + (hasZ ? sizeof(double) : 0) + (hasM ?
sizeof(double) : 0);
+ std::string wkb(numBytes, 0);
+ char* ptr = wkb.data();
+
+ ptr[0] = kWkbNativeEndianness;
+ uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::POINT,
hasZ, hasM);
+ std::memcpy(&ptr[1], &geom_type, 4);
+ std::memcpy(&ptr[5], &xyzm[0], 8);
+ std::memcpy(&ptr[13], &xyzm[1], 8);
+ ptr += 21;
+
+ if (hasZ) {
+ std::memcpy(ptr, &xyzm[2], 8);
+ ptr += 8;
+ }
+
+ if (hasM) {
+ std::memcpy(ptr, &xyzm[3], 8);
+ ptr += 8;
+ }
+
+ assert(static_cast<size_t>(ptr - wkb.data()) == wkb.length());
+ return wkb;
+ }
+
+} // namespace orc
\ No newline at end of file
diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh
new file mode 100644
index 000000000..104fbc039
--- /dev/null
+++ b/c++/test/TestUtil.hh
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "orc/Geospatial.hh"
+
+#include <cstdint>
+#include <vector>
+
+namespace orc {
+
+ /// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t
endian,
+ /// uint32_t geometry type, 2 * double coordinates)
+ static constexpr int kWkbPointXYSize = 21;
+
+ static bool isLittleEndian() {
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
+ return num.c[0] == 4;
+ }
+
+ static uint8_t kWkbNativeEndianness = isLittleEndian() ? 0x01 : 0x00;
+
+ uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ,
bool hasM);
+ std::string MakeWKBPoint(const std::vector<double>& xyzm, bool hasZ, bool
hasM);
+
+} // namespace orc
\ No newline at end of file
diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc
index 975462e30..11ba0c9de 100644
--- a/c++/test/TestWriter.cc
+++ b/c++/test/TestWriter.cc
@@ -16,18 +16,20 @@
* limitations under the License.
*/
-#include "orc/ColumnPrinter.hh"
+#include <gtest/gtest.h>
#include "orc/OrcFile.hh"
#include "MemoryInputStream.hh"
#include "MemoryOutputStream.hh"
#include "Reader.hh"
+#include "TestUtil.hh"
#include "wrap/gmock.h"
#include "wrap/gtest-wrapper.h"
#include <cmath>
#include <ctime>
+#include <memory>
#include <sstream>
#ifdef __clang__
@@ -2400,6 +2402,139 @@ namespace orc {
EXPECT_FALSE(rowReader->next(*batch));
}
+ TEST_P(WriterTest, writeGeometryAndGeographyColumn) {
+ MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
+ MemoryPool* pool = getDefaultPool();
+ std::unique_ptr<Type> type(Type::buildTypeFromString(
+
"struct<col1:geometry(OGC:CRS84),col2:geography(OGC:CRS84,speherial)>"));
+ uint64_t stripeSize = 1024; // 1K
+ uint64_t compressionBlockSize = 1024; // 1k
+ uint64_t memoryBlockSize = 64;
+ std::unique_ptr<Writer> writer =
+ createWriter(stripeSize, memoryBlockSize, compressionBlockSize,
CompressionKind_ZLIB, *type,
+ pool, &memStream, fileVersion,
enableAlignBlockBoundToRowGroup ? 1024 : 0);
+
+
EXPECT_EQ("struct<col1:geometry(OGC:CRS84),),col2:geography(OGC:CRS84,speherial)>",
+ type->toString());
+
+ uint64_t batchCount = 100, batchSize = 1000;
+ std::unique_ptr<ColumnVectorBatch> batch =
writer->createRowBatch(batchSize);
+ StructVectorBatch* structBatch =
dynamic_cast<StructVectorBatch*>(batch.get());
+ StringVectorBatch* geometryBatch =
dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ StringVectorBatch* geographyBatch =
dynamic_cast<StringVectorBatch*>(structBatch->fields[1]);
+
+ std::unique_ptr<char[]> buffer(new char[8000000]);
+ char* buf = buffer.get();
+
+ // write 100 * 1000 rows, every 100 rows are in one row group
+ // every 2 consecutive rows has one null value.
+ uint64_t rowCount = 0;
+ for (uint64_t i = 0; i != batchCount; ++i) {
+ structBatch->hasNulls = false;
+ structBatch->numElements = batchSize;
+
+ geometryBatch->hasNulls = true;
+ geometryBatch->numElements = batchSize;
+ geographyBatch->hasNulls = true;
+ geographyBatch->numElements = batchSize;
+
+ for (uint64_t j = 0; j != batchSize; ++j) {
+ if (rowCount % 2 == 0) {
+ geometryBatch->notNull[j] = 0;
+ geographyBatch->notNull[j] = 0;
+ } else {
+ geometryBatch->notNull[j] = 1;
+ geographyBatch->notNull[j] = 1;
+
+ std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false);
+ strncpy(buf, wkb.c_str(), wkb.size());
+
+ geometryBatch->data[j] = buf;
+ geometryBatch->length[j] = static_cast<int64_t>(wkb.size());
+ geographyBatch->data[j] = buf;
+ geographyBatch->length[j] = static_cast<int64_t>(wkb.size());
+
+ buf += wkb.size();
+ }
+ ++rowCount;
+ }
+
+ writer->add(*batch);
+ }
+ writer->close();
+
+ std::unique_ptr<InputStream> inStream(
+ new MemoryInputStream(memStream.getData(), memStream.getLength()));
+ std::unique_ptr<Reader> reader = createReader(pool, std::move(inStream));
+ EXPECT_EQ(batchCount * batchSize, reader->getNumberOfRows());
+ EXPECT_TRUE(reader->getNumberOfStripes() > 1);
+
+
EXPECT_EQ("struct<col1:geometry(OGC:CRS84),),col2:geography(OGC:CRS84,speherial)>",
+ reader->getType().toString());
+ // test sequential reader
+ std::unique_ptr<RowReader> seqReader = createRowReader(reader.get());
+ rowCount = 0;
+ for (uint64_t i = 0; i != batchCount; ++i) {
+ seqReader->next(*batch);
+
+ EXPECT_FALSE(structBatch->hasNulls);
+ EXPECT_EQ(batchSize, structBatch->numElements);
+
+ EXPECT_TRUE(geometryBatch->hasNulls);
+ EXPECT_EQ(batchSize, geometryBatch->numElements);
+ EXPECT_TRUE(geographyBatch->hasNulls);
+ EXPECT_EQ(batchSize, geographyBatch->numElements);
+
+ for (uint64_t j = 0; j != batchSize; ++j) {
+ if (rowCount % 2 == 0) {
+ EXPECT_TRUE(geometryBatch->notNull[j] == 0);
+ EXPECT_TRUE(geographyBatch->notNull[j] == 0);
+ } else {
+ EXPECT_TRUE(geometryBatch->notNull[j] != 0);
+ EXPECT_TRUE(geographyBatch->notNull[j] != 0);
+ std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false);
+ EXPECT_EQ(static_cast<int64_t>(wkb.size()),
geometryBatch->length[j]);
+ EXPECT_TRUE(strncmp(geometryBatch->data[j], wkb.c_str(), wkb.size())
== 0);
+ EXPECT_EQ(static_cast<int64_t>(wkb.size()),
geographyBatch->length[j]);
+ EXPECT_TRUE(strncmp(geographyBatch->data[j], wkb.c_str(),
wkb.size()) == 0);
+ }
+ ++rowCount;
+ }
+ }
+ EXPECT_FALSE(seqReader->next(*batch));
+
+ // test seek reader
+ std::unique_ptr<RowReader> seekReader = createRowReader(reader.get());
+ batch = seekReader->createRowBatch(2);
+ structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
+ geometryBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[0]);
+ geographyBatch = dynamic_cast<StringVectorBatch*>(structBatch->fields[1]);
+
+ for (uint64_t row = rowCount - 2; row >= 100; row -= 100) {
+ seekReader->seekToRow(row);
+ seekReader->next(*batch);
+
+ EXPECT_FALSE(structBatch->hasNulls);
+ EXPECT_EQ(2, structBatch->numElements);
+ EXPECT_TRUE(geometryBatch->hasNulls);
+ EXPECT_EQ(2, geometryBatch->numElements);
+ EXPECT_TRUE(geographyBatch->hasNulls);
+ EXPECT_EQ(2, geographyBatch->numElements);
+
+ EXPECT_TRUE(geometryBatch->notNull[0] == 0);
+ EXPECT_TRUE(geometryBatch->notNull[1] != 0);
+ EXPECT_TRUE(geographyBatch->notNull[0] == 0);
+ EXPECT_TRUE(geographyBatch->notNull[1] != 0);
+
+ std::string wkb = MakeWKBPoint({(row + 1) * 1.0, (row + 1) * 1.0},
false, false);
+
+ EXPECT_EQ(static_cast<int64_t>(wkb.size()), geometryBatch->length[1]);
+ EXPECT_TRUE(strncmp(geometryBatch->data[1], wkb.c_str(), wkb.size()) ==
0);
+ EXPECT_EQ(static_cast<int64_t>(wkb.size()), geographyBatch->length[1]);
+ EXPECT_TRUE(strncmp(geographyBatch->data[1], wkb.c_str(), wkb.size()) ==
0);
+ }
+ }
+
std::vector<TestParams> testParams = {{FileVersion::v_0_11(), true},
{FileVersion::v_0_11(), false},
{FileVersion::v_0_12(), false},
diff --git a/c++/test/meson.build b/c++/test/meson.build
index ba84bf7fa..a8d30a6b9 100644
--- a/c++/test/meson.build
+++ b/c++/test/meson.build
@@ -50,10 +50,12 @@ test_sources = [
'TestSargsApplier.cc',
'TestSearchArgument.cc',
'TestSchemaEvolution.cc',
+ 'TestStatistics.cc',
'TestStripeIndexStatistics.cc',
'TestTimestampStatistics.cc',
'TestTimezone.cc',
'TestType.cc',
+ 'TestUtil.cc',
'TestWriter.cc',
'TestCache.cc',
]
diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc
index ae17b3348..31a6f52a2 100644
--- a/tools/src/CSVFileImport.cc
+++ b/tools/src/CSVFileImport.cc
@@ -420,6 +420,8 @@ int main(int argc, char* argv[]) {
case orc::LIST:
case orc::MAP:
case orc::UNION:
+ case orc::GEOMETRY:
+ case orc::GEOGRAPHY:
throw std::runtime_error(subType->toString() + " is not supported
yet.");
}
}