wgtmac commented on code in PR #2269: URL: https://github.com/apache/orc/pull/2269#discussion_r2164193752
########## c++/include/orc/Geospatial.hh: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include <array> Review Comment: ```suggestion #pragma once #include <array> ``` ########## c++/src/TypeImpl.cc: ########## @@ -710,6 +823,22 @@ namespace orc { return std::make_unique<TypeImpl>(DECIMAL, precision, scale); } + std::unique_ptr<Type> TypeImpl::parseGeographyType(const std::string& input, size_t start, + size_t end) { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geometry."); + } + size_t pos = start + 1; + size_t sep = input.find(',', pos); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Decimal type must specify CRS."); Review Comment: ```suggestion throw std::logic_error("Geography type must specify CRS."); ``` ########## c++/include/orc/Geospatial.hh: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include <array> Review Comment: BTW, I just have a general question with regard to coding style. This file seems to follow the Arrow style. Do we need to keep consistency with this repo? ########## c++/include/orc/Statistics.hh: ########## @@ -22,6 +22,7 @@ #include "orc/Type.hh" #include "orc/Vector.hh" #include "orc/orc-config.hh" +#include "orc/Geospatial.hh" Review Comment: Sort alphabetically? ########## c++/src/TypeImpl.cc: ########## @@ -780,6 +909,14 @@ namespace orc { uint64_t maxLength = static_cast<uint64_t>(atoi(input.substr(start + 1, end - start + 1).c_str())); return std::make_unique<TypeImpl>(CHAR, maxLength); + } else if (category == "geometry") { + if (input[start] != '(') { + throw std::logic_error("Missing ( after varchar."); Review Comment: The error msg looks wrong. ########## tools/src/CSVFileImport.cc: ########## @@ -395,6 +395,8 @@ int main(int argc, char* argv[]) { case orc::CHAR: case orc::VARCHAR: case orc::BINARY: + case orc::GEOMETRY: Review Comment: I suppose geospatial types cannot be supported, right? ########## c++/src/TypeImpl.cc: ########## @@ -710,6 +823,22 @@ namespace orc { return std::make_unique<TypeImpl>(DECIMAL, precision, scale); } + std::unique_ptr<Type> TypeImpl::parseGeographyType(const std::string& input, size_t start, + size_t end) { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geometry."); Review Comment: ```suggestion throw std::logic_error("Missing ( after geography."); ``` ########## c++/include/orc/Geospatial.hh: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include <array> Review Comment: It worths adding a comment to say that this file comes mostly from Arrow C++. ########## c++/src/Statistics.hh: ########## @@ -1683,6 +1683,127 @@ namespace orc { } }; + class GeospatialColumnStatisticsImpl : public GeospatialColumnStatistics, + public MutableColumnStatistics { + private: + geospatial::WKBGeometryBounder bounder_; + InternalCharStatistics stats_; + + public: + GeospatialColumnStatisticsImpl() { + reset(); + } + explicit GeospatialColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~GeospatialColumnStatisticsImpl(); + + uint64_t getNumberOfValues() const override { + return stats_.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + stats_.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); + } + + bool hasNull() const override { + return stats_.hasNull(); + } + + void setHasNull(bool hasNull) override { + stats_.setHasNull(hasNull); + } + + void merge(const MutableColumnStatistics& other) override { + const GeospatialColumnStatisticsImpl& geoStats = + dynamic_cast<const GeospatialColumnStatisticsImpl&>(other); + stats_.merge(geoStats.stats_); + bounder_.Merge(geoStats.bounder_); + } + + void reset() override { + stats_.reset(); + bounder_.Reset(); + } + + void update(const char* value, size_t length) override { + bounder_.MergeGeometry(std::string_view(value, length)); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); + + proto::GeospatialStatistics* geoStats = pbStats.mutable_geospatial_statistics(); + const auto& bbox = bounder_.Bounds(); + if (bbox.BoundValid(0) && bbox.BoundValid(1) && !bbox.BoundEmpty(0) && !bbox.BoundEmpty(1)) { + geoStats->mutable_bbox()->set_xmin(bbox.min[0]); + geoStats->mutable_bbox()->set_xmax(bbox.max[0]); + geoStats->mutable_bbox()->set_ymin(bbox.min[1]); + geoStats->mutable_bbox()->set_ymax(bbox.max[1]); + if (bbox.BoundValid(2) && !bbox.BoundEmpty(2)) { + geoStats->mutable_bbox()->set_zmin(bbox.min[2]); + geoStats->mutable_bbox()->set_zmax(bbox.max[2]); + } + if (bbox.BoundValid(3) && !bbox.BoundEmpty(3)) { + geoStats->mutable_bbox()->set_mmin(bbox.min[3]); + geoStats->mutable_bbox()->set_mmax(bbox.max[3]); + } + } + for (auto type : bounder_.GeometryTypes()) { + geoStats->add_geospatial_types(type); + } + } + + std::string toString() const override { + if (!bounder_.IsValid()) { + return "<GeoStatistics> invalid"; + } + + std::stringstream ss; + ss << "<GeoStatistics>"; Review Comment: I think it is due to gap between Parquet Java and C++ implementations... ########## c++/include/orc/Geospatial.hh: ########## @@ -0,0 +1,256 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include <array> Review Comment: IIRC, we use `#ifdef XXX_H` instead of `#pragma once` to prevent double include. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
