wgtmac commented on code in PR #2971: URL: https://github.com/apache/parquet-java/pull/2971#discussion_r1964907972
########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/BoundingBox.java: ########## @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import org.apache.parquet.Preconditions; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; + +public class BoundingBox { + + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + } + + public BoundingBox() {} + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + // Method to update the bounding box with the coordinates of a Geometry object + // geometry can be changed by this method + void update(Geometry geometry, String crs) { + GeometryUtils.normalizeLongitude(geometry); + Envelope envelope = geometry.getEnvelopeInternal(); + double minX = envelope.getMinX(); + double minY = envelope.getMinY(); + double maxX = envelope.getMaxX(); + double maxY = envelope.getMaxY(); + + // Initialize Z and M values + double minZ = Double.POSITIVE_INFINITY; + double maxZ = Double.NEGATIVE_INFINITY; + double minM = Double.POSITIVE_INFINITY; + double maxM = Double.NEGATIVE_INFINITY; + + Coordinate[] coordinates = geometry.getCoordinates(); + for (Coordinate coord : coordinates) { + if (!Double.isNaN(coord.getZ())) { + minZ = Math.min(minZ, coord.getZ()); + maxZ = Math.max(maxZ, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + minM = Math.min(minM, coord.getM()); + maxM = Math.max(maxM, coord.getM()); + } + } + + if (xMin == Double.POSITIVE_INFINITY || xMax == Double.NEGATIVE_INFINITY) { + xMin = minX; + xMax = maxX; + } else { + // Handle the wraparound case for X values Review Comment: Do we need to check if it is `geography` type first or use a subclass for bbox of geography type? IIUC, `geometry` type does not need to check wraparound. ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/BoundingBox.java: ########## @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import org.apache.parquet.Preconditions; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; + +public class BoundingBox { + + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + } + + public BoundingBox() {} + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + // Method to update the bounding box with the coordinates of a Geometry object + // geometry can be changed by this method + void update(Geometry geometry, String crs) { + GeometryUtils.normalizeLongitude(geometry); + Envelope envelope = geometry.getEnvelopeInternal(); + double minX = envelope.getMinX(); + double minY = envelope.getMinY(); + double maxX = envelope.getMaxX(); + double maxY = envelope.getMaxY(); + + // Initialize Z and M values + double minZ = Double.POSITIVE_INFINITY; + double maxZ = Double.NEGATIVE_INFINITY; + double minM = Double.POSITIVE_INFINITY; + double maxM = Double.NEGATIVE_INFINITY; + + Coordinate[] coordinates = geometry.getCoordinates(); + for (Coordinate coord : coordinates) { + if (!Double.isNaN(coord.getZ())) { + minZ = Math.min(minZ, coord.getZ()); + maxZ = Math.max(maxZ, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + minM = Math.min(minM, coord.getM()); + maxM = Math.max(maxM, coord.getM()); + } + } + + if (xMin == Double.POSITIVE_INFINITY || xMax == Double.NEGATIVE_INFINITY) { + xMin = minX; + xMax = maxX; + } else { + // Handle the wraparound case for X values + if (!isCrossingAntiMeridian(xMax, xMin)) { // current bounding box is NOT wrapped around + if (!isCrossingAntiMeridian(maxX, minX)) { // new bounding box is NOT wrapped around + xMin = Math.min(xMin, minX); + xMax = Math.max(xMax, maxX); + } else { // new bounding box is wrapped around + xMin = Math.max(xMin, maxX); + xMax = Math.min(xMax, minX); + } + } else { // current bounding box is wrapped around + if (!isCrossingAntiMeridian(maxX, minX)) { // new bounding box is NOT wrapped around + xMin = Math.max(xMin, minX); + xMax = Math.min(xMax, maxX); + } else { // new bounding box is wrapped around + xMin = Math.max(xMin, maxX); + xMax = Math.min(xMax, minX); + } + } + } + + yMin = Math.min(yMin, minY); + yMax = Math.max(yMax, maxY); + zMin = Math.min(zMin, minZ); + zMax = Math.max(zMax, maxZ); + mMin = Math.min(mMin, minM); + mMax = Math.max(mMax, maxM); + } + + void merge(BoundingBox other) { + Preconditions.checkArgument(other != null, "Cannot merge with null bounding box"); + double minX = other.xMin; + double maxX = other.xMax; + + if (xMin == Double.POSITIVE_INFINITY || xMax == Double.NEGATIVE_INFINITY) { + xMin = minX; + xMax = maxX; + } else { + // Handle the wraparound case for X values + if (!isCrossingAntiMeridian(xMax, xMin)) { // current bounding box is NOT wrapped around Review Comment: These wraparound check duplicates that in `update` method, please use a method to wrap the logic for reuse. ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryTypes.java: ########## @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.parquet.Preconditions; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +public class GeometryTypes { + + private static final int UNKNOWN_TYPE_ID = -1; + private Set<Integer> types = new HashSet<>(); + private boolean valid = true; + + public GeometryTypes(Set<Integer> types) { + this.types = types; + } + + public GeometryTypes() {} + + public Set<Integer> getTypes() { + return types; + } + + void update(Geometry geometry) { + if (!valid) { + return; + } + int code = getGeometryTypeCode(geometry); + if (code != UNKNOWN_TYPE_ID) { + types.add(code); + } else { + valid = false; + types.clear(); + } + } + + public void merge(GeometryTypes other) { + Preconditions.checkArgument(other != null, "Cannot merge with null GeometryTypes"); + if (!valid) { + return; + } + if (!other.valid) { + valid = false; + types.clear(); + return; + } + types.addAll(other.types); + } + + public void reset() { + types.clear(); + valid = true; + } + + public void abort() { + valid = false; + types.clear(); + } + + public GeometryTypes copy() { + return new GeometryTypes(new HashSet<>(types)); + } + + @Override + public String toString() { + return "GeometryTypes{" + "types=" + + types.stream().map(this::typeIdToString).collect(Collectors.toSet()) + '}'; + } + + private int getGeometryTypeId(Geometry geometry) { + switch (geometry.getGeometryType()) { + case Geometry.TYPENAME_POINT: + return 1; + case Geometry.TYPENAME_LINESTRING: + return 2; + case Geometry.TYPENAME_POLYGON: + return 3; + case Geometry.TYPENAME_MULTIPOINT: + return 4; + case Geometry.TYPENAME_MULTILINESTRING: + return 5; + case Geometry.TYPENAME_MULTIPOLYGON: + return 6; + case Geometry.TYPENAME_GEOMETRYCOLLECTION: + return 7; + default: + return UNKNOWN_TYPE_ID; + } + } + + /** + * This is from the following spec proposed: + * <p> + * The geometry types of all geometries, or an empty array if they are not + * known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + * except that values in the list are WKB (ISO variant) integer codes [2]. Table + * below shows the most common geometry types and their codes: + * <p> + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + * <p> + * In addition, the following rules are used: + * - A list of multiple values indicates that multiple geometry types are + * present (e.g. `[0003, 0006]`). + * - An empty array explicitly signals that the geometry types are not known. + * - The geometry types in the list must be unique (e.g. `[0001, 0001]` + * is not valid). + * <p> + * Please refer to links below for more detail: + * [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + * [2] https://github.com/opengeospatial/geoparquet/blob/v1.0.0/format-specs/geoparquet.md?plain=1#L91 + */ Review Comment: ```suggestion /** * Geospatial type codes: * * | Type | XY | XYZ | XYM | XYZM | * | :----------------- | :--- | :--- | :--- | :--: | * | Point | 0001 | 1001 | 2001 | 3001 | * | LineString | 0002 | 1002 | 2002 | 3002 | * | Polygon | 0003 | 1003 | 2003 | 3003 | * | MultiPoint | 0004 | 1004 | 2004 | 3004 | * | MultiLineString | 0005 | 1005 | 2005 | 3005 | * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | * * See https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types */ ``` ########## parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java: ########## @@ -521,6 +537,7 @@ public long getMinMaxSize() { private int nextPageIndex; private LongList repLevelHistogram = new LongArrayList(); private LongList defLevelHistogram = new LongArrayList(); + private List<GeospatialStatistics> geospatialStatistics = new ArrayList<>(); Review Comment: Please remove all changes in this file too. ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java: ########## @@ -1382,6 +1384,11 @@ public void endColumn() throws IOException { currentColumnIndexes.add(columnIndexBuilder.build()); } currentOffsetIndexes.add(offsetIndexBuilder.build(currentChunkFirstDataPage)); + // calculate the geometryStatistics from the BinaryStatistics + GeospatialStatistics geospatialStatistics = null; + if (currentStatistics instanceof BinaryStatistics) + geospatialStatistics = ((BinaryStatistics) currentStatistics).getGeospatialStatistics(); Review Comment: Please refactor the change to be similar to `currentSizeStatistics`. ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeospatialStatistics.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import org.apache.parquet.Preconditions; +import org.apache.parquet.io.api.Binary; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; + +public class GeospatialStatistics { + + private static final BoundingBox DUMMY_BOUNDING_BOX = new DummyBoundingBox(); + + // Metadata that may impact the statistics calculation + private final String crs; + + private final BoundingBox boundingBox; + private final String edgeAlgorithm; + private final GeometryTypes geometryTypes; + private final WKBReader reader = new WKBReader(); + + public GeospatialStatistics(String crs, BoundingBox boundingBox, GeometryTypes geometryTypes) { + this.crs = crs; + this.boundingBox = supportsBoundingBox() ? boundingBox : DUMMY_BOUNDING_BOX; + this.geometryTypes = geometryTypes; + this.edgeAlgorithm = null; + } + + public GeospatialStatistics(String crs) { + this(crs, new BoundingBox(), new GeometryTypes()); + } + + public GeospatialStatistics(String crs, String edgeAlgorithm) { + this.crs = crs; + this.boundingBox = DUMMY_BOUNDING_BOX; + this.geometryTypes = new GeometryTypes(); + this.edgeAlgorithm = edgeAlgorithm; + } + + public BoundingBox getBoundingBox() { + return boundingBox; + } + + public GeometryTypes getGeometryTypes() { + return geometryTypes; + } + + public void update(Binary value) { + if (value == null) { + return; + } + try { + Geometry geom = reader.read(value.getBytes()); + update(geom); + } catch (ParseException e) { + abort(); + } + } + + private void update(Geometry geom) { + if (supportsBoundingBox()) { + boundingBox.update(geom, crs); + } + geometryTypes.update(geom); + } + + /** + * A bounding box is a rectangular region defined by two points, the lower left + * and upper right corners. It is used to represent the minimum and maximum + * coordinates of a geometry. Only planar geometries can have a bounding box. + */ + private boolean supportsBoundingBox() { Review Comment: Do we still need this method given that we support both geometry and geography types? ########## parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java: ########## @@ -1091,6 +1139,156 @@ public int hashCode() { } } + public static class GeometryLogicalTypeAnnotation extends LogicalTypeAnnotation { + private final String crs; + private final ByteBuffer metadata; + + private GeometryLogicalTypeAnnotation(String crs, ByteBuffer metadata) { + this.crs = crs; + this.metadata = metadata; + } + + @Override + @Deprecated + public OriginalType toOriginalType() { + return null; + } + + @Override + public <T> Optional<T> accept(LogicalTypeAnnotationVisitor<T> logicalTypeAnnotationVisitor) { + return logicalTypeAnnotationVisitor.visit(this); + } + + @Override + LogicalTypeToken getType() { + return LogicalTypeToken.GEOMETRY; + } + + @Override + protected String typeParametersAsString() { + StringBuilder sb = new StringBuilder(); + sb.append("("); + sb.append(","); + if (crs != null && !crs.isEmpty()) { + sb.append(","); + sb.append(crs); + } + if (metadata != null) { + sb.append(","); + sb.append(metadata); + } + sb.append(")"); + return sb.toString(); + } + + public String getCrs() { + return crs; + } + + public ByteBuffer getMetadata() { + return metadata; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof GeometryLogicalTypeAnnotation)) { + return false; + } + GeometryLogicalTypeAnnotation other = (GeometryLogicalTypeAnnotation) obj; + return crs.equals(other.crs); + } + + @Override + public int hashCode() { + return Objects.hash(crs); + } + + @Override + PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { + return PrimitiveStringifier.WKB_STRINGIFIER; + } + } + + public static class GeographyLogicalTypeAnnotation extends LogicalTypeAnnotation { + private final String crs; + private final String edgeAlgorithm; Review Comment: I think we do. We should not expose thrift definition to the downstream. ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java: ########## @@ -178,6 +183,30 @@ public Statistics<?> build() { } } + // Builder for GEOMETRY type to handle GeometryStatistics + private static class GeometryBuilder extends Builder { Review Comment: Similar to my comment in BinaryStatistics.java, we need to revert all changes in this file as well. ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeospatialStatistics.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import org.apache.parquet.Preconditions; +import org.apache.parquet.io.api.Binary; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; + +public class GeospatialStatistics { + + private static final BoundingBox DUMMY_BOUNDING_BOX = new DummyBoundingBox(); + + // Metadata that may impact the statistics calculation + private final String crs; + + private final BoundingBox boundingBox; + private final String edgeAlgorithm; Review Comment: Should this be an enum? ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java: ########## @@ -30,6 +32,7 @@ public class BinaryStatistics extends Statistics<Binary> { private Binary max; private Binary min; + private GeospatialStatistics geospatialStatistics = null; Review Comment: We need to leave `BinaryStatistics` as is without any change. This has been discussed in the spec PR that `GeospatialStatistics` should be similar to the implementation of `SizeStatistics`. Users can directly call `ColumnChunkMetaData.getGeospatialStatitics()` to get it. Please revert all changes in this file. ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryTypes.java: ########## @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.parquet.Preconditions; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +public class GeometryTypes { + + private static final int UNKNOWN_TYPE_ID = -1; + private Set<Integer> types = new HashSet<>(); + private boolean valid = true; + + public GeometryTypes(Set<Integer> types) { + this.types = types; + } + + public GeometryTypes() {} + + public Set<Integer> getTypes() { + return types; + } + + void update(Geometry geometry) { + if (!valid) { + return; + } + int code = getGeometryTypeCode(geometry); + if (code != UNKNOWN_TYPE_ID) { + types.add(code); + } else { + valid = false; + types.clear(); + } + } + + public void merge(GeometryTypes other) { + Preconditions.checkArgument(other != null, "Cannot merge with null GeometryTypes"); + if (!valid) { + return; + } + if (!other.valid) { + valid = false; + types.clear(); + return; + } + types.addAll(other.types); + } + + public void reset() { + types.clear(); + valid = true; + } + + public void abort() { + valid = false; + types.clear(); + } + + public GeometryTypes copy() { + return new GeometryTypes(new HashSet<>(types)); + } + + @Override + public String toString() { + return "GeometryTypes{" + "types=" Review Comment: ```suggestion return "GeospatialTypes{" + "types=" ``` ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/geometry/GeometryTypes.java: ########## @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.column.statistics.geometry; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.parquet.Preconditions; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +public class GeometryTypes { Review Comment: ```suggestion public class GeospatialTypes { ``` It is consistent with the spec. ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java: ########## @@ -395,6 +445,12 @@ public SizeStatistics getSizeStatistics() { throw new UnsupportedOperationException("SizeStatistics is not implemented"); } + /** @return the geometry stats for this column */ Review Comment: ```suggestion /** @return the geospatial stats for this column */ ``` ########## parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndex.java: ########## @@ -71,4 +72,12 @@ default List<Long> getRepetitionLevelHistogram() { default List<Long> getDefinitionLevelHistogram() { throw new UnsupportedOperationException("Definition level histogram is not implemented"); } + + /** + * @return the unmodifiable list of the geometry statistics for each page; + * used for converting to the related thrift object + */ + default List<GeospatialStatistics> getGeometryStatistics() { Review Comment: Please revert all changes in this file as we no longer support geo stats in the page index. ########## pom.xml: ########## @@ -111,8 +110,9 @@ <brotli-codec.version>0.1.1</brotli-codec.version> <mockito.version>1.10.19</mockito.version> <powermock.version>2.0.9</powermock.version> - <net.openhft.version>0.27ea0</net.openhft.version> + <net.openhft.version>0.26ea0</net.openhft.version> Review Comment: ditto ########## parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java: ########## @@ -770,6 +802,37 @@ public static Statistics toParquetStatistics( return formatStats; } + private static GeospatialStatistics toParquetStatistics( Review Comment: ```suggestion private static GeospatialStatistics toParquetGeospatialStatistics( ``` ########## parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java: ########## @@ -1142,7 +1269,8 @@ LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement } LogicalTypeAnnotation getLogicalTypeAnnotation(LogicalType type) { - switch (type.getSetField()) { + LogicalType._Fields setField = type.getSetField(); Review Comment: Please revert these two lines. ########## parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java: ########## @@ -875,6 +938,64 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( return fromParquetStatisticsInternal(createdBy, statistics, type, expectedOrder); } + private GeospatialStatistics toParquetGeometryStatistics( Review Comment: ```suggestion private GeospatialStatistics toParquetGeospatialStatistics( ``` ########## parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java: ########## @@ -113,12 +119,8 @@ import org.apache.parquet.format.UUIDType; import org.apache.parquet.format.Uncompressed; import org.apache.parquet.format.XxHash; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import org.apache.parquet.hadoop.metadata.ColumnPath; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.*; Review Comment: Please avoid wildcard import ########## pom.xml: ########## @@ -92,9 +92,8 @@ <javax.annotation.version>1.3.2</javax.annotation.version> <spotless.version>2.30.0</spotless.version> <shade.prefix>shaded.parquet</shade.prefix> - <!-- Guarantees no newer classes/methods/constants are used by parquet. --> - <hadoop.version>3.3.0</hadoop.version> - <parquet.format.version>2.10.0</parquet.format.version> + <hadoop.version>3.3.6</hadoop.version> Review Comment: We should not bump hadoop version. ########## parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java: ########## @@ -850,4 +906,105 @@ public SizeStatistics getSizeStatistics() { public boolean isEncrypted() { return true; } + + public GeospatialStatistics getGeospatialStatistics() { + return shadowColumnChunkMetaData.getGeospatialStatistics(); + } +} + +class GeometryColumnChunkMetaData extends ColumnChunkMetaData { Review Comment: We should not create another ColumnChunkMetaData subclass. Please follow what we did for `SizeStatistics`. ########## parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java: ########## @@ -1032,6 +1153,12 @@ public Optional<SortOrder> visit( LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { return of(SortOrder.SIGNED); } + + @Override + public Optional<SortOrder> visit( + LogicalTypeAnnotation.GeometryLogicalTypeAnnotation geometryLogicalType) { Review Comment: Missing GeographyLogicalTypeAnnotation ########## parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java: ########## @@ -30,6 +32,7 @@ public class BinaryStatistics extends Statistics<Binary> { private Binary max; private Binary min; + private GeospatialStatistics geospatialStatistics = null; Review Comment: Please take a look at `ColumnValueCollector` class and follow the same approach of collecting `SizeStatistics` for `GeospatialStatistics`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
