nknize commented on code in PR #1017:
URL: https://github.com/apache/lucene/pull/1017#discussion_r924723448


##########
lucene/core/src/java/org/apache/lucene/document/ShapeDocValuesField.java:
##########
@@ -0,0 +1,896 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.document;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.ShapeField.DecodedTriangle.TYPE;
+import org.apache.lucene.document.ShapeField.QueryRelation;
+import org.apache.lucene.document.SpatialQuery.EncodedRectangle;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.IndexableFieldType;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteBuffersDataOutput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+
+/** A doc values field representation for {@link LatLonShape} and {@link 
XYShape} */
+public final class ShapeDocValuesField extends Field {
+  private final ShapeComparator shapeComparator;
+
+  private static final FieldType FIELD_TYPE = new FieldType();
+
+  static {
+    FIELD_TYPE.setDocValuesType(DocValuesType.BINARY);
+    FIELD_TYPE.setOmitNorms(true);
+    FIELD_TYPE.freeze();
+  }
+
+  /**
+   * Creates a {@ShapeDocValueField} instance from a shape tessellation
+   *
+   * @param name The Field Name (must not be null)
+   * @param tessellation The tessellation (must not be null)
+   */
+  ShapeDocValuesField(String name, List<ShapeField.DecodedTriangle> 
tessellation) {
+    super(name, FIELD_TYPE);
+    BytesRef b = computeBinaryValue(tessellation);
+    this.fieldsData = b;
+    try {
+      this.shapeComparator = new ShapeComparator(b);
+    } catch (IOException e) {
+      throw new IllegalArgumentException("unable to read binary shape doc 
value field. ", e);
+    }
+  }
+
+  /** Creates a {@code ShapeDocValue} field from a given serialized value */
+  ShapeDocValuesField(String name, BytesRef binaryValue) {
+    super(name, FIELD_TYPE);
+    this.fieldsData = binaryValue;
+    try {
+      this.shapeComparator = new ShapeComparator(binaryValue);
+    } catch (IOException e) {
+      throw new IllegalArgumentException("unable to read binary shape doc 
value field. ", e);
+    }
+  }
+
+  /** The name of the field */
+  @Override
+  public String name() {
+    return name;
+  }
+
+  /** Gets the {@code IndexableFieldType} for this ShapeDocValue field */
+  @Override
+  public IndexableFieldType fieldType() {
+    return FIELD_TYPE;
+  }
+
+  /** Currently there is no string representation for the ShapeDocValueField */
+  @Override
+  public String stringValue() {
+    return null;
+  }
+
+  /** TokenStreams are not yet supported */
+  @Override
+  public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+    return null;
+  }
+
+  /** create a shape docvalue field from indexable fields */
+  public static ShapeDocValuesField createDocValueField(String fieldName, 
Field[] indexableFields) {
+    ArrayList<ShapeField.DecodedTriangle> tess = new 
ArrayList<>(indexableFields.length);
+    final byte[] scratch = new byte[7 * Integer.BYTES];
+    for (Field f : indexableFields) {
+      BytesRef br = f.binaryValue();
+      assert br.length == 7 * ShapeField.BYTES;
+      System.arraycopy(br.bytes, br.offset, scratch, 0, 7 * ShapeField.BYTES);
+      ShapeField.DecodedTriangle t = new ShapeField.DecodedTriangle();
+      ShapeField.decodeTriangle(scratch, t);
+      tess.add(t);
+    }
+    return new ShapeDocValuesField(fieldName, tess);
+  }
+
+  /** Returns the number of terms (tessellated triangles) for this shape */
+  public int numberOfTerms() {
+    return shapeComparator.numberOfTerms();
+  }
+
+  /** Creates a geometry query for shape docvalues */
+  public static Query newGeometryQuery(
+      final String field, final QueryRelation relation, Object... geometries) {
+    return null;
+    // TODO
+    //  return new ShapeDocValuesQuery(field, relation, geometries);
+  }
+
+  /** Compute the spatial relation of this shape and a bounding box (in 
encoded space) */
+  public Relation relate(final int minX, final int maxX, final int minY, final 
int maxY)
+      throws IOException {
+    return shapeComparator.relate(minX, maxX, minY, maxY);
+  }
+
+  /** returns the min x value for the shape's bounding box */
+  public int getMinX() {
+    return shapeComparator.getMinX();
+  }
+
+  /** returns the min y value for the shape's bounding box */
+  public int getMinY() {
+    return shapeComparator.getMinY();
+  }
+
+  /** returns the max x value for the shape's bounding box */
+  public int getMaxX() {
+    return shapeComparator.getMaxX();
+  }
+
+  /** returns the max y value for the shape's bounding box */
+  public int getMaxY() {
+    return shapeComparator.getMaxY();
+  }
+
+  /** Retrieves the x centroid location for the geometry(s) */
+  public int getCentroidX() {
+    return shapeComparator.getCentroidX();
+  }
+
+  /** Retrieves the y centroid location for the geometry(s) */
+  public int getCentroidY() {
+    return shapeComparator.getCentroidY();
+  }
+
+  /**
+   * Retrieves the highest dimensional type (POINT, LINE, TRIANGLE) for 
computing the geometry(s)
+   * centroid
+   */
+  public TYPE getHighestDimensionType() {
+    return shapeComparator.getHighestDimension();
+  }
+
+  private BytesRef computeBinaryValue(List<ShapeField.DecodedTriangle> 
tessellation) {
+    try {
+      // dfs order serialization
+      List<TreeNode> dfsSerialized = new ArrayList<>(tessellation.size());
+      buildTree(tessellation, dfsSerialized);
+      Writer w = new Writer(dfsSerialized);
+      return w.getBytesRef();
+    } catch (IOException e) {
+      throw new RuntimeException("Internal error building 
LatLonShapeDocValues. Got ", e);
+    }
+  }
+
+  /** main entry point to build the tessellation tree * */
+  public TreeNode buildTree(
+      List<ShapeField.DecodedTriangle> tessellation, List<TreeNode> 
dfsSerialized)
+      throws IOException {
+    if (tessellation.size() == 1) {
+      ShapeField.DecodedTriangle t = tessellation.get(0);
+      TreeNode node = new TreeNode(t);
+      if (t.type == TYPE.LINE) {
+        node.midX /= node.length;
+        node.midY /= node.length;
+      } else if (t.type == TYPE.TRIANGLE) {
+        node.midX /= node.signedArea;
+        node.midY /= node.signedArea;
+      }
+      node.highestType = t.type;
+      dfsSerialized.add(node);
+      return node;
+    }
+    TreeNode[] triangles = new TreeNode[tessellation.size()];
+    int i = 0;
+    int minY = Integer.MAX_VALUE;
+    int minX = Integer.MAX_VALUE;
+    int maxY = Integer.MIN_VALUE;
+    int maxX = Integer.MIN_VALUE;
+
+    // running stats for computing centroid
+    double totalSignedArea = 0;
+    double totalLength = 0;
+    double numXPnt = 0;
+    double numYPnt = 0;
+    double numXLin = 0;
+    double numYLin = 0;
+    double numXPly = 0;
+    double numYPly = 0;
+    TYPE highestType = TYPE.POINT;
+
+    for (ShapeField.DecodedTriangle t : tessellation) {
+      TreeNode node = new TreeNode(t);
+      triangles[i++] = node;
+      // compute the bbox values up front
+      minY = Math.min(minY, node.minY);
+      minX = Math.min(minX, node.minX);
+      maxY = Math.max(maxY, node.maxY);
+      maxX = Math.max(maxX, node.maxX);
+
+      // compute the running centroid stats
+      totalSignedArea += node.signedArea; // non-zero if any components are 
triangles
+      totalLength += node.length; // non-zero if any components are line 
segments
+      if (t.type == TYPE.POINT) {
+        numXPnt += node.midX;
+        numYPnt += node.midY;
+      } else if (t.type == TYPE.LINE) {
+        if (highestType == TYPE.POINT) {
+          highestType = TYPE.LINE;
+        }
+        numXLin += node.midX;
+        numYLin += node.midY;
+      } else {
+        if (highestType != TYPE.TRIANGLE) {
+          highestType = TYPE.TRIANGLE;
+        }
+        numXPly += node.midX;
+        numYPly += node.midY;
+      }
+    }
+    TreeNode root = createTree(triangles, 0, triangles.length - 1, false, 
null, dfsSerialized);
+
+    // pull up min values for the root node so the bbox is consistent
+    root.minY = minY;
+    root.minX = minX;
+
+    // set the highest dimensional type
+    root.highestType = highestType;
+
+    // compute centroid values for the root node so the centroid is consistent
+    if (highestType == TYPE.POINT) {
+      root.midX = numXPnt / i;
+      root.midY = numYPnt / i;
+    } else if (highestType == TYPE.LINE) {
+      // numerator is sum of segment midPoints times segment length
+      // divide by total length per
+      // https://www.ae.msstate.edu/vlsm/shape/centroid_of_a_line/straight.htm
+      root.midX = numXLin / totalLength;
+      root.midY = numYLin / totalLength;
+    } else {
+      // numerator is sum of triangle centroids times triangle signed area
+      // divide by total signed area per 
http://www.faqs.org/faqs/graphics/algorithms-faq/
+      root.midX = numXPly / totalSignedArea;
+      root.midY = numYPly / totalSignedArea;
+    }
+
+    return root;
+  }
+
+  /** creates the tree */
+  private TreeNode createTree(
+      TreeNode[] triangles,
+      int low,
+      int high,
+      boolean splitX,
+      TreeNode parent,
+      List<TreeNode> dfsSerialized) {
+    if (low > high) {
+      return null;
+    }
+    // add midpoint
+    int mid = (low + high) >>> 1;
+    if (low < high) {
+      Comparator<TreeNode> comparator =
+          splitX
+              ? Comparator.comparingInt((TreeNode left) -> left.minX)
+                  .thenComparingInt(left -> left.maxX)
+              : Comparator.comparingInt((TreeNode left) -> left.minY)
+                  .thenComparingInt(left -> left.maxY);
+      ArrayUtil.select(triangles, low, high + 1, mid, comparator);
+    }
+    TreeNode newNode = triangles[mid];
+    dfsSerialized.add(newNode);
+    // set parent
+    newNode.parent = parent;
+
+    // add children
+    newNode.left = createTree(triangles, low, mid - 1, !splitX, newNode, 
dfsSerialized);
+    newNode.right = createTree(triangles, mid + 1, high, !splitX, newNode, 
dfsSerialized);
+    // pull up values to this node
+    if (newNode.left != null) {
+      newNode.maxX = Math.max(newNode.maxX, newNode.left.maxX);
+      newNode.maxY = Math.max(newNode.maxY, newNode.left.maxY);
+    }
+    if (newNode.right != null) {
+      newNode.maxX = Math.max(newNode.maxX, newNode.right.maxX);
+      newNode.maxY = Math.max(newNode.maxY, newNode.right.maxY);
+    }
+
+    // adjust byteSize based on new parent bbox values
+    if (newNode.left != null) {
+      // bounding box size
+      newNode.left.byteSize += vLongSize((long) newNode.maxX - 
newNode.left.maxX);
+      newNode.left.byteSize += vLongSize((long) newNode.maxY - 
newNode.left.maxY);
+      // component size
+      newNode.left.byteSize += computeComponentSize(newNode.left, 
newNode.maxX, newNode.maxY);
+      // include byte size (if needed to be skipped)
+      newNode.byteSize += vIntSize(newNode.left.byteSize) + 
newNode.left.byteSize;
+    }
+    if (newNode.right != null) {
+      // bounding box size
+      newNode.right.byteSize += vLongSize((long) newNode.maxX - 
newNode.right.maxX);
+      newNode.right.byteSize += vLongSize((long) newNode.maxY - 
newNode.right.maxY);
+      // component size
+      newNode.right.byteSize += computeComponentSize(newNode.right, 
newNode.maxX, newNode.maxY);
+      // include byte size (if needed to be skipped)
+      newNode.byteSize += vIntSize(newNode.right.byteSize) + 
newNode.right.byteSize;
+    }
+    return newNode;
+  }
+
+  private int computeComponentSize(TreeNode node, int maxX, int maxY) {
+    int size = 0;
+    ShapeField.DecodedTriangle t = node.triangle;
+    size += vLongSize((long) maxX - t.aX);
+    size += vLongSize((long) maxY - t.aY);
+    if (t.type == TYPE.LINE || t.type == TYPE.TRIANGLE) {
+      size += vLongSize((long) maxX - t.bX);
+      size += vLongSize((long) maxY - t.bY);
+    }
+    if (t.type == TYPE.TRIANGLE) {
+      size += vLongSize((long) maxX - t.cX);
+      size += vLongSize((long) maxY - t.cY);
+    }
+    return size;
+  }
+
+  /**
+   * Builds an in memory binary tree of tessellated triangles. This logic 
comes from {@code
+   * org.apache.lucene.geo.ComponentTree} which originated from {@code
+   * org.apache.lucene.geo.EdgeTree}
+   *
+   * <p>The tree is serialized on disk in a variable format which becomes a 
compressed
+   * representation of the doc value format for the Geometry Component Tree
+   */
+  private final class TreeNode {
+    /** the triangle for this tree node */
+    private final ShapeField.DecodedTriangle triangle;
+
+    /** centroid running stats (in encoded space) for this tree node */
+    private double midX;
+
+    private double midY;
+
+    private final double
+        signedArea; // Units are encoded space. This is only used to compute 
centroid
+    // in encoded space. DO NOT USE THIS FOR GEOGRAPHICAL SPATIAL AREA
+    // triangles are guaranteed CCW so this will always be positive
+    // unless the component is a point or line
+    private final double length; // Units are encoded space. This is only used 
to compute centroid
+    // in encoded space. DO NOT USE THIS FOR GEOGRAPHICAL DISTANCE
+    // this will always be positive unless the component is a
+    // point or triangle
+    private TYPE highestType; // the highest dimensional type
+
+    /** the bounding box for the tree */
+    private int minX;
+
+    private int maxX;
+    private int minY;
+    private int maxY;
+
+    // left and right branch
+    private TreeNode left;
+    private TreeNode right;
+    private TreeNode parent;
+
+    private int byteSize = 1; // header size is one byte; remaining is 
accumulated on construction
+
+    private TreeNode(ShapeField.DecodedTriangle t) {
+      this.minX = StrictMath.min(StrictMath.min(t.aX, t.bX), t.cX);
+      this.minY = StrictMath.min(StrictMath.min(t.aY, t.bY), t.cY);
+      this.maxX = StrictMath.max(StrictMath.max(t.aX, t.bX), t.cX);
+      this.maxY = StrictMath.max(StrictMath.max(t.aY, t.bY), t.cY);
+      this.triangle = t;
+      this.left = null;
+      this.right = null;
+
+      // compute the component level centroid, encoded area, or length based 
on type
+      if (t.type == TYPE.POINT) {
+        this.midX = t.aX;
+        this.midY = t.aY;
+        this.signedArea = 0;
+        this.length = 0;
+      } else if (t.type == TYPE.LINE) {
+        this.length = Math.hypot(t.aX - t.bX, t.aY - t.bY);
+        this.midX = (0.5d * (t.aX + t.bX)) * length; // weight by length
+        this.midY = (0.5d * (t.aY + t.bY)) * length; // weight by length
+        this.signedArea = 0;
+      } else {
+        this.signedArea =
+            Math.abs(0.5d * ((t.bX - t.aX) * (t.cY - t.aY) - (t.cX - t.aX) * 
(t.bY - t.aY)));
+        this.midX = ((t.aX + t.bX + t.cX) / 3) * signedArea; // weight by 
signed area
+        this.midY = ((t.aY + t.bY + t.cY) / 3) * signedArea; // weight by 
signed area
+        this.length = 0;
+      }
+    }
+  }
+
+  /** Writes data from a ShapeDocValues field to a data output array */
+  public final class Writer {
+    private final ByteBuffersDataOutput output;
+    private BytesRef bytesRef;
+
+    Writer(List<TreeNode> dfsSerialized) throws IOException {
+      this.output = new ByteBuffersDataOutput();
+      writeTree(dfsSerialized);
+      this.bytesRef = new BytesRef(output.toArrayCopy(), 0, 
Math.toIntExact(output.size()));
+    }
+
+    BytesRef getBytesRef() {
+      return bytesRef;
+    }
+
+    private void writeTree(List<TreeNode> dfsSerialized) throws IOException {
+      assert output != null : "passed null datastream to ShapeDocValuesField 
tessellation tree";
+      // write number of terms (triangles)
+      output.writeVInt(dfsSerialized.size());
+      // write root
+      TreeNode root = dfsSerialized.remove(0);
+      // write bounding box; convert to variable long by translating
+      output.writeVLong((long) root.minX - Integer.MIN_VALUE);

Review Comment:
   For multi geometries (and geometry collections) crossing dateline check 
(xdl) is handled by syntactic sugar `minX > maxX` to avoid superfluous 
encoding. I couldn't think of a case where explicitly encoding the xdl check 
would be needed; but I could've certainly missed it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to