zentol commented on code in PR #1: URL: https://github.com/apache/flink-connector-mongodb/pull/1#discussion_r1030486881
########## flink-connector-mongodb/src/main/java/org/apache/flink/connector/mongodb/table/MongoKeyExtractor.java: ########## @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.mongodb.table; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.connector.mongodb.common.utils.MongoValidationUtils; +import org.apache.flink.connector.mongodb.table.converter.RowDataToBsonConverters; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.connector.Projection; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.utils.ProjectedRowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.util.function.SerializableFunction; + +import org.bson.BsonObjectId; +import org.bson.BsonValue; +import org.bson.types.ObjectId; + +import java.util.Optional; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** An extractor for a MongoDB key from a {@link RowData}. */ +@Internal +public class MongoKeyExtractor implements SerializableFunction<RowData, BsonValue> { + + public static final String RESERVED_ID = "_id"; + + private final LogicalType primaryKeyType; + + private final int[] primaryKeyIndexes; + + private final RowDataToBsonConverters.RowDataToBsonConverter primaryKeyConverter; + + private MongoKeyExtractor(LogicalType primaryKeyType, int[] primaryKeyIndexes) { + this.primaryKeyType = primaryKeyType; + this.primaryKeyIndexes = primaryKeyIndexes; + this.primaryKeyConverter = RowDataToBsonConverters.createNullableConverter(primaryKeyType); + } + + @Override + public BsonValue apply(RowData rowData) { + BsonValue keyValue; + if (isCompoundPrimaryKey(primaryKeyIndexes)) { + RowData keyRow = ProjectedRowData.from(primaryKeyIndexes).replaceRow(rowData); + keyValue = primaryKeyConverter.convert(keyRow); + } else { + RowData.FieldGetter fieldGetter = + RowData.createFieldGetter(primaryKeyType, primaryKeyIndexes[0]); + keyValue = primaryKeyConverter.convert(fieldGetter.getFieldOrNull(rowData)); + if (keyValue.isString()) { + String keyString = keyValue.asString().getValue(); + // Try to restore MongoDB's ObjectId from string. + if (ObjectId.isValid(keyString)) { + keyValue = new BsonObjectId(new ObjectId(keyString)); + } + } + } + return checkNotNull(keyValue, "Primary key value is null of RowData: " + rowData); + } + + public static SerializableFunction<RowData, BsonValue> createKeyExtractor( + ResolvedSchema resolvedSchema) { + + Optional<UniqueConstraint> primaryKey = resolvedSchema.getPrimaryKey(); + int[] primaryKeyIndexes = resolvedSchema.getPrimaryKeyIndexes(); + Optional<Column> reversedId = resolvedSchema.getColumn(RESERVED_ID); + + // It behaves as append-only when no primary key is declared and no reversed _id is present. + // We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in SerializedLambdas + // (MSHADE-260). + if (!primaryKey.isPresent() && !reversedId.isPresent()) { + return new SerializableFunction<RowData, BsonValue>() { + private static final long serialVersionUID = 1L; + + @Override + public BsonValue apply(RowData rowData) { + return null; + } + }; + } + + if (reversedId.isPresent()) { + // Primary key should be declared as (_id) when the mongo reversed _id is present. + if (!primaryKey.isPresent() + || isCompoundPrimaryKey(primaryKeyIndexes) + || !primaryKeyContainsReversedId(primaryKey.get())) { + throw new IllegalArgumentException( + "The primary key should be declared as (_id) when mongo reversed _id field is present"); Review Comment: That's an interesting question. We don't seem to handle this case in a special way in the ES connector; we use the `PRIMARY KEY` as the id and insert the document as is with it's _id_ field; the final behavior will depend on Elasticsearch. I don't really like that behavior because as you said it is ambiguous; we should fail early if the schema contains an _id field and it's not the sole primary key. My comment was rather about the error message. You are explicitly suggesting `(_id)` as an alternative in this case; but is that really the best option? Maybe they just made a mistake w.r.t. keys. Maybe we shouldn't even give such suggestions, and rather just describe the problem of ambiguous keys being used due to the presence of an `_id` field ########## flink-connector-mongodb/src/main/java/org/apache/flink/connector/mongodb/table/MongoKeyExtractor.java: ########## @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.mongodb.table; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.connector.mongodb.common.utils.MongoValidationUtils; +import org.apache.flink.connector.mongodb.table.converter.RowDataToBsonConverters; +import org.apache.flink.table.catalog.Column; +import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.catalog.UniqueConstraint; +import org.apache.flink.table.connector.Projection; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.utils.ProjectedRowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.util.function.SerializableFunction; + +import org.bson.BsonObjectId; +import org.bson.BsonValue; +import org.bson.types.ObjectId; + +import java.util.Optional; + +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** An extractor for a MongoDB key from a {@link RowData}. */ +@Internal +public class MongoKeyExtractor implements SerializableFunction<RowData, BsonValue> { + + public static final String RESERVED_ID = "_id"; + + private final LogicalType primaryKeyType; + + private final int[] primaryKeyIndexes; + + private final RowDataToBsonConverters.RowDataToBsonConverter primaryKeyConverter; + + private MongoKeyExtractor(LogicalType primaryKeyType, int[] primaryKeyIndexes) { + this.primaryKeyType = primaryKeyType; + this.primaryKeyIndexes = primaryKeyIndexes; + this.primaryKeyConverter = RowDataToBsonConverters.createNullableConverter(primaryKeyType); + } + + @Override + public BsonValue apply(RowData rowData) { + BsonValue keyValue; + if (isCompoundPrimaryKey(primaryKeyIndexes)) { + RowData keyRow = ProjectedRowData.from(primaryKeyIndexes).replaceRow(rowData); + keyValue = primaryKeyConverter.convert(keyRow); + } else { + RowData.FieldGetter fieldGetter = + RowData.createFieldGetter(primaryKeyType, primaryKeyIndexes[0]); + keyValue = primaryKeyConverter.convert(fieldGetter.getFieldOrNull(rowData)); + if (keyValue.isString()) { + String keyString = keyValue.asString().getValue(); + // Try to restore MongoDB's ObjectId from string. + if (ObjectId.isValid(keyString)) { + keyValue = new BsonObjectId(new ObjectId(keyString)); + } + } + } + return checkNotNull(keyValue, "Primary key value is null of RowData: " + rowData); + } + + public static SerializableFunction<RowData, BsonValue> createKeyExtractor( + ResolvedSchema resolvedSchema) { + + Optional<UniqueConstraint> primaryKey = resolvedSchema.getPrimaryKey(); + int[] primaryKeyIndexes = resolvedSchema.getPrimaryKeyIndexes(); + Optional<Column> reversedId = resolvedSchema.getColumn(RESERVED_ID); + + // It behaves as append-only when no primary key is declared and no reversed _id is present. + // We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in SerializedLambdas + // (MSHADE-260). + if (!primaryKey.isPresent() && !reversedId.isPresent()) { + return new SerializableFunction<RowData, BsonValue>() { + private static final long serialVersionUID = 1L; + + @Override + public BsonValue apply(RowData rowData) { + return null; + } + }; + } + + if (reversedId.isPresent()) { + // Primary key should be declared as (_id) when the mongo reversed _id is present. + if (!primaryKey.isPresent() + || isCompoundPrimaryKey(primaryKeyIndexes) + || !primaryKeyContainsReversedId(primaryKey.get())) { + throw new IllegalArgumentException( + "The primary key should be declared as (_id) when mongo reversed _id field is present"); Review Comment: That's an interesting question. We don't seem to handle this case in a special way in the ES connector; we use the `PRIMARY KEY` as the id and insert the document as is with it's _id_ field; the final behavior will depend on Elasticsearch. I don't really like that behavior because as you said it is ambiguous; we should fail early if the schema contains an `_id` field and it's not the sole primary key. My comment was rather about the error message. You are explicitly suggesting `(_id)` as an alternative in this case; but is that really the best option? Maybe they just made a mistake w.r.t. keys. Maybe we shouldn't even give such suggestions, and rather just describe the problem of ambiguous keys being used due to the presence of an `_id` field -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
