yyanyy commented on a change in pull request #2701: URL: https://github.com/apache/iceberg/pull/2701#discussion_r660892173
########## File path: mr/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergConfigTextParser.java ########## @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mr.hive; + +import java.util.Locale; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class HiveIcebergConfigTextParser { + private static final int PARTITIONING_TEXT_MAX_LENGTH = 1000; + + private HiveIcebergConfigTextParser() { + } + + public static PartitionSpec fromPartitioningText(Schema schema, String text) { + Preconditions.checkArgument(text.length() < PARTITIONING_TEXT_MAX_LENGTH, + "Partition spec text too long: max allowed length %s, but got %s", + PARTITIONING_TEXT_MAX_LENGTH, text.length()); + PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); + + StringBuilder sb = new StringBuilder(); + boolean inTransform = false; + boolean inEscape = false; + boolean needBuild = false; + String transformPart = null; + String[] argParts = {null, null}; + int argIndex = 0; + for (int i = 0; i < text.length(); i++) { + char curr = text.charAt(i); + if (needBuild) { + Preconditions.checkArgument(curr == ' ' || curr == '|', + "Cannot parse partitioning text: end of bracket not followed by ' ' or '|' at position %s in %s", i, text); + } + + switch (curr) { + case '(': + Preconditions.checkArgument(!inTransform, + "Cannot parse partitioning text: found multiple ( at position %s in %s", i, text); + inTransform = true; + transformPart = sb.toString(); + sb.delete(0, sb.length()); + break; + case ')': Review comment: We might want to do some sanity check for ')' if we go with char by char parsing, since looks like currently these three cases would pass but ideally they shouldn't: - `day(created|employee_info.employer` - `day(created|employee_info.employer)` - `day(created|employee_info.employer)|bucket(16,id` ########## File path: mr/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergConfigTextParser.java ########## @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mr.hive; + +import org.apache.iceberg.AssertHelpers; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; +import org.junit.Assert; +import org.junit.Test; + +import static org.apache.iceberg.types.Types.NestedField.optional; + +public class TestHiveIcebergConfigTextParser { + + private static final Schema SCHEMA = new Schema( + optional(1, "id", Types.LongType.get()), + optional(2, "name", Types.StringType.get()), + optional(3, "employee_info", Types.StructType.of( + optional(6, "employer", Types.StringType.get()), + optional(7, "id", Types.LongType.get()), + optional(8, "address", Types.StringType.get()) + )), + optional(4, "created", Types.TimestampType.withoutZone()), + optional(5, "updated", Types.TimestampType.withoutZone()) + ); + + @Test + public void testInvalidInputText() { + AssertHelpers.assertThrows("should fail when input exceeding max allowed length", + IllegalArgumentException.class, + "Partition spec text too long: max allowed length 1000, but got 1024", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, new String(new char[1024]).replace("\0", "s"))); + + AssertHelpers.assertThrows("should fail with multiple left brackets", + IllegalArgumentException.class, + "found multiple (", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(col(col2))")); + + AssertHelpers.assertThrows("should fail with multiple right brackets", + IllegalArgumentException.class, + "end of bracket not followed by ' ' or '|'", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(16,id))")); + + AssertHelpers.assertThrows("should fail with no delimiter between transforms", + IllegalArgumentException.class, + "end of bracket not followed by ' ' or '|'", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(16,id)bucket(16,col)")); + + AssertHelpers.assertThrows("should fail with no delimiter between transform and column", + IllegalArgumentException.class, + "end of bracket not followed by ' ' or '|'", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(16,id)name")); + + AssertHelpers.assertThrows("should fail with more than 2 transform arguments", + IllegalArgumentException.class, + "more than 2 transform arguments used", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(1,2,3)")); + + AssertHelpers.assertThrows("should fail with wrong delimiter", + IllegalArgumentException.class, + "detected multiple args for identity transform", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "name,id")); + } + + @Test + public void testParsingColumns() { + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) + .identity("id").identity("name").identity("created").build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "id|name|created")); + Assert.assertEquals("space on the right of pipe should not matter", + expected, HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "id| name|created")); + Assert.assertEquals("space on the left of pipe should not matter", + expected, HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "id |name|created")); + Assert.assertEquals("space on both sides of pipe should not matter", + expected, HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "id | name|created")); + } + + @Test + public void testParsingColumnsFailure() { + AssertHelpers.assertThrows("should fail when column does not exist", + IllegalArgumentException.class, + "Cannot find source column: col", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "col")); + } + + @Test + public void testParsing2ArgTransforms() { + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) + .bucket("name", 16).truncate("employee_info.address", 8).build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "bucket(16,name)|truncate(employee_info.address,8)")); + Assert.assertEquals("space in args should not matter", + expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "bucket( 16, name)| truncate(employee_info.address , 8 )")); + } + + @Test + public void testParsing2ArgTransformsFailure() { + AssertHelpers.assertThrows("should fail when transform not exist", + IllegalArgumentException.class, + "unknown 2-arg transform", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "unknown(16,col)")); + + AssertHelpers.assertThrows("should fail when there is only 1 arg", + IllegalArgumentException.class, + "unknown 1-arg transform", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(a)")); + + AssertHelpers.assertThrows("should fail when column does not exist", + IllegalArgumentException.class, + "Cannot find source column", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(8, col)")); + + AssertHelpers.assertThrows("should fail when input to transform is wrong", + IllegalArgumentException.class, + "fail to parse integer argument in 2-arg transform", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "bucket(name, 8)")); + } + + @Test + public void testParsing1ArgTransforms() { + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) + .day("created").hour("updated").alwaysNull("id").build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "day(created)|hour(updated)|alwaysNull(id)")); + Assert.assertEquals("space in args should not matter", + expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "day(created )| hour( updated)| alwaysNull(id)")); + Assert.assertEquals("case sensitivity should not matter", + expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "day(created)|hour(updated)|alwaysnull(id)")); + } + + @Test + public void testParsing1ArgTransformsFailure() { + AssertHelpers.assertThrows("should fail when transform not exist", + IllegalArgumentException.class, + "unknown 1-arg transform", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "unknown(col)")); + + AssertHelpers.assertThrows("should fail when column does not exist", + IllegalArgumentException.class, + "Cannot find source column: col", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "year(col)")); + + AssertHelpers.assertThrows("should treat additional arguments as plain text column and fail", + IllegalArgumentException.class, + "unknown 2-arg transform", + () -> HiveIcebergConfigTextParser.fromPartitioningText(SCHEMA, "year(8,col)")); + } + + @Test + public void textMixedTransforms() { + PartitionSpec expected = PartitionSpec.builderFor(SCHEMA) + .day("created") + .bucket("id", 16) + .identity("employee_info.employer") + .build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "day(created)|bucket(16,id)|employee_info.employer")); + + expected = PartitionSpec.builderFor(SCHEMA) + .day("created") + .identity("employee_info.employer") + .bucket("id", 16) + .build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "day(created)|employee_info.employer|bucket(16,id)")); + + expected = PartitionSpec.builderFor(SCHEMA) + .bucket("id", 16) + .identity("employee_info.employer") + .day("created") + .build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "bucket(16,id)|employee_info.employer|day(created)")); + + expected = PartitionSpec.builderFor(SCHEMA) + .bucket("id", 16) + .day("created") + .identity("employee_info.employer") + .build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "bucket(16,id)|day(created)|employee_info.employer")); + + expected = PartitionSpec.builderFor(SCHEMA) + .identity("employee_info.employer") + .day("created") + .bucket("id", 16) + .build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "employee_info.employer|day(created)|bucket(16,id)")); + + expected = PartitionSpec.builderFor(SCHEMA) + .identity("employee_info.employer") + .bucket("id", 16) + .day("created") + .build(); + Assert.assertEquals(expected, HiveIcebergConfigTextParser.fromPartitioningText( + SCHEMA, "employee_info.employer|bucket(16,id)|day(created)")); + } + + @Test + public void testEscapedColumns() { + Schema schema = new Schema( + optional(1, "i|d", Types.LongType.get()), Review comment: minor: might want to add an additional case to test other special characters like ";" in column name. Looks like currently we don't require escaping them (e.g. `bucket(16,i;d)` works), I think it might be fine but not sure if we want to make things consistent -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
