vinothchandar commented on a change in pull request #623: Hudi Test Suite URL: https://github.com/apache/incubator-hudi/pull/623#discussion_r276877479
########## File path: hoodie-bench/src/main/java/com/uber/hoodie/integrationsuite/helpers/AvroReader.java ########## @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 Uber Technologies, Inc. ([email protected]) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.integrationsuite.helpers; + +import com.uber.hoodie.AvroConversionUtils; +import java.util.List; +import java.util.Optional; +import org.apache.avro.generic.GenericRecord; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.collection.JavaConverters; + +/** + * Helper class to read avro files and generate an RDD of {@link GenericRecord} + */ +public class AvroReader { + + public static final String SPARK_AVRO_FORMAT = "com.databricks.spark.avro"; + private static final String AVRO_SCHEMA_OPTION_KEY = "avroSchema"; + private static final String DEFAULT_STRUCT_NAME = "test.struct"; + private static final String DEFAULT_NAMESPACE_NAME = "test.namespace"; + + // Spark anyways globs the path and gets all the paths in memory so take the List<filePaths> as an argument. + // https://github.com/apache/spark/.../org/apache/spark/sql/execution/datasources/DataSource.scala#L251 + public static JavaRDD<GenericRecord> read(SparkSession sparkSession, String schemaStr, List<String> listOfPaths, + Optional<String> structName, Optional<String> nameSpace) { + + Dataset<Row> dataSet = sparkSession.read() Review comment: follow on to the previous comment.. we can just use sparkSession.write() to write out to the sink? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
