wzx140 commented on code in PR #6745: URL: https://github.com/apache/hudi/pull/6745#discussion_r981303513
########## hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/HoodieSparkRecordSerializer.scala: ########## @@ -38,25 +40,20 @@ import scala.collection.mutable * schema, as to reduce network IO. * Actions like parsing or compressing schemas are computationally expensive so the serializer * caches all previously seen values as to reduce the amount of work needed to do. - * @param schemas a map where the keys are unique IDs for spark schemas and the values are the - * string representation of the Avro schema, used to decrease the amount of data - * that needs to be serialized. */ -class SparkStructTypeSerializer(schemas: Map[Long, StructType]) extends KSerializer[HoodieSparkRecord] { +class HoodieSparkRecordSerializer extends KSerializer[HoodieSparkRecord] { /** Used to reduce the amount of effort to compress the schema */ private val compressCache = new mutable.HashMap[StructType, Array[Byte]]() private val decompressCache = new mutable.HashMap[ByteBuffer, StructType]() - /** Fingerprinting is very expensive so this alleviates most of the work */ - private val fingerprintCache = new mutable.HashMap[StructType, Long]() - private val schemaCache = new mutable.HashMap[Long, StructType]() - // GenericAvroSerializer can't take a SparkConf in the constructor b/c then it would become // a member of KryoSerializer, which would make KryoSerializer not Serializable. We make // the codec lazy here just b/c in some unit tests, we use a KryoSerializer w/out having // the SparkEnv set (note those tests would fail if they tried to serialize avro data). private lazy val codec = CompressionCodec.createCodec(SparkEnv.get.conf) + private var objSerializerMap = new ConcurrentHashMap[Kryo, FieldSerializer[HoodieSparkRecord]] Review Comment: The current implementation is: Write fingerPrinter first, and then serialize HoodieSparkRecord(without schema) with kyro's default FieldSerializer. The original implementation method uses java serializer, which will have poor performance. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org