akashrn5 commented on a change in pull request #4044: URL: https://github.com/apache/carbondata/pull/4044#discussion_r537288377
########## File path: integration/spark/src/main/scala/org/apache/carbondata/spark/rdd/CarbonDataRDDFactory.scala ########## @@ -577,38 +556,46 @@ object CarbonDataRDDFactory { LOGGER.info("Data load is successful for " + s"${ carbonLoadModel.getDatabaseName }.${ carbonLoadModel.getTableName }") } - - // code to handle Pre-Priming cache for loading - - if (!StringUtils.isEmpty(carbonLoadModel.getSegmentId)) { - DistributedRDDUtils.triggerPrepriming(sqlContext.sparkSession, carbonTable, Seq(), - operationContext, hadoopConf, List(carbonLoadModel.getSegmentId)) - } - try { - // compaction handling - if (carbonTable.isHivePartitionTable) { - carbonLoadModel.setFactTimeStamp(System.currentTimeMillis()) - } - val compactedSegments = new util.ArrayList[String]() - handleSegmentMerging(sqlContext, - carbonLoadModel - .getCopyWithPartition(carbonLoadModel.getCsvHeader, carbonLoadModel.getCsvDelimiter), - carbonTable, - compactedSegments, - operationContext) - carbonLoadModel.setMergedSegmentIds(compactedSegments) - writtenSegment - } catch { - case e: Exception => - LOGGER.error( - "Auto-Compaction has failed. Ignoring this exception because the" + - " load is passed.", e) - writtenSegment - } + isLoadingCommitted = true + writtenSegment } } finally { // Release the segment lock, once table status is finally updated segmentLock.unlock() + if (isLoadingCommitted) { + triggerEventsAfterLoading(sqlContext, carbonLoadModel, hadoopConf, operationContext) + } + } + } + + private def triggerEventsAfterLoading( + sqlContext: SQLContext, + carbonLoadModel: CarbonLoadModel, + hadoopConf: Configuration, + operationContext: OperationContext): Unit = { + val carbonTable = carbonLoadModel.getCarbonDataLoadSchema.getCarbonTable + // code to handle Pre-Priming cache for loading + if (!StringUtils.isEmpty(carbonLoadModel.getSegmentId)) { + DistributedRDDUtils.triggerPrepriming(sqlContext.sparkSession, carbonTable, Seq(), Review comment: calling two times will increase time, better to have a logic to find out whether compacted or not and based on that send the segments to pre-prime only once, its better. Also in `DistributedRDDUtils.scala`, line number 376, new `SegmentUpdateStatusManager `is created which is not used, its simply reading the table status file and update status, please check if it can be removed. Just another input to optimization. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org