[ https://issues.apache.org/jira/browse/HIVE-24718?focusedWorklogId=555925&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-555925 ]
ASF GitHub Bot logged work on HIVE-24718: ----------------------------------------- Author: ASF GitHub Bot Created on: 22/Feb/21 18:02 Start Date: 22/Feb/21 18:02 Worklog Time Spent: 10m Work Description: ArkoSharma commented on a change in pull request #1936: URL: https://github.com/apache/hive/pull/1936#discussion_r580465458 ########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/repl/util/FileList.java ########## @@ -18,158 +18,176 @@ package org.apache.hadoop.hive.ql.exec.repl.util; -import com.google.common.annotations.VisibleForTesting; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.ErrorMsg; +import org.apache.hadoop.hive.ql.exec.util.Retryable; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.io.StringWriter; +import java.io.UncheckedIOException; import java.util.Iterator; import java.util.NoSuchElementException; -import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.Callable; - -/** - * A file backed list of Strings which is in-memory till the threshold. - */ public class FileList implements AutoCloseable, Iterator<String> { private static final Logger LOG = LoggerFactory.getLogger(FileList.class); - private static int fileListStreamerID = 0; - private static final String FILE_LIST_STREAMER_PREFIX = "file-list-streamer-"; - - private LinkedBlockingQueue<String> cache; - private volatile boolean thresholdHit = false; - private int thresholdPoint; - private float thresholdFactor = 0.9f; - private Path backingFile; - private FileListStreamer fileListStreamer; - private String nextElement; - private boolean noMoreElement; + private final Path backingFile; + private String nextElement = null; private HiveConf conf; + private volatile boolean retryMode; private BufferedReader backingFileReader; + private volatile FSDataOutputStream backingFileWriter; - - public FileList(Path backingFile, int cacheSize, HiveConf conf) { + public FileList(Path backingFile, HiveConf conf) { this.backingFile = backingFile; this.conf = conf; - if (cacheSize > 0) { - // Cache size must be > 0 for this list to be used for the write operation. - this.cache = new LinkedBlockingQueue<>(cacheSize); - fileListStreamer = new FileListStreamer(cache, backingFile, conf); - thresholdPoint = getThreshold(cacheSize); - LOG.debug("File list backed by {} can be used for write operation.", backingFile); - } else { - thresholdHit = true; + this.retryMode = false; + } + + public void add(String entry) throws IOException { + Retryable retryable = buildRetryable(); + try { + retryable.executeCallable((Callable<Void>) ()-> { + synchronized (backingFile) { + try{ + if (backingFileWriter == null) { + backingFileWriter = initWriter(); + } + backingFileWriter.writeBytes(getEntryWithNewline(entry)); + backingFileWriter.hflush(); Review comment: This patch assumes that each flush operation either writes data completely (including ack) or does not write it at all. If this assumption changes, then we might need future investigation to determine the boundaries of each entry that needs to be written to the file. So currently checks for duplicate entries and invalid entries are not being done. Hflush guarantees that after successful return, every new reader would see the flushed data. This guarantee seems enough as we do not have reads during writes. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 555925) Time Spent: 4h (was: 3h 50m) > Moving to file based iteration for copying data > ----------------------------------------------- > > Key: HIVE-24718 > URL: https://issues.apache.org/jira/browse/HIVE-24718 > Project: Hive > Issue Type: Bug > Reporter: Arko Sharma > Assignee: Arko Sharma > Priority: Major > Labels: pull-request-available > Attachments: HIVE-24718.01.patch, HIVE-24718.02.patch, > HIVE-24718.04.patch > > Time Spent: 4h > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)