HDFS-8722. Optimize datanode writes for small writes and flushes. Contributed by Kihwal Lee
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/59388a80 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/59388a80 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/59388a80 Branch: refs/heads/HDFS-7240 Commit: 59388a801514d6af64ef27fbf246d8054f1dcc74 Parents: b7fb6ec Author: Kihwal Lee <kih...@apache.org> Authored: Tue Jul 14 14:04:06 2015 -0500 Committer: Kihwal Lee <kih...@apache.org> Committed: Tue Jul 14 14:04:06 2015 -0500 ---------------------------------------------------------------------- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 ++ .../hdfs/server/datanode/BlockReceiver.java | 34 +++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/59388a80/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 86b1ea1..14f3403 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -1053,6 +1053,8 @@ Release 2.7.2 - UNRELEASED OPTIMIZATIONS + HDFS-8722. Optimize datanode writes for small writes and flushes (kihwal) + BUG FIXES Release 2.7.1 - 2015-07-06 http://git-wip-us.apache.org/repos/asf/hadoop/blob/59388a80/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java index 2468f43..55c9d57 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java @@ -598,14 +598,19 @@ class BlockReceiver implements Closeable { // bytes should be skipped when writing the data and checksum // buffers out to disk. long partialChunkSizeOnDisk = onDiskLen % bytesPerChecksum; + long lastChunkBoundary = onDiskLen - partialChunkSizeOnDisk; boolean alignedOnDisk = partialChunkSizeOnDisk == 0; boolean alignedInPacket = firstByteInBlock % bytesPerChecksum == 0; - // Since data is always appended, not overwritten, partial CRC - // recalculation is necessary if the on-disk data is not chunk- - // aligned, regardless of whether the beginning of the data in - // the packet is chunk-aligned. - boolean doPartialCrc = !alignedOnDisk && !shouldNotWriteChecksum; + // If the end of the on-disk data is not chunk-aligned, the last + // checksum needs to be overwritten. + boolean overwriteLastCrc = !alignedOnDisk && !shouldNotWriteChecksum; + // If the starting offset of the packat data is at the last chunk + // boundary of the data on disk, the partial checksum recalculation + // can be skipped and the checksum supplied by the client can be used + // instead. This reduces disk reads and cpu load. + boolean doCrcRecalc = overwriteLastCrc && + (lastChunkBoundary != firstByteInBlock); // If this is a partial chunk, then verify that this is the only // chunk in the packet. If the starting offset is not chunk @@ -621,9 +626,10 @@ class BlockReceiver implements Closeable { // If the last portion of the block file is not a full chunk, // then read in pre-existing partial data chunk and recalculate // the checksum so that the checksum calculation can continue - // from the right state. + // from the right state. If the client provided the checksum for + // the whole chunk, this is not necessary. Checksum partialCrc = null; - if (doPartialCrc) { + if (doCrcRecalc) { if (LOG.isDebugEnabled()) { LOG.debug("receivePacket for " + block + ": previous write did not end at the chunk boundary." @@ -659,8 +665,15 @@ class BlockReceiver implements Closeable { int skip = 0; byte[] crcBytes = null; - // First, overwrite the partial crc at the end, if necessary. - if (doPartialCrc) { // not chunk-aligned on disk + // First, prepare to overwrite the partial crc at the end. + if (overwriteLastCrc) { // not chunk-aligned on disk + // prepare to overwrite last checksum + adjustCrcFilePosition(); + } + + // The CRC was recalculated for the last partial chunk. Update the + // CRC by reading the rest of the chunk, then write it out. + if (doCrcRecalc) { // Calculate new crc for this chunk. int bytesToReadForRecalc = (int)(bytesPerChecksum - partialChunkSizeOnDisk); @@ -673,8 +686,6 @@ class BlockReceiver implements Closeable { byte[] buf = FSOutputSummer.convertToByteStream(partialCrc, checksumSize); crcBytes = copyLastChunkChecksum(buf, checksumSize, buf.length); - // prepare to overwrite last checksum - adjustCrcFilePosition(); checksumOut.write(buf); if(LOG.isDebugEnabled()) { LOG.debug("Writing out partial crc for data len " + len + @@ -687,7 +698,6 @@ class BlockReceiver implements Closeable { // boundary. The checksum after the boundary was already counted // above. Only count the number of checksums skipped up to the // boundary here. - long lastChunkBoundary = onDiskLen - (onDiskLen%bytesPerChecksum); long skippedDataBytes = lastChunkBoundary - firstByteInBlock; if (skippedDataBytes > 0) {