tgravescs commented on a change in pull request #30062: URL: https://github.com/apache/spark/pull/30062#discussion_r511095790
########## File path: common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java ########## @@ -363,4 +363,26 @@ public boolean useOldFetchProtocol() { return conf.getBoolean("spark.shuffle.useOldFetchProtocol", false); } + /** + * The minimum size of a chunk when dividing a merged shuffle file into multiple chunks during + * push-based shuffle. + * A merged shuffle file consists of multiple small shuffle blocks. Fetching the + * complete merged shuffle file in a single response increases the memory requirements for the + * clients. Instead of serving the entire merged file, the shuffle service serves the + * merged file in `chunks`. A `chunk` constitutes few shuffle blocks in entirety and this + * configuration controls how big a chunk can get. A corresponding index file for each merged + * shuffle file will be generated indicating chunk boundaries. + */ + public int minChunkSizeInMergedShuffleFile() { + return Ints.checkedCast(JavaUtils.byteStringAsBytes( + conf.get("spark.shuffle.server.minChunkSizeInMergedShuffleFile", "2m"))); + } + + /** + * The size of cache used in push-based shuffle for storing merged index files. Review comment: we should specify if cache is memory or disk. ########## File path: common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java ########## @@ -0,0 +1,899 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.shuffle; + +import java.io.BufferedOutputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; + +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import com.google.common.cache.Weigher; +import com.google.common.collect.Maps; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.Unpooled; +import org.roaringbitmap.RoaringBitmap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.spark.network.buffer.FileSegmentManagedBuffer; +import org.apache.spark.network.buffer.ManagedBuffer; +import org.apache.spark.network.client.StreamCallbackWithID; +import org.apache.spark.network.protocol.Encoders; +import org.apache.spark.network.shuffle.protocol.FinalizeShuffleMerge; +import org.apache.spark.network.shuffle.protocol.MergeStatuses; +import org.apache.spark.network.shuffle.protocol.PushBlockStream; +import org.apache.spark.network.util.JavaUtils; +import org.apache.spark.network.util.NettyUtils; +import org.apache.spark.network.util.TransportConf; + +/** + * An implementation of {@link MergedShuffleFileManager} that provides the most essential shuffle + * service processing logic to support push based shuffle. + */ +public class RemoteBlockPushResolver implements MergedShuffleFileManager { + + private static final Logger logger = LoggerFactory.getLogger(RemoteBlockPushResolver.class); + private static final String MERGE_MANAGER_DIR = "merge_manager"; + + private final ConcurrentMap<String, AppPathsInfo> appsPathInfo; + private final ConcurrentMap<AppShufflePartitionId, AppShufflePartitionInfo> partitions; + + private final Executor directoryCleaner; + private final TransportConf conf; + private final int minChunkSize; + private final String relativeMergeDirPathPattern; + private final ErrorHandler.BlockPushErrorHandler errorHandler; + + @SuppressWarnings("UnstableApiUsage") + private final LoadingCache<File, ShuffleIndexInformation> indexCache; + + @SuppressWarnings("UnstableApiUsage") + public RemoteBlockPushResolver(TransportConf conf, String relativeMergeDirPathPattern) { + this.conf = conf; + this.partitions = Maps.newConcurrentMap(); + this.appsPathInfo = Maps.newConcurrentMap(); + this.directoryCleaner = Executors.newSingleThreadExecutor( + // Add `spark` prefix because it will run in NM in Yarn mode. + NettyUtils.createThreadFactory("spark-shuffle-merged-shuffle-directory-cleaner")); + this.minChunkSize = conf.minChunkSizeInMergedShuffleFile(); + CacheLoader<File, ShuffleIndexInformation> indexCacheLoader = + new CacheLoader<File, ShuffleIndexInformation>() { + public ShuffleIndexInformation load(File file) throws IOException { + return new ShuffleIndexInformation(file); + } + }; + indexCache = CacheBuilder.newBuilder() + .maximumWeight(conf.mergedIndexCacheSize()) + .weigher((Weigher<File, ShuffleIndexInformation>) (file, indexInfo) -> indexInfo.getSize()) + .build(indexCacheLoader); + this.relativeMergeDirPathPattern = relativeMergeDirPathPattern; + this.errorHandler = new ErrorHandler.BlockPushErrorHandler(); + } + + /** + * Given an ID that uniquely identifies a given shuffle partition of an application, retrieves the + * associated metadata. If not present and the corresponding merged shuffle does not exist, + * initializes the metadata. + */ + private AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo(AppShufflePartitionId id) { + return partitions.computeIfAbsent(id, key -> { + // It only gets here when the key is not present in the map. This could either + // be the first time the merge manager receives a pushed block for a given application + // shuffle partition, or after the merged shuffle file is finalized. We handle these + // two cases accordingly by checking if the file already exists. + File dataFile = getMergedShuffleDataFile(id); + File indexFile = getMergedShuffleIndexFile(id); + File metaFile = getMergedShuffleMetaFile(id); + try { + if (dataFile.exists()) { + return null; + } else { + return new AppShufflePartitionInfo(id, dataFile, indexFile, metaFile); + } + } catch (IOException e) { + logger.error( + "Cannot create merged shuffle partition {} with shuffle file {}, index file {}, and " + + "meta file {}", key, indexFile.getAbsolutePath(), + indexFile.getAbsolutePath(), metaFile.getAbsolutePath()); + throw new RuntimeException( + String.format("Cannot initialize merged shuffle partition %s", key.toString()), e); + } + }); + } + + @Override + public MergedBlockMeta getMergedBlockMeta(String appId, int shuffleId, int reduceId) { + AppShufflePartitionId id = new AppShufflePartitionId(appId, shuffleId, reduceId); + File indexFile = getMergedShuffleIndexFile(id); + if (!indexFile.exists()) { + throw new RuntimeException(String.format( + "Merged shuffle index file %s of %s not found", indexFile.getPath(), id.toString())); + } + int size = (int) indexFile.length(); + // First entry is the zero offset + int numChunks = (size / Long.BYTES) - 1; + File metaFile = getMergedShuffleMetaFile(id); + if (!metaFile.exists()) { + throw new RuntimeException(String.format("Merged shuffle meta file %s of %s not found", + metaFile.getPath(), id.toString())); + } + FileSegmentManagedBuffer chunkBitMaps = + new FileSegmentManagedBuffer(conf, metaFile, 0L, metaFile.length()); + logger.trace( + "{} shuffleId {} reduceId {} num chunks {}", appId, shuffleId, reduceId, numChunks); + return new MergedBlockMeta(numChunks, chunkBitMaps); + } + + @SuppressWarnings("UnstableApiUsage") + @Override + public ManagedBuffer getMergedBlockData(String appId, int shuffleId, int reduceId, int chunkId) { + AppShufflePartitionId id = new AppShufflePartitionId(appId, shuffleId, reduceId); + File dataFile = getMergedShuffleDataFile(id); + if (!dataFile.exists()) { + throw new RuntimeException(String.format("Merged shuffle data file %s of %s not found", + dataFile.getPath(), id.toString())); + } + File indexFile = getMergedShuffleIndexFile(id); + try { + // If we get here, the merged shuffle file should have been properly finalized. Thus we can + // use the file length to determine the size of the merged shuffle block. + ShuffleIndexInformation shuffleIndexInformation = indexCache.get(indexFile); + ShuffleIndexRecord shuffleIndexRecord = shuffleIndexInformation.getIndex(chunkId); + return new FileSegmentManagedBuffer( + conf, dataFile, shuffleIndexRecord.getOffset(), shuffleIndexRecord.getLength()); + } catch (ExecutionException e) { + throw new RuntimeException(String.format( + "Failed to open merged shuffle index file %s of %s", indexFile.getPath(), id.toString()), + e); + } + } + + /** + * The logic here is consistent with + * org.apache.spark.storage.DiskBlockManager#getMergedShuffleFile + */ + // TODO should we use subDirsPerLocalDir to potentially reduce inode size? + private File getFile(String appId, String filename) { + int hash = JavaUtils.nonNegativeHash(filename); + // TODO: Change the message when this service is able to handle NM restart Review comment: is there a follow on lira to do this? what is exact behavior on restart then? ########## File path: common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java ########## @@ -172,7 +178,9 @@ protected void serviceInit(Configuration conf) throws Exception { } TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); - blockHandler = new ExternalBlockHandler(transportConf, registeredExecutorFile); + shuffleMergeManager = new RemoteBlockPushResolver(transportConf, APP_BASE_RELATIVE_PATH); Review comment: yeah I think that is the question - are there any side affects that I don't want if I'm not using it. we are registering applications and executors with it even if not in use. I assume that will at least use a bit more memory. Personally I think its safer to have a way to totally turn it off, I think we could leave that config not published, though really I think all of them are that way. This also allows someone to turn it off in case they don't want clients to be able to do it at all. I was also wondering if someone might want to have their own custom implementation for his. So if we made it configurable such that they could provide a class, that would also be supported. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org