[jira] [Commented] (FLINK-7057) move BLOB ref-counting from LibraryCacheManager to BlobCache

ASF GitHub Bot (JIRA) Thu, 17 Aug 2017 03:19:12 -0700

    [ 
https://issues.apache.org/jira/browse/FLINK-7057?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16130202#comment-16130202
 ]


ASF GitHub Bot commented on FLINK-7057:
---------------------------------------

Github user tillrohrmann commented on a diff in the pull request:

    https://github.com/apache/flink/pull/4238#discussion_r133673650
  
    --- Diff: 
flink-runtime/src/test/java/org/apache/flink/runtime/jobmanager/JobManagerCleanupITCase.java
 ---
    @@ -0,0 +1,298 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *     http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.runtime.jobmanager;
    +
    +import akka.actor.ActorSystem;
    +import akka.testkit.JavaTestKit;
    +import org.apache.flink.api.common.JobID;
    +import org.apache.flink.configuration.AkkaOptions;
    +import org.apache.flink.configuration.BlobServerOptions;
    +import org.apache.flink.configuration.ConfigConstants;
    +import org.apache.flink.configuration.Configuration;
    +import org.apache.flink.runtime.akka.AkkaUtils;
    +import org.apache.flink.runtime.akka.ListeningBehaviour;
    +import org.apache.flink.runtime.blob.BlobClient;
    +import org.apache.flink.runtime.blob.BlobKey;
    +import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
    +import org.apache.flink.runtime.instance.ActorGateway;
    +import org.apache.flink.runtime.instance.AkkaActorGateway;
    +import org.apache.flink.runtime.jobgraph.JobGraph;
    +import org.apache.flink.runtime.jobgraph.JobVertex;
    +import org.apache.flink.runtime.messages.JobManagerMessages;
    +import org.apache.flink.runtime.testingUtils.TestingCluster;
    +import org.apache.flink.runtime.testingUtils.TestingUtils;
    +import org.apache.flink.runtime.testtasks.FailingBlockingInvokable;
    +import org.apache.flink.runtime.testtasks.NoOpInvokable;
    +import org.junit.AfterClass;
    +import org.junit.BeforeClass;
    +import org.junit.Rule;
    +import org.junit.Test;
    +import org.junit.rules.TemporaryFolder;
    +import scala.concurrent.Await;
    +import scala.concurrent.Future;
    +import scala.concurrent.duration.FiniteDuration;
    +
    +import java.io.File;
    +import java.io.FilenameFilter;
    +import java.io.IOException;
    +import java.net.InetSocketAddress;
    +import java.util.Arrays;
    +
    +import static 
org.apache.flink.runtime.testingUtils.TestingUtils.DEFAULT_AKKA_ASK_TIMEOUT;
    +import static org.junit.Assert.assertArrayEquals;
    +import static org.junit.Assert.assertNotNull;
    +import static org.junit.Assert.fail;
    +
    +/**
    + * Small test to check that the {@link 
org.apache.flink.runtime.blob.BlobServer} cleanup is executed
    + * after job termination.
    + */
    +public class JobManagerCleanupITCase {
    +
    +   @Rule
    +   public TemporaryFolder tmpFolder = new TemporaryFolder();
    +
    +   private static ActorSystem system;
    +
    +   @BeforeClass
    +   public static void setup() {
    +           system = AkkaUtils.createLocalActorSystem(new Configuration());
    +   }
    +
    +   @AfterClass
    +   public static void teardown() {
    +           JavaTestKit.shutdownActorSystem(system);
    +   }
    +
    +   /**
    +    * Specifies which test case to run in {@link 
#testBlobServerCleanup(TestCase)}.
    +    */
    +   private enum TestCase {
    +           JOB_FINISHES_SUCESSFULLY,
    +           JOB_IS_CANCELLED,
    +           JOB_FAILS,
    +           JOB_SUBMISSION_FAILS
    +   }
    +
    +   /**
    +    * Test cleanup for a job that finishes ordinarily.
    +    */
    +   @Test
    +   public void testBlobServerCleanupFinishedJob() throws IOException {
    +           testBlobServerCleanup(TestCase.JOB_FINISHES_SUCESSFULLY);
    +   }
    +
    +   /**
    +    * Test cleanup for a job which is cancelled after submission.
    +    */
    +   @Test
    +   public void testBlobServerCleanupCancelledJob() throws IOException {
    +           testBlobServerCleanup(TestCase.JOB_IS_CANCELLED);
    +   }
    +
    +   /**
    +    * Test cleanup for a job that fails (first a task fails, then the job 
recovers, then the whole
    +    * job fails due to a limited restart policy).
    +    */
    +   @Test
    +   public void testBlobServerCleanupFailedJob() throws IOException {
    +           testBlobServerCleanup(TestCase.JOB_FAILS);
    +   }
    +
    +   /**
    +    * Test cleanup for a job that fails job submission (emulated by an 
additional BLOB not being
    +    * present).
    +    */
    +   @Test
    +   public void testBlobServerCleanupFailedSubmission() throws IOException {
    +           testBlobServerCleanup(TestCase.JOB_SUBMISSION_FAILS);
    +   }
    +
    +   private void testBlobServerCleanup(final TestCase testCase) throws 
IOException {
    +           final int num_tasks = 2;
    +           final File blobBaseDir = tmpFolder.newFolder();
    +
    +           new JavaTestKit(system) {{
    +                   new Within(duration("30 seconds")) {
    +                           @Override
    +                           protected void run() {
    +                                   // Setup
    +
    +                                   TestingCluster cluster = null;
    +                                   BlobClient bc = null;
    +
    +                                   try {
    +                                           Configuration config = new 
Configuration();
    +                                           
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 2);
    +                                           
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    +                                           
config.setString(AkkaOptions.ASK_TIMEOUT, DEFAULT_AKKA_ASK_TIMEOUT());
    +                                           
config.setString(BlobServerOptions.STORAGE_DIRECTORY, 
blobBaseDir.getAbsolutePath());
    +
    +                                           
config.setString(ConfigConstants.RESTART_STRATEGY, "fixeddelay");
    +                                           
config.setInteger(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS, 1);
    +                                           
config.setString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY, "1 s");
    +                                           // BLOBs are deleted from 
BlobCache between 1s and 2s after last reference
    +                                           // -> the BlobCache may still 
have the BLOB or not (let's test both cases randomly)
    +                                           
config.setLong(BlobServerOptions.CLEANUP_INTERVAL, 1L);
    +
    +                                           cluster = new 
TestingCluster(config);
    +                                           cluster.start();
    +
    +                                           final ActorGateway 
jobManagerGateway = cluster.getLeaderGateway(
    +                                                   
TestingUtils.TESTING_DURATION());
    +
    +                                           // we can set the leader 
session ID to None because we don't use this gateway to send messages
    +                                           final ActorGateway 
testActorGateway = new AkkaActorGateway(getTestActor(),
    +                                                   
HighAvailabilityServices.DEFAULT_LEADER_ID);
    +
    +                                           // Create a task
    +
    +                                           JobVertex source = new 
JobVertex("Source");
    +                                           if (testCase == 
TestCase.JOB_FAILS || testCase == TestCase.JOB_IS_CANCELLED) {
    +                                                   
source.setInvokableClass(FailingBlockingInvokable.class);
    +                                           } else {
    +                                                   
source.setInvokableClass(NoOpInvokable.class);
    +                                           }
    +                                           
source.setParallelism(num_tasks);
    +
    +                                           JobGraph jobGraph = new 
JobGraph("BlobCleanupTest", source);
    +                                           final JobID jid = 
jobGraph.getJobID();
    +
    +                                           // request the blob port from 
the job manager
    +                                           Future<Object> future = 
jobManagerGateway
    +                                                   
.ask(JobManagerMessages.getRequestBlobManagerPort(), remaining());
    +                                           int blobPort = (Integer) 
Await.result(future, remaining());
    +
    +                                           // upload a blob
    +                                           BlobKey key1;
    +                                           bc = new BlobClient(new 
InetSocketAddress("localhost", blobPort),
    +                                                   config);
    +                                           try {
    +                                                   key1 = bc.put(jid, new 
byte[10]);
    +                                           } finally {
    +                                                   bc.close();
    +                                           }
    +                                           jobGraph.addBlob(key1);
    +
    +                                           if (testCase == 
TestCase.JOB_SUBMISSION_FAILS) {
    +                                                   // add an invalid key 
so that the submission fails
    +                                                   jobGraph.addBlob(new 
BlobKey());
    +                                           }
    +
    +                                           // Submit the job and wait for 
all vertices to be running
    +                                           jobManagerGateway.tell(
    +                                                   new 
JobManagerMessages.SubmitJob(
    +                                                           jobGraph,
    +                                                           
ListeningBehaviour.EXECUTION_RESULT),
    +                                                   testActorGateway);
    +                                           if (testCase == 
TestCase.JOB_SUBMISSION_FAILS) {
    +                                                   
expectMsgClass(JobManagerMessages.JobResultFailure.class);
    +                                           } else {
    +                                                   
expectMsgClass(JobManagerMessages.JobSubmitSuccess.class);
    +
    +                                                   if (testCase == 
TestCase.JOB_FAILS) {
    +                                                           // fail a task 
so that the job is going to be recovered (we actually do not
    +                                                           // need the 
blocking part of the invokable and can start throwing right away)
    +                                                           
FailingBlockingInvokable.unblock();
    +
    +                                                           // job will get 
restarted, BlobCache may re-download the BLOB if already deleted
    +                                                           // then the 
tasks will fail again and the restart strategy will finalise the job
    +
    +                                                           
expectMsgClass(JobManagerMessages.JobResultFailure.class);
    +                                                   } else if (testCase == 
TestCase.JOB_IS_CANCELLED) {
    +                                                           
jobManagerGateway.tell(
    +                                                                   new 
JobManagerMessages.CancelJob(jid),
    +                                                                   
testActorGateway);
    +                                                           
expectMsgClass(JobManagerMessages.CancellationResponse.class);
    +
    +                                                           // job will be 
cancelled and everything should be cleaned up
    +
    +                                                           
expectMsgClass(JobManagerMessages.JobResultFailure.class);
    +                                                   } else {
    +                                                           
expectMsgClass(JobManagerMessages.JobResultSuccess.class);
    +                                                   }
    +                                           }
    +
    +                                           // both BlobServer and 
BlobCache should eventually delete all files
    +
    +                                           File[] blobDirs = 
blobBaseDir.listFiles(new FilenameFilter() {
    +                                                   @Override
    +                                                   public boolean 
accept(File dir, String name) {
    +                                                           return 
name.startsWith("blobStore-");
    +                                                   }
    +                                           });
    +                                           assertNotNull(blobDirs);
    +                                           for (File blobDir : blobDirs) {
    +                                                   
waitForEmptyBlobDir(blobDir, remaining());
    +                                           }
    +
    +                                   } catch (Exception e) {
    +                                           e.printStackTrace();
    +                                           fail(e.getMessage());
    --- End diff --
    
    Shall we let this exception simply bubble up?


> move BLOB ref-counting from LibraryCacheManager to BlobCache
> ------------------------------------------------------------
>
>                 Key: FLINK-7057
>                 URL: https://issues.apache.org/jira/browse/FLINK-7057
>             Project: Flink
>          Issue Type: Sub-task
>          Components: Distributed Coordination, Network
>    Affects Versions: 1.4.0
>            Reporter: Nico Kruber
>            Assignee: Nico Kruber
>
> Currently, the {{LibraryCacheManager}} is doing some ref-counting for JAR 
> files managed by it. Instead, we want the {{BlobCache}} to do that itself for 
> all job-related BLOBs. Also, we do not want to operate on a per-{{BlobKey}} 
> level but rather per job. Therefore, the cleanup process should be adapted, 
> too.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (FLINK-7057) move BLOB ref-counting from LibraryCacheManager to BlobCache

Reply via email to