zhijiangW commented on a change in pull request #9984: [FLINK-9495][kubernetes] 
Implement ResourceManager for Kubernetes.
URL: https://github.com/apache/flink/pull/9984#discussion_r354707730
 
 

 ##########
 File path: 
flink-kubernetes/src/test/java/org/apache/flink/kubernetes/KubernetesResourceManagerTest.java
 ##########
 @@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.kubernetes;
+
+import org.apache.flink.api.common.JobID;
+import org.apache.flink.api.common.time.Time;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.configuration.TaskManagerOptions;
+import org.apache.flink.kubernetes.configuration.KubernetesConfigOptions;
+import org.apache.flink.kubernetes.kubeclient.FlinkKubeClient;
+import org.apache.flink.kubernetes.kubeclient.TaskManagerPodParameter;
+import org.apache.flink.kubernetes.kubeclient.resources.KubernetesPod;
+import org.apache.flink.kubernetes.utils.Constants;
+import org.apache.flink.runtime.clusterframework.types.AllocationID;
+import org.apache.flink.runtime.clusterframework.types.ResourceID;
+import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
+import org.apache.flink.runtime.clusterframework.types.SlotID;
+import org.apache.flink.runtime.entrypoint.ClusterInformation;
+import org.apache.flink.runtime.heartbeat.HeartbeatServices;
+import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
+import org.apache.flink.runtime.instance.HardwareDescription;
+import org.apache.flink.runtime.messages.Acknowledge;
+import org.apache.flink.runtime.metrics.groups.ResourceManagerMetricGroup;
+import org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups;
+import org.apache.flink.runtime.registration.RegistrationResponse;
+import org.apache.flink.runtime.resourcemanager.JobLeaderIdService;
+import org.apache.flink.runtime.resourcemanager.ResourceManagerGateway;
+import org.apache.flink.runtime.resourcemanager.SlotRequest;
+import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager;
+import 
org.apache.flink.runtime.resourcemanager.utils.MockResourceManagerRuntimeServices;
+import org.apache.flink.runtime.rpc.FatalErrorHandler;
+import org.apache.flink.runtime.rpc.RpcService;
+import org.apache.flink.runtime.rpc.TestingRpcService;
+import org.apache.flink.runtime.taskexecutor.SlotReport;
+import org.apache.flink.runtime.taskexecutor.SlotStatus;
+import org.apache.flink.runtime.taskexecutor.TaskExecutorGateway;
+import org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess;
+import org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder;
+import org.apache.flink.runtime.util.TestingFatalErrorHandler;
+
+import io.fabric8.kubernetes.api.model.Container;
+import io.fabric8.kubernetes.api.model.ContainerStateBuilder;
+import io.fabric8.kubernetes.api.model.ContainerStatusBuilder;
+import io.fabric8.kubernetes.api.model.EnvVarBuilder;
+import io.fabric8.kubernetes.api.model.Pod;
+import io.fabric8.kubernetes.api.model.PodList;
+import io.fabric8.kubernetes.api.model.PodStatusBuilder;
+import io.fabric8.kubernetes.client.KubernetesClient;
+import org.hamcrest.Matchers;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+
+import static junit.framework.TestCase.assertEquals;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Test cases for {@link KubernetesResourceManager}.
+ */
+public class KubernetesResourceManagerTest extends KubernetesTestBase {
+
+       private static final Time TIMEOUT = Time.seconds(10L);
+
+       private TestingFatalErrorHandler testingFatalErrorHandler;
+
+       private final String jobManagerHost = "jm-host1";
+
+       private Configuration flinkConfig;
+
+       private TestingKubernetesResourceManager resourceManager;
+
+       private FlinkKubeClient flinkKubeClient;
+
+       @Before
+       public void setup() throws Exception {
+               testingFatalErrorHandler = new TestingFatalErrorHandler();
+               flinkConfig = new Configuration(FLINK_CONFIG);
+               flinkConfig.setString(TaskManagerOptions.TOTAL_PROCESS_MEMORY, 
"1024m");
+
+               flinkKubeClient = getFabric8FlinkKubeClient();
+               resourceManager = createAndStartResourceManager(flinkConfig);
+       }
+
+       @After
+       public void teardown() throws Exception {
+               if (testingFatalErrorHandler != null) {
+                       testingFatalErrorHandler.rethrowError();
+               }
+               resourceManager.close();
+       }
+
+       class TestingKubernetesResourceManager extends 
KubernetesResourceManager {
+
+               private SlotManager slotManager;
+
+               public TestingKubernetesResourceManager(
+                               RpcService rpcService,
+                               String resourceManagerEndpointId,
+                               ResourceID resourceId,
+                               Configuration flinkConfig,
+                               HighAvailabilityServices 
highAvailabilityServices,
+                               HeartbeatServices heartbeatServices,
+                               SlotManager slotManager,
+                               JobLeaderIdService jobLeaderIdService,
+                               ClusterInformation clusterInformation,
+                               FatalErrorHandler fatalErrorHandler,
+                               ResourceManagerMetricGroup 
resourceManagerMetricGroup) {
+                       super(
+                               rpcService,
+                               resourceManagerEndpointId,
+                               resourceId,
+                               flinkConfig,
+                               highAvailabilityServices,
+                               heartbeatServices,
+                               slotManager,
+                               jobLeaderIdService,
+                               clusterInformation,
+                               fatalErrorHandler,
+                               resourceManagerMetricGroup
+                       );
+                       this.slotManager = slotManager;
+               }
+
+               @Override
+               protected FlinkKubeClient createFlinkKubeClient() {
+                       return flinkKubeClient;
+               }
+
+               <T> CompletableFuture<T> runInMainThread(Callable<T> callable) {
+                       return callAsync(callable, TIMEOUT);
+               }
+
+               @Override
+               protected void runAsync(Runnable runnable) {
+                       runnable.run();
+               }
+
+               MainThreadExecutor getMainThreadExecutorForTesting() {
+                       return super.getMainThreadExecutor();
+               }
+
+               SlotManager getSlotManager() {
+                       return this.slotManager;
+               }
+       }
+
+       @Test
+       public void testStartAndStopWorker() throws Exception {
+               registerSlotRequest(resourceManager);
+
+               final KubernetesClient client = getKubeClient();
+               final PodList list = client.pods().list();
+               assertEquals(1, list.getItems().size());
+               final Pod pod = list.getItems().get(0);
+
+               final Map<String, String> labels = getCommonLabels();
+               labels.put(Constants.LABEL_COMPONENT_KEY, 
Constants.LABEL_COMPONENT_TASK_MANAGER);
+               assertEquals(labels, pod.getMetadata().getLabels());
+
+               assertEquals(1, pod.getSpec().getContainers().size());
+               final Container tmContainer = 
pod.getSpec().getContainers().get(0);
+               assertEquals(CONTAINER_IMAGE, tmContainer.getImage());
+
+               final String podName = CLUSTER_ID + "-taskmanager-1-1";
+               assertEquals(podName, pod.getMetadata().getName());
+
+               // Check environments
+               assertThat(tmContainer.getEnv(), Matchers.contains(
+                       new 
EnvVarBuilder().withName(Constants.ENV_FLINK_POD_NAME).withValue(podName).build()));
+
+               // Check task manager main class args.
+               assertEquals(3, tmContainer.getArgs().size());
+               final String confDirOption = "--configDir " + 
flinkConfig.getString(KubernetesConfigOptions.FLINK_CONF_DIR);
+               
assertTrue(tmContainer.getArgs().get(2).contains(confDirOption));
+
+               resourceManager.onAdded(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod)));
+               final ResourceID resourceID = new ResourceID(podName);
+               assertThat(resourceManager.getWorkerNodes().keySet(), 
Matchers.contains(resourceID));
+
+               registerTaskExecutor(resourceID);
+
+               // Unregister all task executors and release all task managers.
+               CompletableFuture<?> unregisterAndReleaseFuture = 
resourceManager.runInMainThread(() -> {
+                       
resourceManager.getSlotManager().unregisterTaskManagersAndReleaseResources();
+                       return null;
+               });
+               unregisterAndReleaseFuture.get();
+               assertEquals(0, client.pods().list().getItems().size());
+               assertEquals(0, resourceManager.getWorkerNodes().size());
+       }
+
+       @Test
+       public void testTaskManagerPodTerminated() throws Exception {
+               registerSlotRequest(resourceManager);
+
+               final KubernetesClient client = getKubeClient();
+               final Pod pod1 = client.pods().list().getItems().get(0);
+               final String taskManagerPrefix = CLUSTER_ID + "-taskmanager-1-";
+
+               resourceManager.onAdded(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod1)));
+
+               // General modification event
+               resourceManager.onModified(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod1)));
+               assertEquals(1, client.pods().list().getItems().size());
+               assertEquals(taskManagerPrefix + 1, 
client.pods().list().getItems().get(0).getMetadata().getName());
+
+               // Terminate the pod.
+               terminatePod(pod1);
+               resourceManager.onModified(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod1)));
+
+               // Old pod should be deleted and a new task manager should be 
created
+               assertEquals(1, client.pods().list().getItems().size());
+               final Pod pod2 = client.pods().list().getItems().get(0);
+               assertEquals(taskManagerPrefix + 2, 
pod2.getMetadata().getName());
+
+               // Error happens in the pod.
+               resourceManager.onAdded(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod2)));
+               terminatePod(pod2);
+               resourceManager.onError(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod2)));
+               final Pod pod3 = client.pods().list().getItems().get(0);
+               assertEquals(taskManagerPrefix + 3, 
pod3.getMetadata().getName());
+
+               // Delete the pod.
+               resourceManager.onAdded(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod3)));
+               terminatePod(pod3);
+               resourceManager.onDeleted(Collections.singletonList(new 
KubernetesPod(flinkConfig, pod3)));
+               assertEquals(taskManagerPrefix + 4, 
client.pods().list().getItems().get(0).getMetadata().getName());
+       }
+
+       @Test
+       public void testGetWorkerNodesFromPreviousAttempts() throws Exception {
+               // Prepare pod of previous attempt
+               final String previewPodName = CLUSTER_ID + "-taskmanager-1-1";
+               flinkKubeClient.createTaskManagerPod(new 
TaskManagerPodParameter(
+                       previewPodName,
+                       new ArrayList<>(),
+                       1024,
+                       1,
+                       new HashMap<>()));
+               final KubernetesClient client = getKubeClient();
+               assertEquals(1, client.pods().list().getItems().size());
+
+               // Call initialize method to recover worker nodes from previous 
attempt.
+               resourceManager.initialize();
+
+               final String taskManagerPrefix = CLUSTER_ID + "-taskmanager-";
+               // Register the previous taskmanager, no new pod should be 
created
+               registerTaskExecutor(new ResourceID(previewPodName));
+               registerSlotRequest(resourceManager);
+               assertEquals(1, client.pods().list().getItems().size());
+
+               // Register a new slot request, a new taskmanger pod will be 
created with attempt2
+               registerSlotRequest(resourceManager);
+               assertEquals(2, client.pods().list().getItems().size());
+               assertThat(client.pods().list().getItems().stream()
+                               .map(e -> e.getMetadata().getName())
+                               .collect(Collectors.toList()),
+                       Matchers.containsInAnyOrder(taskManagerPrefix + "1-1", 
taskManagerPrefix + "2-1"));
+
+               resourceManager.getRpcService().stopService().get();
+       }
+
+       private TestingKubernetesResourceManager 
createAndStartResourceManager(Configuration configuration) throws Exception {
+
+               final TestingRpcService rpcService = new 
TestingRpcService(configuration);
+               MockResourceManagerRuntimeServices rmServices = new 
MockResourceManagerRuntimeServices(rpcService, TIMEOUT);
+
+               final TestingKubernetesResourceManager 
kubernetesResourceManager = new TestingKubernetesResourceManager(
+                       rpcService,
+                       "kubernetesResourceManager",
+                       ResourceID.generate(),
+                       configuration,
+                       rmServices.highAvailabilityServices,
+                       rmServices.heartbeatServices,
+                       rmServices.slotManager,
+                       rmServices.jobLeaderIdService,
+                       new ClusterInformation("localhost", 1234),
+                       testingFatalErrorHandler,
+                       
UnregisteredMetricGroups.createUnregisteredResourceManagerMetricGroup()
+               );
+               kubernetesResourceManager.start();
+               rmServices.grantLeadership();
+               return kubernetesResourceManager;
+       }
+
+       private void registerSlotRequest(TestingKubernetesResourceManager 
resourceManager) throws Exception {
+
+               CompletableFuture<?> registerSlotRequestFuture = 
resourceManager.runInMainThread(() -> {
+                       resourceManager.getSlotManager().registerSlotRequest(
+                               new SlotRequest(new JobID(), new 
AllocationID(), ResourceProfile.UNKNOWN, jobManagerHost));
+                       return null;
+               });
+               registerSlotRequestFuture.get();
+       }
+
+       private void registerTaskExecutor(ResourceID resourceID) throws 
Exception {
+               final TaskExecutorGateway taskExecutorGateway = new 
TestingTaskExecutorGatewayBuilder()
+                       .setAddress(UUID.randomUUID().toString())
 
 Review comment:
   this set is unnecessary

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to