shwstppr commented on a change in pull request #3680: [WIP: DO NOT MERGE] CloudStack Kubernetes Service URL: https://github.com/apache/cloudstack/pull/3680#discussion_r364079259
########## File path: plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetescluster/KubernetesClusterManagerImpl.java ########## @@ -0,0 +1,3061 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.kubernetescluster; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.reflect.Field; +import java.math.BigInteger; +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.net.MalformedURLException; +import java.net.Socket; +import java.net.URL; +import java.net.UnknownHostException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.security.SecureRandom; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.inject.Inject; +import javax.naming.ConfigurationException; + +import org.apache.cloudstack.acl.ControlledEntity; +import org.apache.cloudstack.acl.SecurityChecker; +import org.apache.cloudstack.api.ApiConstants; +import org.apache.cloudstack.api.ApiErrorCode; +import org.apache.cloudstack.api.BaseCmd; +import org.apache.cloudstack.api.ServerApiException; +import org.apache.cloudstack.api.command.user.firewall.CreateFirewallRuleCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.CreateKubernetesClusterCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.DeleteKubernetesClusterCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.GetKubernetesClusterConfigCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.ListKubernetesClustersCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.ScaleKubernetesClusterCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.StartKubernetesClusterCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.StopKubernetesClusterCmd; +import org.apache.cloudstack.api.command.user.kubernetescluster.UpgradeKubernetesClusterCmd; +import org.apache.cloudstack.api.command.user.vm.StartVMCmd; +import org.apache.cloudstack.api.response.KubernetesClusterConfigResponse; +import org.apache.cloudstack.api.response.KubernetesClusterResponse; +import org.apache.cloudstack.api.response.ListResponse; +import org.apache.cloudstack.ca.CAManager; +import org.apache.cloudstack.context.CallContext; +import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService; +import org.apache.cloudstack.engine.subsystem.api.storage.ObjectInDataStoreStateMachine; +import org.apache.cloudstack.framework.ca.Certificate; +import org.apache.cloudstack.framework.config.dao.ConfigurationDao; +import org.apache.cloudstack.managed.context.ManagedContextRunnable; +import org.apache.cloudstack.utils.security.CertUtils; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.io.IOUtils; +import org.apache.log4j.Logger; + +import com.cloud.api.ApiDBUtils; +import com.cloud.api.query.dao.TemplateJoinDao; +import com.cloud.api.query.vo.TemplateJoinVO; +import com.cloud.capacity.CapacityManager; +import com.cloud.dc.ClusterDetailsDao; +import com.cloud.dc.ClusterDetailsVO; +import com.cloud.dc.ClusterVO; +import com.cloud.dc.DataCenter; +import com.cloud.dc.DataCenterVO; +import com.cloud.dc.Pod; +import com.cloud.dc.Vlan; +import com.cloud.dc.VlanVO; +import com.cloud.dc.dao.ClusterDao; +import com.cloud.dc.dao.DataCenterDao; +import com.cloud.dc.dao.VlanDao; +import com.cloud.deploy.DeployDestination; +import com.cloud.exception.ConcurrentOperationException; +import com.cloud.exception.InsufficientCapacityException; +import com.cloud.exception.InsufficientServerCapacityException; +import com.cloud.exception.InvalidParameterValueException; +import com.cloud.exception.ManagementServerException; +import com.cloud.exception.NetworkRuleConflictException; +import com.cloud.exception.PermissionDeniedException; +import com.cloud.exception.ResourceAllocationException; +import com.cloud.exception.ResourceUnavailableException; +import com.cloud.host.Host.Type; +import com.cloud.host.HostVO; +import com.cloud.hypervisor.Hypervisor; +import com.cloud.kubernetescluster.dao.KubernetesClusterDao; +import com.cloud.kubernetescluster.dao.KubernetesClusterDetailsDao; +import com.cloud.kubernetescluster.dao.KubernetesClusterVmMapDao; +import com.cloud.kubernetesversion.KubernetesSupportedVersion; +import com.cloud.kubernetesversion.KubernetesSupportedVersionVO; +import com.cloud.kubernetesversion.KubernetesVersionManagerImpl; +import com.cloud.kubernetesversion.dao.KubernetesSupportedVersionDao; +import com.cloud.network.IpAddress; +import com.cloud.network.IpAddressManager; +import com.cloud.network.Network; +import com.cloud.network.Network.Service; +import com.cloud.network.NetworkModel; +import com.cloud.network.NetworkService; +import com.cloud.network.PhysicalNetwork; +import com.cloud.network.addr.PublicIp; +import com.cloud.network.dao.FirewallRulesDao; +import com.cloud.network.dao.IPAddressDao; +import com.cloud.network.dao.IPAddressVO; +import com.cloud.network.dao.NetworkDao; +import com.cloud.network.dao.NetworkVO; +import com.cloud.network.dao.PhysicalNetworkDao; +import com.cloud.network.firewall.FirewallService; +import com.cloud.network.lb.LoadBalancingRulesService; +import com.cloud.network.rules.FirewallRule; +import com.cloud.network.rules.FirewallRuleVO; +import com.cloud.network.rules.LoadBalancer; +import com.cloud.network.rules.PortForwardingRuleVO; +import com.cloud.network.rules.RulesService; +import com.cloud.network.rules.dao.PortForwardingRulesDao; +import com.cloud.offering.NetworkOffering; +import com.cloud.offering.ServiceOffering; +import com.cloud.offerings.NetworkOfferingVO; +import com.cloud.offerings.dao.NetworkOfferingDao; +import com.cloud.offerings.dao.NetworkOfferingServiceMapDao; +import com.cloud.org.Grouping; +import com.cloud.resource.ResourceManager; +import com.cloud.service.ServiceOfferingVO; +import com.cloud.service.dao.ServiceOfferingDao; +import com.cloud.storage.Storage; +import com.cloud.storage.VMTemplateVO; +import com.cloud.storage.VMTemplateZoneVO; +import com.cloud.storage.dao.VMTemplateDao; +import com.cloud.storage.dao.VMTemplateZoneDao; +import com.cloud.template.TemplateApiService; +import com.cloud.template.VirtualMachineTemplate; +import com.cloud.user.Account; +import com.cloud.user.AccountManager; +import com.cloud.user.AccountService; +import com.cloud.user.SSHKeyPairVO; +import com.cloud.user.User; +import com.cloud.user.dao.AccountDao; +import com.cloud.user.dao.SSHKeyPairDao; +import com.cloud.uservm.UserVm; +import com.cloud.utils.Pair; +import com.cloud.utils.component.ComponentContext; +import com.cloud.utils.component.ManagerBase; +import com.cloud.utils.concurrency.NamedThreadFactory; +import com.cloud.utils.db.Filter; +import com.cloud.utils.db.GlobalLock; +import com.cloud.utils.db.SearchCriteria; +import com.cloud.utils.db.Transaction; +import com.cloud.utils.db.TransactionCallback; +import com.cloud.utils.db.TransactionCallbackNoReturn; +import com.cloud.utils.db.TransactionCallbackWithException; +import com.cloud.utils.db.TransactionStatus; +import com.cloud.utils.exception.CloudRuntimeException; +import com.cloud.utils.fsm.NoTransitionException; +import com.cloud.utils.fsm.StateMachine2; +import com.cloud.utils.net.Ip; +import com.cloud.utils.net.NetUtils; +import com.cloud.utils.ssh.SshHelper; +import com.cloud.vm.Nic; +import com.cloud.vm.ReservationContext; +import com.cloud.vm.ReservationContextImpl; +import com.cloud.vm.UserVmManager; +import com.cloud.vm.UserVmService; +import com.cloud.vm.UserVmVO; +import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; +import com.cloud.vm.dao.UserVmDao; +import com.cloud.vm.dao.VMInstanceDao; +import com.google.common.base.Strings; + +public class KubernetesClusterManagerImpl extends ManagerBase implements KubernetesClusterService { + + private static final Logger LOGGER = Logger.getLogger(KubernetesClusterManagerImpl.class); + + protected StateMachine2<KubernetesCluster.State, KubernetesCluster.Event, KubernetesCluster> _stateMachine = KubernetesCluster.State.getStateMachine(); + + ScheduledExecutorService _gcExecutor; + ScheduledExecutorService _stateScanner; + + @Inject + protected KubernetesClusterDao kubernetesClusterDao; + @Inject + protected KubernetesClusterVmMapDao kubernetesClusterVmMapDao; + @Inject + protected KubernetesClusterDetailsDao kubernetesClusterDetailsDao; + @Inject + protected KubernetesSupportedVersionDao kubernetesSupportedVersionDao; + @Inject + protected CAManager caManager; + @Inject + protected SSHKeyPairDao sshKeyPairDao; + @Inject + protected DataCenterDao dataCenterDao; + @Inject + protected ClusterDao clusterDao; + @Inject + protected ClusterDetailsDao clusterDetailsDao; + @Inject + protected ServiceOfferingDao serviceOfferingDao; + @Inject + protected VMTemplateDao templateDao; + @Inject + protected TemplateApiService templateService; + @Inject + protected VMTemplateZoneDao templateZoneDao; + @Inject + protected TemplateJoinDao templateJoinDao; + @Inject + protected AccountService accountService; + @Inject + protected AccountDao accountDao; + @Inject + protected AccountManager accountManager; + @Inject + protected VMInstanceDao vmInstanceDao; + @Inject + protected UserVmDao userVmDao; + @Inject + protected UserVmService userVmService; + @Inject + protected UserVmManager userVmManager; + @Inject + protected ConfigurationDao globalConfigDao; + @Inject + protected NetworkOfferingDao networkOfferingDao; + @Inject + protected NetworkService networkService; + @Inject + protected NetworkModel networkModel; + @Inject + protected PhysicalNetworkDao physicalNetworkDao; + @Inject + protected NetworkOrchestrationService networkMgr; + @Inject + protected NetworkDao networkDao; + @Inject + protected IPAddressDao ipAddressDao; + @Inject + protected PortForwardingRulesDao portForwardingRulesDao; + @Inject + protected FirewallService firewallService; + @Inject + protected RulesService rulesService; + @Inject + protected NetworkOfferingServiceMapDao networkOfferingServiceMapDao; + @Inject + protected CapacityManager capacityManager; + @Inject + protected ResourceManager resourceManager; + @Inject + protected FirewallRulesDao firewallRulesDao; + @Inject + protected IpAddressManager ipAddressManager; + @Inject + protected LoadBalancingRulesService lbService; + @Inject + protected VlanDao vlanDao; + + private static final String CLUSTER_NODE_VM_USER = "core"; + private static final int CLUSTER_API_PORT = 6443; + private static final int CLUSTER_NODES_DEFAULT_START_SSH_PORT = 2222; + + private static String getStackTrace(final Throwable throwable) { + final StringWriter sw = new StringWriter(); + final PrintWriter pw = new PrintWriter(sw, true); + throwable.printStackTrace(pw); + return sw.getBuffer().toString(); + } + + private String readResourceFile(String resource) throws IOException { + return IOUtils.toString(Objects.requireNonNull(Thread.currentThread().getContextClassLoader().getResourceAsStream(resource)), Charset.defaultCharset().name()); + } + + private boolean isKubernetesServiceConfigured(DataCenter zone) { + // Check Kubernetes VM template for zone + String templateName = globalConfigDao.getValue(KubernetesServiceConfig.KubernetesClusterTemplateName.key()); + if (templateName == null || templateName.isEmpty()) { + LOGGER.warn(String.format("Global setting %s is empty. Template name need to be specified for Kubernetes service to function", KubernetesServiceConfig.KubernetesClusterTemplateName.key())); + return false; + } + final VMTemplateVO template = templateDao.findByTemplateName(templateName); + if (template == null) { + LOGGER.warn(String.format("Unable to find the template %s to be used for provisioning Kubernetes cluster", templateName)); + return false; + } + // Check network offering + String networkOfferingName = globalConfigDao.getValue(KubernetesServiceConfig.KubernetesClusterNetworkOffering.key()); + if (networkOfferingName == null || networkOfferingName.isEmpty()) { + LOGGER.warn(String.format("Global setting %s is empty. Admin has not yet specified the network offering to be used for provisioning isolated network for the cluster", KubernetesServiceConfig.KubernetesClusterNetworkOffering.key())); + return false; + } + NetworkOfferingVO networkOffering = networkOfferingDao.findByUniqueName(networkOfferingName); + if (networkOffering == null) { + LOGGER.warn(String.format("Unable to find the network offering %s to be used for provisioning Kubernetes cluster", networkOfferingName)); + return false; + } + if (networkOffering.getState() == NetworkOffering.State.Disabled) { + LOGGER.warn(String.format("Network offering ID: %s is not enabled", networkOffering.getUuid())); + return false; + } + List<String> services = networkOfferingServiceMapDao.listServicesForNetworkOffering(networkOffering.getId()); + if (services == null || services.isEmpty() || !services.contains("SourceNat")) { + LOGGER.warn(String.format("Network offering ID: %s does not have necessary services to provision Kubernetes cluster", networkOffering.getUuid())); + return false; + } + if (!networkOffering.isEgressDefaultPolicy()) { + LOGGER.warn(String.format("Network offering ID: %s has egress default policy turned off should be on to provision Kubernetes cluster", networkOffering.getUuid())); + return false; + } + long physicalNetworkId = networkModel.findPhysicalNetworkId(zone.getId(), networkOffering.getTags(), networkOffering.getTrafficType()); + PhysicalNetwork physicalNetwork = physicalNetworkDao.findById(physicalNetworkId); + if (physicalNetwork == null) { + LOGGER.warn(String.format("Unable to find physical network with tag: %s", networkOffering.getTags())); + return false; + } + return true; + } + + private File getManagementServerSshPublicKeyFile() { + boolean devel = Boolean.parseBoolean(globalConfigDao.getValue("developer")); + String keyFile = String.format("%s/.ssh/id_rsa", System.getProperty("user.home")); + if (devel) { + keyFile += ".cloud"; + } + return new File(keyFile); + } + + private String generateClusterToken(KubernetesCluster kubernetesCluster) { + if (kubernetesCluster == null) return ""; + String token = kubernetesCluster.getUuid(); + token = token.replaceAll("-", ""); + token = token.substring(0, 22); + token = token.substring(0, 6) + "." + token.substring(6); + return token; + } + + private String generateClusterHACertificateKey(KubernetesCluster kubernetesCluster) { + if (kubernetesCluster == null) return ""; + String uuid = kubernetesCluster.getUuid(); + StringBuilder token = new StringBuilder(uuid.replaceAll("-", "")); + while (token.length() < 64) { + token.append(token); + } + return token.toString().substring(0, 64); + } + + private KubernetesClusterVmMapVO addKubernetesClusterVm(final long kubernetesClusterId, final long vmId) { + return Transaction.execute(new TransactionCallback<KubernetesClusterVmMapVO>() { + @Override + public KubernetesClusterVmMapVO doInTransaction(TransactionStatus status) { + KubernetesClusterVmMapVO newClusterVmMap = new KubernetesClusterVmMapVO(kubernetesClusterId, vmId); + kubernetesClusterVmMapDao.persist(newClusterVmMap); + return newClusterVmMap; + } + }); + } + + private boolean isKubernetesClusterServerRunning(KubernetesCluster kubernetesCluster, String ipAddress, int retries, long waitDuration) { + int retryCounter = 0; + boolean k8sApiServerSetup = false; + while (retryCounter < retries) { + try { + String versionOutput = IOUtils.toString(new URL(String.format("https://%s:%d/version", ipAddress, CLUSTER_API_PORT)), StandardCharsets.UTF_8); + if (!Strings.isNullOrEmpty(versionOutput)) { + LOGGER.debug(String.format("Kubernetes cluster ID: %s API has been successfully provisioned, %s", kubernetesCluster.getUuid(), versionOutput)); + k8sApiServerSetup = true; + break; + } + } catch (Exception e) { + LOGGER.warn(String.format("API endpoint for Kubernetes cluster ID: %s not available. Attempt: %d/%d", kubernetesCluster.getUuid(), retryCounter+1, retries), e); + } + try { + Thread.sleep(waitDuration); + } catch (InterruptedException ie) { + LOGGER.error(String.format("Error while waiting for Kubernetes cluster ID: %s API endpoint to be available", kubernetesCluster.getUuid()), ie); + } + retryCounter++; + } + return k8sApiServerSetup; + } + + private String getKubernetesClusterConfig(KubernetesCluster kubernetesCluster, String ipAddress, int port, int retries) { + int retryCounter = 0; + String kubeConfig = ""; + while (retryCounter < retries) { + try { + Pair<Boolean, String> result = SshHelper.sshExecute(ipAddress, port, CLUSTER_NODE_VM_USER, + getManagementServerSshPublicKeyFile(), null, "sudo cat /etc/kubernetes/admin.conf", + 10000, 10000, 10000); + + if (result.first() && !Strings.isNullOrEmpty(result.second())) { + kubeConfig = result.second(); + break; + } + } catch (Exception e) { + LOGGER.warn(String.format("Failed to retrieve kube-config file for Kubernetes cluster ID: %s. Attempt: %d/%d", kubernetesCluster.getUuid(), retryCounter+1, retries), e); + } + retryCounter++; + } + return kubeConfig; + } + + private boolean isKubernetesClusterAddOnServiceRunning(KubernetesCluster kubernetesCluster, final String ipAddress, final int port, final String namespace, String serviceName) { + try { + String cmd = "sudo kubectl get pods --all-namespaces"; + if (!Strings.isNullOrEmpty(namespace)) { + cmd = String.format("sudo kubectl get pods --namespace=%s", namespace); + } + Pair<Boolean, String> result = SshHelper.sshExecute(ipAddress, port, CLUSTER_NODE_VM_USER, + getManagementServerSshPublicKeyFile(), null, cmd, + 10000, 10000, 10000); + if (result.first() && !Strings.isNullOrEmpty(result.second())) { + String[] lines = result.second().split("\n"); + for (String line : + lines) { + if (line.contains(serviceName) && line.contains("Running")) { + LOGGER.debug(String.format("Service : %s in namespace: %s for the Kubernetes cluster ID: %s is running",serviceName, namespace, kubernetesCluster.getUuid())); + return true; + } + } + } + } catch (Exception e) { + LOGGER.warn(String.format("Unable to retrieve service: %s running status in namespace %s for Kubernetes cluster ID: %s", serviceName, namespace, kubernetesCluster.getUuid()), e); + } + return false; + } + + private boolean isKubernetesClusterDashboardServiceRunning(KubernetesCluster kubernetesCluster, String ipAddress, int port, int retries, long waitDuration) { + boolean running = false; + int retryCounter = 0; + // Check if dashboard service is up running. + while (retryCounter < retries) { + LOGGER.debug(String.format("Checking dashboard service for the Kubernetes cluster ID: %s to come up. Attempt: %d/%d", kubernetesCluster.getUuid(), retryCounter+1, retries)); + if (isKubernetesClusterAddOnServiceRunning(kubernetesCluster, ipAddress, port, "kubernetes-dashboard", "kubernetes-dashboard")) { + running = true; + break; + } + try { + Thread.sleep(waitDuration); + } catch (InterruptedException ex) { + LOGGER.error(String.format("Error while waiting for Kubernetes cluster: %s API dashboard service to be available", kubernetesCluster.getUuid()), ex); + } + retryCounter++; + } + return running; + } + + private Pair<String, Integer> getKubernetesClusterServerIpSshPort(KubernetesCluster kubernetesCluster, UserVm masterVm) { + int port = CLUSTER_NODES_DEFAULT_START_SSH_PORT; + KubernetesClusterDetailsVO detail = kubernetesClusterDetailsDao.findDetail(kubernetesCluster.getId(), ApiConstants.EXTERNAL_LOAD_BALANCER_IP_ADDRESS); + if (detail != null && !Strings.isNullOrEmpty(detail.getValue())) { + return new Pair<>(detail.getValue(), port); + } + Network network = networkDao.findById(kubernetesCluster.getNetworkId()); + if (network == null) { + LOGGER.warn(String.format("Network for Kubernetes cluster ID: %s cannot be found", kubernetesCluster.getUuid())); + return new Pair<>(null, port); + } + if (Network.GuestType.Isolated.equals(network.getGuestType())) { + List<? extends IpAddress> addresses = networkModel.listPublicIpsAssignedToGuestNtwk(network.getId(), true); + if (CollectionUtils.isEmpty(addresses)) { + LOGGER.warn(String.format("No public IP addresses found for network ID: %s, Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid())); + return new Pair<>(null, port); + } + for (IpAddress address : addresses) { + if (address.isSourceNat()) { + return new Pair<>(address.getAddress().addr(), port); + } + } + LOGGER.warn(String.format("No source NAT IP addresses found for network ID: %s, Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid())); + return new Pair<>(null, port); + } else if (Network.GuestType.Shared.equals(network.getGuestType())) { + port = 22; + if (masterVm == null) { + List<KubernetesClusterVmMapVO> clusterVMs = kubernetesClusterVmMapDao.listByClusterId(kubernetesCluster.getId()); + if (CollectionUtils.isEmpty(clusterVMs)) { + LOGGER.warn(String.format("Unable to retrieve VMs for Kubernetes cluster ID: %s", kubernetesCluster.getUuid())); + return new Pair<>(null, port); + } + List<Long> vmIds = new ArrayList<>(); + for (KubernetesClusterVmMapVO vmMap : clusterVMs) { + vmIds.add(vmMap.getVmId()); + } + Collections.sort(vmIds); + masterVm = userVmDao.findById(vmIds.get(0)); + } + if (masterVm == null) { + LOGGER.warn(String.format("Unable to retrieve master VM for Kubernetes cluster ID: %s", kubernetesCluster.getUuid())); + return new Pair<>(null, port); + } + return new Pair<>(masterVm.getPrivateIpAddress(), port); + } + LOGGER.warn(String.format("Unable to retrieve server IP address for Kubernetes cluster ID: %s", kubernetesCluster.getUuid())); + return new Pair<>(null, port); + } + + private Pair<String, Integer> getKubernetesClusterServerIpSshPort(KubernetesCluster kubernetesCluster) { + return getKubernetesClusterServerIpSshPort(kubernetesCluster, null); + } + + private int getKubernetesClusterReadyNodesCount(KubernetesCluster kubernetesCluster, String ipAddress, int port) throws Exception { + Pair<Boolean, String> result = SshHelper.sshExecute(ipAddress, port, + CLUSTER_NODE_VM_USER, getManagementServerSshPublicKeyFile(), null, + "sudo kubectl get nodes | awk '{if ($2 == \"Ready\") print $1}' | wc -l", + 10000, 10000, 20000); + if (result.first()) { + return Integer.parseInt(result.second().trim().replace("\"", "")); + } + return 0; + } + + private boolean isKubernetesClusterNodeReady(KubernetesCluster kubernetesCluster, String ipAddress, int port, String nodeName) throws Exception { + Pair<Boolean, String> result = SshHelper.sshExecute(ipAddress, port, + CLUSTER_NODE_VM_USER, getManagementServerSshPublicKeyFile(), null, + String.format("sudo kubectl get nodes | awk '{if ($1 == \"%s\" && $2 == \"Ready\") print $1}'", nodeName), + 10000, 10000, 20000); + return result.first() && nodeName.equals(result.second().trim()); + } + + private boolean isKubernetesClusterNodeReady(KubernetesCluster kubernetesCluster, String ipAddress, int port, String nodeName, int retries, int waitDuration) { + int retryCounter = 0; + while (retryCounter < retries) { + boolean ready = false; + try { + ready = isKubernetesClusterNodeReady(kubernetesCluster, ipAddress, port, nodeName); + } catch (Exception e) { + LOGGER.warn(String.format("Failed to retrieve state of node: %s in Kubernetes cluster ID: %s", nodeName, kubernetesCluster.getUuid()), e); + } + if (ready) { + return true; + } + try { + Thread.sleep(waitDuration); + } catch (InterruptedException ie) { + LOGGER.error(String.format("Error while waiting for Kubernetes cluster ID: %s node: %s to become ready", kubernetesCluster.getUuid(), nodeName), ie); + } + retryCounter++; + } + return false; + } + + private int getKubernetesClusterReadyNodesCount(KubernetesCluster kubernetesCluster) throws Exception { + Pair<String, Integer> ipSshPort = getKubernetesClusterServerIpSshPort(kubernetesCluster); + String ipAddress = ipSshPort.first(); + int sshPort = ipSshPort.second(); + if (Strings.isNullOrEmpty(ipAddress)) { + String msg = String.format("No public IP found for Kubernetes cluster ID: %s" , kubernetesCluster.getUuid()); + LOGGER.warn(msg); + throw new ManagementServerException(msg); + } + return getKubernetesClusterReadyNodesCount(kubernetesCluster, ipAddress, sshPort); + } + + private boolean validateKubernetesClusterReadyNodesCount(KubernetesCluster kubernetesCluster, String ipAddress, int port, int retries, long waitDuration) { + int retryCounter = 0; + while (retryCounter < retries) { + // "sudo kubectl get nodes -o json | jq \".items[].metadata.name\" | wc -l" + LOGGER.debug(String.format("Checking ready nodes for the Kubernetes cluster ID: %s with total %d provisioned nodes. Attempt: %d/%d", kubernetesCluster.getUuid(), kubernetesCluster.getTotalNodeCount(), retryCounter+1, retries)); + try { + int nodesCount = getKubernetesClusterReadyNodesCount(kubernetesCluster, ipAddress, port); + if (nodesCount == kubernetesCluster.getTotalNodeCount()) { + LOGGER.debug(String.format("Kubernetes cluster ID: %s has %d ready now", kubernetesCluster.getUuid(), kubernetesCluster.getTotalNodeCount())); + return true; + } else { + LOGGER.debug(String.format("Kubernetes cluster ID: %s has total %d provisioned nodes while %d ready now", kubernetesCluster.getUuid(), kubernetesCluster.getTotalNodeCount(), nodesCount)); + } + } catch (Exception e) { + LOGGER.warn(String.format("Failed to retrieve ready node count for Kubernetes cluster ID: %s", kubernetesCluster.getUuid()), e); + } + try { + Thread.sleep(waitDuration); + } catch (InterruptedException ex) { + LOGGER.warn(String.format("Error while waiting during Kubernetes cluster ID: %s ready node check. %d/%d", kubernetesCluster.getUuid(), retryCounter+1, retries), ex); + } + retryCounter++; + } + return false; + } + + private boolean removeKubernetesClusterNode(KubernetesCluster kubernetesCluster, String ipAddress, int port, UserVm userVm, int retries, int waitDuration) { + File pkFile = getManagementServerSshPublicKeyFile(); + int retryCounter = 0; + while (retryCounter < retries) { + retryCounter++; + try { + Pair<Boolean, String> result = SshHelper.sshExecute(ipAddress, port, CLUSTER_NODE_VM_USER, + pkFile, null, String.format("sudo kubectl drain %s --ignore-daemonsets --delete-local-data", userVm.getHostName()), + 10000, 10000, 60000); + if (!result.first()) { + LOGGER.warn(String.format("Draining node: %s on VM ID: %s in Kubernetes cluster ID: %s unsuccessful", userVm.getHostName(), userVm.getUuid(), kubernetesCluster.getUuid())); + } else { + result = SshHelper.sshExecute(ipAddress, port, CLUSTER_NODE_VM_USER, + pkFile, null, String.format("sudo kubectl delete node %s", userVm.getHostName()), + 10000, 10000, 30000); + if (result.first()) { + return true; + } else { + LOGGER.warn(String.format("Deleting node: %s on VM ID: %s in Kubernetes cluster ID: %s unsuccessful", userVm.getHostName(), userVm.getUuid(), kubernetesCluster.getUuid())); + } + } + break; + } catch (Exception e) { + String msg = String.format("Failed to remove Kubernetes cluster ID: %s node: %s on VM ID: %s", kubernetesCluster.getUuid(), userVm.getHostName(), userVm.getUuid()); + LOGGER.warn(msg, e); + } + try { + Thread.sleep(waitDuration); + } catch (InterruptedException ie) { + LOGGER.error(String.format("Error while waiting for Kubernetes cluster ID: %s node: %s on VM ID: %s removal", kubernetesCluster.getUuid(), userVm.getHostName(), userVm.getUuid()), ie); + } + retryCounter++; + } + return false; + } + + private boolean uncordonKubernetesClusterNode(KubernetesCluster kubernetesCluster, String ipAddress, int port, UserVm userVm, int retries, int waitDuration) { + int retryCounter = 0; + while (retryCounter < retries) { + Pair<Boolean, String> result = null; + try { + result = SshHelper.sshExecute(ipAddress, port, CLUSTER_NODE_VM_USER, getManagementServerSshPublicKeyFile(), null, + String.format("sudo kubectl uncordon %s", userVm.getHostName()), + 10000, 10000, 30000); + if (result.first()) { + return true; + } + } catch (Exception e) { + LOGGER.warn(String.format("Failed to uncordon node: %s on VM ID: %s in Kubernetes cluster ID: %s", userVm.getHostName(), userVm.getUuid(), kubernetesCluster.getUuid()), e); + } + try { + Thread.sleep(waitDuration); + } catch (InterruptedException ie) { + LOGGER.warn(String.format("Error while waiting for uncordon Kubernetes cluster ID: %s node: %s on VM ID: %s", kubernetesCluster.getUuid(), userVm.getHostName(), userVm.getUuid()), ie); + } + retryCounter++; + } + return false; + } + + // perform a cold start (which will provision resources as well) + private boolean startKubernetesClusterOnCreate(final long kubernetesClusterId) throws ManagementServerException { + + // Starting a Kubernetes cluster has below workflow + // - start the network + // - provision the master /node VM + // - provision node VM's (as many as cluster size) + // - update the book keeping data of the VM's provisioned for the cluster + // - setup networking (add Firewall and PF rules) + // - wait till Kubernetes API server on master VM to come up + // - wait till addon services (dashboard etc) to come up + // - update API and dashboard URL endpoints in Kubernetes cluster details + + KubernetesClusterVO kubernetesCluster = kubernetesClusterDao.findById(kubernetesClusterId); + final DataCenter zone = dataCenterDao.findById(kubernetesCluster.getZoneId()); + if (zone == null) { + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, String.format("Unable to find zone for Kubernetes cluster ID: %s", kubernetesCluster.getUuid())); + } + LOGGER.debug(String.format("Starting Kubernetes cluster ID: %s", kubernetesCluster.getUuid())); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.StartRequested); + Account account = accountDao.findById(kubernetesCluster.getAccountId()); + + DeployDestination dest = null; + try { + dest = plan(kubernetesCluster, zone); + } catch (InsufficientCapacityException e) { + String msg = String.format("Provisioning the cluster failed due to insufficient capacity in the Kubernetes cluster: %s", kubernetesCluster.getUuid()); + LOGGER.error(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + final ReservationContext context = new ReservationContextImpl(null, null, null, account); + + Network network = networkDao.findById(kubernetesCluster.getNetworkId()); + if (network == null) { + String msg = String.format("Network for Kubernetes cluster ID: %s not found", kubernetesCluster.getUuid()); + LOGGER.warn(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg); + } + try { + networkMgr.startNetwork(network.getId(), dest, context); + LOGGER.debug(String.format("Network ID: %s is started for the Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Failed to start Kubernetes cluster ID: %s as unable to start associated network ID: %s" , kubernetesCluster.getUuid(), network.getUuid()); + LOGGER.error(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + + Pair<String, Integer> publicIpSshPort = getKubernetesClusterServerIpSshPort(kubernetesCluster); + String publicIpAddress = publicIpSshPort.first(); + if (Strings.isNullOrEmpty(publicIpAddress) && + (Network.GuestType.Isolated.equals(network.getGuestType()) || kubernetesCluster.getMasterNodeCount() > 1)) { // Shared network, single-master cluster won't have an IP yet + String msg = String.format("Failed to start Kubernetes cluster ID: %s as no public IP found for the cluster" , kubernetesCluster.getUuid()); + LOGGER.warn(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg); + } + + List<Long> clusterVMIds = new ArrayList<>(); + + UserVm k8sMasterVM = null; + try { + k8sMasterVM = createKubernetesMaster(kubernetesCluster, dest.getPod(), network, account, publicIpAddress); + addKubernetesClusterVm(kubernetesCluster.getId(), k8sMasterVM.getId()); + startKubernetesVM(k8sMasterVM, kubernetesCluster); + clusterVMIds.add(k8sMasterVM.getId()); + k8sMasterVM = userVmDao.findById(k8sMasterVM.getId()); + LOGGER.debug(String.format("Provisioned the master VM ID: %s in to the Kubernetes cluster ID: %s", k8sMasterVM.getUuid(), kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Provisioning the master VM failed in the Kubernetes cluster ID: %s", kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + + if (Strings.isNullOrEmpty(publicIpAddress)) { + publicIpSshPort = getKubernetesClusterServerIpSshPort(kubernetesCluster, k8sMasterVM); + publicIpAddress = publicIpSshPort.first(); + if (Strings.isNullOrEmpty(publicIpAddress)) { + String msg = String.format("Failed to start Kubernetes cluster ID: %s as no public IP found for the cluster", kubernetesCluster.getUuid()); + LOGGER.warn(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg); + } + } + + if (kubernetesCluster.getMasterNodeCount() > 1) { + for (int i = 1; i < kubernetesCluster.getMasterNodeCount(); i++) { + UserVm vm = null; + try { + vm = createKubernetesAdditionalMaster(kubernetesCluster, publicIpAddress, i); + addKubernetesClusterVm(kubernetesCluster.getId(), vm.getId()); + startKubernetesVM(vm, kubernetesCluster); + clusterVMIds.add(vm.getId()); + LOGGER.debug(String.format("Provisioned additional master VM ID: %s in to the Kubernetes cluster ID: %s", vm.getUuid(), kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Provisioning additional master VM %d/%d failed in the Kubernetes cluster ID: %s", i+1, kubernetesCluster.getMasterNodeCount(), kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + } + } + + for (int i = 1; i <= kubernetesCluster.getNodeCount(); i++) { + UserVm vm = null; + try { + vm = createKubernetesNode(kubernetesCluster, publicIpAddress, i); + addKubernetesClusterVm(kubernetesCluster.getId(), vm.getId()); + startKubernetesVM(vm, kubernetesCluster); + clusterVMIds.add(vm.getId()); + LOGGER.debug(String.format("Provisioned node master VM ID: %s in to the Kubernetes cluster ID: %s", vm.getUuid(), kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Provisioning node VM %d/%d failed in the Kubernetes cluster ID: %s", i, kubernetesCluster.getNodeCount(), kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + } + LOGGER.debug(String.format("Kubernetes cluster ID: %s VMs successfully provisioned", kubernetesCluster.getUuid())); + + setupKubernetesClusterNetworkRules(kubernetesCluster, network, account, clusterVMIds); + attachIsoKubernetesVMs(kubernetesCluster, clusterVMIds); + + boolean masterVmRunning = false; + long startTime = System.currentTimeMillis(); + while (!masterVmRunning && System.currentTimeMillis() - startTime < 10 * 60 * 1000) { + try (Socket socket = new Socket()) { + socket.connect(new InetSocketAddress(publicIpAddress, publicIpSshPort.second()), 10000); + masterVmRunning = true; + } catch (IOException e) { + LOGGER.debug(String.format("Waiting for Kubernetes cluster ID: %s master node VMs to be accessible", kubernetesCluster.getUuid())); + try { + Thread.sleep(10000); + } catch (InterruptedException ex) { + LOGGER.warn(String.format("Error while waiting for Kubernetes cluster ID: %s master node VMs to be accessible", kubernetesCluster.getUuid()), ex); + } + } + } + if (!masterVmRunning) { + String msg = String.format("Failed to setup Kubernetes cluster ID: %s in usable state as unable to access master node VMs of the cluster", kubernetesCluster.getUuid()); + if (kubernetesCluster.getMasterNodeCount() > 1 && Network.GuestType.Shared.equals(network.getGuestType())) { + msg = String.format("%s. Make sure external load-balancer has port forwarding rules for SSH access on ports %d-%d and API access on port %d", + msg, + CLUSTER_NODES_DEFAULT_START_SSH_PORT, + CLUSTER_NODES_DEFAULT_START_SSH_PORT + kubernetesCluster.getTotalNodeCount() - 1, + CLUSTER_API_PORT); + } + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + detachIsoKubernetesVMs(kubernetesCluster, clusterVMIds); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + + boolean k8sApiServerSetup = isKubernetesClusterServerRunning(kubernetesCluster, publicIpAddress, 20, 30000); + if (!k8sApiServerSetup) { + String msg = String.format("Failed to setup Kubernetes cluster ID: %s in usable state as unable to provision API endpoint for the cluster", kubernetesCluster.getUuid()); + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + detachIsoKubernetesVMs(kubernetesCluster, clusterVMIds); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + kubernetesCluster = kubernetesClusterDao.findById(kubernetesClusterId); + kubernetesCluster.setEndpoint(String.format("https://%s:%d/", publicIpAddress, CLUSTER_API_PORT)); + kubernetesClusterDao.update(kubernetesCluster.getId(), kubernetesCluster); + + int sshPort = publicIpSshPort.second(); + boolean readyNodesCountValid = validateKubernetesClusterReadyNodesCount(kubernetesCluster, publicIpAddress, sshPort, 30, 30000); + + // Detach binaries ISO from new VMs + detachIsoKubernetesVMs(kubernetesCluster, clusterVMIds); + + // Throw exception if nodes count for k8s cluster timed out + if (!readyNodesCountValid) { // Scaling failed + String msg = String.format("Failed to setup Kubernetes cluster ID: %s as it does not have desired number of nodes in ready state", kubernetesCluster.getUuid()); + LOGGER.warn(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + + boolean k8sKubeConfigCopied = false; + String kubeConfig = getKubernetesClusterConfig(kubernetesCluster, publicIpAddress, sshPort, 5); + if (!Strings.isNullOrEmpty(kubeConfig)) { + k8sKubeConfigCopied = true; + } + if (!k8sKubeConfigCopied) { + String msg = String.format("Failed to setup Kubernetes cluster ID: %s in usable state as unable to retrieve kube-config for the cluster", kubernetesCluster.getUuid()); + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + kubeConfig = kubeConfig.replace(String.format("server: https://%s:%d", k8sMasterVM.getPrivateIpAddress(), CLUSTER_API_PORT), + String.format("server: https://%s:%d", publicIpAddress, CLUSTER_API_PORT)); + kubernetesClusterDetailsDao.addDetail(kubernetesCluster.getId(), "kubeConfigData", Base64.encodeBase64String(kubeConfig.getBytes(Charset.forName("UTF-8"))), false); + + boolean dashboardServiceRunning = isKubernetesClusterDashboardServiceRunning(kubernetesCluster, publicIpAddress, sshPort, 10, 20000); + if (!dashboardServiceRunning) { + String msg = String.format("Failed to setup Kubernetes cluster ID: %s in usable state as unable to get Dashboard service running for the cluster", kubernetesCluster.getUuid()); + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + kubernetesClusterDetailsDao.addDetail(kubernetesCluster.getId(), "dashboardServiceRunning", String.valueOf(dashboardServiceRunning), false); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationSucceeded); + return true; + } + + private boolean startStoppedKubernetesCluster(long kubernetesClusterId) throws ManagementServerException, + ResourceAllocationException, ResourceUnavailableException, InsufficientCapacityException { + final KubernetesClusterVO kubernetesCluster = kubernetesClusterDao.findById(kubernetesClusterId); + if (kubernetesCluster == null) { + throw new ManagementServerException("Invalid Kubernetes cluster ID"); + } + if (kubernetesCluster.getRemoved() != null) { + throw new ManagementServerException(String.format("Kubernetes cluster ID: %s is already deleted", kubernetesCluster.getUuid())); + } + if (kubernetesCluster.getState().equals(KubernetesCluster.State.Running)) { + LOGGER.debug(String.format("Kubernetes cluster ID: %s is in running state", kubernetesCluster.getUuid())); + return true; + } + if (kubernetesCluster.getState().equals(KubernetesCluster.State.Starting)) { + LOGGER.debug(String.format("Kubernetes cluster ID: %s is already in starting state", kubernetesCluster.getUuid())); + return true; + } + LOGGER.debug(String.format("Starting Kubernetes cluster ID: %s", kubernetesCluster.getUuid())); + + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.StartRequested); + + for (final KubernetesClusterVmMapVO vmMapVO : kubernetesClusterVmMapDao.listByClusterId(kubernetesClusterId)) { + final UserVmVO vm = userVmDao.findById(vmMapVO.getVmId()); + try { + if (vm == null) { + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ManagementServerException("Failed to start all VMs in Kubernetes cluster ID: " + kubernetesClusterId); + } + startKubernetesVM(vm, kubernetesCluster); + } catch (ServerApiException ex) { + LOGGER.warn("Failed to start VM in Kubernetes cluster ID:" + kubernetesClusterId + " due to " + ex); + // dont bail out here. proceed further to stop the reset of the VM's + } + } + + for (final KubernetesClusterVmMapVO vmMapVO : kubernetesClusterVmMapDao.listByClusterId(kubernetesClusterId)) { + final UserVmVO vm = userVmDao.findById(vmMapVO.getVmId()); + if (vm == null || !vm.getState().equals(VirtualMachine.State.Running)) { + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ManagementServerException("Failed to start all VMs in Kubernetes cluster ID: " + kubernetesClusterId); + } + } + + InetAddress address = null; + try { + address = InetAddress.getByName(new URL(kubernetesCluster.getEndpoint()).getHost()); + } catch (MalformedURLException | UnknownHostException ex) { + String msg = String.format("Kubernetes cluster ID: %s has invalid API endpoint. Can not verify if cluster is in ready state", kubernetesCluster.getUuid()); + LOGGER.warn(msg, ex); + throw new ManagementServerException(msg, ex); + } + + Pair<String, Integer> publicIpSshPort = getKubernetesClusterServerIpSshPort(kubernetesCluster); + String publicIpAddress = publicIpSshPort.first(); + if (Strings.isNullOrEmpty(publicIpAddress)) { + String msg = String.format("Failed to start Kubernetes cluster ID: %s as no public IP found for the cluster" , kubernetesCluster.getUuid()); + LOGGER.warn(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ManagementServerException(msg); + } + + boolean k8sApiServerSetup = isKubernetesClusterServerRunning(kubernetesCluster, publicIpAddress, 10, 30000); + if (!k8sApiServerSetup) { + String msg = String.format("Failed to start Kubernetes cluster ID: %s in usable state", kubernetesCluster.getUuid()); + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ManagementServerException(msg); + } + + int sshPort = publicIpSshPort.second(); + KubernetesClusterDetailsVO kubeConfigDetail = kubernetesClusterDetailsDao.findDetail(kubernetesCluster.getId(), "kubeConfigData"); + if (kubeConfigDetail == null || Strings.isNullOrEmpty(kubeConfigDetail.getValue())) { + boolean k8sKubeConfigCopied = false; + String kubeConfig = getKubernetesClusterConfig(kubernetesCluster, publicIpAddress, sshPort, 5); + if (!Strings.isNullOrEmpty(kubeConfig)) { + k8sKubeConfigCopied = true; + } + if (!k8sKubeConfigCopied) { + String msg = String.format("Failed to start Kubernetes cluster ID: %s in usable state as unable to retrieve kube-config for the cluster", kubernetesCluster.getUuid()); + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + kubernetesClusterDetailsDao.addDetail(kubernetesCluster.getId(), "kubeConfigData", Base64.encodeBase64String(kubeConfig.getBytes(Charset.forName("UTF-8"))), false); + } + KubernetesClusterDetailsVO dashboardServiceRunningDetail = kubernetesClusterDetailsDao.findDetail(kubernetesCluster.getId(), "dashboardServiceRunning"); + if (kubeConfigDetail == null || !Boolean.parseBoolean(dashboardServiceRunningDetail.getValue())) { + boolean dashboardServiceRunning = isKubernetesClusterDashboardServiceRunning(kubernetesCluster, publicIpAddress, sshPort, 10, 20000); + if (!dashboardServiceRunning) { + String msg = String.format("Failed to start Kubernetes cluster ID: %s in usable state as unable to get Dashboard service running for the cluster", kubernetesCluster.getUuid()); + LOGGER.error(msg); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationFailed); + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, msg); + } + kubernetesClusterDetailsDao.addDetail(kubernetesCluster.getId(), "dashboardServiceRunning", String.valueOf(dashboardServiceRunning), false); + } + + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.OperationSucceeded); + LOGGER.debug(String.format("Kubernetes cluster ID: %s successfully started", kubernetesCluster.getUuid())); + return true; + } + + // Open up firewall port CLUSTER_API_PORT, secure port on which Kubernetes API server is running. Also create port-forwarding + // rule to forward public IP traffic to master VM private IP + // Open up firewall ports NODES_DEFAULT_START_SSH_PORT to NODES_DEFAULT_START_SSH_PORT+n for SSH access. Also create port-forwarding + // rule to forward public IP traffic to all node VM private IP + private void setupKubernetesClusterNetworkRules(KubernetesCluster kubernetesCluster, + Network network, Account account, + List<Long> clusterVMIds) throws ManagementServerException { + if (!Network.GuestType.Isolated.equals(network.getGuestType())) { + LOGGER.debug(String.format("Network ID: %s for Kubernetes cluster ID: %s is not an isolated network, therefore, no need for network rules", network.getUuid(), kubernetesCluster.getUuid())); + return; + } + IpAddress publicIp = null; + List<? extends IpAddress> addresses = networkModel.listPublicIpsAssignedToGuestNtwk(network.getId(), true); + if (CollectionUtils.isEmpty(addresses)) { + LOGGER.error(String.format("No public IP addresses found for network ID: %s, Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid())); + return; + } + for (IpAddress address : addresses) { + if (address.isSourceNat()) { + publicIp = address; + break; + } + } + if (publicIp == null) { + LOGGER.error(String.format("No source NAT IP addresses found for network ID: %s, Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid())); + return; + } + List<String> sourceCidrList = new ArrayList<String>(); + sourceCidrList.add("0.0.0.0/0"); + + try { + CreateFirewallRuleCmd rule = new CreateFirewallRuleCmd(); + rule = ComponentContext.inject(rule); + + Field addressField = rule.getClass().getDeclaredField("ipAddressId"); + addressField.setAccessible(true); + addressField.set(rule, publicIp.getId()); + + Field protocolField = rule.getClass().getDeclaredField("protocol"); + protocolField.setAccessible(true); + protocolField.set(rule, "TCP"); + + Field startPortField = rule.getClass().getDeclaredField("publicStartPort"); + startPortField.setAccessible(true); + startPortField.set(rule, CLUSTER_API_PORT); + + Field endPortField = rule.getClass().getDeclaredField("publicEndPort"); + endPortField.setAccessible(true); + endPortField.set(rule, CLUSTER_API_PORT); + + Field cidrField = rule.getClass().getDeclaredField("cidrlist"); + cidrField.setAccessible(true); + cidrField.set(rule, sourceCidrList); + + firewallService.createIngressFirewallRule(rule); + firewallService.applyIngressFwRules(publicIp.getId(), account); + + LOGGER.debug(String.format("Provisioned firewall rule to open up port %d on %s for Kubernetes cluster ID: %s", + CLUSTER_API_PORT, publicIp.getAddress().addr(), kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Failed to provision firewall rules for API access for the Kubernetes cluster ID: %s", kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + + try { + CreateFirewallRuleCmd rule = new CreateFirewallRuleCmd(); + rule = ComponentContext.inject(rule); + + Field addressField = rule.getClass().getDeclaredField("ipAddressId"); + addressField.setAccessible(true); + addressField.set(rule, publicIp.getId()); + + Field protocolField = rule.getClass().getDeclaredField("protocol"); + protocolField.setAccessible(true); + protocolField.set(rule, "TCP"); + + Field startPortField = rule.getClass().getDeclaredField("publicStartPort"); + startPortField.setAccessible(true); + startPortField.set(rule, CLUSTER_NODES_DEFAULT_START_SSH_PORT); + + Field endPortField = rule.getClass().getDeclaredField("publicEndPort"); + endPortField.setAccessible(true); + int endPort = CLUSTER_NODES_DEFAULT_START_SSH_PORT + clusterVMIds.size() - 1; + endPortField.set(rule, endPort); // clusterVMIds contains all nodes including master + + Field cidrField = rule.getClass().getDeclaredField("cidrlist"); + cidrField.setAccessible(true); + cidrField.set(rule, sourceCidrList); + + firewallService.createIngressFirewallRule(rule); + firewallService.applyIngressFwRules(publicIp.getId(), account); + + LOGGER.debug(String.format("Provisioned firewall rule to open up port %d to %d on %s for Kubernetes cluster ID: %s", CLUSTER_NODES_DEFAULT_START_SSH_PORT, endPort, publicIp.getAddress().addr(), kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Failed to provision firewall rules for SSH access for the Kubernetes cluster ID: %s", kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + + // Load balancer rule fo API access for master node VMs + try { + LoadBalancer lb = lbService.createPublicLoadBalancerRule(null, "api-lb", "LB rule for API access", + CLUSTER_API_PORT, CLUSTER_API_PORT, CLUSTER_API_PORT, CLUSTER_API_PORT, + publicIp.getId(), NetUtils.TCP_PROTO, "roundrobin", kubernetesCluster.getNetworkId(), + kubernetesCluster.getAccountId(), false, NetUtils.TCP_PROTO, true); + + Map<Long, List<String>> vmIdIpMap = new HashMap<>(); + for (int i=0; i<kubernetesCluster.getMasterNodeCount(); ++i) { + List<String> ips = new ArrayList<>(); + Nic masterVmNic = networkModel.getNicInNetwork(clusterVMIds.get(i), kubernetesCluster.getNetworkId()); + ips.add(masterVmNic.getIPv4Address()); + vmIdIpMap.put(clusterVMIds.get(i), ips); + } + lbService.assignToLoadBalancer(lb.getId(), null, vmIdIpMap); + } catch (Exception e) { + String msg = String.format("Failed to provision load balancer rule for API access for the Kubernetes cluster ID: %s", kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + + // Port forwarding rule fo SSH access on each node VM + final long publicIpId = publicIp.getId(); + final long networkId = kubernetesCluster.getNetworkId(); + final long accountId = account.getId(); + final long domainId = account.getDomainId(); + + for (int i = 0; i < clusterVMIds.size(); ++i) { + long vmId = clusterVMIds.get(i); + Nic vmNic = networkModel.getNicInNetwork(vmId, kubernetesCluster.getNetworkId()); + final Ip vmIp = new Ip(vmNic.getIPv4Address()); + final long vmIdFinal = vmId; + final int srcPortFinal = CLUSTER_NODES_DEFAULT_START_SSH_PORT + i; + try { + PortForwardingRuleVO pfRule = Transaction.execute(new TransactionCallbackWithException<PortForwardingRuleVO, NetworkRuleConflictException>() { + @Override + public PortForwardingRuleVO doInTransaction(TransactionStatus status) throws NetworkRuleConflictException { + PortForwardingRuleVO newRule = + new PortForwardingRuleVO(null, publicIpId, + srcPortFinal, srcPortFinal, + vmIp, + 22, 22, + "tcp", networkId, accountId, domainId, vmIdFinal); + newRule.setDisplay(true); + newRule.setState(FirewallRule.State.Add); + newRule = portForwardingRulesDao.persist(newRule); + return newRule; + } + }); + rulesService.applyPortForwardingRules(publicIp.getId(), account); + LOGGER.debug(String.format("Provisioned SSH port forwarding rule from port %d to 22 on %s to the VM IP: %s in Kubernetes cluster ID: %s", srcPortFinal, publicIp.getAddress().addr(), vmIp, kubernetesCluster.getUuid())); + } catch (Exception e) { + String msg = String.format("Failed to activate SSH port forwarding rules for the Kubernetes cluster ID: %s", kubernetesCluster.getUuid()); + LOGGER.warn(msg, e); + stateTransitTo(kubernetesCluster.getId(), KubernetesCluster.Event.CreateFailed); + throw new ManagementServerException(msg, e); + } + } + } + + // Open up firewall ports NODES_DEFAULT_START_SSH_PORT to NODES_DEFAULT_START_SSH_PORT+n for SSH access. Also create port-forwarding + // rule to forward public IP traffic to all node VM private IP. Existing node VMs before scaling + // will already be having these rules + private void scaleKubernetesClusterNetworkRules(KubernetesCluster kubernetesCluster, Network network, Account account, + List<Long> clusterVMIds, List<Long> removedVMIds) throws ManagementServerException { + if (!Network.GuestType.Isolated.equals(network.getGuestType())) { + LOGGER.debug(String.format("Network ID: %s for Kubernetes cluster ID: %s is not an isolated network, therefore, no need for network rules", network.getUuid(), kubernetesCluster.getUuid())); + return; + } + IpAddress publicIp = null; + List<? extends IpAddress> addresses = networkModel.listPublicIpsAssignedToGuestNtwk(network.getId(), true); + if (CollectionUtils.isEmpty(addresses)) { + String msg = String.format("No public IP addresses found for network ID: %s, Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid()); + LOGGER.error(msg); + throw new ManagementServerException(msg); + } + for (IpAddress address : addresses) { + if (address.isSourceNat()) { + publicIp = address; + break; + } + } + if (publicIp == null) { + String msg = String.format("No source NAT IP addresses found for network ID: %s, Kubernetes cluster ID: %s", network.getUuid(), kubernetesCluster.getUuid()); + LOGGER.error(msg); + throw new ManagementServerException(msg); + } + + List<String> sourceCidrList Review comment: Refactored ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services