http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java new file mode 100644 index 0000000..61b8ce5 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java @@ -0,0 +1,254 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.Shell; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser; +import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuDiscoverer { + public static final Logger LOG = LoggerFactory.getLogger( + GpuDiscoverer.class); + @VisibleForTesting + protected static final String DEFAULT_BINARY_NAME = "nvidia-smi"; + + // When executable path not set, try to search default dirs + // By default search /usr/bin, /bin, and /usr/local/nvidia/bin (when + // launched by nvidia-docker. + private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of( + "/usr/bin", "/bin", "/usr/local/nvidia/bin"); + + // command should not run more than 10 sec. + private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000; + private static final int MAX_REPEATED_ERROR_ALLOWED = 10; + private static GpuDiscoverer instance; + + static { + instance = new GpuDiscoverer(); + } + + private Configuration conf = null; + private String pathOfGpuBinary = null; + private Map<String, String> environment = new HashMap<>(); + private GpuDeviceInformationParser parser = new GpuDeviceInformationParser(); + + private int numOfErrorExecutionSinceLastSucceed = 0; + GpuDeviceInformation lastDiscoveredGpuInformation = null; + + private void validateConfOrThrowException() throws YarnException { + if (conf == null) { + throw new YarnException("Please initialize (call initialize) before use " + + GpuDiscoverer.class.getSimpleName()); + } + } + + /** + * Get GPU device information from system. + * This need to be called after initialize. + * + * Please note that this only works on *NIX platform, so external caller + * need to make sure this. + * + * @return GpuDeviceInformation + * @throws YarnException when any error happens + */ + public synchronized GpuDeviceInformation getGpuDeviceInformation() + throws YarnException { + validateConfOrThrowException(); + + if (null == pathOfGpuBinary) { + throw new YarnException( + "Failed to find GPU discovery executable, please double check " + + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting."); + } + + if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) { + String msg = + "Failed to execute GPU device information detection script for " + + MAX_REPEATED_ERROR_ALLOWED + + " times, skip following executions."; + LOG.error(msg); + throw new YarnException(msg); + } + + String output; + try { + output = Shell.execCommand(environment, + new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS); + GpuDeviceInformation info = parser.parseXml(output); + numOfErrorExecutionSinceLastSucceed = 0; + lastDiscoveredGpuInformation = info; + return info; + } catch (IOException e) { + numOfErrorExecutionSinceLastSucceed++; + String msg = + "Failed to execute " + pathOfGpuBinary + " exception message:" + e + .getMessage() + ", continue ..."; + if (LOG.isDebugEnabled()) { + LOG.debug(msg); + } + throw new YarnException(e); + } catch (YarnException e) { + numOfErrorExecutionSinceLastSucceed++; + String msg = "Failed to parse xml output" + e.getMessage(); + if (LOG.isDebugEnabled()) { + LOG.warn(msg, e); + } + throw e; + } + } + + /** + * Get list of minor device numbers of Gpu devices usable by YARN. + * + * @return List of minor device numbers of Gpu devices. + * @throws YarnException when any issue happens + */ + public synchronized List<Integer> getMinorNumbersOfGpusUsableByYarn() + throws YarnException { + validateConfOrThrowException(); + + String allowedDevicesStr = conf.get( + YarnConfiguration.NM_GPU_ALLOWED_DEVICES, + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES); + + List<Integer> minorNumbers = new ArrayList<>(); + + if (allowedDevicesStr.equals( + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) { + // Get gpu device information from system. + if (null == lastDiscoveredGpuInformation) { + String msg = YarnConfiguration.NM_GPU_ALLOWED_DEVICES + " is set to " + + YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES + + ", however automatically discovering " + + "GPU information failed, please check NodeManager log for more" + + " details, as an alternative, admin can specify " + + YarnConfiguration.NM_GPU_ALLOWED_DEVICES + + " manually to enable GPU isolation."; + LOG.error(msg); + throw new YarnException(msg); + } + + if (lastDiscoveredGpuInformation.getGpus() != null) { + for (PerGpuDeviceInformation gpu : lastDiscoveredGpuInformation + .getGpus()) { + minorNumbers.add(gpu.getMinorNumber()); + } + } + } else{ + for (String s : allowedDevicesStr.split(",")) { + if (s.trim().length() > 0) { + minorNumbers.add(Integer.valueOf(s.trim())); + } + } + LOG.info("Allowed GPU devices with minor numbers:" + allowedDevicesStr); + } + + return minorNumbers; + } + + public synchronized void initialize(Configuration conf) throws YarnException { + this.conf = conf; + numOfErrorExecutionSinceLastSucceed = 0; + String pathToExecutable = conf.get(YarnConfiguration.NM_GPU_PATH_TO_EXEC, + YarnConfiguration.DEFAULT_NM_GPU_PATH_TO_EXEC); + if (pathToExecutable.isEmpty()) { + pathToExecutable = DEFAULT_BINARY_NAME; + } + + // Validate file existence + File binaryPath = new File(pathToExecutable); + + if (!binaryPath.exists()) { + // When binary not exist, use default setting. + boolean found = false; + for (String dir : DEFAULT_BINARY_SEARCH_DIRS) { + binaryPath = new File(dir, DEFAULT_BINARY_NAME); + if (binaryPath.exists()) { + found = true; + pathOfGpuBinary = binaryPath.getAbsolutePath(); + break; + } + } + + if (!found) { + LOG.warn("Failed to locate binary at:" + binaryPath.getAbsolutePath() + + ", please double check [" + YarnConfiguration.NM_GPU_PATH_TO_EXEC + + "] setting. Now use " + "default binary:" + DEFAULT_BINARY_NAME); + } + } else{ + // If path specified by user is a directory, use + if (binaryPath.isDirectory()) { + binaryPath = new File(binaryPath, DEFAULT_BINARY_NAME); + LOG.warn("Specified path is a directory, use " + DEFAULT_BINARY_NAME + + " under the directory, updated path-to-executable:" + binaryPath + .getAbsolutePath()); + } + // Validated + pathOfGpuBinary = binaryPath.getAbsolutePath(); + } + + // Try to discover GPU information once and print + try { + LOG.info("Trying to discover GPU information ..."); + GpuDeviceInformation info = getGpuDeviceInformation(); + LOG.info(info.toString()); + } catch (YarnException e) { + String msg = + "Failed to discover GPU information from system, exception message:" + + e.getMessage() + " continue..."; + LOG.warn(msg); + } + } + + @VisibleForTesting + protected Map<String, String> getEnvironmentToRunCommand() { + return environment; + } + + @VisibleForTesting + protected String getPathOfGpuBinary() { + return pathOfGpuBinary; + } + + public static GpuDiscoverer getInstance() { + return instance; + } +}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java new file mode 100644 index 0000000..f6bf506 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; +import java.util.Map; + +import static org.apache.hadoop.yarn.api.records.ResourceInformation.GPU_URI; + +public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin { + private static final Logger LOG = + LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class); + + @Override + public void updateConfiguredResource(Resource res) throws YarnException { + LOG.info("Initializing configured GPU resources for the NodeManager."); + + List<Integer> usableGpus = + GpuDiscoverer.getInstance().getMinorNumbersOfGpusUsableByYarn(); + if (null == usableGpus || usableGpus.isEmpty()) { + LOG.info("Didn't find any usable GPUs on the NodeManager."); + // No gpu can be used by YARN. + return; + } + + long nUsableGpus = usableGpus.size(); + + Map<String, ResourceInformation> configuredResourceTypes = + ResourceUtils.getResourceTypes(); + if (!configuredResourceTypes.containsKey(GPU_URI)) { + throw new YarnException("Found " + nUsableGpus + " usable GPUs, however " + + GPU_URI + + " resource-type is not configured inside" + + " resource-types.xml, please configure it to enable GPU feature or" + + " remove " + GPU_URI + " from " + + YarnConfiguration.NM_RESOURCE_PLUGINS); + } + + res.setResourceValue(GPU_URI, nUsableGpus); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java new file mode 100644 index 0000000..9576ce7 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu; + +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu.GpuResourceHandlerImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.NodeResourceUpdaterPlugin; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; + +public class GpuResourcePlugin implements ResourcePlugin { + private ResourceHandler gpuResourceHandler = null; + private GpuNodeResourceUpdateHandler resourceDiscoverHandler = null; + + @Override + public synchronized void initialize(Context context) throws YarnException { + resourceDiscoverHandler = new GpuNodeResourceUpdateHandler(); + GpuDiscoverer.getInstance().initialize(context.getConf()); + } + + @Override + public synchronized ResourceHandler createResourceHandler( + Context context, CGroupsHandler cGroupsHandler, + PrivilegedOperationExecutor privilegedOperationExecutor) { + if (gpuResourceHandler == null) { + gpuResourceHandler = new GpuResourceHandlerImpl(context, cGroupsHandler, + privilegedOperationExecutor); + } + + return gpuResourceHandler; + } + + @Override + public synchronized NodeResourceUpdaterPlugin getNodeResourceHandlerInstance() { + return resourceDiscoverHandler; + } + + @Override + public void cleanup() throws YarnException { + // Do nothing. + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java new file mode 100644 index 0000000..977032a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformation.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlRootElement; +import java.util.List; + +/** + * All GPU Device Information in the system. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "nvidia_smi_log") +public class GpuDeviceInformation { + List<PerGpuDeviceInformation> gpus; + + String driverVersion = "N/A"; + + // More fields like topology information could be added when needed. + // ... + + @javax.xml.bind.annotation.XmlElement(name = "gpu") + public List<PerGpuDeviceInformation> getGpus() { + return gpus; + } + + public void setGpus(List<PerGpuDeviceInformation> gpus) { + this.gpus = gpus; + } + + @javax.xml.bind.annotation.XmlElement(name = "driver_version") + public String getDriverVersion() { + return driverVersion; + } + + public void setDriverVersion(String driverVersion) { + this.driverVersion = driverVersion; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("=== Gpus in the system ===\n").append("\tDriver Version:").append( + getDriverVersion()).append("\n"); + + if (gpus != null) { + for (PerGpuDeviceInformation gpu : gpus) { + sb.append("\t").append(gpu.toString()).append("\n"); + } + } + return sb.toString(); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java new file mode 100644 index 0000000..1bd92f6 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/GpuDeviceInformationParser.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +import javax.xml.bind.JAXBContext; +import javax.xml.bind.JAXBException; +import javax.xml.bind.Unmarshaller; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.sax.SAXSource; +import java.io.StringReader; + +/** + * Parse XML and get GPU device information + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class GpuDeviceInformationParser { + private static final Logger LOG = LoggerFactory.getLogger( + GpuDeviceInformationParser.class); + + private Unmarshaller unmarshaller = null; + private XMLReader xmlReader = null; + + private void init() + throws SAXException, ParserConfigurationException, JAXBException { + SAXParserFactory spf = SAXParserFactory.newInstance(); + // Disable external-dtd since by default nvidia-smi output contains + // <!DOCTYPE nvidia_smi_log SYSTEM "nvsmi_device_v8.dtd"> in header + spf.setFeature( + "http://apache.org/xml/features/nonvalidating/load-external-dtd", + false); + spf.setFeature("http://xml.org/sax/features/validation", false); + + JAXBContext jaxbContext = JAXBContext.newInstance( + GpuDeviceInformation.class); + + this.xmlReader = spf.newSAXParser().getXMLReader(); + this.unmarshaller = jaxbContext.createUnmarshaller(); + } + + public synchronized GpuDeviceInformation parseXml(String xmlContent) + throws YarnException { + if (unmarshaller == null) { + try { + init(); + } catch (SAXException | ParserConfigurationException | JAXBException e) { + LOG.error("Exception while initialize parser", e); + throw new YarnException(e); + } + } + + InputSource inputSource = new InputSource(new StringReader(xmlContent)); + SAXSource source = new SAXSource(xmlReader, inputSource); + try { + return (GpuDeviceInformation) unmarshaller.unmarshal(source); + } catch (JAXBException e) { + LOG.error("Exception while parsing xml", e); + throw new YarnException(e); + } + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java new file mode 100644 index 0000000..f315313 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuDeviceInformation.java @@ -0,0 +1,165 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlAdapter; + +/** + * Capture single GPU device information such as memory size, temperature, + * utilization. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "gpu") +public class PerGpuDeviceInformation { + + private String productName = "N/A"; + private String uuid = "N/A"; + private int minorNumber = -1; + + private PerGpuUtilizations gpuUtilizations; + private PerGpuMemoryUsage gpuMemoryUsage; + private PerGpuTemperature temperature; + + /** + * Convert formats like "34 C", "75.6 %" to float. + */ + @InterfaceAudience.Private + @InterfaceStability.Unstable + static class StrToFloatBeforeSpaceAdapter extends + XmlAdapter<String, Float> { + @Override + public String marshal(Float v) throws Exception { + if (v == null) { + return ""; + } + return String.valueOf(v); + } + + @Override + public Float unmarshal(String v) throws Exception { + if (v == null) { + return -1f; + } + + return Float.valueOf(v.split(" ")[0]); + } + } + + /** + * Convert formats like "725 MiB" to long. + */ + @InterfaceAudience.Private + @InterfaceStability.Unstable + static class StrToMemAdapter extends XmlAdapter<String, Long> { + @Override + public String marshal(Long v) throws Exception { + if (v == null) { + return ""; + } + return String.valueOf(v) + " MiB"; + } + + @Override + public Long unmarshal(String v) throws Exception { + if (v == null) { + return -1L; + } + return Long.valueOf(v.split(" ")[0]); + } + } + + @XmlElement(name = "temperature") + public PerGpuTemperature getTemperature() { + return temperature; + } + + public void setTemperature(PerGpuTemperature temperature) { + this.temperature = temperature; + } + + @XmlElement(name = "uuid") + public String getUuid() { + return uuid; + } + + public void setUuid(String uuid) { + this.uuid = uuid; + } + + @XmlElement(name = "product_name") + public String getProductName() { + return productName; + } + + public void setProductName(String productName) { + this.productName = productName; + } + + @XmlElement(name = "minor_number") + public int getMinorNumber() { + return minorNumber; + } + + public void setMinorNumber(int minorNumber) { + this.minorNumber = minorNumber; + } + + @XmlElement(name = "utilization") + public PerGpuUtilizations getGpuUtilizations() { + return gpuUtilizations; + } + + public void setGpuUtilizations(PerGpuUtilizations utilizations) { + this.gpuUtilizations = utilizations; + } + + @XmlElement(name = "bar1_memory_usage") + public PerGpuMemoryUsage getGpuMemoryUsage() { + return gpuMemoryUsage; + } + + public void setGpuMemoryUsage(PerGpuMemoryUsage gpuMemoryUsage) { + this.gpuMemoryUsage = gpuMemoryUsage; + } + + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("ProductName=").append(productName).append(", MinorNumber=") + .append(minorNumber); + + if (getGpuMemoryUsage() != null) { + sb.append(", TotalMemory=").append( + getGpuMemoryUsage().getTotalMemoryMiB()).append("MiB"); + } + + if (getGpuUtilizations() != null) { + sb.append(", Utilization=").append( + getGpuUtilizations().getOverallGpuUtilization()).append("%"); + } + return sb.toString(); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java new file mode 100644 index 0000000..3964c4e --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuMemoryUsage.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter; + +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "bar1_memory_usage") +public class PerGpuMemoryUsage { + long usedMemoryMiB = -1L; + long availMemoryMiB = -1L; + + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class) + @XmlElement(name = "used") + public Long getUsedMemoryMiB() { + return usedMemoryMiB; + } + + public void setUsedMemoryMiB(Long usedMemoryMiB) { + this.usedMemoryMiB = usedMemoryMiB; + } + + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToMemAdapter.class) + @XmlElement(name = "free") + public Long getAvailMemoryMiB() { + return availMemoryMiB; + } + + public void setAvailMemoryMiB(Long availMemoryMiB) { + this.availMemoryMiB = availMemoryMiB; + } + + public long getTotalMemoryMiB() { + return usedMemoryMiB + availMemoryMiB; + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java new file mode 100644 index 0000000..ccd60cb --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuTemperature.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter; + +/** + * Temperature of GPU + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "temperature") +public class PerGpuTemperature { + private float currentGpuTemp = Float.MIN_VALUE; + private float maxGpuTemp = Float.MIN_VALUE; + private float slowThresholdGpuTemp = Float.MIN_VALUE; + + /** + * Get current celsius GPU temperature + * @return temperature + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_temp") + public Float getCurrentGpuTemp() { + return currentGpuTemp; + } + + public void setCurrentGpuTemp(Float currentGpuTemp) { + this.currentGpuTemp = currentGpuTemp; + } + + /** + * Get max possible celsius GPU temperature + * @return temperature + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_temp_max_threshold") + public Float getMaxGpuTemp() { + return maxGpuTemp; + } + + public void setMaxGpuTemp(Float maxGpuTemp) { + this.maxGpuTemp = maxGpuTemp; + } + + /** + * Get celsius GPU temperature which could make GPU runs slower + * @return temperature + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_temp_slow_threshold") + public Float getSlowThresholdGpuTemp() { + return slowThresholdGpuTemp; + } + + public void setSlowThresholdGpuTemp(Float slowThresholdGpuTemp) { + this.slowThresholdGpuTemp = slowThresholdGpuTemp; + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java new file mode 100644 index 0000000..4ef218b --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/dao/gpu/PerGpuUtilizations.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +import javax.xml.bind.annotation.XmlElement; +import javax.xml.bind.annotation.XmlRootElement; +import javax.xml.bind.annotation.adapters.XmlJavaTypeAdapter; + +/** + * GPU utilizations + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +@XmlRootElement(name = "utilization") +public class PerGpuUtilizations { + private float overallGpuUtilization; + + /** + * Overall percent GPU utilization + * @return utilization + */ + @XmlJavaTypeAdapter(PerGpuDeviceInformation.StrToFloatBeforeSpaceAdapter.class) + @XmlElement(name = "gpu_util") + public Float getOverallGpuUtilization() { + return overallGpuUtilization; + } + + public void setOverallGpuUtilization(Float overallGpuUtilization) { + this.overallGpuUtilization = overallGpuUtilization; + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java new file mode 100644 index 0000000..13b3ee9 --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/NodeManagerTestBase.java @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.net.ServerSocketUtil; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.event.Dispatcher; +import org.apache.hadoop.yarn.exceptions.YarnException; +import org.apache.hadoop.yarn.factories.RecordFactory; +import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; +import org.apache.hadoop.yarn.server.api.ResourceTracker; +import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest; +import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse; +import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest; +import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse; +import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest; +import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse; +import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.NodeHeartbeatResponsePBImpl; +import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.RegisterNodeManagerResponsePBImpl; +import org.apache.hadoop.yarn.server.api.protocolrecords.impl.pb.UnRegisterNodeManagerResponsePBImpl; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl; +import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics; +import org.junit.Assert; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.net.InetAddress; +import java.net.UnknownHostException; + +public class NodeManagerTestBase { + // temp fix until metrics system can auto-detect itself running in unit test: + static { + DefaultMetricsSystem.setMiniClusterMode(true); + } + + protected static final Logger LOG = + LoggerFactory.getLogger(TestNodeStatusUpdater.class); + protected static final File basedir = + new File("target", TestNodeStatusUpdater.class.getName()); + protected static final File nmLocalDir = new File(basedir, "nm0"); + protected static final File tmpDir = new File(basedir, "tmpDir"); + protected static final File remoteLogsDir = new File(basedir, "remotelogs"); + protected static final File logsDir = new File(basedir, "logs"); + protected static final RecordFactory recordFactory = RecordFactoryProvider + .getRecordFactory(null); + protected Configuration conf; + + protected YarnConfiguration createNMConfig() throws IOException { + return createNMConfig(ServerSocketUtil.getPort(49170, 10)); + } + + protected YarnConfiguration createNMConfig(int port) throws IOException { + YarnConfiguration conf = new YarnConfiguration(); + String localhostAddress = null; + try { + localhostAddress = InetAddress.getByName("localhost") + .getCanonicalHostName(); + } catch (UnknownHostException e) { + Assert.fail("Unable to get localhost address: " + e.getMessage()); + } + conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB + conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port); + conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":" + + ServerSocketUtil.getPort(49160, 10)); + conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath()); + conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, + remoteLogsDir.getAbsolutePath()); + conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath()); + conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1); + return conf; + } + + public static class BaseResourceTrackerForTest implements ResourceTracker { + @Override + public RegisterNodeManagerResponse registerNodeManager( + RegisterNodeManagerRequest request) throws YarnException, IOException { + return new RegisterNodeManagerResponsePBImpl(); + } + + @Override + public NodeHeartbeatResponse nodeHeartbeat(NodeHeartbeatRequest request) + throws YarnException, IOException { + return new NodeHeartbeatResponsePBImpl(); + } + + @Override + public UnRegisterNodeManagerResponse unRegisterNodeManager( + UnRegisterNodeManagerRequest request) + throws YarnException, IOException { + return new UnRegisterNodeManagerResponsePBImpl(); + } + } + + protected static class BaseNodeStatusUpdaterForTest extends NodeStatusUpdaterImpl { + public ResourceTracker resourceTracker; + protected Context context; + + public BaseNodeStatusUpdaterForTest(Context context, Dispatcher dispatcher, + NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, + ResourceTracker resourceTracker) { + super(context, dispatcher, healthChecker, metrics); + this.context = context; + this.resourceTracker = resourceTracker; + } + @Override + protected ResourceTracker getRMClient() { + return resourceTracker; + } + + @Override + protected void stopRMProxy() { + return; + } + } + + public class MyContainerManager extends ContainerManagerImpl { + public boolean signaled = false; + + public MyContainerManager(Context context, ContainerExecutor exec, + DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, + NodeManagerMetrics metrics, + LocalDirsHandlerService dirsHandler) { + super(context, exec, deletionContext, nodeStatusUpdater, + metrics, dirsHandler); + } + + @Override + public void handle(ContainerManagerEvent event) { + if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) { + signaled = true; + } + } + } + + @Before + public void setUp() throws IOException { + nmLocalDir.mkdirs(); + tmpDir.mkdirs(); + logsDir.mkdirs(); + remoteLogsDir.mkdirs(); + conf = createNMConfig(); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java index 2e9eff5..9b180c7 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java @@ -178,7 +178,7 @@ public class TestDefaultContainerExecutor { FileContext lfs = FileContext.getLocalFSFileContext(conf); DefaultContainerExecutor executor = new DefaultContainerExecutor(lfs); executor.setConf(conf); - executor.init(); + executor.init(null); try { executor.createUserLocalDirs(localDirs, user); @@ -317,7 +317,7 @@ public class TestDefaultContainerExecutor { Path workDir = localDir; Path pidFile = new Path(workDir, "pid.txt"); - mockExec.init(); + mockExec.init(null); mockExec.activateContainer(cId, pidFile); int ret = mockExec.launchContainer(new ContainerStartContext.Builder() .setContainer(container) http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java index cf8d977..95c8f5e 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java @@ -628,7 +628,7 @@ public class TestLinuxContainerExecutor { LinuxContainerExecutor lce = new LinuxContainerExecutor(); lce.setConf(conf); try { - lce.init(); + lce.init(null); } catch (IOException e) { // expected if LCE isn't setup right, but not necessary for this test } http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java index 79b88cf..249e017 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java @@ -426,7 +426,7 @@ public class TestLinuxContainerExecutorWithMocks { @Test public void testInit() throws Exception { - mockExec.init(); + mockExec.init(mock(Context.class)); assertEquals(Arrays.asList("--checksetup"), readMockParams()); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java index 9279711..b31215b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManager.java @@ -37,7 +37,7 @@ public class TestNodeManager { public static final class InvalidContainerExecutor extends DefaultContainerExecutor { @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { throw new IOException("dummy executor init called"); } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java index 055dab4..533cf2a 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeStatusUpdater.java @@ -20,16 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager; import static org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils.newNodeHeartbeatResponse; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; -import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; @@ -80,8 +78,6 @@ import org.apache.hadoop.yarn.event.Dispatcher; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; -import org.apache.hadoop.yarn.factories.RecordFactory; -import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NodeHeartbeatResponseProto; import org.apache.hadoop.yarn.security.ContainerTokenIdentifier; import org.apache.hadoop.yarn.server.api.ResourceTracker; @@ -117,41 +113,14 @@ import org.junit.Before; import org.junit.Test; @SuppressWarnings("rawtypes") -public class TestNodeStatusUpdater { - - // temp fix until metrics system can auto-detect itself running in unit test: - static { - DefaultMetricsSystem.setMiniClusterMode(true); - } - - static final Logger LOG = - LoggerFactory.getLogger(TestNodeStatusUpdater.class); - static final File basedir = - new File("target", TestNodeStatusUpdater.class.getName()); - static final File nmLocalDir = new File(basedir, "nm0"); - static final File tmpDir = new File(basedir, "tmpDir"); - static final File remoteLogsDir = new File(basedir, "remotelogs"); - static final File logsDir = new File(basedir, "logs"); - private static final RecordFactory recordFactory = RecordFactoryProvider - .getRecordFactory(null); - +public class TestNodeStatusUpdater extends NodeManagerTestBase { volatile int heartBeatID = 0; volatile Throwable nmStartError = null; private final List<NodeId> registeredNodes = new ArrayList<NodeId>(); private boolean triggered = false; - private Configuration conf; private NodeManager nm; private AtomicBoolean assertionFailedInThread = new AtomicBoolean(false); - @Before - public void setUp() throws IOException { - nmLocalDir.mkdirs(); - tmpDir.mkdirs(); - logsDir.mkdirs(); - remoteLogsDir.mkdirs(); - conf = createNMConfig(); - } - @After public void tearDown() { this.registeredNodes.clear(); @@ -332,29 +301,7 @@ public class TestNodeStatusUpdater { } } - private class MyContainerManager extends ContainerManagerImpl { - public boolean signaled = false; - - public MyContainerManager(Context context, ContainerExecutor exec, - DeletionService deletionContext, NodeStatusUpdater nodeStatusUpdater, - NodeManagerMetrics metrics, - LocalDirsHandlerService dirsHandler) { - super(context, exec, deletionContext, nodeStatusUpdater, - metrics, dirsHandler); - } - - @Override - public void handle(ContainerManagerEvent event) { - if (event.getType() == ContainerManagerEventType.SIGNAL_CONTAINERS) { - signaled = true; - } - } - } - - private class MyNodeStatusUpdater extends NodeStatusUpdaterImpl { - public ResourceTracker resourceTracker; - private Context context; - + private class MyNodeStatusUpdater extends BaseNodeStatusUpdaterForTest { public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) { this(context, dispatcher, healthChecker, metrics, false); @@ -363,19 +310,8 @@ public class TestNodeStatusUpdater { public MyNodeStatusUpdater(Context context, Dispatcher dispatcher, NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics, boolean signalContainer) { - super(context, dispatcher, healthChecker, metrics); - this.context = context; - resourceTracker = new MyResourceTracker(this.context, signalContainer); - } - - @Override - protected ResourceTracker getRMClient() { - return resourceTracker; - } - - @Override - protected void stopRMProxy() { - return; + super(context, dispatcher, healthChecker, metrics, + new MyResourceTracker(context, signalContainer)); } } @@ -1818,7 +1754,6 @@ public class TestNodeStatusUpdater { Assert.assertTrue("Test failed with exception(s)" + exceptions, exceptions.isEmpty()); } - // Add new containers info into NM context each time node heart beats. private class MyNMContext extends NMContext { @@ -1922,31 +1857,6 @@ public class TestNodeStatusUpdater { this.registeredNodes.size()); } - private YarnConfiguration createNMConfig(int port) throws IOException { - YarnConfiguration conf = new YarnConfiguration(); - String localhostAddress = null; - try { - localhostAddress = InetAddress.getByName("localhost") - .getCanonicalHostName(); - } catch (UnknownHostException e) { - Assert.fail("Unable to get localhost address: " + e.getMessage()); - } - conf.setInt(YarnConfiguration.NM_PMEM_MB, 5 * 1024); // 5GB - conf.set(YarnConfiguration.NM_ADDRESS, localhostAddress + ":" + port); - conf.set(YarnConfiguration.NM_LOCALIZER_ADDRESS, localhostAddress + ":" - + ServerSocketUtil.getPort(49160, 10)); - conf.set(YarnConfiguration.NM_LOG_DIRS, logsDir.getAbsolutePath()); - conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, - remoteLogsDir.getAbsolutePath()); - conf.set(YarnConfiguration.NM_LOCAL_DIRS, nmLocalDir.getAbsolutePath()); - conf.setLong(YarnConfiguration.NM_LOG_RETAIN_SECONDS, 1); - return conf; - } - - private YarnConfiguration createNMConfig() throws IOException { - return createNMConfig(ServerSocketUtil.getPort(49170, 10)); - } - private NodeManager getNodeManager(final NodeAction nodeHeartBeatAction) { return new NodeManager() { @Override http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java index 0958191..c9c0a38 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java @@ -18,26 +18,6 @@ package org.apache.hadoop.yarn.server.nodemanager.amrmproxy; -import java.io.IOException; -import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; - -import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.UserGroupInformation; @@ -65,6 +45,7 @@ import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport; import org.apache.hadoop.yarn.server.api.records.AppCollectorData; import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus; import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor; +import org.apache.hadoop.yarn.server.nodemanager.ContainerStateTransitionListener; import org.apache.hadoop.yarn.server.nodemanager.Context; import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService; import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext; @@ -73,18 +54,37 @@ import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater; import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager; import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application; import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState; -import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager; import org.apache.hadoop.yarn.server.nodemanager.security.NMTokenSecretManagerInNM; import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher; +import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator; import org.apache.hadoop.yarn.server.security.ApplicationACLsManager; import org.apache.hadoop.yarn.util.Records; import org.junit.After; import org.junit.Assert; import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.security.PrivilegedExceptionAction; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; /** * Base class for all the AMRMProxyService test cases. It provides utility @@ -803,5 +803,9 @@ public abstract class BaseAMRMProxyTest { public NMTimelinePublisher getNMTimelinePublisher() { return null; } + + public ResourcePluginManager getResourcePluginManager() { + return null; + } } } http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java index e5414a5..0563694 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestResourceHandlerModule.java @@ -22,6 +22,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resourc import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -30,6 +31,8 @@ import org.slf4j.LoggerFactory; import java.util.List; +import static org.mockito.Mockito.mock; + public class TestResourceHandlerModule { private static final Logger LOG = LoggerFactory.getLogger(TestResourceHandlerModule.class); @@ -62,7 +65,7 @@ public class TestResourceHandlerModule { //Ensure that outbound bandwidth resource handler is present in the chain ResourceHandlerChain resourceHandlerChain = ResourceHandlerModule - .getConfiguredResourceHandlerChain(networkEnabledConf); + .getConfiguredResourceHandlerChain(networkEnabledConf, mock(Context.class)); List<ResourceHandler> resourceHandlers = resourceHandlerChain .getResourceHandlerList(); //Exactly one resource handler in chain @@ -88,7 +91,8 @@ public class TestResourceHandlerModule { Assert.assertNotNull(handler); ResourceHandlerChain resourceHandlerChain = - ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf); + ResourceHandlerModule.getConfiguredResourceHandlerChain(diskConf, + mock(Context.class)); List<ResourceHandler> resourceHandlers = resourceHandlerChain.getResourceHandlerList(); // Exactly one resource handler in chain http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java new file mode 100644 index 0000000..5c70f7a --- /dev/null +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandler.java @@ -0,0 +1,382 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.yarn.api.protocolrecords.ResourceTypes; +import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; +import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ContainerId; +import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; +import org.apache.hadoop.yarn.api.records.Resource; +import org.apache.hadoop.yarn.api.records.ResourceInformation; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.nodemanager.Context; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ResourceMappings; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperation; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.privileged.PrivilegedOperationExecutor; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.CGroupsHandler; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer; +import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService; +import org.apache.hadoop.yarn.util.resource.ResourceUtils; +import org.apache.hadoop.yarn.util.resource.TestResourceUtils; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyList; +import static org.mockito.Matchers.anyString; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class TestGpuResourceHandler { + private CGroupsHandler mockCGroupsHandler; + private PrivilegedOperationExecutor mockPrivilegedExecutor; + private GpuResourceHandlerImpl gpuResourceHandler; + private NMStateStoreService mockNMStateStore; + private ConcurrentHashMap<ContainerId, Container> runningContainersMap; + + @Before + public void setup() { + TestResourceUtils.addNewTypesToResources(ResourceInformation.GPU_URI); + + mockCGroupsHandler = mock(CGroupsHandler.class); + mockPrivilegedExecutor = mock(PrivilegedOperationExecutor.class); + mockNMStateStore = mock(NMStateStoreService.class); + + Context nmctx = mock(Context.class); + when(nmctx.getNMStateStore()).thenReturn(mockNMStateStore); + runningContainersMap = new ConcurrentHashMap<>(); + when(nmctx.getContainers()).thenReturn(runningContainersMap); + + gpuResourceHandler = new GpuResourceHandlerImpl(nmctx, mockCGroupsHandler, + mockPrivilegedExecutor); + } + + @Test + public void testBootStrap() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0"); + + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + verify(mockCGroupsHandler, times(1)).initializeCGroupController( + CGroupsHandler.CGroupController.DEVICES); + } + + private static ContainerId getContainerId(int id) { + return ContainerId.newContainerId(ApplicationAttemptId + .newInstance(ApplicationId.newInstance(1234L, 1), 1), id); + } + + private static Container mockContainerWithGpuRequest(int id, + int numGpuRequest) { + Container c = mock(Container.class); + when(c.getContainerId()).thenReturn(getContainerId(id)); + + Resource res = Resource.newInstance(1024, 1); + ResourceMappings resMapping = new ResourceMappings(); + + res.setResourceValue(ResourceInformation.GPU_URI, numGpuRequest); + when(c.getResource()).thenReturn(res); + when(c.getResourceMappings()).thenReturn(resMapping); + return c; + } + + private void verifyDeniedDevices(ContainerId containerId, + List<Integer> deniedDevices) + throws ResourceHandlerException, PrivilegedOperationException { + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, containerId.toString()); + + if (null != deniedDevices && !deniedDevices.isEmpty()) { + verify(mockPrivilegedExecutor, times(1)).executePrivilegedOperation( + new PrivilegedOperation(PrivilegedOperation.OperationType.GPU, Arrays + .asList(GpuResourceHandlerImpl.CONTAINER_ID_CLI_OPTION, + containerId.toString(), + GpuResourceHandlerImpl.EXCLUDED_GPUS_CLI_OPTION, + StringUtils.join(",", deniedDevices))), true); + } + } + + @Test + public void testAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(4)); + + /* Start container 2, asks 2 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 2)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Start container 3, ask 1 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(3, 1)); + + // devices = 0/1/3 will be blocked + verifyDeniedDevices(getContainerId(3), Arrays.asList(0, 1, 3)); + + /* Start container 4, ask 0 container, succeeded */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(4, 0)); + + // All devices will be blocked + verifyDeniedDevices(getContainerId(4), Arrays.asList(0, 1, 3, 4)); + + /* Release container-1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(3, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Release container-3, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(3)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(3).toString()); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @SuppressWarnings("unchecked") + @Test + public void testAssignedGpuWillBeCleanedupWhenStoreOpFails() + throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + doThrow(new IOException("Exception ...")).when(mockNMStateStore) + .storeAssignedResources( + any(ContainerId.class), anyString(), anyList()); + + boolean exception = false; + /* Start container 1, asks 3 containers */ + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 3)); + } catch (ResourceHandlerException e) { + exception = true; + } + + Assert.assertTrue("preStart should throw exception", exception); + + // After preStart, we still have 4 available GPU since the store op fails. + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationWithoutAllowedGpus() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " "); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 0 containers */ + gpuResourceHandler.preStart(mockContainerWithGpuRequest(1, 0)); + verifyDeniedDevices(getContainerId(1), Collections.emptyList()); + + /* Start container 2, asks 1 containers. Excepted to fail */ + boolean failedToAllocate = false; + try { + gpuResourceHandler.preStart(mockContainerWithGpuRequest(2, 1)); + } catch (ResourceHandlerException e) { + failedToAllocate = true; + } + Assert.assertTrue(failedToAllocate); + + /* Release container 1, expect cgroups deleted */ + gpuResourceHandler.postComplete(getContainerId(1)); + + verify(mockCGroupsHandler, times(1)).createCGroup( + CGroupsHandler.CGroupController.DEVICES, getContainerId(1).toString()); + Assert.assertEquals(0, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + } + + @Test + public void testAllocationStored() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + /* Start container 1, asks 3 containers */ + Container container = mockContainerWithGpuRequest(1, 3); + gpuResourceHandler.preStart(container); + + verify(mockNMStateStore).storeAssignedResources(getContainerId(1), + ResourceInformation.GPU_URI, + Arrays.asList("0", "1", "3")); + + Assert.assertEquals(3, container.getResourceMappings() + .getAssignedResources(ResourceInformation.GPU_URI).size()); + + // Only device=4 will be blocked. + verifyDeniedDevices(getContainerId(1), Arrays.asList(4)); + + /* Start container 2, ask 0 container, succeeded */ + container = mockContainerWithGpuRequest(2, 0); + gpuResourceHandler.preStart(container); + + verifyDeniedDevices(getContainerId(2), Arrays.asList(0, 1, 3, 4)); + Assert.assertEquals(0, container.getResourceMappings() + .getAssignedResources(ResourceInformation.GPU_URI).size()); + + // Store assigned resource will not be invoked. + verify(mockNMStateStore, never()).storeAssignedResources( + eq(getContainerId(2)), eq(ResourceInformation.GPU_URI), anyList()); + } + + @Test + public void testRecoverResourceAllocation() throws Exception { + Configuration conf = new YarnConfiguration(); + conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0,1,3,4"); + GpuDiscoverer.getInstance().initialize(conf); + + gpuResourceHandler.bootstrap(conf); + Assert.assertEquals(4, + gpuResourceHandler.getGpuAllocator().getAvailableGpus()); + + Container nmContainer = mock(Container.class); + ResourceMappings rmap = new ResourceMappings(); + ResourceMappings.AssignedResources ar = + new ResourceMappings.AssignedResources(); + ar.updateAssignedResources(Arrays.asList("1", "3")); + rmap.addAssignedResources(ResourceInformation.GPU_URI, ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(1), nmContainer); + + // TEST CASE + // Reacquire container restore state of GPU Resource Allocator. + gpuResourceHandler.reacquireContainer(getContainerId(1)); + + Map<Integer, ContainerId> deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is not in allowed list. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=5 is not in allowed list. + ar.updateAssignedResources(Arrays.asList("4", "5")); + rmap.addAssignedResources(ResourceInformation.GPU_URI, ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + boolean caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + + // TEST CASE + // Try to reacquire a container but requested device is already assigned. + nmContainer = mock(Container.class); + rmap = new ResourceMappings(); + ar = new ResourceMappings.AssignedResources(); + // id=3 is already assigned + ar.updateAssignedResources(Arrays.asList("4", "3")); + rmap.addAssignedResources("gpu", ar); + when(nmContainer.getResourceMappings()).thenReturn(rmap); + + runningContainersMap.put(getContainerId(2), nmContainer); + + caughtException = false; + try { + gpuResourceHandler.reacquireContainer(getContainerId(1)); + } catch (ResourceHandlerException e) { + caughtException = true; + } + Assert.assertTrue( + "Should fail since requested device Id is not in allowed list", + caughtException); + + // Make sure internal state not changed. + deviceAllocationMapping = + gpuResourceHandler.getGpuAllocator().getDeviceAllocationMapping(); + Assert.assertEquals(2, deviceAllocationMapping.size()); + Assert.assertTrue( + deviceAllocationMapping.keySet().containsAll(Arrays.asList(1, 3))); + Assert.assertEquals(deviceAllocationMapping.get(1), getContainerId(1)); + } +} http://git-wip-us.apache.org/repos/asf/hadoop/blob/f17a2e47/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java index 318ae6b..a147afb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java @@ -70,7 +70,7 @@ public class TestContainersMonitorResourceChange { private static class MockExecutor extends ContainerExecutor { @Override - public void init() throws IOException { + public void init(Context nmContext) throws IOException { } @Override public void startLocalizer(LocalizerStartContext ctx) --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org