sodonnel commented on code in PR #4614:
URL: https://github.com/apache/ozone/pull/4614#discussion_r1175130849
##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/placement/algorithms/SCMContainerPlacementRackAware.java:
##########
@@ -99,7 +99,177 @@ public SCMContainerPlacementRackAware(final NodeManager
nodeManager,
*/
@Override
protected List<DatanodeDetails> chooseDatanodesInternal(
- List<DatanodeDetails> usedNodes,
+ List<DatanodeDetails> usedNodes,
+ List<DatanodeDetails> excludedNodes,
+ List<DatanodeDetails> favoredNodes, int nodesRequired,
+ long metadataSizeRequired, long dataSizeRequired)
+ throws SCMException {
+ if (!usedNodesPassed(usedNodes)) {
+ // If interface is called without used nodes
+ // In this case consider only exclude nodes to determine racks
+ return chooseDatanodesInternalLegacy(excludedNodes,
+ favoredNodes, nodesRequired, metadataSizeRequired, dataSizeRequired);
+ }
+ Preconditions.checkArgument(nodesRequired > 0);
+ metrics.incrDatanodeRequestCount(nodesRequired);
+ int datanodeCount = networkTopology.getNumOfLeafNode(NetConstants.ROOT);
+ int excludedNodesCount = excludedNodes == null ? 0 : excludedNodes.size();
+ int usedNodesCount = usedNodes == null ? 0 : usedNodes.size();
+ if (datanodeCount < nodesRequired + excludedNodesCount + usedNodesCount) {
+ throw new SCMException("No enough datanodes to choose. " +
+ "TotalNode = " + datanodeCount +
+ " RequiredNode = " + nodesRequired +
+ " ExcludedNode = " + excludedNodesCount +
+ " UsedNode = " + usedNodesCount, null);
+ }
+ List<DatanodeDetails> mutableFavoredNodes = favoredNodes;
+ // sanity check of favoredNodes
+ if (mutableFavoredNodes != null && excludedNodes != null) {
+ mutableFavoredNodes = new ArrayList<>();
+ mutableFavoredNodes.addAll(favoredNodes);
+ mutableFavoredNodes.removeAll(excludedNodes);
+ }
+ int favoredNodeNum = mutableFavoredNodes == null ? 0 :
+ mutableFavoredNodes.size();
+
+ List<DatanodeDetails> chosenNodes = new ArrayList<>();
+ List<DatanodeDetails> mutableUsedNodes = new ArrayList<>();
+ mutableUsedNodes.addAll(usedNodes);
+ List<DatanodeDetails> mutableExcludedNodes = new ArrayList<>();
+ if (excludedNodes != null) {
+ mutableExcludedNodes.addAll(excludedNodes);
+ }
+
+ DatanodeDetails favoredNode;
+ int favorIndex = 0;
+ if (mutableUsedNodes.size() == 0) {
+ // choose all nodes for a new pipeline case
+ // choose first datanode from scope ROOT or from favoredNodes if not null
+ favoredNode = favoredNodeNum > favorIndex ?
+ mutableFavoredNodes.get(favorIndex) : null;
+ DatanodeDetails firstNode;
+ if (favoredNode != null) {
+ firstNode = favoredNode;
+ favorIndex++;
+ } else {
+ firstNode = chooseNode(mutableExcludedNodes, null,
metadataSizeRequired,
+ dataSizeRequired);
+ }
+ chosenNodes.add(firstNode);
+ nodesRequired--;
+ if (nodesRequired == 0) {
+ return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
+ }
+
+ // choose second datanode on the same rack as first one
+ favoredNode = favoredNodeNum > favorIndex ?
+ mutableFavoredNodes.get(favorIndex) : null;
+ DatanodeDetails secondNode;
+ if (favoredNode != null &&
+ networkTopology.isSameParent(firstNode, favoredNode)) {
+ secondNode = favoredNode;
+ favorIndex++;
+ } else {
+ mutableExcludedNodes.add(firstNode);
+ secondNode = chooseNode(mutableExcludedNodes, Arrays.asList(firstNode),
+ metadataSizeRequired, dataSizeRequired);
+ }
+ chosenNodes.add(secondNode);
+ nodesRequired--;
+ if (nodesRequired == 0) {
+ return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
+ }
+ // choose remaining datanodes on different rack as first and second
+ mutableExcludedNodes.add(secondNode);
+ } else {
+
+ // choose node to meet replication requirement
+ // case 1: one used node, choose one on the same rack as the used
+ // node, choose others on different racks.
+ if (mutableUsedNodes.size() == 1) {
+ favoredNode = favoredNodeNum > favorIndex ?
+ mutableFavoredNodes.get(favorIndex) : null;
+ DatanodeDetails firstNode;
+ if (favoredNode != null &&
+ networkTopology.isSameParent(mutableUsedNodes.get(0),
+ favoredNode)) {
+ firstNode = favoredNode;
+ favorIndex++;
+ } else {
+ firstNode = chooseNode(mutableExcludedNodes, mutableUsedNodes,
+ metadataSizeRequired, dataSizeRequired);
+ }
+ chosenNodes.add(firstNode);
+ nodesRequired--;
+ if (nodesRequired == 0) {
+ return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
+ }
+
+ // choose remaining nodes on different racks
+ mutableExcludedNodes.add(firstNode);
+ mutableExcludedNodes.addAll(mutableUsedNodes);
+ } else {
+ // case 2: two or more used nodes, if these two nodes are
+ // in the same rack, then choose nodes on different racks, otherwise,
+ // choose one on the same rack as one of used
+ // nodes, remaining chosen
+ // are on different racks.
+ for (int i = 0; i < usedNodesCount; i++) {
+ for (int j = i + 1; j < usedNodesCount; j++) {
+ if (networkTopology.isSameParent(
+ usedNodes.get(i), usedNodes.get(j))) {
+ // choose remaining nodes on different racks
+ mutableExcludedNodes.addAll(mutableUsedNodes);
+ return chooseNodes(mutableExcludedNodes, chosenNodes,
+ mutableFavoredNodes, favorIndex, nodesRequired,
+ metadataSizeRequired, dataSizeRequired);
+ }
+ }
+ }
+ // choose one data on the same rack with one used node
+ favoredNode = favoredNodeNum > favorIndex ?
+ mutableFavoredNodes.get(favorIndex) : null;
+ DatanodeDetails secondNode;
+ if (favoredNode != null && networkTopology.isSameParent(
+ chosenNodes.get(0), favoredNode)) {
+ secondNode = favoredNode;
+ favorIndex++;
+ } else {
+ secondNode =
+ chooseNode(mutableExcludedNodes, mutableUsedNodes,
+ metadataSizeRequired, dataSizeRequired);
+ }
+ chosenNodes.add(secondNode);
+ mutableExcludedNodes.add(secondNode);
+ mutableExcludedNodes.addAll(mutableUsedNodes);
+ nodesRequired--;
+ if (nodesRequired == 0) {
+ return Arrays.asList(chosenNodes.toArray(new DatanodeDetails[0]));
+ }
+ }
+ }
+ // choose remaining nodes on different racks
+ return chooseNodes(mutableExcludedNodes, chosenNodes, mutableFavoredNodes,
Review Comment:
I am not sure if this is going to do what we want. The first few lines of
that method are:
```
private List<DatanodeDetails> chooseNodes(List<DatanodeDetails>
excludedNodes,
List<DatanodeDetails> chosenNodes, List<DatanodeDetails> favoredNodes,
int favorIndex, int nodesRequired, long metadataSizeRequired,
long dataSizeRequired) throws SCMException {
Preconditions.checkArgument(chosenNodes != null);
List<DatanodeDetails> excludedNodeList = excludedNodes != null ?
excludedNodes : chosenNodes;
```
If we have excluded nodes and used nodes, then it sets the list to just
excludedNodes - this is fine as you have added the chosen nodes to excluded on
the way through this method.
Then it passes just the excludedNodes to:
```
chosenNode = chooseNode(excludedNodeList, null,
metadataSizeRequired, dataSizeRequired);
```
Which eventually calls:
```
node =
(DatanodeDetails)networkTopology.chooseRandom(NetConstants.ROOT,
excludedNodesForCapacity, excludedNodes, null, ancestorGen);
```
This is what I am not sure about - does that call above attempt to ensure
that the picked node does not share a rack with any of the excluded nodes? If
we had a couple of excluded nodes (eg decommissioning nodes) and then we pick
some new used nodes already - that exlude list will container <decommissioning
ones> + <already chosen>, and then it will attempt to pick a new node that does
not share any of those racks, which is too restrictive, as we are happy to use
the decommissioning racks again.
The Javadoc is not really clear to me:
```
* @param ancestorGen If 0, then no same generation ancestor enforcement on
* both excludedNodes and affinityNode. If greater
than 0,
* then apply to affinityNode(if not null), or apply to
* excludedNodes if affinityNode is null
```
ancesorGen is getting passed as 1 so it is greater than zero.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]