[GitHub] [solr-operator] tflobbe commented on a diff in pull request #561: Add managed scale-down for SolrClouds

via GitHub Tue, 16 May 2023 13:31:07 -0700


tflobbe commented on code in PR #561:
URL: https://github.com/apache/solr-operator/pull/561#discussion_r1195591555



##########
controllers/solr_cluster_ops_util.go:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controllers
+
+import (
+       "context"
+       "errors"
+       solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
+       "github.com/apache/solr-operator/controllers/util"
+       "github.com/go-logr/logr"
+       appsv1 "k8s.io/api/apps/v1"
+       corev1 "k8s.io/api/core/v1"
+       "k8s.io/utils/pointer"
+       "sigs.k8s.io/controller-runtime/pkg/client"
+       "strconv"
+       "time"
+)
+
+func determineScaleClusterOpLockIfNecessary(ctx context.Context, r 
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet 
*appsv1.StatefulSet, podList []corev1.Pod, logger logr.Logger) (clusterOpLock 
string, clusterOpMetadata string, retryLaterDuration time.Duration, err error) {
+       desiredPods := int(*instance.Spec.Replicas)
+       configuredPods := int(*statefulSet.Spec.Replicas)
+       if desiredPods != configuredPods {
+               scaleTo := -1
+               // Start a scaling operation
+               if desiredPods < configuredPods {
+                       // Scale down!
+                       // The option is enabled by default, so treat "nil" 
like "true"
+                       if instance.Spec.Autoscaling.VacatePodsOnScaleDown == 
nil || *instance.Spec.Autoscaling.VacatePodsOnScaleDown {
+                               if desiredPods > 0 {
+                                       // We only support one scaling down one 
pod at-a-time if not scaling down to 0 pods
+                                       scaleTo = configuredPods - 1
+                               } else {
+                                       scaleTo = 0
+                               }
+                       } else {
+                               // The cloud is not setup to use managed 
scale-down
+                               err = scaleCloudUnmanaged(ctx, r, statefulSet, 
desiredPods, logger)
+                       }
+               } else if desiredPods > configuredPods {
+                       // Scale up!
+                       // TODO: replicasScaleUp is not supported, so do not 
make a clusterOp out of it, just do the patch
+                       err = scaleCloudUnmanaged(ctx, r, statefulSet, 
desiredPods, logger)
+               }
+               if scaleTo > -1 {
+                       clusterOpLock = util.ScaleLock
+                       clusterOpMetadata = strconv.Itoa(scaleTo)
+               }
+       }
+       return
+}
+
+func handleLockedClusterOpScale(ctx context.Context, r *SolrCloudReconciler, 
instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, podList 
[]corev1.Pod, logger logr.Logger) (retryLaterDuration time.Duration, err error) 
{
+       if scalingToNodes, hasAnn := 
statefulSet.Annotations[util.ClusterOpsMetadataAnnotation]; hasAnn {
+               if scalingToNodesInt, convErr := strconv.Atoi(scalingToNodes); 
convErr != nil {
+                       logger.Error(convErr, "Could not convert statefulSet 
annotation to int for scale-down-to information", "annotation", 
util.ClusterOpsMetadataAnnotation, "value", scalingToNodes)
+                       err = convErr
+               } else {
+                       replicaManagementComplete := false
+                       if scalingToNodesInt < int(*statefulSet.Spec.Replicas) {
+                               // Manage scaling down the SolrCloud
+                               replicaManagementComplete, err = 
handleManagedCloudScaleDown(ctx, r, instance, statefulSet, scalingToNodesInt, 
podList, logger)
+                               // } else if scalingToNodesInt > 
int(*statefulSet.Spec.Replicas) {
+                               // TODO: Utilize the scaled-up nodes in the 
future, however Solr does not currently have APIs for this.
+                               // TODO: Think about the order of scale-up and 
restart when individual nodeService IPs are injected into the pods.
+                               // TODO: Will likely want to do a scale-up of 
the service first, then do the rolling restart of the cluster, then utilize the 
node.
+                       } else {
+                               // This shouldn't happen. The 
ScalingToNodesAnnotation is removed when the statefulSet size changes, through 
a Patch.
+                               // But if it does happen, we should just remove 
the annotation and move forward.
+                               patchedStatefulSet := statefulSet.DeepCopy()
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsLockAnnotation)
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsMetadataAnnotation)
+                               if err = r.Patch(ctx, patchedStatefulSet, 
client.StrategicMergeFrom(statefulSet)); err != nil {
+                                       logger.Error(err, "Error while patching 
StatefulSet to remove unneeded clusterLockOp annotation for scaling to the 
current amount of nodes")
+                               } else {
+                                       statefulSet = patchedStatefulSet
+                               }
+                       }
+
+                       // Scale down the statefulSet to represent the new 
number of utilizedPods, if it is lower than the current number of pods
+                       // Also remove the "scalingToNodes" annotation, as that 
acts as a lock on the cluster, so that other operations,
+                       // such as scale-up, pod updates and further scale-down 
cannot happen at the same time.
+                       if replicaManagementComplete {
+                               patchedStatefulSet := statefulSet.DeepCopy()
+                               patchedStatefulSet.Spec.Replicas = 
pointer.Int32(int32(scalingToNodesInt))
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsLockAnnotation)
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsMetadataAnnotation)
+                               if err = r.Patch(ctx, patchedStatefulSet, 
client.StrategicMergeFrom(statefulSet)); err != nil {
+                                       logger.Error(err, "Error while patching 
StatefulSet to scale down SolrCloud", "newUtilizedNodes", scalingToNodesInt)
+                               }
+
+                               // TODO: Create event for the CRD.
+                       } else {
+                               // Retry after five minutes to check if the 
replica management commands have been completed
+                               retryLaterDuration = time.Second * 5
+                       }
+               }
+               // If everything succeeded, the statefulSet will have an 
annotation updated
+               // and the reconcile loop will be called again.
+
+               return
+       } else {
+               err = errors.New("no clusterOpMetadata annotation is present in 
the statefulSet")
+               logger.Error(err, "Cannot perform scaling operation when no 
scale-to-nodes is provided via the clusterOpMetadata")
+               return time.Second * 10, err
+       }
+}
+
+// handleManagedCloudScaleDown does the logic of a managed and "locked" cloud 
scale down operation.
+// This will likely take many reconcile loops to complete, as it is moving 
replicas away from the nodes that will be scaled down.
+func handleManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, 
instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, scaleDownTo 
int, podList []corev1.Pod, logger logr.Logger) (replicaManagementComplete bool, 
err error) {
+       // Before doing anything to the pod, make sure that users cannot send 
requests to the pod anymore.
+       podStoppedReadinessConditions := 
map[corev1.PodConditionType]podReadinessConditionChange{
+               util.SolrIsNotStoppedReadinessCondition: {
+                       reason:  ScaleDown,
+                       message: "Pod is being deleted, traffic to the pod must 
be stopped",
+                       status:  false,
+               },
+       }
+
+       if scaleDownTo == 0 {
+               // Delete all collections & data, the user wants no data left 
if scaling the solrcloud down to 0
+               // This is a much different operation to deleting the 
SolrCloud/StatefulSet all-together
+               replicaManagementComplete, err = evictAllPods(ctx, r, instance, 
podList, podStoppedReadinessConditions, logger)

Review Comment:
   Question: Why is this such different than deleting the 
SolrCloud/StatefulSet? Isn't this going to delete all data and stop the pods?



##########
controllers/util/solr_update_util.go:
##########
@@ -503,82 +503,65 @@ func GetAllManagedSolrNodeNames(solrCloud 
*solr.SolrCloud) map[string]bool {
 }
 
 // EvictReplicasForPodIfNecessary takes a solr Pod and migrates all replicas 
off of that Pod, if the Pod is using ephemeral storage.
-// If the pod is using persistent storage, this function is a no-op.
-// This function MUST be idempotent and return the same list of pods given the 
same kubernetes/solr state.
-func EvictReplicasForPodIfNecessary(ctx context.Context, solrCloud 
*solr.SolrCloud, pod *corev1.Pod, podHasReplicas bool, logger logr.Logger) (err 
error, canDeletePod bool) {
-       var solrDataVolume *corev1.Volume
-
-       // TODO: Remove these checks after v0.7.0, since it will be taken care 
by the evictReplicas podReadinessCondition
-       dataVolumeName := solrCloud.DataVolumeName()
-       for _, volume := range pod.Spec.Volumes {
-               if volume.Name == dataVolumeName {
-                       solrDataVolume = &volume
-                       break
-               }
-       }
-
-       // Only evict if the Data volume is not persistent
-       if solrDataVolume != nil && 
solrDataVolume.VolumeSource.PersistentVolumeClaim == nil {
-               // If the Cloud has 1 or zero pods, and this is the "-0" pod, 
then delete the data since we can't move it anywhere else
-               // Otherwise, move the replicas to other pods
-               if (solrCloud.Spec.Replicas == nil || *solrCloud.Spec.Replicas 
< 2) && strings.HasSuffix(pod.Name, "-0") {
-                       queryParams := url.Values{}
-                       queryParams.Add("action", "DELETENODE")
-                       queryParams.Add("node", SolrNodeName(solrCloud, 
pod.Name))
-                       // TODO: Figure out a way to do this, since DeleteNode 
will not delete the last replica of every type...
-                       canDeletePod = true
-               } else {
-                       requestId := "move-replicas-" + pod.Name
-
-                       // First check to see if the Async Replace request has 
started
-                       if asyncState, message, asyncErr := 
solr_api.CheckAsyncRequest(ctx, solrCloud, requestId); asyncErr != nil {
-                               err = asyncErr
-                       } else if asyncState == "notfound" {
-                               if podHasReplicas {
-                                       // Submit new Replace Node request
-                                       replaceResponse := 
&solr_api.SolrAsyncResponse{}
-                                       queryParams := url.Values{}
-                                       queryParams.Add("action", "REPLACENODE")
-                                       queryParams.Add("parallel", "true")
-                                       queryParams.Add("sourceNode", 
SolrNodeName(solrCloud, pod.Name))
-                                       queryParams.Add("async", requestId)
-                                       err = solr_api.CallCollectionsApi(ctx, 
solrCloud, queryParams, replaceResponse)
-                                       if hasError, apiErr := 
solr_api.CheckForCollectionsApiError("REPLACENODE", 
replaceResponse.ResponseHeader); hasError {
-                                               err = apiErr
-                                       }
-                                       if err == nil {
-                                               logger.Info("Migrating all 
replicas off of pod before deletion.", "requestId", requestId, "pod", pod.Name)
-                                       } else {
-                                               logger.Error(err, "Could not 
migrate all replicas off of pod before deletion. Will try again later.", 
"requestId", requestId, "message", message)
-                                       }
+// For updates this will only be called for pods using ephemeral data.
+// For scale-down operations, this can be called for pods using ephemeral or 
persistent data.
+func EvictReplicasForPodIfNecessary(ctx context.Context, solrCloud 
*solr.SolrCloud, pod *corev1.Pod, podHasReplicas bool, evictionReason string, 
logger logr.Logger) (err error, canDeletePod bool) {
+       logger = logger.WithValues("evictionReason", evictionReason)
+       // If the Cloud has 1 or zero pods, and this is the "-0" pod, then 
delete the data since we can't move it anywhere else
+       // Otherwise, move the replicas to other pods
+       if (solrCloud.Spec.Replicas == nil || *solrCloud.Spec.Replicas < 2) && 
strings.HasSuffix(pod.Name, "-0") {
+               queryParams := url.Values{}
+               queryParams.Add("action", "DELETENODE")
+               queryParams.Add("node", SolrNodeName(solrCloud, pod.Name))
+               // TODO: Figure out a way to do this, since DeleteNode will not 
delete the last replica of every type...
+               canDeletePod = true
+       } else {
+               requestId := "move-replicas-" + pod.Name
+
+               // First check to see if the Async Replace request has started
+               if asyncState, message, asyncErr := 
solr_api.CheckAsyncRequest(ctx, solrCloud, requestId); asyncErr != nil {
+                       err = asyncErr
+               } else if asyncState == "notfound" {
+                       if podHasReplicas {
+                               // Submit new Replace Node request
+                               replaceResponse := &solr_api.SolrAsyncResponse{}
+                               queryParams := url.Values{}
+                               queryParams.Add("action", "REPLACENODE")
+                               queryParams.Add("parallel", "true")
+                               queryParams.Add("sourceNode", 
SolrNodeName(solrCloud, pod.Name))
+                               queryParams.Add("async", requestId)
+                               err = solr_api.CallCollectionsApi(ctx, 
solrCloud, queryParams, replaceResponse)
+                               if hasError, apiErr := 
solr_api.CheckForCollectionsApiError("REPLACENODE", 
replaceResponse.ResponseHeader); hasError {
+                                       err = apiErr
+                               }
+                               if err == nil {
+                                       logger.Info("Migrating all replicas off 
of pod before deletion.", "requestId", requestId, "pod", pod.Name)
                                } else {
-                                       canDeletePod = true
+                                       logger.Error(err, "Could not migrate 
all replicas off of pod before deletion. Will try again later.", "requestId", 
requestId, "message", message)
                                }
-
                        } else {
-                               logger.Info("Found async status", "requestId", 
requestId, "state", asyncState)
-                               // Only continue to delete the pod if the 
ReplaceNode request is complete and successful
-                               if asyncState == "completed" {
-                                       canDeletePod = true
-                                       logger.Info("Migration of all replicas 
off of pod before deletion complete. Pod can now be deleted.", "pod", pod.Name)
-                               } else if asyncState == "failed" {
-                                       logger.Info("Migration of all replicas 
off of pod before deletion failed. Will try again.", "pod", pod.Name, 
"message", message)
-                               }
+                               canDeletePod = true
+                       }
 
-                               // Delete the async request Id if the async 
request is successful or failed.
-                               // If the request failed, this will cause a 
retry since the next reconcile won't find the async requestId in Solr.
-                               if asyncState == "completed" || asyncState == 
"failed" {
-                                       if message, err = 
solr_api.DeleteAsyncRequest(ctx, solrCloud, requestId); err != nil {
-                                               logger.Error(err, "Could not 
delete Async request status.", "requestId", requestId, "message", message)
-                                       } else {
-                                               canDeletePod = false
-                                       }
+               } else {
+                       logger.Info("Found async status", "requestId", 
requestId, "state", asyncState)
+                       // Only continue to delete the pod if the ReplaceNode 
request is complete and successful
+                       if asyncState == "completed" {
+                               canDeletePod = true

Review Comment:
   Do you think it makes sense to check again for the pod before proceeding to 
delete it? Maybe keep doing `REPLACENODE` until it's empty? there is still a 
race condition, but much shorter



##########
controllers/solr_cluster_ops_util.go:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controllers
+
+import (
+       "context"
+       "errors"
+       solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
+       "github.com/apache/solr-operator/controllers/util"
+       "github.com/go-logr/logr"
+       appsv1 "k8s.io/api/apps/v1"
+       corev1 "k8s.io/api/core/v1"
+       "k8s.io/utils/pointer"
+       "sigs.k8s.io/controller-runtime/pkg/client"
+       "strconv"
+       "time"
+)
+
+func determineScaleClusterOpLockIfNecessary(ctx context.Context, r 
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet 
*appsv1.StatefulSet, podList []corev1.Pod, logger logr.Logger) (clusterOpLock 
string, clusterOpMetadata string, retryLaterDuration time.Duration, err error) {
+       desiredPods := int(*instance.Spec.Replicas)
+       configuredPods := int(*statefulSet.Spec.Replicas)
+       if desiredPods != configuredPods {
+               scaleTo := -1
+               // Start a scaling operation
+               if desiredPods < configuredPods {
+                       // Scale down!
+                       // The option is enabled by default, so treat "nil" 
like "true"
+                       if instance.Spec.Autoscaling.VacatePodsOnScaleDown == 
nil || *instance.Spec.Autoscaling.VacatePodsOnScaleDown {
+                               if desiredPods > 0 {
+                                       // We only support one scaling down one 
pod at-a-time if not scaling down to 0 pods
+                                       scaleTo = configuredPods - 1
+                               } else {
+                                       scaleTo = 0
+                               }
+                       } else {
+                               // The cloud is not setup to use managed 
scale-down
+                               err = scaleCloudUnmanaged(ctx, r, statefulSet, 
desiredPods, logger)
+                       }
+               } else if desiredPods > configuredPods {
+                       // Scale up!
+                       // TODO: replicasScaleUp is not supported, so do not 
make a clusterOp out of it, just do the patch
+                       err = scaleCloudUnmanaged(ctx, r, statefulSet, 
desiredPods, logger)
+               }
+               if scaleTo > -1 {
+                       clusterOpLock = util.ScaleLock
+                       clusterOpMetadata = strconv.Itoa(scaleTo)
+               }
+       }
+       return
+}
+
+func handleLockedClusterOpScale(ctx context.Context, r *SolrCloudReconciler, 
instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, podList 
[]corev1.Pod, logger logr.Logger) (retryLaterDuration time.Duration, err error) 
{
+       if scalingToNodes, hasAnn := 
statefulSet.Annotations[util.ClusterOpsMetadataAnnotation]; hasAnn {
+               if scalingToNodesInt, convErr := strconv.Atoi(scalingToNodes); 
convErr != nil {
+                       logger.Error(convErr, "Could not convert statefulSet 
annotation to int for scale-down-to information", "annotation", 
util.ClusterOpsMetadataAnnotation, "value", scalingToNodes)
+                       err = convErr
+               } else {
+                       replicaManagementComplete := false
+                       if scalingToNodesInt < int(*statefulSet.Spec.Replicas) {
+                               // Manage scaling down the SolrCloud
+                               replicaManagementComplete, err = 
handleManagedCloudScaleDown(ctx, r, instance, statefulSet, scalingToNodesInt, 
podList, logger)
+                               // } else if scalingToNodesInt > 
int(*statefulSet.Spec.Replicas) {
+                               // TODO: Utilize the scaled-up nodes in the 
future, however Solr does not currently have APIs for this.
+                               // TODO: Think about the order of scale-up and 
restart when individual nodeService IPs are injected into the pods.
+                               // TODO: Will likely want to do a scale-up of 
the service first, then do the rolling restart of the cluster, then utilize the 
node.
+                       } else {
+                               // This shouldn't happen. The 
ScalingToNodesAnnotation is removed when the statefulSet size changes, through 
a Patch.
+                               // But if it does happen, we should just remove 
the annotation and move forward.
+                               patchedStatefulSet := statefulSet.DeepCopy()
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsLockAnnotation)
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsMetadataAnnotation)
+                               if err = r.Patch(ctx, patchedStatefulSet, 
client.StrategicMergeFrom(statefulSet)); err != nil {
+                                       logger.Error(err, "Error while patching 
StatefulSet to remove unneeded clusterLockOp annotation for scaling to the 
current amount of nodes")
+                               } else {
+                                       statefulSet = patchedStatefulSet
+                               }
+                       }
+
+                       // Scale down the statefulSet to represent the new 
number of utilizedPods, if it is lower than the current number of pods
+                       // Also remove the "scalingToNodes" annotation, as that 
acts as a lock on the cluster, so that other operations,
+                       // such as scale-up, pod updates and further scale-down 
cannot happen at the same time.
+                       if replicaManagementComplete {
+                               patchedStatefulSet := statefulSet.DeepCopy()
+                               patchedStatefulSet.Spec.Replicas = 
pointer.Int32(int32(scalingToNodesInt))
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsLockAnnotation)
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsMetadataAnnotation)
+                               if err = r.Patch(ctx, patchedStatefulSet, 
client.StrategicMergeFrom(statefulSet)); err != nil {
+                                       logger.Error(err, "Error while patching 
StatefulSet to scale down SolrCloud", "newUtilizedNodes", scalingToNodesInt)
+                               }
+
+                               // TODO: Create event for the CRD.
+                       } else {
+                               // Retry after five minutes to check if the 
replica management commands have been completed
+                               retryLaterDuration = time.Second * 5
+                       }
+               }
+               // If everything succeeded, the statefulSet will have an 
annotation updated
+               // and the reconcile loop will be called again.
+
+               return
+       } else {
+               err = errors.New("no clusterOpMetadata annotation is present in 
the statefulSet")
+               logger.Error(err, "Cannot perform scaling operation when no 
scale-to-nodes is provided via the clusterOpMetadata")
+               return time.Second * 10, err
+       }
+}
+
+// handleManagedCloudScaleDown does the logic of a managed and "locked" cloud 
scale down operation.
+// This will likely take many reconcile loops to complete, as it is moving 
replicas away from the nodes that will be scaled down.
+func handleManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, 
instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, scaleDownTo 
int, podList []corev1.Pod, logger logr.Logger) (replicaManagementComplete bool, 
err error) {
+       // Before doing anything to the pod, make sure that users cannot send 
requests to the pod anymore.
+       podStoppedReadinessConditions := 
map[corev1.PodConditionType]podReadinessConditionChange{
+               util.SolrIsNotStoppedReadinessCondition: {
+                       reason:  ScaleDown,
+                       message: "Pod is being deleted, traffic to the pod must 
be stopped",
+                       status:  false,
+               },
+       }
+
+       if scaleDownTo == 0 {
+               // Delete all collections & data, the user wants no data left 
if scaling the solrcloud down to 0
+               // This is a much different operation to deleting the 
SolrCloud/StatefulSet all-together
+               replicaManagementComplete, err = evictAllPods(ctx, r, instance, 
podList, podStoppedReadinessConditions, logger)
+       } else {
+               // Only evict the last pod, even if we are trying to scale down 
multiple pods.
+               // Scale down will happen one pod at a time.
+               replicaManagementComplete, err = evictSinglePod(ctx, r, 
instance, scaleDownTo, podList, podStoppedReadinessConditions, logger)
+       }
+       // TODO: It would be great to support a multi-node scale down when Solr 
supports evicting many SolrNodes at once.
+
+       return
+}
+
+// scaleCloudUnmanaged does simple scaling of a SolrCloud without moving 
replicas.
+// This is not a "locked" cluster operation, and does not block other cluster 
operations from taking place.
+func scaleCloudUnmanaged(ctx context.Context, r *SolrCloudReconciler, 
statefulSet *appsv1.StatefulSet, scaleTo int, logger logr.Logger) (err error) {
+       // Before doing anything to the pod, make sure that users cannot send 
requests to the pod anymore.
+       patchedStatefulSet := statefulSet.DeepCopy()
+       patchedStatefulSet.Spec.Replicas = pointer.Int32(int32(scaleTo))
+       if err = r.Patch(ctx, patchedStatefulSet, 
client.StrategicMergeFrom(statefulSet)); err != nil {
+               logger.Error(err, "Error while patching StatefulSet to scale 
SolrCloud.", "fromNodes", *statefulSet.Spec.Replicas, "toNodes", scaleTo)
+       }
+       return err
+}
+
+func evictAllPods(ctx context.Context, r *SolrCloudReconciler, instance 
*solrv1beta1.SolrCloud, podList []corev1.Pod, readinessConditions 
map[corev1.PodConditionType]podReadinessConditionChange, logger logr.Logger) 
(podsAreEmpty bool, err error) {
+       // If there are no pods, we can't empty them. Just return true
+       if len(podList) == 0 {
+               return true, nil
+       }
+
+       for i, pod := range podList {
+               if updatedPod, e := EnsurePodReadinessConditions(ctx, r, &pod, 
readinessConditions, logger); e != nil {
+                       err = e
+                       return
+               } else {
+                       podList[i] = *updatedPod
+               }
+       }
+
+       // Delete all collections & data, the user wants no data left if 
scaling the solrcloud down to 0
+       // This is a much different operation to deleting the 
SolrCloud/StatefulSet all-together
+       // TODO: Implement delete all collections. Currently just leave the data
+       //if err, podsAreEmpty = util.DeleteAllCollectionsIfNecessary(ctx, 
instance, "scaleDown", logger); err != nil {
+       //      logger.Error(err, "Error while evicting all collections in 
SolrCloud, when scaling down SolrCloud to 0 pods")
+       //}
+       podsAreEmpty = true
+
+       return
+}
+
+func evictSinglePod(ctx context.Context, r *SolrCloudReconciler, instance 
*solrv1beta1.SolrCloud, scaleDownTo int, podList []corev1.Pod, 
readinessConditions map[corev1.PodConditionType]podReadinessConditionChange, 
logger logr.Logger) (podIsEmpty bool, err error) {
+       var pod *corev1.Pod
+       podName := instance.GetSolrPodName(scaleDownTo)
+       for _, p := range podList {
+               if p.Name == podName {
+                       pod = &p
+                       break
+               }
+       }
+
+       // TODO: Get whether the pod has replicas or not
+       podHasReplicas := true
+
+       // The pod doesn't exist, we cannot empty it

Review Comment:
   Question: How can this happen?



##########
docs/solr-cloud/autoscaling.md:
##########
@@ -0,0 +1,85 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+# SolrCloud Scaling
+_Since v0.8.0_
+
+Solr Clouds are complex distributed systems, and thus require additional help 
when trying to scale up or down.
+
+Scaling/Autoscaling can mean different things in different situations, and 
this is true even within the `SolrCloud.spec.autoscaling` section.
+- Replicas can be moved when new nodes are added or when nodes need to be 
taken down
+- Nodes can be added/removed if more or less resources are desired.
+
+The following sections describes all the features that the Solr Operator 
currently supports to aid in scaling & autoscaling SolrClouds.
+
+## Configuration
+
+The `autoscaling` section in the SolrCloud CRD can be configured in the 
following ways
+
+```yaml
+spec:
+  autoscaling:
+    vacatePodsOnScaleDown: true # Default: true
+```
+
+## Replica Movement
+
+Solr can be scaled up & down either manually or by 
`HorizontalPodAutoscaler`'s, however no matter how the 
`SolrCloud.Spec.Replicas` value
+changes, the Solr Operator must implement this change the same way.
+
+For now Replicas are not scaled up and down themselves, they are just moved to 
utilize new Solr pods or vacate soon-to-be-deleted Solr pods.
+
+### Solr Pod Scale-Down
+
+When the desired number of Solr Pods that should be run 
`SolrCloud.Spec.Replicas` is decreased,
+the `SolrCloud.spec.autoscaling.vacatePodsOnScaleDown` option determines 
whether the Solr Operator should move replicas
+off of the pods that are about to be deleted.
+
+When a StatefulSet, which the Solr Operator uses to run Solr pods, has its 
size decreased by `x` pods, it's the last
+`x` pods that are deleted. So if a StatefulSet `tmp` has size 4, it will have 
pods `tmp-0`, `tmp-1`, `tmp-2` and `tmp-3`.
+If that `tmp` then is scaled down to size 2, then pods `tmp-2` and `tmp-3` 
will be deleted because they are `tmp`'s last pods numerically.

Review Comment:
   > then pods `tmp-2` and `tmp-3` will be deleted because they are `tmp`'s 
last pods numerically
   
   May be 
   > then pod `tmp-3` will be deleted first, followed by `tmp-2`because they 
are `tmp`'s last pods numerically



##########
docs/solr-cloud/autoscaling.md:
##########
@@ -0,0 +1,85 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+# SolrCloud Scaling
+_Since v0.8.0_
+
+Solr Clouds are complex distributed systems, and thus require additional help 
when trying to scale up or down.
+
+Scaling/Autoscaling can mean different things in different situations, and 
this is true even within the `SolrCloud.spec.autoscaling` section.
+- Replicas can be moved when new nodes are added or when nodes need to be 
taken down
+- Nodes can be added/removed if more or less resources are desired.
+
+The following sections describes all the features that the Solr Operator 
currently supports to aid in scaling & autoscaling SolrClouds.
+
+## Configuration
+
+The `autoscaling` section in the SolrCloud CRD can be configured in the 
following ways
+
+```yaml
+spec:
+  autoscaling:
+    vacatePodsOnScaleDown: true # Default: true
+```
+
+## Replica Movement
+
+Solr can be scaled up & down either manually or by 
`HorizontalPodAutoscaler`'s, however no matter how the 
`SolrCloud.Spec.Replicas` value
+changes, the Solr Operator must implement this change the same way.
+
+For now Replicas are not scaled up and down themselves, they are just moved to 
utilize new Solr pods or vacate soon-to-be-deleted Solr pods.
+
+### Solr Pod Scale-Down
+
+When the desired number of Solr Pods that should be run 
`SolrCloud.Spec.Replicas` is decreased,
+the `SolrCloud.spec.autoscaling.vacatePodsOnScaleDown` option determines 
whether the Solr Operator should move replicas
+off of the pods that are about to be deleted.
+
+When a StatefulSet, which the Solr Operator uses to run Solr pods, has its 
size decreased by `x` pods, it's the last
+`x` pods that are deleted. So if a StatefulSet `tmp` has size 4, it will have 
pods `tmp-0`, `tmp-1`, `tmp-2` and `tmp-3`.
+If that `tmp` then is scaled down to size 2, then pods `tmp-2` and `tmp-3` 
will be deleted because they are `tmp`'s last pods numerically.
+
+If Solr has replicas placed on the pods that will be deleted as a part of the 
scale-down, then it has a problem.
+Solr will expect that these replicas will eventually come back online, because 
they are a part of the clusterState.
+However, the Solr Operator has no expectations for these replicas to come 
back, because the cloud has been scaled down.
+Therefore, the safest option is to move the replicas off of these pods before 
the scale-down operation occurs.

Review Comment:
   I think this isn't very clear. I think a simpler explanation of what the 
operator will do would be better, something like: 
   ```
   The Solr Operator can update the cluster state to handle the scale-down 
operation by moving replicas off of the soon-to-be-deleted pods
   ```
   Or something like that. I feel all the explanation of the difference between 
Solr and Solr Operator just adds confusion.



##########
controllers/solr_cluster_ops_util.go:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controllers
+
+import (
+       "context"
+       "errors"
+       solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
+       "github.com/apache/solr-operator/controllers/util"
+       "github.com/go-logr/logr"
+       appsv1 "k8s.io/api/apps/v1"
+       corev1 "k8s.io/api/core/v1"
+       "k8s.io/utils/pointer"
+       "sigs.k8s.io/controller-runtime/pkg/client"
+       "strconv"
+       "time"
+)
+
+func determineScaleClusterOpLockIfNecessary(ctx context.Context, r 
*SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet 
*appsv1.StatefulSet, podList []corev1.Pod, logger logr.Logger) (clusterOpLock 
string, clusterOpMetadata string, retryLaterDuration time.Duration, err error) {
+       desiredPods := int(*instance.Spec.Replicas)
+       configuredPods := int(*statefulSet.Spec.Replicas)
+       if desiredPods != configuredPods {
+               scaleTo := -1
+               // Start a scaling operation
+               if desiredPods < configuredPods {
+                       // Scale down!
+                       // The option is enabled by default, so treat "nil" 
like "true"
+                       if instance.Spec.Autoscaling.VacatePodsOnScaleDown == 
nil || *instance.Spec.Autoscaling.VacatePodsOnScaleDown {
+                               if desiredPods > 0 {
+                                       // We only support one scaling down one 
pod at-a-time if not scaling down to 0 pods
+                                       scaleTo = configuredPods - 1
+                               } else {
+                                       scaleTo = 0
+                               }
+                       } else {
+                               // The cloud is not setup to use managed 
scale-down
+                               err = scaleCloudUnmanaged(ctx, r, statefulSet, 
desiredPods, logger)
+                       }
+               } else if desiredPods > configuredPods {
+                       // Scale up!
+                       // TODO: replicasScaleUp is not supported, so do not 
make a clusterOp out of it, just do the patch
+                       err = scaleCloudUnmanaged(ctx, r, statefulSet, 
desiredPods, logger)
+               }
+               if scaleTo > -1 {
+                       clusterOpLock = util.ScaleLock
+                       clusterOpMetadata = strconv.Itoa(scaleTo)
+               }
+       }
+       return
+}
+
+func handleLockedClusterOpScale(ctx context.Context, r *SolrCloudReconciler, 
instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, podList 
[]corev1.Pod, logger logr.Logger) (retryLaterDuration time.Duration, err error) 
{
+       if scalingToNodes, hasAnn := 
statefulSet.Annotations[util.ClusterOpsMetadataAnnotation]; hasAnn {
+               if scalingToNodesInt, convErr := strconv.Atoi(scalingToNodes); 
convErr != nil {
+                       logger.Error(convErr, "Could not convert statefulSet 
annotation to int for scale-down-to information", "annotation", 
util.ClusterOpsMetadataAnnotation, "value", scalingToNodes)
+                       err = convErr
+               } else {
+                       replicaManagementComplete := false
+                       if scalingToNodesInt < int(*statefulSet.Spec.Replicas) {
+                               // Manage scaling down the SolrCloud
+                               replicaManagementComplete, err = 
handleManagedCloudScaleDown(ctx, r, instance, statefulSet, scalingToNodesInt, 
podList, logger)
+                               // } else if scalingToNodesInt > 
int(*statefulSet.Spec.Replicas) {
+                               // TODO: Utilize the scaled-up nodes in the 
future, however Solr does not currently have APIs for this.
+                               // TODO: Think about the order of scale-up and 
restart when individual nodeService IPs are injected into the pods.
+                               // TODO: Will likely want to do a scale-up of 
the service first, then do the rolling restart of the cluster, then utilize the 
node.
+                       } else {
+                               // This shouldn't happen. The 
ScalingToNodesAnnotation is removed when the statefulSet size changes, through 
a Patch.
+                               // But if it does happen, we should just remove 
the annotation and move forward.
+                               patchedStatefulSet := statefulSet.DeepCopy()
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsLockAnnotation)
+                               delete(patchedStatefulSet.Annotations, 
util.ClusterOpsMetadataAnnotation)
+                               if err = r.Patch(ctx, patchedStatefulSet, 
client.StrategicMergeFrom(statefulSet)); err != nil {
+                                       logger.Error(err, "Error while patching 
StatefulSet to remove unneeded clusterLockOp annotation for scaling to the 
current amount of nodes")
+                               } else {
+                                       statefulSet = patchedStatefulSet
+                               }
+                       }
+
+                       // Scale down the statefulSet to represent the new 
number of utilizedPods, if it is lower than the current number of pods
+                       // Also remove the "scalingToNodes" annotation, as that 
acts as a lock on the cluster, so that other operations,
+                       // such as scale-up, pod updates and further scale-down 
cannot happen at the same time.

Review Comment:
   Isn't there a `lock` annotation being used ((as I see blow)?



##########
docs/solr-cloud/autoscaling.md:
##########
@@ -0,0 +1,85 @@
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+# SolrCloud Scaling
+_Since v0.8.0_
+
+Solr Clouds are complex distributed systems, and thus require additional help 
when trying to scale up or down.
+
+Scaling/Autoscaling can mean different things in different situations, and 
this is true even within the `SolrCloud.spec.autoscaling` section.
+- Replicas can be moved when new nodes are added or when nodes need to be 
taken down
+- Nodes can be added/removed if more or less resources are desired.
+
+The following sections describes all the features that the Solr Operator 
currently supports to aid in scaling & autoscaling SolrClouds.
+
+## Configuration
+
+The `autoscaling` section in the SolrCloud CRD can be configured in the 
following ways
+
+```yaml
+spec:
+  autoscaling:
+    vacatePodsOnScaleDown: true # Default: true
+```
+
+## Replica Movement
+
+Solr can be scaled up & down either manually or by 
`HorizontalPodAutoscaler`'s, however no matter how the 
`SolrCloud.Spec.Replicas` value
+changes, the Solr Operator must implement this change the same way.
+
+For now Replicas are not scaled up and down themselves, they are just moved to 
utilize new Solr pods or vacate soon-to-be-deleted Solr pods.
+
+### Solr Pod Scale-Down
+
+When the desired number of Solr Pods that should be run 
`SolrCloud.Spec.Replicas` is decreased,
+the `SolrCloud.spec.autoscaling.vacatePodsOnScaleDown` option determines 
whether the Solr Operator should move replicas
+off of the pods that are about to be deleted.
+
+When a StatefulSet, which the Solr Operator uses to run Solr pods, has its 
size decreased by `x` pods, it's the last
+`x` pods that are deleted. So if a StatefulSet `tmp` has size 4, it will have 
pods `tmp-0`, `tmp-1`, `tmp-2` and `tmp-3`.
+If that `tmp` then is scaled down to size 2, then pods `tmp-2` and `tmp-3` 
will be deleted because they are `tmp`'s last pods numerically.
+
+If Solr has replicas placed on the pods that will be deleted as a part of the 
scale-down, then it has a problem.
+Solr will expect that these replicas will eventually come back online, because 
they are a part of the clusterState.
+However, the Solr Operator has no expectations for these replicas to come 
back, because the cloud has been scaled down.
+Therefore, the safest option is to move the replicas off of these pods before 
the scale-down operation occurs.
+
+If `autoscaling.vacatePodsOnScaleDown` option is not enabled, then whenever 
the `SolrCloud.Spec.Replicas` is decreased,
+that change will be reflected in the StatefulSet immediately.
+Pods will be deleted even if replicas live on those pods.
+
+If `autoscaling.vacatePodsOnScaleDown` option is enabled, which it is by 
default, then the following steps occur:
+1. Acquire a cluster-ops lock on the SolrCloud. (This means other cluster 
operations, such as a rolling restart, cannot occur during the scale down 
operation)
+1. Scale down the last pod.
+   1. Mark the pod as "notReady" so that traffic is diverted away from this 
pod.

Review Comment:
   Question: Is this only an ingress change? traffic that goes directly to the 
pod should continue to succeed, right? Otherwise, there will be update errors, 
replicas going DOWN, unnecessarily 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@solr.apache.org
For additional commands, e-mail: issues-h...@solr.apache.org

[GitHub] [solr-operator] tflobbe commented on a diff in pull request #561: Add managed scale-down for SolrClouds

Reply via email to