From f355ee5b92b7cf105ba35a318335cf4a3f1daf07 Mon Sep 17 00:00:00 2001 From: pasanw Date: Tue, 27 Jan 2026 12:01:14 -0800 Subject: [PATCH] Update nodeSet naming (#4378) The upgrade of k8s.io/apimachinery to v0.34.3 changed how k8s objects serialize, causing the same PVC template to produce different hashes across tigera-operator versions. Since NodeSet names are derived from PVC template hashes, this triggered unnecessary StatefulSet recreation during Calico Enterprise upgrades, leading to slow upgrades (dynamic provisioning) or stalled upgrades (static provisioning with no available PVs). This change replaces hash-based NodeSet naming with logic that retains the existing name when the PVC template matches LogStorage configuration, and uses a random string when a new name is needed. --- .../elastic/elastic_controller_test.go | 156 +++++++++++++++++- pkg/render/logstorage.go | 83 +++++++--- pkg/render/logstorage_test.go | 18 +- 3 files changed, 232 insertions(+), 25 deletions(-) diff --git a/pkg/controller/logstorage/elastic/elastic_controller_test.go b/pkg/controller/logstorage/elastic/elastic_controller_test.go index e8b951209e..7ed0e0ade9 100644 --- a/pkg/controller/logstorage/elastic/elastic_controller_test.go +++ b/pkg/controller/logstorage/elastic/elastic_controller_test.go @@ -20,6 +20,8 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + "github.com/tigera/operator/pkg/ptr" + "k8s.io/apimachinery/pkg/api/resource" "github.com/stretchr/testify/mock" @@ -609,6 +611,97 @@ var _ = Describe("LogStorage controller", func() { test.VerifyCert(kbSecret, kbDNSNames...) }) + It("should set the nodeset name as expected", func() { + // Setup + Expect(cli.Create(ctx, &storagev1.StorageClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: storageClassName, + }, + })).ShouldNot(HaveOccurred()) + logStorage := &operatorv1.LogStorage{ + ObjectMeta: metav1.ObjectMeta{ + Name: "tigera-secure", + }, + Spec: operatorv1.LogStorageSpec{ + Nodes: &operatorv1.Nodes{ + Count: int64(1), + }, + StorageClassName: storageClassName, + }, + Status: operatorv1.LogStorageStatus{ + State: operatorv1.TigeraStatusReady, + }, + } + CreateLogStorage(cli, logStorage) + Expect(cli.Create(ctx, &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Namespace: eck.OperatorNamespace, Name: eck.LicenseConfigMapName}, + Data: map[string]string{"eck_license_level": string(render.ElasticsearchLicenseTypeEnterprise)}, + })).ShouldNot(HaveOccurred()) + r, err := NewReconcilerWithShims(cli, scheme, mockStatus, operatorv1.ProviderNone, MockESCLICreator, dns.DefaultClusterDomain, readyFlag) + Expect(err).ShouldNot(HaveOccurred()) + mockStatus.On("SetDegraded", operatorv1.ResourceNotReady, "Waiting for Elasticsearch cluster to be operational", mock.Anything, mock.Anything).Return() + + // Perform the initial reconcile, and capture the generated name. + _, err = r.Reconcile(ctx, reconcile.Request{}) + Expect(err).ShouldNot(HaveOccurred()) + es := &esv1.Elasticsearch{} + Expect(cli.Get(ctx, esObjKey, es)).ShouldNot(HaveOccurred()) + Expect(es.Spec.NodeSets).To(HaveLen(1)) + nodeSetName := es.Spec.NodeSets[0].Name + + // Check that the nodeset name stays the same after a reconcile with no change. + _, err = r.Reconcile(ctx, reconcile.Request{}) + Expect(err).ShouldNot(HaveOccurred()) + Expect(cli.Get(ctx, esObjKey, es)).ShouldNot(HaveOccurred()) + Expect(es.Spec.NodeSets).To(HaveLen(1)) + Expect(es.Spec.NodeSets[0].Name).To(Equal(nodeSetName)) + + // Modify the storage requests - this should trigger a change in the nodeset name + Expect(cli.Get(ctx, types.NamespacedName{Name: "tigera-secure"}, logStorage)).ShouldNot(HaveOccurred()) + logStorage.Spec.Nodes.ResourceRequirements = &corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse("100Gi"), + }, + } + Expect(cli.Update(ctx, logStorage)).ShouldNot(HaveOccurred()) + _, err = r.Reconcile(ctx, reconcile.Request{}) + Expect(err).ShouldNot(HaveOccurred()) + Expect(cli.Get(ctx, esObjKey, es)).ShouldNot(HaveOccurred()) + Expect(es.Spec.NodeSets[0].Name).ToNot(Equal(nodeSetName)) + + // Check that the nodeset name stays the same after a reconcile with no change. + nodeSetName = es.Spec.NodeSets[0].Name + _, err = r.Reconcile(ctx, reconcile.Request{}) + Expect(err).ShouldNot(HaveOccurred()) + Expect(cli.Get(ctx, esObjKey, es)).ShouldNot(HaveOccurred()) + Expect(es.Spec.NodeSets).To(HaveLen(1)) + Expect(es.Spec.NodeSets[0].Name).To(Equal(nodeSetName)) + + // Modify the storage class name - this should trigger a change in the nodeset name + Expect(cli.Create(ctx, &storagev1.StorageClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "new-storage-class", + }, + })).ShouldNot(HaveOccurred()) + Expect(cli.Get(ctx, types.NamespacedName{Name: "tigera-secure"}, logStorage)).ShouldNot(HaveOccurred()) + logStorage.Spec.StorageClassName = "new-storage-class" + Expect(cli.Update(ctx, logStorage)).ShouldNot(HaveOccurred()) + _, err = r.Reconcile(ctx, reconcile.Request{}) + Expect(err).ShouldNot(HaveOccurred()) + Expect(cli.Get(ctx, esObjKey, es)).ShouldNot(HaveOccurred()) + Expect(es.Spec.NodeSets[0].Name).ToNot(Equal(nodeSetName)) + + // Check that the nodeset name stays the same after a reconcile with no change. + nodeSetName = es.Spec.NodeSets[0].Name + _, err = r.Reconcile(ctx, reconcile.Request{}) + Expect(err).ShouldNot(HaveOccurred()) + Expect(cli.Get(ctx, esObjKey, es)).ShouldNot(HaveOccurred()) + Expect(es.Spec.NodeSets).To(HaveLen(1)) + Expect(es.Spec.NodeSets[0].Name).To(Equal(nodeSetName)) + + mockStatus.AssertExpectations(GinkgoT()) + }) + It("should not add OwnerReference to user supplied kibana TLS cert", func() { mockStatus.On("ClearDegraded", mock.Anything) @@ -681,6 +774,20 @@ var _ = Describe("LogStorage controller", func() { Name: render.ElasticsearchName, Namespace: render.ElasticsearchNamespace, }, + Spec: esv1.ElasticsearchSpec{ + NodeSets: []esv1.NodeSet{ + { + Name: "default", + VolumeClaimTemplates: []corev1.PersistentVolumeClaim{ + { + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: ptr.ToPtr("tigera-elasticsearch"), + }, + }, + }, + }, + }, + }, Status: esv1.ElasticsearchStatus{ Phase: esv1.ElasticsearchReadyPhase, }, @@ -802,6 +909,20 @@ var _ = Describe("LogStorage controller", func() { Name: render.ElasticsearchName, Namespace: render.ElasticsearchNamespace, }, + Spec: esv1.ElasticsearchSpec{ + NodeSets: []esv1.NodeSet{ + { + Name: "default", + VolumeClaimTemplates: []corev1.PersistentVolumeClaim{ + { + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: ptr.ToPtr("tigera-elasticsearch"), + }, + }, + }, + }, + }, + }, Status: esv1.ElasticsearchStatus{ Phase: esv1.ElasticsearchReadyPhase, }, @@ -854,6 +975,20 @@ var _ = Describe("LogStorage controller", func() { Name: render.ElasticsearchName, Namespace: render.ElasticsearchNamespace, }, + Spec: esv1.ElasticsearchSpec{ + NodeSets: []esv1.NodeSet{ + { + Name: "default", + VolumeClaimTemplates: []corev1.PersistentVolumeClaim{ + { + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: ptr.ToPtr("tigera-elasticsearch"), + }, + }, + }, + }, + }, + }, } Expect(test.GetResource(cli, &escfg)).To(BeNil()) Expect(escfg.Spec.NodeSets).To(HaveLen(1)) @@ -1219,7 +1354,26 @@ func setUpLogStorageComponents(cli client.Client, ctx context.Context, storageCl KubernetesProvider: operatorv1.ProviderNone, Registry: "testregistry.com/", }, - Elasticsearch: &esv1.Elasticsearch{ObjectMeta: metav1.ObjectMeta{Name: render.ElasticsearchName, Namespace: render.ElasticsearchNamespace}}, + Elasticsearch: &esv1.Elasticsearch{ + ObjectMeta: metav1.ObjectMeta{ + Name: render.ElasticsearchName, + Namespace: render.ElasticsearchNamespace, + }, + Spec: esv1.ElasticsearchSpec{ + NodeSets: []esv1.NodeSet{ + { + Name: "default", + VolumeClaimTemplates: []corev1.PersistentVolumeClaim{ + { + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: ptr.ToPtr("tigera-elasticsearch"), + }, + }, + }, + }, + }, + }, + }, ClusterConfig: relasticsearch.NewClusterConfig("cluster", 1, 1, 1), ElasticsearchKeyPair: esKeyPair, TrustedBundle: trustedBundle, diff --git a/pkg/render/logstorage.go b/pkg/render/logstorage.go index 37dc18d0af..72e14908a0 100644 --- a/pkg/render/logstorage.go +++ b/pkg/render/logstorage.go @@ -15,15 +15,14 @@ package render import ( - "encoding/hex" - "encoding/json" "fmt" - "hash/fnv" + "reflect" "strings" cmnv1 "github.com/elastic/cloud-on-k8s/v2/pkg/apis/common/v1" esv1 "github.com/elastic/cloud-on-k8s/v2/pkg/apis/elasticsearch/v1" "gopkg.in/inf.v0" + "k8s.io/apimachinery/pkg/util/rand" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" @@ -244,7 +243,7 @@ func (es *elasticsearchComponent) Objects() ([]client.Object, []client.Object) { toCreate = append(toCreate, es.elasticsearchServiceAccount()) toCreate = append(toCreate, es.cfg.ClusterConfig.ConfigMap()) - toCreate = append(toCreate, es.elasticsearchCluster()) + toCreate = append(toCreate, es.elasticsearchCluster(es.cfg.Elasticsearch)) if es.cfg.Installation.KubernetesProvider.IsOpenShift() { toCreate = append(toCreate, es.elasticsearchClusterRole(), es.elasticsearchClusterRoleBinding()) @@ -546,7 +545,7 @@ func (es *elasticsearchComponent) podTemplate() corev1.PodTemplateSpec { } // render the Elasticsearch CR that the ECK operator uses to create elasticsearch cluster -func (es *elasticsearchComponent) elasticsearchCluster() *esv1.Elasticsearch { +func (es *elasticsearchComponent) elasticsearchCluster(current *esv1.Elasticsearch) *esv1.Elasticsearch { elasticsearch := &esv1.Elasticsearch{ TypeMeta: metav1.TypeMeta{Kind: "Elasticsearch", APIVersion: "elasticsearch.k8s.elastic.co/v1"}, ObjectMeta: metav1.ObjectMeta{ @@ -563,7 +562,7 @@ func (es *elasticsearchComponent) elasticsearchCluster() *esv1.Elasticsearch { }, }, }, - NodeSets: es.nodeSets(), + NodeSets: es.nodeSets(current), }, } @@ -639,7 +638,7 @@ func memoryQuantityToJVMHeapSize(q *resource.Quantity) string { // nodeSets calculates the number of NodeSets needed for the Elasticsearch cluster. Multiple NodeSets are returned only // if the "nodeSets" field has been set in the LogStorage CR. The number of Nodes for the cluster will be distributed as // evenly as possible between the NodeSets. -func (es *elasticsearchComponent) nodeSets() []esv1.NodeSet { +func (es *elasticsearchComponent) nodeSets(currentES *esv1.Elasticsearch) []esv1.NodeSet { nodeConfig := es.cfg.LogStorage.Spec.Nodes pvcTemplate := es.pvcTemplate() @@ -651,10 +650,12 @@ func (es *elasticsearchComponent) nodeSets() []esv1.NodeSet { return nil } + name := nodeSetName(pvcTemplate, currentES) + var nodeSets []esv1.NodeSet if len(nodeConfig.NodeSets) < 1 { nodeSet := es.nodeSetTemplate(pvcTemplate) - nodeSet.Name = nodeSetName(pvcTemplate) + nodeSet.Name = name nodeSet.Count = int32(nodeConfig.Count) nodeSet.PodTemplate = es.podTemplate() @@ -678,7 +679,7 @@ func (es *elasticsearchComponent) nodeSets() []esv1.NodeSet { nodeSet := es.nodeSetTemplate(pvcTemplate) // Each NodeSet needs a unique name, so just add the index as a suffix - nodeSet.Name = fmt.Sprintf("%s-%d", nodeSetName(pvcTemplate), i) + nodeSet.Name = fmt.Sprintf("%s-%d", name, i) nodeSet.Count = int32(numNodes) podTemplate := es.podTemplate() @@ -769,24 +770,60 @@ func (es *elasticsearchComponent) nodeSetTemplate(pvcTemplate corev1.PersistentV } } -// nodeSetName returns thumbprint of PersistentVolumeClaim object as string. -// As storage requirements of NodeSets are immutable, -// renaming a NodeSet automatically creates a new StatefulSet with new PersistentVolumeClaim. -// https://www.elastic.co/guide/en/cloud-on-k8s/current/k8s-orchestration.html#k8s-orchestration-limitations -func nodeSetName(pvcTemplate corev1.PersistentVolumeClaim) string { - pvcTemplateHash := fnv.New64a() - templateBytes, err := json.Marshal(pvcTemplate) - if err != nil { - log.V(5).Info("Failed to create unique name for ElasticSearch NodeSet.", "err", err) - return "es" +// nodeSetName resolves the name that should be configured on the NodeSet by deciding to either retain the existing +// name or generate a new random string to force a rename. A rename must be forced when we know that the current +// NodeSet does not match the storage requirements outlined by LogStorage. This rename triggers the eck-operator +// to create a new NodeSet, which is required because VolumeClaimTemplates on a StatefulSet are immutable. +// https://www.elastic.co/guide/en/cloud-on-k8s/current/k8s-orchestration.html +func nodeSetName(pvcTemplate corev1.PersistentVolumeClaim, currentES *esv1.Elasticsearch) string { + // We expect the following preconditions to be met, but we check for them explicitly to indicate developer error. + if currentES != nil && len(currentES.Spec.NodeSets) == 0 { + panic("ElasticSearch CR has no NodeSets") + } + if currentES != nil && len(currentES.Spec.NodeSets[0].VolumeClaimTemplates) != 1 { + panic(fmt.Sprintf("ElasticSearch CR has %d VolumeClaimTemplates in its NodeSets, expected 1", len(currentES.Spec.NodeSets[0].VolumeClaimTemplates))) + } + if currentES != nil && currentES.Spec.NodeSets[0].VolumeClaimTemplates[0].Spec.StorageClassName == nil { + panic("ElasticSearch CR's VolumeClaimTemplate has no StorageClassName") + } + if pvcTemplate.Spec.StorageClassName == nil { + panic("Rendered VolumeClaimTemplate has no StorageClassName") + } + + // Determine if we need to generate a new name + var nameMustBeGenerated bool + var existingName *string + if currentES == nil { + nameMustBeGenerated = true + } else { + // Check if the existing PVC template matches our LogStorage CR. If it does not, we need to force a rename. + // We assume that the VolumeClaimTemplate is the same across all NodeSets (this is how we configure it). + currentPVCTemplate := currentES.Spec.NodeSets[0].VolumeClaimTemplates[0] + storageClassNamesMatch := *pvcTemplate.Spec.StorageClassName == *currentPVCTemplate.Spec.StorageClassName + resourceRequirementsMatch := reflect.DeepEqual(currentPVCTemplate.Spec.Resources, pvcTemplate.Spec.Resources) + if !storageClassNamesMatch || !resourceRequirementsMatch { + nameMustBeGenerated = true + } + + // Capture the existing name. If there are multiple NodeSets, there will be a suffix, so we strip it. + splitExistingName := strings.Split(currentES.Spec.NodeSets[0].Name, "-")[0] + existingName = &splitExistingName } - if _, err := pvcTemplateHash.Write(templateBytes); err != nil { - log.V(5).Info("Failed to create unique name for ElasticSearch NodeSet.", "err", err) - return "es" + // Generate a new name if necessary + var name string + if nameMustBeGenerated { + for { + name = rand.String(12) + if existingName == nil || name != *existingName { + break + } + } + } else { + name = *existingName } - return hex.EncodeToString(pvcTemplateHash.Sum(nil)) + return name } // This is a list of components that belong to Curator which has been decommissioned since it is no longer supported diff --git a/pkg/render/logstorage_test.go b/pkg/render/logstorage_test.go index 4ace5b70c3..a19f819b14 100644 --- a/pkg/render/logstorage_test.go +++ b/pkg/render/logstorage_test.go @@ -22,6 +22,7 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/ginkgo/extensions/table" . "github.com/onsi/gomega" + "github.com/tigera/operator/pkg/ptr" esv1 "github.com/elastic/cloud-on-k8s/v2/pkg/apis/elasticsearch/v1" @@ -503,7 +504,22 @@ var _ = Describe("Elasticsearch rendering tests", func() { }, }, } - cfg.Elasticsearch = &esv1.Elasticsearch{} + cfg.Elasticsearch = &esv1.Elasticsearch{ + Spec: esv1.ElasticsearchSpec{ + NodeSets: []esv1.NodeSet{ + { + Name: "default", + VolumeClaimTemplates: []corev1.PersistentVolumeClaim{ + { + Spec: corev1.PersistentVolumeClaimSpec{ + StorageClassName: ptr.ToPtr("tigera-elasticsearch"), + }, + }, + }, + }, + }, + }, + } component := render.LogStorage(cfg)