From 728897aba629b23fd5ed2f110f63931a170dad17 Mon Sep 17 00:00:00 2001 From: Utku Ozdemir Date: Fri, 3 Jan 2025 12:59:45 +0100 Subject: [PATCH] fix: wait for infra machine info to be collected before powering off There were cases when an infra machine was accepted, it was powered off by the infra provider too quickly, before its specs were populated by the `MachineStatus` poller. This caused additional issues as some other resources like `SchematicConfiguration` were also never created, blocking cluster creation. Address this by setting the preferred power state of the infra machine to ON until its status is populated (we check this by checking the secure boot status field). Additionally, clean up `InfraMachineConfig` resources (user-managed) when a machine is deleted. Signed-off-by: Utku Ozdemir --- .../omni/controllers/omni/infra_machine.go | 24 ++++++++++++++++--- .../controllers/omni/infra_machine_test.go | 19 ++++++++++++++- .../omni/controllers/omni/machine_cleanup.go | 7 ++++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/internal/backend/runtime/omni/controllers/omni/infra_machine.go b/internal/backend/runtime/omni/controllers/omni/infra_machine.go index 6fd650e1..007e6750 100644 --- a/internal/backend/runtime/omni/controllers/omni/infra_machine.go +++ b/internal/backend/runtime/omni/controllers/omni/infra_machine.go @@ -64,6 +64,9 @@ func NewInfraMachineController() *InfraMachineController { qtransform.WithExtraMappedInput( qtransform.MapperSameID[*omni.ClusterMachine, *siderolink.Link](), ), + qtransform.WithExtraMappedInput( + qtransform.MapperSameID[*omni.MachineStatus, *siderolink.Link](), + ), qtransform.WithExtraMappedInput( func(ctx context.Context, _ *zap.Logger, runtime controller.QRuntime, res *infra.ProviderStatus) ([]resource.Pointer, error) { linkList, err := safe.ReaderListAll[*siderolink.Link](ctx, runtime, state.WithLabelQuery(resource.LabelEqual(omni.LabelInfraProviderID, res.Metadata().ID()))) @@ -94,6 +97,11 @@ func (h *infraMachineControllerHelper) transformExtraOutput(ctx context.Context, return err } + machineStatus, err := helpers.HandleInput[*omni.MachineStatus](ctx, r, InfraMachineControllerName, link) + if err != nil { + return err + } + providerID, ok := link.Metadata().Annotations().Get(omni.LabelInfraProviderID) if !ok { return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("the link is not created by an infra provider") @@ -108,7 +116,9 @@ func (h *infraMachineControllerHelper) transformExtraOutput(ctx context.Context, return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("the link is not created by a static infra provider") } - if err = h.applyInfraMachineConfig(infraMachine, config); err != nil { + machineInfoCollected := machineStatus != nil && machineStatus.TypedSpec().Value.SecureBootStatus != nil + + if err = h.applyInfraMachineConfig(infraMachine, config, machineInfoCollected); err != nil { return err } @@ -168,13 +178,17 @@ func (h *infraMachineControllerHelper) finalizerRemovalExtraOutput(ctx context.C return err } - _, err := helpers.HandleInput[*omni.ClusterMachine](ctx, r, InfraMachineControllerName, link) + if _, err := helpers.HandleInput[*omni.ClusterMachine](ctx, r, InfraMachineControllerName, link); err != nil { + return err + } + + _, err := helpers.HandleInput[*omni.MachineStatus](ctx, r, InfraMachineControllerName, link) return err } // applyInfraMachineConfig applies the user-managed configuration from the omni.InfraMachineConfig resource into the infra.Machine. -func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *infra.Machine, config *omni.InfraMachineConfig) error { +func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *infra.Machine, config *omni.InfraMachineConfig, machineInfoCollected bool) error { const defaultPreferredPowerState = specs.InfraMachineSpec_POWER_STATE_OFF // todo: introduce a resource to configure this globally or per-provider level // reset the user-override fields except the "Accepted" field @@ -209,6 +223,10 @@ func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *inf infraMachine.Metadata().Labels().Delete(omni.LabelMachinePendingAccept) } + if !machineInfoCollected { // we need the machine to stay powered on even if it is accepted, until Omni collects the machine information + infraMachine.TypedSpec().Value.PreferredPowerState = specs.InfraMachineSpec_POWER_STATE_ON + } + return nil } diff --git a/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go b/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go index 3a920e10..93da6c70 100644 --- a/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go +++ b/internal/backend/runtime/omni/controllers/omni/infra_machine_test.go @@ -47,13 +47,26 @@ func (suite *InfraMachineControllerSuite) TestReconcile() { assertion.True(ok) assertion.Equal("bare-metal", infraProviderID) - assertion.Equal(specs.InfraMachineSpec_POWER_STATE_OFF, r.TypedSpec().Value.PreferredPowerState) + assertion.Equal(specs.InfraMachineSpec_POWER_STATE_ON, r.TypedSpec().Value.PreferredPowerState) // MachineStatus is not populated yet assertion.Equal(specs.InfraMachineConfigSpec_PENDING, r.TypedSpec().Value.AcceptanceStatus) assertion.Empty(r.TypedSpec().Value.ClusterTalosVersion) assertion.Empty(r.TypedSpec().Value.Extensions) assertion.Empty(r.TypedSpec().Value.WipeId) }) + machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "machine-1") + machineStatus.TypedSpec().Value.SecureBootStatus = &specs.SecureBootStatus{} + + suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus)) + + assertResource[*omni.MachineStatus](&suite.OmniSuite, machineStatus.Metadata(), func(r *omni.MachineStatus, assertion *assert.Assertions) { + assertion.True(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName)) + }) + + assertResource[*infra.Machine](&suite.OmniSuite, infraMachineMD, func(r *infra.Machine, assertion *assert.Assertions) { + assertion.Equal(specs.InfraMachineSpec_POWER_STATE_OFF, r.TypedSpec().Value.PreferredPowerState) // expect the default state of "OFF" + }) + // accept the machine, set its preferred power state to on config := omni.NewInfraMachineConfig(resources.DefaultNamespace, "machine-1") config.TypedSpec().Value.AcceptanceStatus = specs.InfraMachineConfigSpec_ACCEPTED @@ -148,6 +161,10 @@ func (suite *InfraMachineControllerSuite) TestReconcile() { assertion.False(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName)) }) + assertResource[*omni.MachineStatus](&suite.OmniSuite, infraMachineMD, func(r *omni.MachineStatus, assertion *assert.Assertions) { + assertion.False(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName)) + }) + // assert that infra.Machine is removed assertNoResource[*infra.Machine](&suite.OmniSuite, infraMachine) } diff --git a/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go b/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go index f68c255c..d346232c 100644 --- a/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go +++ b/internal/backend/runtime/omni/controllers/omni/machine_cleanup.go @@ -21,8 +21,11 @@ type MachineCleanupController = cleanup.Controller[*omni.Machine] func NewMachineCleanupController() *MachineCleanupController { return cleanup.NewController( cleanup.Settings[*omni.Machine]{ - Name: "MachineCleanupController", - Handler: &helpers.SameIDHandler[*omni.Machine, *omni.MachineSetNode]{}, + Name: "MachineCleanupController", + Handler: cleanup.Combine( + &helpers.SameIDHandler[*omni.Machine, *omni.MachineSetNode]{}, + &helpers.SameIDHandler[*omni.Machine, *omni.InfraMachineConfig]{}, + ), }, ) }