From 652f5d79d4ed019db2efd66afa3041f77626074b Mon Sep 17 00:00:00 2001 From: Andrew Doran Date: Thu, 18 Jul 2024 09:56:55 -0700 Subject: [PATCH] use consistent watchdog logic Summary: When it's impossible to extend the watchdog timeout because the character device is held open by another process, we have a backup plan: wait 10s before each step when erasing/writing flash chips so that whatever thread manages the watchdog has an opportunity to pet it. For some reason this logic was skipped on LF OpenBMC. Re-enable it because regardless of whether healthd / fscd / systemd is managing the watchdog we should do the same thing. Test Plan: ``` 0 ~/local/openbmc/tools/flashy $ ./build.sh && ./build_dev.sh && go test ./... ? github.com/facebook/openbmc/tools/flashy/flash_procedure [no test files] ? github.com/facebook/openbmc/tools/flashy/lib/logger [no test files] ? github.com/facebook/openbmc/tools/flashy/tests [no test files] ? github.com/facebook/openbmc/tools/flashy/utilities [no test files] ok github.com/facebook/openbmc/tools/flashy 2.116s ok github.com/facebook/openbmc/tools/flashy/checks_and_remediations/bletchley 0.006s ok github.com/facebook/openbmc/tools/flashy/checks_and_remediations/common 0.250s ok github.com/facebook/openbmc/tools/flashy/checks_and_remediations/galaxy100 0.009s ok github.com/facebook/openbmc/tools/flashy/checks_and_remediations/grandteton 0.009s ok github.com/facebook/openbmc/tools/flashy/checks_and_remediations/wedge100 0.009s ok github.com/facebook/openbmc/tools/flashy/checks_and_remediations/yamp 0.009s ok github.com/facebook/openbmc/tools/flashy/install 0.009s ok github.com/facebook/openbmc/tools/flashy/lib/fileutils (cached) ok github.com/facebook/openbmc/tools/flashy/lib/flash 0.008s ok github.com/facebook/openbmc/tools/flashy/lib/flash/flashcp 0.008s ok github.com/facebook/openbmc/tools/flashy/lib/flash/flashutils 0.010s ok github.com/facebook/openbmc/tools/flashy/lib/flash/flashutils/devices 0.009s ok github.com/facebook/openbmc/tools/flashy/lib/step 0.008s ok github.com/facebook/openbmc/tools/flashy/lib/utils 0.443s ok github.com/facebook/openbmc/tools/flashy/lib/validate 0.008s ok github.com/facebook/openbmc/tools/flashy/lib/validate/image 0.008s ok github.com/facebook/openbmc/tools/flashy/lib/validate/partition 0.022s ``` Build ephemeral fbpkg with these changes and force-flash a bletchley that's in repair (first chip only): ``` 1 ~ $ oobgrader --host macbmc1r0036p0009-oob.03.pci1 --wait --flashy-tag 9640644 --force --allow-downgrade ... Host Workflow ID Progress Status Result ----------------------------- ------------------------------------ ---------- ----------------------- ---------------------- macbmc1r0036p0009-oob.03.pci1 041870c1-4e8b-4194-9d69-ef27dfb42286 finished WorkflowStatus.FINISHED FinishStatus.SUCCEEDED ``` -> https://fburl.com/scuba/openbmc_upgrades/5ogr4a5q Reviewed By: williamspatrick Differential Revision: D59917900 fbshipit-source-id: 7cd7050c75226b1badc98fb043fd9f1f98b6f3a0 --- .../common/13_restart_services.go | 5 +---- tools/flashy/lib/utils/system.go | 18 +++++------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/tools/flashy/checks_and_remediations/common/13_restart_services.go b/tools/flashy/checks_and_remediations/common/13_restart_services.go index d50127726df1..01dcbf17802d 100644 --- a/tools/flashy/checks_and_remediations/common/13_restart_services.go +++ b/tools/flashy/checks_and_remediations/common/13_restart_services.go @@ -61,7 +61,7 @@ func restartServices(stepParams step.StepParams) step.StepExitError { // healthd to be stopped). If healthd is not in use, perform the // watchdog step directly. The timeout increase stops the BMC // rebooting during the following heavyweight steps, like image - // validation. + // validation. See utils.PetWatchdog() for the gory details. if utils.HealthdExists() { log.Printf("Healthd exists, attempting to restart healthd...") err = utils.RestartHealthd(true, supervisor) @@ -70,9 +70,6 @@ func restartServices(stepParams step.StepParams) step.StepExitError { } else { log.Printf("Finished restarting healthd") } - // Linux Foundation uses systemd to pet the watchdog directly. - } else if utils.IsLFOpenBMC() { - log.Printf("LF-OpenBMC, letting systemd maintain the watchdog") } else { utils.PetWatchdog() } diff --git a/tools/flashy/lib/utils/system.go b/tools/flashy/lib/utils/system.go index 0d4d05179ef7..3bae1f06e27a 100644 --- a/tools/flashy/lib/utils/system.go +++ b/tools/flashy/lib/utils/system.go @@ -529,21 +529,13 @@ func tryPetWatchdog() bool { // Try to pet the watchdog and increase its timeout. This works in two ways: // -// - When /dev/watchdog is busy because it's held open by healthd, the delay -// here will hopefully allow healthd's watchdog thread to get some CPU -// time. +// - When /dev/watchdog is busy because it's held open by healthd / systemd +// / fscd, the delay here will hopefully allow the thread petting the +// watchdog to get some CPU time. // -// - When /dev/watchdog it NOT busy because healthd is not running and there -// are no concurrent instances of wdtcli, the watchdog timeout will be -// extended and the watchdog petted. +// - When /dev/watchdog it NOT busy and there are no concurrent instances of +// wdtcli, the watchdog timeout will be extended and the watchdog petted. var PetWatchdog = func() { - // LF-OpenBMC relies on systemd to pet the watchdog, so there is nothing - // to do here. - if IsLFOpenBMC() { - log.Printf("Watchdog not petted; LF OpenBMC") - return - } - if IsBMCLite() { log.Printf("Watchdog not petted; BMC Lite") return