From 09a0586db3b5540975e5efb4faac077f071b7264 Mon Sep 17 00:00:00 2001 From: Igal Tsoiref Date: Sun, 24 May 2026 00:08:16 -0400 Subject: [PATCH] feat(dpuagent): write done marker file after all operations complete In OCP, we need to create a systemd dependency between the dpu-agent and kubelet so that kubelet only starts after the dpu-agent finishes all its configuration tasks. To enable this cleanly in systemd, the dpu-agent now writes a marker file at /run/dpu-agent/configuration-complete once all operations have completed successfully. Other systemd units can use a PathExists= or ConditionPathExists= directive on this file to gate their startup on the dpu-agent being done. The marker is written to /run (tmpfs), so it is automatically cleared on reboot. It is written before the final status update to avoid blocking on a potentially long-running status push. Co-authored-by: Cursor --- internal/provisioning/dpuagent/dpuagent.go | 23 ++++++ .../provisioning/dpuagent/dpuagent_test.go | 82 +++++++++++++++++++ 2 files changed, 105 insertions(+) diff --git a/internal/provisioning/dpuagent/dpuagent.go b/internal/provisioning/dpuagent/dpuagent.go index f616f250d..13e8aa222 100644 --- a/internal/provisioning/dpuagent/dpuagent.go +++ b/internal/provisioning/dpuagent/dpuagent.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "os" + "path/filepath" "strings" "time" @@ -58,10 +59,16 @@ const defaultRetryInterval = 30 * time.Second const bootIDFile = "/proc/sys/kernel/random/boot_id" +const ( + defaultRunDir = "/run/dpu-agent" + doneMarkerFileName = "configuration-complete" +) + type DPUAgent struct { optCtx *operations.Context operations []operations.Operation retryInterval time.Duration + runDir string // rebootMethodDiscoveryFunc, if non-nil, replaces MFT tool probing (tests only). rebootMethodDiscoveryFunc func(context.Context) bool @@ -99,6 +106,7 @@ func NewDPUAgent(optCtx *operations.Context) *DPUAgent { return &DPUAgent{ optCtx: optCtx, operations: operations, + runDir: defaultRunDir, } } @@ -143,6 +151,9 @@ func (d *DPUAgent) Run(ctx context.Context) error { return fmt.Errorf("execution of operator %s aborted: %v", op.Name(), err) } } + if err := writeDoneMarker(d.runDir); err != nil { + klog.Errorf("Failed to write done marker: %v", err) + } d.updateStatusUntilSuccess(ctx) return nil } @@ -177,3 +188,15 @@ func (d *DPUAgent) initCurrentBootID() error { d.optCtx.CurrentBootID = strings.TrimSpace(string(currentBootID)) return nil } + +func writeDoneMarker(dir string) error { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("create run directory %s: %w", dir, err) + } + markerPath := filepath.Join(dir, doneMarkerFileName) + if err := os.WriteFile(markerPath, []byte(time.Now().UTC().Format(time.RFC3339)+"\n"), 0644); err != nil { + return fmt.Errorf("write done marker file: %w", err) + } + klog.Infof("Configuration complete, marker written to %s", markerPath) + return nil +} diff --git a/internal/provisioning/dpuagent/dpuagent_test.go b/internal/provisioning/dpuagent/dpuagent_test.go index aaa9ff985..a90b013e6 100644 --- a/internal/provisioning/dpuagent/dpuagent_test.go +++ b/internal/provisioning/dpuagent/dpuagent_test.go @@ -19,6 +19,8 @@ package dpuagent import ( "context" "fmt" + "os" + "path/filepath" "time" provisioningv1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1" @@ -35,6 +37,86 @@ import ( const testRetryInterval = 1 * time.Millisecond var _ = Describe("DPUAgent", func() { + Describe("Done marker", func() { + var tmpDir string + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "dpu-agent-run-*") + Expect(err).NotTo(HaveOccurred()) + }) + + AfterEach(func() { + Expect(os.RemoveAll(tmpDir)).To(Succeed()) + }) + + It("should write the done marker file after all operations complete successfully", func() { + agent := &DPUAgent{ + retryInterval: testRetryInterval, + runDir: tmpDir, + optCtx: &operations.Context{ + Client: &mockClient{}, + DiscoverPorts: func() ([]pciutil.NICPort, error) { + return []pciutil.NICPort{ + {Netdev: "p0", PCIAddress: "0000:03:00.0", MSTDevice: "/dev/mst/mt41692_pciconf0"}, + {Netdev: "p1", PCIAddress: "0000:03:00.1", MSTDevice: "/dev/mst/mt41692_pciconf0.1"}, + }, nil + }, + }, + operations: []operations.Operation{ + &mockOperation{ + name: "op1", + conditionType: "Op1Condition", + executeFunc: func(execCtx context.Context, optCtx *operations.Context) error { + return nil + }, + }, + }, + } + + Expect(agent.Run(ctx)).To(Succeed()) + + markerPath := filepath.Join(tmpDir, "configuration-complete") + Expect(markerPath).To(BeAnExistingFile()) + content, err := os.ReadFile(markerPath) + Expect(err).NotTo(HaveOccurred()) + Expect(string(content)).NotTo(BeEmpty()) + }) + + It("should not write the done marker when the run is aborted", func() { + cancelCtx, cancelFunc := context.WithCancel(ctx) + agent := &DPUAgent{ + retryInterval: testRetryInterval, + runDir: tmpDir, + optCtx: &operations.Context{ + Client: &mockClient{}, + DiscoverPorts: func() ([]pciutil.NICPort, error) { + return []pciutil.NICPort{ + {Netdev: "p0", PCIAddress: "0000:03:00.0", MSTDevice: "/dev/mst/mt41692_pciconf0"}, + {Netdev: "p1", PCIAddress: "0000:03:00.1", MSTDevice: "/dev/mst/mt41692_pciconf0.1"}, + }, nil + }, + }, + operations: []operations.Operation{ + &mockOperation{ + name: "cancel-op", + conditionType: "CancelOpCondition", + executeFunc: func(execCtx context.Context, optCtx *operations.Context) error { + cancelFunc() + return fmt.Errorf("error that triggers context check") + }, + }, + }, + } + + err := agent.Run(cancelCtx) + Expect(err).To(HaveOccurred()) + + markerPath := filepath.Join(tmpDir, "configuration-complete") + Expect(markerPath).NotTo(BeAnExistingFile()) + }) + }) + Describe("Run", func() { It("should include package installation after static file verification", func() { agent := NewDPUAgent(&operations.Context{})