From 460216ca580f6a79a75ac71e2605416aa2044631 Mon Sep 17 00:00:00 2001 From: Vitaly Grinberg Date: Wed, 27 May 2026 22:47:55 +0300 Subject: [PATCH] Fix BC stuck on holdover Currently holdover timer is not started if the corresponding interface can't be associated with a PTP profile. When a TR is lost, the holdover state is published as event and set in metrics, but the timer start is skipped due to the unresolved ptpOpts. 1. Fix nil PtpProcessOpts preventing holdover timer recovery. Populate process options after config file create and resolve profile lookups on holdover entry so the timer always starts even when the one-shot configmap watcher missed the node profile at startup. 2. Ensure the holdover goroutine always starts after HOLDOVER state is published and set in metrics. Close orphaned timer channels on reset. Add lifecycle logging for cluster diagnosis. Co-authored-by: Cursor Signed-off-by: Vitaly Grinberg --- plugins/ptp_operator/config/config.go | 99 ++++++- plugins/ptp_operator/metrics/manager.go | 75 ++++-- plugins/ptp_operator/metrics/metrics.go | 2 +- plugins/ptp_operator/metrics/ptp4lParse.go | 71 +++-- .../ptp_operator/metrics/ptp4lParse_test.go | 254 ++++++++++++++++++ plugins/ptp_operator/metrics/tbc_test.go | 12 +- 6 files changed, 451 insertions(+), 62 deletions(-) create mode 100644 plugins/ptp_operator/metrics/ptp4lParse_test.go diff --git a/plugins/ptp_operator/config/config.go b/plugins/ptp_operator/config/config.go index 944c1af9..0c2a1e39 100644 --- a/plugins/ptp_operator/config/config.go +++ b/plugins/ptp_operator/config/config.go @@ -142,7 +142,9 @@ func (pt *PtpClockThreshold) SafeClose() (justClosed bool) { }() select { case <-pt.Close: + log.Debugf("cancelling holdover timer: channel already closed") default: + log.Debugf("cancelling holdover timer") close(pt.Close) // close any holdover go routines } return true // <=> justClosed = true; return @@ -177,7 +179,7 @@ func (l *LinuxPTPConfigMapUpdate) SetAppliedNodeProfileJSON(appliedNodeProfileJS func NewLinuxPTPConfUpdate() *LinuxPTPConfigMapUpdate { ptpProfileUpdate := &LinuxPTPConfigMapUpdate{ lock: sync.RWMutex{}, - UpdateCh: make(chan bool), + UpdateCh: make(chan bool, 1), profilePath: DefaultProfilePath, intervalUpdate: DefaultUpdateInterval, EventThreshold: make(map[string]*PtpClockThreshold), @@ -219,24 +221,39 @@ func (p *PtpProfile) GetInterface() (interfaces []*string) { // DeletePTPThreshold ... delete threshold for profile func (l *LinuxPTPConfigMapUpdate) DeletePTPThreshold(name string) { + l.lock.Lock() + defer l.lock.Unlock() if t, found := l.EventThreshold[name]; found { - l.lock.Lock() closeHoldover(t) delete(l.EventThreshold, name) - l.lock.Unlock() } } // DeleteAllPTPThreshold ... delete all threshold per config func (l *LinuxPTPConfigMapUpdate) DeleteAllPTPThreshold() { + l.lock.Lock() + defer l.lock.Unlock() for k, t := range l.EventThreshold { - l.lock.Lock() closeHoldover(t) delete(l.EventThreshold, k) - l.lock.Unlock() } } +// LookupEventThreshold returns the PtpClockThreshold for profileName under read +// lock, falling back to the first available entry when profileName is not found. +// Returns nil if EventThreshold is empty. +func (l *LinuxPTPConfigMapUpdate) LookupEventThreshold(profileName string) *PtpClockThreshold { + l.lock.RLock() + defer l.lock.RUnlock() + if t, found := l.EventThreshold[profileName]; found { + return t + } + for _, t := range l.EventThreshold { + return t + } + return nil +} + func closeHoldover(t *PtpClockThreshold) { defer func() { if err := recover(); err != nil { @@ -366,7 +383,7 @@ func (l *LinuxPTPConfigMapUpdate) UpdateConfig(nodeProfilesJSON []byte) (bool, e l.NodeProfiles[index].Interfaces = np.GetInterface() l.NodeProfiles[index].PtpClockThreshold = np.PtpClockThreshold } - l.UpdateCh <- true + l.notifyUpdate() return true, nil } @@ -380,12 +397,21 @@ func (l *LinuxPTPConfigMapUpdate) UpdateConfig(nodeProfilesJSON []byte) (bool, e log.Info("load profiles using old method") l.appliedNodeProfileJSON = nodeProfilesJSON l.NodeProfiles = nodeProfiles - l.UpdateCh <- true + l.notifyUpdate() return true, nil } return false, fmt.Errorf("unable to load profile config") } +// notifyUpdate signals config watchers without blocking when no receiver is ready. +func (l *LinuxPTPConfigMapUpdate) notifyUpdate() { + select { + case l.UpdateCh <- true: + default: + log.Debug("coalescing config update notification") + } +} + // Try to load the multiple policy config func tryToLoadConfig(nodeProfilesJSON []byte) ([]PtpProfile, bool) { var ptpConfig []PtpProfile @@ -438,7 +464,7 @@ func (l *LinuxPTPConfigMapUpdate) updatePtpConfig(nodeName string) (updated bool if _, err := filesystem.Stat(nodeProfile); err != nil { if os.IsNotExist(err) { log.Infof("ptp profile %s doesn't exist for node: %v , error %s", nodeProfile, nodeName, err.Error()) - l.UpdateCh <- true // if profile doesn't exist let the caller know + l.notifyUpdate() // if profile doesn't exist let the caller know return } log.Errorf("error finding node profile %v: %v", nodeName, err) @@ -461,6 +487,63 @@ func (l *LinuxPTPConfigMapUpdate) updatePtpConfig(nodeName string) (updated bool return } +// EnsureProcessOptions reloads the node profile file when needed and populates +// PtpProcessOpts if NodeProfiles are available but the process-options cache is empty. +// This handles startup ordering where the one-shot configmap watcher exits before the +// profile mount is ready, leaving PtpProcessOpts unpopulated when ptp4l configs arrive. +func (l *LinuxPTPConfigMapUpdate) EnsureProcessOptions(nodeName string) { + if len(l.NodeProfiles) == 0 { + l.updatePtpConfig(nodeName) + } + + l.lock.Lock() + defer l.lock.Unlock() + + if len(l.NodeProfiles) == 0 { + return + } + if len(l.PtpProcessOpts) > 0 { + return + } + l.UpdatePTPProcessOptions() + l.UpdatePTPThreshold() + l.UpdatePTPSetting() +} + +// LookupPtpProcessOpts returns cached process options for a profile under read lock. +func (l *LinuxPTPConfigMapUpdate) LookupPtpProcessOpts(profileName string) *PtpProcessOpts { + if profileName == "" { + return nil + } + l.lock.RLock() + opts := l.PtpProcessOpts[profileName] + l.lock.RUnlock() + return opts +} + +// LookupOrEnsurePtpProcessOpts returns process options for profileName, repopulating +// the cache under lock when it is still empty. updatePtpConfig runs outside the lock +// because it performs filesystem I/O. +func (l *LinuxPTPConfigMapUpdate) LookupOrEnsurePtpProcessOpts(nodeName, profileName string) *PtpProcessOpts { + if profileName == "" { + return nil + } + + l.lock.RLock() + if opts, ok := l.PtpProcessOpts[profileName]; ok { + l.lock.RUnlock() + return opts + } + cacheEmpty := len(l.PtpProcessOpts) == 0 + l.lock.RUnlock() + + if cacheEmpty { + l.EnsureProcessOptions(nodeName) + } + + return l.LookupPtpProcessOpts(profileName) +} + // GetDefaultThreshold ... get default threshold func GetDefaultThreshold() PtpClockThreshold { return PtpClockThreshold{ diff --git a/plugins/ptp_operator/metrics/manager.go b/plugins/ptp_operator/metrics/manager.go index c9673f3f..414cb9d8 100644 --- a/plugins/ptp_operator/metrics/manager.go +++ b/plugins/ptp_operator/metrics/manager.go @@ -66,30 +66,20 @@ func NewPTPEventManager(resourcePrefix string, publisherTypes map[ptp.EventType] // PtpThreshold ... return ptp threshold // resetCh will reset any closed channel func (p *PTPEventManager) PtpThreshold(profileName string, resetCh bool) ptpConfig.PtpClockThreshold { - if t, found := p.PtpConfigMapUpdates.EventThreshold[profileName]; found { - if resetCh { - t.Close = make(chan struct{}) // reset channel to new - } - return ptpConfig.PtpClockThreshold{ - HoldOverTimeout: t.HoldOverTimeout, - MaxOffsetThreshold: t.MaxOffsetThreshold, - MinOffsetThreshold: t.MinOffsetThreshold, - Close: t.Close, - } - } else if len(p.PtpConfigMapUpdates.EventThreshold) > 0 { // if not found get the first item since one per config) - for _, t := range p.PtpConfigMapUpdates.EventThreshold { - if resetCh { - t.Close = make(chan struct{}) - } - return ptpConfig.PtpClockThreshold{ - HoldOverTimeout: t.HoldOverTimeout, - MaxOffsetThreshold: t.MaxOffsetThreshold, - MinOffsetThreshold: t.MinOffsetThreshold, - Close: t.Close, - } - } + t := p.PtpConfigMapUpdates.LookupEventThreshold(profileName) + if t == nil { + return ptpConfig.GetDefaultThreshold() + } + if resetCh { + t.SafeClose() + t.Close = make(chan struct{}) + } + return ptpConfig.PtpClockThreshold{ + HoldOverTimeout: t.HoldOverTimeout, + MaxOffsetThreshold: t.MaxOffsetThreshold, + MinOffsetThreshold: t.MinOffsetThreshold, + Close: t.Close, } - return ptpConfig.GetDefaultThreshold() } // MockTest ... use for test only @@ -472,6 +462,7 @@ func (p *PTPEventManager) GenPTPEvent(ptpProfileName string, oStats *stats.Stats // previous state was HOLDOVER, now it is in LOCKED state, cancel any HOLDOVER if isOffsetInRange(ptpOffset, threshold.MaxOffsetThreshold, threshold.MinOffsetThreshold) { log.Infof("interface %s is in LOCKED state, cancel any holdover states", eventResourceName) + log.Debugf("cancelling holdover timer: profile=%s resource=%s", ptpProfileName, eventResourceName) threshold.SafeClose() log.Infof(" publishing event for ( profile %s) %s with last state %s and current clock state %s and offset %d for ( Max/Min Threshold %d/%d )", ptpProfileName, eventResourceName, lastClockState, clockState, ptpOffset, threshold.MaxOffsetThreshold, threshold.MinOffsetThreshold) @@ -540,6 +531,44 @@ func (p *PTPEventManager) NodeName() string { return p.nodeName } +// GetPtpProcessOpts returns process options for the given profile, resolving the +// profile from the registered config when needed and refreshing the node profile +// cache when PtpProcessOpts was never populated due to startup ordering. +func (p *PTPEventManager) GetPtpProcessOpts(profileName, configName string) *ptpConfig.PtpProcessOpts { + profileName = p.resolveProfileName(profileName, configName) + if !p.mock { + if opts := p.PtpConfigMapUpdates.LookupOrEnsurePtpProcessOpts(p.nodeName, profileName); opts != nil { + return opts + } + } else if opts := p.PtpConfigMapUpdates.LookupPtpProcessOpts(profileName); opts != nil { + return opts + } + if configName != "" { + if cfg := p.GetPTPConfig(types.ConfigName(configName)); cfg != nil && cfg.Profile != "" { + if !p.mock { + if opts := p.PtpConfigMapUpdates.LookupOrEnsurePtpProcessOpts(p.nodeName, cfg.Profile); opts != nil { + return opts + } + } + return p.PtpConfigMapUpdates.LookupPtpProcessOpts(cfg.Profile) + } + } + return nil +} + +func (p *PTPEventManager) resolveProfileName(profileName, configName string) string { + if profileName != "" { + return profileName + } + if configName == "" { + return "" + } + if cfg := p.GetPTPConfig(types.ConfigName(configName)); cfg != nil { + return cfg.Profile + } + return "" +} + // GetMockEvent ... func (p *PTPEventManager) GetMockEvent() []ptp.EventType { return p.mockEvent diff --git a/plugins/ptp_operator/metrics/metrics.go b/plugins/ptp_operator/metrics/metrics.go index 910ebe20..cd6dbea0 100644 --- a/plugins/ptp_operator/metrics/metrics.go +++ b/plugins/ptp_operator/metrics/metrics.go @@ -391,7 +391,7 @@ func (p *PTPEventManager) processDownEvent(profileName, processName string, ptpS } } if s, ok := ptpStats[ClockRealTime]; ok { - if t, ok2 := p.PtpConfigMapUpdates.PtpProcessOpts[profileName]; ok2 && t.Phc2SysEnabled() { + if t := p.PtpConfigMapUpdates.LookupPtpProcessOpts(profileName); t != nil && t.Phc2SysEnabled() { p.GenPTPEvent(profileName, s, ClockRealTime, FreeRunOffsetValue, ptp.FREERUN, ptp.OsClockSyncStateChange) } } diff --git a/plugins/ptp_operator/metrics/ptp4lParse.go b/plugins/ptp_operator/metrics/ptp4lParse.go index ae19e7f9..18a9f8fe 100644 --- a/plugins/ptp_operator/metrics/ptp4lParse.go +++ b/plugins/ptp_operator/metrics/ptp4lParse.go @@ -142,20 +142,34 @@ func (p *PTPEventManager) ParsePTP4l(processName, configName, profileName, outpu ptpStats[master].SetLastSyncState(syncState) p.PublishEvent(syncState, ptpStats[master].LastOffset(), masterResource, ptp.PtpStateChange) UpdateSyncStateMetrics(ptpStats[master].ProcessName(), alias, syncState) - if ptpOpts, ok := p.PtpConfigMapUpdates.PtpProcessOpts[profileName]; ok && ptpOpts != nil { - p.maybePublishOSClockSyncStateChangeEvent(ptpOpts, configName, profileName) - threshold := p.PtpThreshold(profileName, true) - if p.mock { - log.Infof("mock holdover is set to %s", ptpStats[MasterClockType].Alias()) - } else { - go handleHoldOverState(p, ptpOpts, configName, profileName, threshold.HoldOverTimeout, ptpStats[MasterClockType].Alias(), threshold.Close) - } - } + ptpOpts := p.GetPtpProcessOpts(profileName, configName) + p.startHoldoverTimer(ptpOpts, configName, profileName, alias) + } else { + log.Warnf("holdover timer not started: master ProcessName %s != masterOffsetSource %s (profile=%s config=%s)", + ptpStats[master].ProcessName(), masterOffsetSource, profileName, configName) } } } } +func (p *PTPEventManager) startHoldoverTimer( + ptpOpts *ptpConfig.PtpProcessOpts, configName, profileName, alias string) { + if ptpOpts != nil { + p.maybePublishOSClockSyncStateChangeEvent(ptpOpts, configName, profileName) + } else { + log.Warnf("holdover timer: PtpProcessOpts missing for profile=%s config=%s (available=%v), continuing with default threshold", + profileName, configName, p.PtpConfigMapUpdates.PtpProcessOpts) + } + if p.mock { + log.Infof("holdover timer not started: mock mode (profile=%s config=%s alias=%s)", profileName, configName, alias) + return + } + threshold := p.PtpThreshold(profileName, true) + log.Infof("starting holdover timer: profile=%s config=%s alias=%s timeout=%ds", + profileName, configName, alias, threshold.HoldOverTimeout) + go handleHoldOverState(p, ptpOpts, configName, profileName, threshold.HoldOverTimeout, alias, threshold.Close) +} + func handleHoldOverState(ptpManager *PTPEventManager, ptpOpts *ptpConfig.PtpProcessOpts, configName, ptpProfileName string, holdoverTimeout int64, @@ -168,25 +182,34 @@ func handleHoldOverState(ptpManager *PTPEventManager, }() select { case <-c: - log.Infof("call received to close holderover timeout") + log.Infof("holdover timer cancelled: profile=%s config=%s alias=%s", ptpProfileName, configName, ptpIFace) return case <-time.After(time.Duration(holdoverTimeout) * time.Second): - log.Infof("holdover time expired for interface %s", ptpIFace) + log.Infof("holdover timer expired: profile=%s config=%s alias=%s timeout=%ds", ptpProfileName, configName, ptpIFace, holdoverTimeout) ptpStats := ptpManager.GetStats(types.ConfigName(configName)) - if mStats, found := ptpStats[master]; found { - if mStats.LastSyncState() == ptp.HOLDOVER { // if it was still in holdover while timing out then switch to FREERUN - log.Infof("HOLDOVER timeout after %d secs,setting clock state to FREERUN from HOLDOVER state for %s", - holdoverTimeout, master) - masterResource := fmt.Sprintf("%s/%s", mStats.Alias(), MasterClockType) - ptpStats[MasterClockType].SetLastSyncState(ptp.FREERUN) - ptpManager.PublishEvent(ptp.FREERUN, ptpStats[MasterClockType].LastOffset(), masterResource, ptp.PtpStateChange) - UpdateSyncStateMetrics(mStats.ProcessName(), mStats.Alias(), ptp.FREERUN) - // don't check of os clock sync state if phc2 not enabled - ptpManager.maybePublishOSClockSyncStateChangeEvent(ptpOpts, configName, ptpProfileName) - } - } else { - log.Errorf("failed to switch from holdover, could not find ptpStats for interface %s", ptpIFace) + mStats, found := ptpStats[master] + if !found { + log.Errorf("failed to switch from holdover, could not find ptpStats for config=%s profile=%s alias=%s", configName, ptpProfileName, ptpIFace) + return + } + if mStats.LastSyncState() != ptp.HOLDOVER { + log.Infof("holdover timer expired but state is %s (not HOLDOVER), no-op: profile=%s config=%s alias=%s", + mStats.LastSyncState(), ptpProfileName, configName, ptpIFace) + return + } + alias := mStats.Alias() + if alias == "" { + log.Errorf("failed to switch from holdover, empty alias for config=%s profile=%s", configName, ptpProfileName) + return } + log.Infof("holdover expired, transitioning to FREERUN: profile=%s config=%s alias=%s timeout=%ds", + ptpProfileName, configName, alias, holdoverTimeout) + masterResource := fmt.Sprintf("%s/%s", alias, MasterClockType) + mStats.SetLastSyncState(ptp.FREERUN) + ptpManager.PublishEvent(ptp.FREERUN, mStats.LastOffset(), masterResource, ptp.PtpStateChange) + UpdateSyncStateMetrics(mStats.ProcessName(), alias, ptp.FREERUN) + // don't check of os clock sync state if phc2 not enabled + ptpManager.maybePublishOSClockSyncStateChangeEvent(ptpOpts, configName, ptpProfileName) } } diff --git a/plugins/ptp_operator/metrics/ptp4lParse_test.go b/plugins/ptp_operator/metrics/ptp4lParse_test.go new file mode 100644 index 00000000..7b482076 --- /dev/null +++ b/plugins/ptp_operator/metrics/ptp4lParse_test.go @@ -0,0 +1,254 @@ +package metrics + +import ( + "fmt" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus/testutil" + ptpConfig "github.com/redhat-cne/cloud-event-proxy/plugins/ptp_operator/config" + "github.com/redhat-cne/cloud-event-proxy/plugins/ptp_operator/ptp4lconf" + "github.com/redhat-cne/cloud-event-proxy/plugins/ptp_operator/stats" + "github.com/redhat-cne/cloud-event-proxy/plugins/ptp_operator/types" + "github.com/redhat-cne/sdk-go/pkg/event/ptp" + "github.com/stretchr/testify/assert" +) + +const ( + testNode = "mynode" + testConfigName = "ptp4l.0.config" + testProfileName = "boundary" +) + +func ensureTestNode(t *testing.T) { + t.Helper() + if ptpNodeName == "" { + ptpNodeName = testNode + } +} + +func TestHandleHoldOverStateExpiryTransitionsToFreerun(t *testing.T) { + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + mgr.MockTest(true) + + configName := testConfigName + profileName := testProfileName + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens5fx") + ptpStats[master].SetLastSyncState(ptp.HOLDOVER) + + closeCh := make(chan struct{}) + go handleHoldOverState(mgr, nil, configName, profileName, 1, "ens5fx", closeCh) + time.Sleep(1500 * time.Millisecond) + + assert.Equal(t, ptp.FREERUN, ptpStats[master].LastSyncState()) + syncState := testutil.ToFloat64(SyncState.With(map[string]string{ + "process": ptp4lProcessName, + "node": ptpNodeName, + "iface": "ens5fx", + })) + assert.Equal(t, float64(GetSyncStateID(string(ptp.FREERUN))), syncState) + assert.Contains(t, mgr.GetMockEvent(), ptp.PtpStateChange) +} + +func TestHandleHoldOverStateFaultHoldoverExpiry(t *testing.T) { + t.Parallel() + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + mgr.MockTest(true) + + configName := testConfigName + profileName := testProfileName + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens3fx") + ptpStats[master].SetLastSyncState(ptp.HOLDOVER) + + closeCh := make(chan struct{}) + go handleHoldOverState(mgr, nil, configName, profileName, 1, "ens3fx", closeCh) + time.Sleep(1500 * time.Millisecond) + + assert.Equal(t, ptp.FREERUN, ptpStats[master].LastSyncState()) +} + +func TestHandleHoldOverStateCancelled(t *testing.T) { + t.Parallel() + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + mgr.MockTest(true) + + configName := testConfigName + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens5fx") + ptpStats[master].SetLastSyncState(ptp.HOLDOVER) + + closeCh := make(chan struct{}) + close(closeCh) + go handleHoldOverState(mgr, nil, configName, testProfileName, 1, "ens5fx", closeCh) + time.Sleep(200 * time.Millisecond) + + assert.Equal(t, ptp.HOLDOVER, ptpStats[master].LastSyncState()) +} + +func TestHandleHoldOverStateExpiryNoOpWhenNotHoldover(t *testing.T) { + t.Parallel() + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + mgr.MockTest(true) + + configName := testConfigName + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens5fx") + ptpStats[master].SetLastSyncState(ptp.LOCKED) + + closeCh := make(chan struct{}) + go handleHoldOverState(mgr, nil, configName, testProfileName, 1, "ens5fx", closeCh) + time.Sleep(1500 * time.Millisecond) + + assert.Equal(t, ptp.LOCKED, ptpStats[master].LastSyncState()) +} + +func TestPtpThresholdResetClosesPreviousChannel(t *testing.T) { + t.Parallel() + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + profileName := testProfileName + oldClose := make(chan struct{}) + mgr.PtpConfigMapUpdates.EventThreshold[profileName] = &ptpConfig.PtpClockThreshold{ + HoldOverTimeout: 5, + Close: oldClose, + } + + cancelled := make(chan struct{}) + go func() { + <-oldClose + close(cancelled) + }() + + threshold := mgr.PtpThreshold(profileName, true) + assert.NotNil(t, threshold.Close) + assert.NotEqual(t, oldClose, threshold.Close) + + select { + case <-cancelled: + case <-time.After(500 * time.Millisecond): + t.Fatal("previous holdover timer channel was not closed on reset") + } +} + +func TestSlaveToMasterHoldoverTimerRecovery(t *testing.T) { + ensureTestNode(t) + SetMasterOffsetSource(ptp4lProcessName) + + mgr := NewPTPEventManager("", nil, testNode, nil) + mgr.MockTest(true) + + configName := testConfigName + profileName := testProfileName + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens5fx") + ptpStats[master].SetLastSyncState(ptp.LOCKED) + + ptp4lCfg := &ptp4lconf.PTP4lConfig{ + Interfaces: []*ptp4lconf.PTPInterface{ + {Name: "ens5f0", PortID: 1, PortName: "port 1", Role: types.SLAVE}, + }, + } + output := fmt.Sprintf("ptp4l[72444.514]: [%s:5] port 1 (ens5f0): SLAVE to MASTER on ANNOUNCE_RECEIPT_TIMEOUT_EXPIRES", testConfigName) + fields := []string{"ptp4l", "1646672953", testConfigName, "port", "1", "(ens5f0)", "SLAVE to MASTER on ANNOUNCE_RECEIPT_TIMEOUT_EXPIRES"} + + mgr.ParsePTP4l(ptp4lProcessName, configName, profileName, output, fields, + ptp4lconf.PTPInterface{Name: "ens5f0", PortID: 1, Role: types.SLAVE}, ptp4lCfg, ptpStats) + assert.Equal(t, ptp.HOLDOVER, ptpStats[master].LastSyncState()) + + closeCh := make(chan struct{}) + go handleHoldOverState(mgr, nil, configName, profileName, 1, "ens5fx", closeCh) + time.Sleep(1500 * time.Millisecond) + assert.Equal(t, ptp.FREERUN, ptpStats[master].LastSyncState()) +} + +func TestClockClass248WithoutHoldoverNoStateChange(t *testing.T) { + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + mgr.MockTest(true) + + configName := testConfigName + profileName := testProfileName + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens5fx") + ptpStats[master].SetLastSyncState(ptp.FREERUN) + ptpStats[master].SetClockClass(6) + + ptp4lCfg := &ptp4lconf.PTP4lConfig{ + Name: configName, + Profile: profileName, + Interfaces: []*ptp4lconf.PTPInterface{ + {Name: "ens5f0", PortID: 1, PortName: "port 1", Role: types.MASTER}, + }, + } + + output := fmt.Sprintf("ptp4l 1646672953 %s CLOCK_CLASS_CHANGE 248.000000", testConfigName) + fields := []string{"ptp4l", "1646672953", testConfigName, "CLOCK_CLASS_CHANGE", "248.000000"} + mgr.ParsePTP4l(ptp4lProcessName, configName, profileName, output, fields, + ptp4lconf.PTPInterface{Name: "ens5f0"}, ptp4lCfg, ptpStats) + + assert.Equal(t, ptp.FREERUN, ptpStats[master].LastSyncState()) + assert.Equal(t, int64(248), ptpStats[master].ClockClass()) + clockClass := testutil.ToFloat64(ClockClassMetrics.With(map[string]string{ + "process": ptp4lProcessName, + "config": configName, + "node": ptpNodeName, + })) + assert.Equal(t, float64(248), clockClass) +} + +func TestGetPtpProcessOptsPopulatesEmptyCache(t *testing.T) { + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + profileName := "boundary" + ptp4lOpts := "foo" + mgr.PtpConfigMapUpdates.NodeProfiles = []ptpConfig.PtpProfile{ + {Name: &profileName, Ptp4lOpts: &ptp4lOpts}, + } + mgr.AddPTPConfig(types.ConfigName("ptp4l.0.config"), &ptp4lconf.PTP4lConfig{ + Name: "ptp4l.0.config", + Profile: profileName, + }) + + opts := mgr.GetPtpProcessOpts("", "ptp4l.0.config") + assert.NotNil(t, opts) + assert.True(t, opts.Ptp4lEnabled()) +} + +func TestStartHoldoverTimerWithNilPtpOptsStillRecovers(t *testing.T) { + ensureTestNode(t) + mgr := NewPTPEventManager("", nil, testNode, nil) + configName := "ptp4l.0.config" + profileName := "boundary" + mgr.PtpConfigMapUpdates.EventThreshold[profileName] = &ptpConfig.PtpClockThreshold{ + HoldOverTimeout: 1, + Close: make(chan struct{}), + } + mgr.Stats[types.ConfigName(configName)] = make(stats.PTPStats) + ptpStats := mgr.GetStats(types.ConfigName(configName)) + ptpStats.CheckSource(master, configName, ptp4lProcessName) + ptpStats[master].SetAlias("ens5fx") + ptpStats[master].SetLastSyncState(ptp.HOLDOVER) + + mgr.startHoldoverTimer(nil, configName, profileName, "ens5fx") + time.Sleep(1500 * time.Millisecond) + + assert.Equal(t, ptp.FREERUN, ptpStats[master].LastSyncState()) +} diff --git a/plugins/ptp_operator/metrics/tbc_test.go b/plugins/ptp_operator/metrics/tbc_test.go index 9c2d884c..75f39ad1 100644 --- a/plugins/ptp_operator/metrics/tbc_test.go +++ b/plugins/ptp_operator/metrics/tbc_test.go @@ -1,6 +1,7 @@ package metrics_test import ( + "fmt" "strings" "testing" @@ -150,12 +151,12 @@ func TestTBCPtp4lMasterOffsetNoHoldover(t *testing.T) { }{ { name: "TBC ptp4l with s0 should be FREERUN", - logLine: "ptp4l[5196819.100]: [ptp4l.0.config] master offset -100 s0 freq +1000 path delay 1000", + logLine: fmt.Sprintf("ptp4l[5196819.100]: [%s] master offset -100 s0 freq +1000 path delay 1000", configName), expectedState: ptp.FREERUN, }, { name: "TBC ptp4l with s2 should be LOCKED", - logLine: "ptp4l[5196819.100]: [ptp4l.0.config] master offset -50 s2 freq +500 path delay 1000", + logLine: fmt.Sprintf("ptp4l[5196819.100]: [%s] master offset -50 s2 freq +500 path delay 1000", configName), expectedState: ptp.LOCKED, }, } @@ -346,7 +347,6 @@ func TestTBCOffsetMetricUpdatedEveryLog(t *testing.T) { eventManager := metrics.NewPTPEventManager("", initPubSubTypes(), "testnode", &common.SCConfiguration{StorePath: "/tmp/store"}) eventManager.MockTest(true) - configName = "ptp4l.0.config" eventManager.Stats[types.ConfigName(configName)] = make(stats.PTPStats) ptpStats := eventManager.GetStats(types.ConfigName(configName)) @@ -360,19 +360,19 @@ func TestTBCOffsetMetricUpdatedEveryLog(t *testing.T) { }{ { name: "T-BC s0 with offset 123", - logLine: "T-BC[1743005894]:[ptp4l.0.config] ens2f0 offset 123 T-BC-STATUS s0", + logLine: fmt.Sprintf("T-BC[1743005894]:[%s] ens2f0 offset 123 T-BC-STATUS s0", configName), expectedOffset: 123, expectedState: ptp.FREERUN, }, { name: "T-BC s2 with offset 55", - logLine: "T-BC[1743005894]:[ptp4l.0.config] ens2f0 offset 55 T-BC-STATUS s2", + logLine: fmt.Sprintf("T-BC[1743005894]:[%s] ens2f0 offset 55 T-BC-STATUS s2", configName), expectedOffset: 55, expectedState: ptp.LOCKED, }, { name: "T-BC s2 with offset 0 (still locked)", - logLine: "T-BC[1743005894]:[ptp4l.0.config] ens2f0 offset 0 T-BC-STATUS s2", + logLine: fmt.Sprintf("T-BC[1743005894]:[%s] ens2f0 offset 0 T-BC-STATUS s2", configName), expectedOffset: 0, expectedState: ptp.LOCKED, },