-
Notifications
You must be signed in to change notification settings - Fork 54
Description
Hi Team,
When submitting a simple GPU job on the slurm cluster, the job fails immediately with the following error:
On the slurm cluster, Slurm nodes are reporting GPU GRES incorrectly. Instead of detecting the GPU type, the GRES field shows (null). This prevents job submission with --gres=gpu:1.
Expected behavior:
GRES should correctly display gpu:nvidia_h100_80gb_hbm3:1.
Job requesting --gres=gpu:1 should be scheduled successfully.
Observed behavior:
GRES = (null) in sinfo.
Node remains idle but unusable.
Jobs fail immediately due to missing GPU configuration.
All the components seems to be up, am i missing something here
Also please find the output of slurm config
root@login-0:~# scontrol show config
Configuration data as of 2025-08-29 16:22:49.UTC
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost = localhost
AccountingStorageExternalHost = (null)
AccountingStorageParameters = (null)
AccountingStoragePort = 0
AccountingStorageTRES = cpu,mem,energy,node,billing,fs/disk,vmem,pages
AccountingStorageType = (null)
AccountingStorageUser = root
AccountingStoreFlags = (null)
AcctGatherEnergyType = (null)
AcctGatherFilesystemType = (null)
AcctGatherInterconnectType = (null)
AcctGatherNodeFreq = 0 sec
AcctGatherProfileType = (null)
AllowSpecResourcesUsage = no
AuthAltTypes = (null)
AuthAltParameters = (null)
AuthInfo = (null)
AuthType = auth/munge
BatchStartTimeout = 10 sec
BcastExclude = /lib,/usr/lib,/lib64,/usr/lib64
BcastParameters = (null)
BOOT_TIME = 2025-08-29 16:10:16.UTC
BurstBufferType = (null)
CertmgrParameters = (null)
CertmgrType = (null)
CliFilterPlugins = cli_filter/user_defaults
ClusterName = avesha-slurm-k8s
CommunicationParameters = (null)
CompleteWait = 5 sec
CpuFreqDef = Unknown
CpuFreqGovernors = OnDemand,Performance,UserSpace
CredType = cred/munge
DataParserParameters = (null)
DebugFlags = Script
DefMemPerNode = 1048576
DependencyParameters = (null)
DisableRootJobs = no
EioTimeout = 60
EnforcePartLimits = NO
Epilog[0] = /opt/slurm_scripts/epilog.sh
EpilogMsgTime = 2000 usec
FairShareDampeningFactor = 1
FederationParameters = (null)
FirstJobId = 1
GetEnvTimeout = 2 sec
GresTypes = gpu
GpuFreqDef = (null)
GroupUpdateForce = 1
GroupUpdateTime = 600 sec
HASH_VAL = Match
HashPlugin = hash/k12
HealthCheckInterval = 0 sec
HealthCheckNodeState = ANY
HealthCheckProgram = (null)
InactiveLimit = 0 sec
InteractiveStepOptions = --interactive --preserve-env --pty $SHELL
JobAcctGatherFrequency = 30
JobAcctGatherType = (null)
JobAcctGatherParams = (null)
JobCompHost = localhost
JobCompLoc = (null)
JobCompParams = (null)
JobCompPort = 0
JobCompType = (null)
JobCompUser = root
JobContainerType = (null)
JobDefaults = DefCpuPerGPU=16
JobFileAppend = 0
JobRequeue = 1
JobSubmitPlugins = (null)
KillOnBadExit = 1
KillWait = 180 sec
LaunchParameters = use_interactive_step
Licenses = (null)
LogTimeFormat = iso8601_ms
MailDomain = (null)
MailProg = /usr/bin/true
MaxArraySize = 1024
MaxBatchRequeue = 5
MaxDBDMsgs = 0
MaxJobCount = 10000
MaxJobId = 67043328
MaxMemPerNode = UNLIMITED
MaxNodeCount = 1024
MaxStepCount = 40000
MaxTasksPerNode = 512
MCSPlugin = (null)
MCSParameters = (null)
MessageTimeout = 60 sec
MinJobAge = 86400 sec
MpiDefault = pmix
MpiParams = (null)
NEXT_JOB_ID = 4
NodeFeaturesPlugins = (null)
OverTimeLimit = 0 min
PluginDir = /usr/lib/x86_64-linux-gnu/slurm
PlugStackConfig = (null)
PreemptMode = REQUEUE
PreemptParameters = (null)
PreemptType = preempt/partition_prio
PreemptExemptTime = 00:00:00
PrEpParameters = (null)
PrEpPlugins = prep/script
PriorityParameters = (null)
PrioritySiteFactorParameters = (null)
PrioritySiteFactorPlugin = (null)
PriorityDecayHalfLife = 7-00:00:00
PriorityCalcPeriod = 00:05:00
PriorityFavorSmall = no
PriorityFlags =
PriorityMaxAge = 7-00:00:00
PriorityType = priority/multifactor
PriorityUsageResetPeriod = NONE
PriorityWeightAge = 0
PriorityWeightAssoc = 0
PriorityWeightFairShare = 0
PriorityWeightJobSize = 0
PriorityWeightPartition = 0
PriorityWeightQOS = 0
PriorityWeightTRES = (null)
PrivateData = none
ProctrackType = proctrack/cgroup
Prolog[0] = /opt/slurm_scripts/prolog.sh
PrologEpilogTimeout = 65534
PrologFlags = (null)
PropagatePrioProcess = 0
PropagateResourceLimits = NONE
PropagateResourceLimitsExcept = (null)
RebootProgram = (null)
ReconfigFlags = (null)
RequeueExit = (null)
RequeueExitHold = (null)
ResumeFailProgram = (null)
ResumeProgram = (null)
ResumeRate = 300 nodes/min
ResumeTimeout = 60 sec
ResvEpilog = (null)
ResvOverRun = 0 min
ResvProlog = (null)
ReturnToService = 2
SchedulerParameters = nohold_on_prolog_fail,extra_constraints
SchedulerTimeSlice = 30 sec
SchedulerType = sched/backfill
ScronParameters = enable,explicit_scancel
SelectType = select/cons_tres
SelectTypeParameters = CR_CORE_MEMORY,CR_CORE_DEFAULT_DIST_BLOCK
SlurmUser = root(0)
SlurmctldAddr = (null)
SlurmctldDebug = verbose
SlurmctldHost[0] = controller-0
SlurmctldLogFile = /dev/null
SlurmctldPort = 6817
SlurmctldSyslogDebug = (null)
SlurmctldPrimaryOffProg = (null)
SlurmctldPrimaryOnProg = (null)
SlurmctldTimeout = 30 sec
SlurmctldParameters = conmgr_max_connections=512,conmgr_threads=16
SlurmdDebug = verbose
SlurmdLogFile = /dev/null
SlurmdParameters = (null)
SlurmdPidFile = /var/run/slurmd.pid
SlurmdPort = 6818
SlurmdSpoolDir = /var/spool/slurmd
SlurmdSyslogDebug = (null)
SlurmdTimeout = 180 sec
SlurmdUser = root(0)
SlurmSchedLogFile = (null)
SlurmSchedLogLevel = 0
SlurmctldPidFile = /var/run/slurmctld.pid
SLURM_CONF = /etc/slurm/slurm.conf
SLURM_VERSION = 24.11.5
SrunEpilog = (null)
SrunPortRange = 0-0
SrunProlog = (null)
StateSaveLocation = /var/spool/slurmctld
SuspendExcNodes = (null)
SuspendExcParts = (null)
SuspendExcStates = (null)
SuspendProgram = (null)
SuspendRate = 60 nodes/min
SuspendTime = INFINITE
SuspendTimeout = 30 sec
SwitchParameters = (null)
SwitchType = (null)
TaskEpilog = (null)
TaskPlugin = task/cgroup,task/affinity
TaskPluginParam = (null type)
TaskProlog = (null)
TCPTimeout = 15 sec
TLSParameters = (null)
TLSType = tls/none
TmpFS = /tmp
TopologyParam = SwitchAsNodeRank
TopologyPlugin = topology/tree
TrackWCKey = no
TreeWidth = 16
UsePam = no
UnkillableStepProgram = (null)
UnkillableStepTimeout = 600 sec
VSizeFactor = 0 percent
WaitTime = 0 sec
X11Parameters = (null)
Cgroup Support Configuration:
AllowedRAMSpace = 100.0%
AllowedSwapSpace = 0.0%
CgroupMountpoint = /sys/fs/cgroup
CgroupPlugin = cgroup/v2
ConstrainCores = yes
ConstrainDevices = yes
ConstrainRAMSpace = yes
ConstrainSwapSpace = no
EnableControllers = yes
IgnoreSystemd = yes
IgnoreSystemdOnFailure = yes
MaxRAMPercent = 100.0%
MaxSwapPercent = 100.0%
MemorySwappiness = (null)
MinRAMSpace = 30MB
SystemdTimeout = 1000 ms
Slurmctld(primary) at controller-0 is UP