Skip to content

Cannot submit GPU job – Slurm reports Requested node configuration is not available #1521

@richiesebastian

Description

@richiesebastian

Hi Team,

When submitting a simple GPU job on the slurm cluster, the job fails immediately with the following error:

Image

On the slurm cluster, Slurm nodes are reporting GPU GRES incorrectly. Instead of detecting the GPU type, the GRES field shows (null). This prevents job submission with --gres=gpu:1.

Image

Expected behavior:

GRES should correctly display gpu:nvidia_h100_80gb_hbm3:1.

Job requesting --gres=gpu:1 should be scheduled successfully.

Observed behavior:

GRES = (null) in sinfo.

Node remains idle but unusable.

Jobs fail immediately due to missing GPU configuration.

All the components seems to be up, am i missing something here

Image

Also please find the output of slurm config

root@login-0:~# scontrol show config
Configuration data as of 2025-08-29 16:22:49.UTC
AccountingStorageBackupHost = (null)
AccountingStorageEnforce = none
AccountingStorageHost = localhost
AccountingStorageExternalHost = (null)
AccountingStorageParameters = (null)
AccountingStoragePort = 0
AccountingStorageTRES = cpu,mem,energy,node,billing,fs/disk,vmem,pages
AccountingStorageType = (null)
AccountingStorageUser = root
AccountingStoreFlags = (null)
AcctGatherEnergyType = (null)
AcctGatherFilesystemType = (null)
AcctGatherInterconnectType = (null)
AcctGatherNodeFreq = 0 sec
AcctGatherProfileType = (null)
AllowSpecResourcesUsage = no
AuthAltTypes = (null)
AuthAltParameters = (null)
AuthInfo = (null)
AuthType = auth/munge
BatchStartTimeout = 10 sec
BcastExclude = /lib,/usr/lib,/lib64,/usr/lib64
BcastParameters = (null)
BOOT_TIME = 2025-08-29 16:10:16.UTC
BurstBufferType = (null)
CertmgrParameters = (null)
CertmgrType = (null)
CliFilterPlugins = cli_filter/user_defaults
ClusterName = avesha-slurm-k8s
CommunicationParameters = (null)
CompleteWait = 5 sec
CpuFreqDef = Unknown
CpuFreqGovernors = OnDemand,Performance,UserSpace
CredType = cred/munge
DataParserParameters = (null)
DebugFlags = Script
DefMemPerNode = 1048576
DependencyParameters = (null)
DisableRootJobs = no
EioTimeout = 60
EnforcePartLimits = NO
Epilog[0] = /opt/slurm_scripts/epilog.sh
EpilogMsgTime = 2000 usec
FairShareDampeningFactor = 1
FederationParameters = (null)
FirstJobId = 1
GetEnvTimeout = 2 sec
GresTypes = gpu
GpuFreqDef = (null)
GroupUpdateForce = 1
GroupUpdateTime = 600 sec
HASH_VAL = Match
HashPlugin = hash/k12
HealthCheckInterval = 0 sec
HealthCheckNodeState = ANY
HealthCheckProgram = (null)
InactiveLimit = 0 sec
InteractiveStepOptions = --interactive --preserve-env --pty $SHELL
JobAcctGatherFrequency = 30
JobAcctGatherType = (null)
JobAcctGatherParams = (null)
JobCompHost = localhost
JobCompLoc = (null)
JobCompParams = (null)
JobCompPort = 0
JobCompType = (null)
JobCompUser = root
JobContainerType = (null)
JobDefaults = DefCpuPerGPU=16
JobFileAppend = 0
JobRequeue = 1
JobSubmitPlugins = (null)
KillOnBadExit = 1
KillWait = 180 sec
LaunchParameters = use_interactive_step
Licenses = (null)
LogTimeFormat = iso8601_ms
MailDomain = (null)
MailProg = /usr/bin/true
MaxArraySize = 1024
MaxBatchRequeue = 5
MaxDBDMsgs = 0
MaxJobCount = 10000
MaxJobId = 67043328
MaxMemPerNode = UNLIMITED
MaxNodeCount = 1024
MaxStepCount = 40000
MaxTasksPerNode = 512
MCSPlugin = (null)
MCSParameters = (null)
MessageTimeout = 60 sec
MinJobAge = 86400 sec
MpiDefault = pmix
MpiParams = (null)
NEXT_JOB_ID = 4
NodeFeaturesPlugins = (null)
OverTimeLimit = 0 min
PluginDir = /usr/lib/x86_64-linux-gnu/slurm
PlugStackConfig = (null)
PreemptMode = REQUEUE
PreemptParameters = (null)
PreemptType = preempt/partition_prio
PreemptExemptTime = 00:00:00
PrEpParameters = (null)
PrEpPlugins = prep/script
PriorityParameters = (null)
PrioritySiteFactorParameters = (null)
PrioritySiteFactorPlugin = (null)
PriorityDecayHalfLife = 7-00:00:00
PriorityCalcPeriod = 00:05:00
PriorityFavorSmall = no
PriorityFlags =
PriorityMaxAge = 7-00:00:00
PriorityType = priority/multifactor
PriorityUsageResetPeriod = NONE
PriorityWeightAge = 0
PriorityWeightAssoc = 0
PriorityWeightFairShare = 0
PriorityWeightJobSize = 0
PriorityWeightPartition = 0
PriorityWeightQOS = 0
PriorityWeightTRES = (null)
PrivateData = none
ProctrackType = proctrack/cgroup
Prolog[0] = /opt/slurm_scripts/prolog.sh
PrologEpilogTimeout = 65534
PrologFlags = (null)
PropagatePrioProcess = 0
PropagateResourceLimits = NONE
PropagateResourceLimitsExcept = (null)
RebootProgram = (null)
ReconfigFlags = (null)
RequeueExit = (null)
RequeueExitHold = (null)
ResumeFailProgram = (null)
ResumeProgram = (null)
ResumeRate = 300 nodes/min
ResumeTimeout = 60 sec
ResvEpilog = (null)
ResvOverRun = 0 min
ResvProlog = (null)
ReturnToService = 2
SchedulerParameters = nohold_on_prolog_fail,extra_constraints
SchedulerTimeSlice = 30 sec
SchedulerType = sched/backfill
ScronParameters = enable,explicit_scancel
SelectType = select/cons_tres
SelectTypeParameters = CR_CORE_MEMORY,CR_CORE_DEFAULT_DIST_BLOCK
SlurmUser = root(0)
SlurmctldAddr = (null)
SlurmctldDebug = verbose
SlurmctldHost[0] = controller-0
SlurmctldLogFile = /dev/null
SlurmctldPort = 6817
SlurmctldSyslogDebug = (null)
SlurmctldPrimaryOffProg = (null)
SlurmctldPrimaryOnProg = (null)
SlurmctldTimeout = 30 sec
SlurmctldParameters = conmgr_max_connections=512,conmgr_threads=16
SlurmdDebug = verbose
SlurmdLogFile = /dev/null
SlurmdParameters = (null)
SlurmdPidFile = /var/run/slurmd.pid
SlurmdPort = 6818
SlurmdSpoolDir = /var/spool/slurmd
SlurmdSyslogDebug = (null)
SlurmdTimeout = 180 sec
SlurmdUser = root(0)
SlurmSchedLogFile = (null)
SlurmSchedLogLevel = 0
SlurmctldPidFile = /var/run/slurmctld.pid
SLURM_CONF = /etc/slurm/slurm.conf
SLURM_VERSION = 24.11.5
SrunEpilog = (null)
SrunPortRange = 0-0
SrunProlog = (null)
StateSaveLocation = /var/spool/slurmctld
SuspendExcNodes = (null)
SuspendExcParts = (null)
SuspendExcStates = (null)
SuspendProgram = (null)
SuspendRate = 60 nodes/min
SuspendTime = INFINITE
SuspendTimeout = 30 sec
SwitchParameters = (null)
SwitchType = (null)
TaskEpilog = (null)
TaskPlugin = task/cgroup,task/affinity
TaskPluginParam = (null type)
TaskProlog = (null)
TCPTimeout = 15 sec
TLSParameters = (null)
TLSType = tls/none
TmpFS = /tmp
TopologyParam = SwitchAsNodeRank
TopologyPlugin = topology/tree
TrackWCKey = no
TreeWidth = 16
UsePam = no
UnkillableStepProgram = (null)
UnkillableStepTimeout = 600 sec
VSizeFactor = 0 percent
WaitTime = 0 sec
X11Parameters = (null)

Cgroup Support Configuration:
AllowedRAMSpace = 100.0%
AllowedSwapSpace = 0.0%
CgroupMountpoint = /sys/fs/cgroup
CgroupPlugin = cgroup/v2
ConstrainCores = yes
ConstrainDevices = yes
ConstrainRAMSpace = yes
ConstrainSwapSpace = no
EnableControllers = yes
IgnoreSystemd = yes
IgnoreSystemdOnFailure = yes
MaxRAMPercent = 100.0%
MaxSwapPercent = 100.0%
MemorySwappiness = (null)
MinRAMSpace = 30MB
SystemdTimeout = 1000 ms

Slurmctld(primary) at controller-0 is UP

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions