From 982fc5d7450ae660981edf079b76866de323b158 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Tue, 2 Jul 2024 15:29:32 +1000 Subject: [PATCH 01/15] First (incomplete) draft of improved pools Signed-off-by: Christopher Irving --- scripts/enqueue | 217 +++++++++++++++++++++++++++++++----------------- 1 file changed, 141 insertions(+), 76 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index f47e8b9..b13b6dc 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -9,6 +9,108 @@ fi # 'if anything at all happens try and shut down our job cleanly' approach SIGNALS="INT TERM HUP QUIT KILL PIPE" + +# 1 2 3 4 5 6 7 8 9 +# system retry_period total_retries key no_lock_mods completion completion_timeout errortxt logfile keep_alive linux dtbflag file_count files + +EnqueueOneSystem() { + + system="$1" + retry_period="$2" + total_retries="$3" + key="$4" + no_lock_mods="$5" + completion="$6" + completion_timeout="$7" + errortxt="$8" + logfile="$9" + shift 9 + keep_alive="$1" + linux="$2" + dtbflag="$3" + file_count="$4" + files="$5" + + + if [ ! -z "${linux}" ]; then + if ! SystemBootsLinuxPXE "${system}" ; then + echo "System cannot boot Linux over the network." + exit 1 + fi + fi + + if [ ! -z "$dtbflag" ]; then + if ! SystemAcceptsDtb "${system}" ; then + echo "System does not accept a dtb" + exit 1 + fi + fi + + if [ ! -z "$dtbflag" ] && [ -z "$linux" ]; then + echo "A dtb may only be supplied when booting a linux image over the network." + exit 1 + fi + + # Check that the number of files specified is correct + if [ "$interact" != "-r" ] ; then + if ! SystemCorrectNumberOfFiles ${linux} "${system}" "${file_count}"; then + echo "Wrong number of files specified for system ${system}" + exit 1 + fi + fi + + if ${no_lock_mods} ; then + if ! LockIOwn "${system}"; then + echo "Failed to run because you said you own the lock (-n flag), but you don't!" + LockSystemPrintInfo "${system}" + exit 1 + fi + else + echo "Acquiring lock for ${system}" + + # Print out potentially useful information to the user about + # the current status of the lock + LockSystemPrintInfo "${system}" + + # We can setup the trap early as we check if we actually own the + # the lock before releasing it + trap "UnlockSystem \"${system}\" 0 \"${key}\"; exit 1" ${SIGNALS} + + if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then + echo "Failed to acquire lock for system (${system})" + exit 2 + fi + fi + + echo "Lock acquired, we are allowed to run" + local ret=0 + + if [ "${interact}" = "-r" ]; then + echo "This is a reservation. You now own ${system} and" + echo "can do what you want. Press ctrl+d or enter here when done" + read line + echo "Attempting to power off machine now that you are done" + SystemPowerOff "${system}" + else + # Run the image. 'files' is deliberately not in quotes so the multiple + # files expands to multiple arguments + # 'dtbflag' is deliberately not in quotes so it expands to multiple + # parameters '-b' and 'my.dtb' + # 'linux' is deliberately not in quotes so it will not create an empty + # parameter if the flag is not set + SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files + ret=$? + fi + + if ! ${no_lock_mods} ; then + UnlockSystem "${system}" 0 "${key}" + trap "exit 1" ${SIGNALS} + fi + + exit $ret + +} + EnqueueUsage() { echo "$0 run -r|-c -l logfile -s system [-w retry-time] [-t retry-count] [-n] [-a] [-d timeout] [-e ] [-k ] [-L] [-b ] -f file1 -f file2 .. -f filen" echo @@ -159,97 +261,60 @@ Enqueue() { exit 1 fi + + local ret=0 + # Verify the requested system exists - if ! IsSystemValid "${system}" ; then - # If it's not a system, is it a pool? - if ! IsPoolValid "${system}" ; then - echo "System or pool '$system' does not exist. Valid systems are" - SystemList - echo "Valid pools are" - PoolList - exit 1 - fi - # Select a system from the pool + if IsSystemValid "${system}" ; then + + # Everything is quoted so we'll pass on the exact same number of parameters every time + EnqueueOneSystem "${system}" "${retry_period}" "${total_retries}" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" + + ret=$? + + # If it's not a system, is it a pool? + elif IsPoolValid "${system}" ; then pool=${system} - system=$(GetRandomSystemFromPool_"${pool}") - echo "Selecting system '${system}' from pool '${pool}'" - fi - if [ ! -z "${linux}" ]; then - if ! SystemBootsLinuxPXE "${system}" ; then - echo "System cannot boot Linux over the network." - exit 1 - fi - fi + # When attempting to lock a pool, we use the retry time and retry count + # given by the user for how often we attempt locking on the whole pool, + # not for lock attempts on a specific system. + # Each time the user-configured period elapses, we try (but do not wait + # on) locking each system in the pool. - if [ ! -z "$dtbflag" ]; then - if ! SystemAcceptsDtb "${system}" ; then - echo "System does not accept a dtb" - exit 1 - fi - fi + lock_tries=0 - if [ ! -z "$dtbflag" ] && [ -z "$linux" ]; then - echo "A dtb may only be supplied when booting a linux image over the network." - exit 1 - fi + # TODO obviously broken, sh doesn't do maths + while "${total_retries}" == -1 || "${lock_tries}" < "${total_retries}" ; do - # Check that the number of files specified is correct - if [ "$interact" != "-r" ] ; then - if ! SystemCorrectNumberOfFiles ${linux} "${system}" "${file_count}"; then - echo "Wrong number of files specified for system ${system}" - exit 1 - fi - fi - if ${no_lock_mods} ; then - if ! LockIOwn "${system}"; then - echo "Failed to run because you said you own the lock (-n flag), but you don't!" - LockSystemPrintInfo "${system}" - exit 1 - fi - else - echo "Acquiring lock for ${system}" + # Loop through and try all systems in the pool. + for system in $(Systems_"${pool}"); do - # Print out potentially useful information to the user about - # the current status of the lock - LockSystemPrintInfo "${system}" + # Note we do not retry and give a small retry period (which is unused) + # We only want to attempt locking, not wait on it + EnqueueOneSystem "${system}" "1" "0" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" - # We can setup the trap early as we check if we actually own the - # the lock before releasing it - trap "UnlockSystem \"${system}\" 0 \"${key}\"; exit 1" ${SIGNALS} + ret=$? - if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then - echo "Failed to acquire lock for system (${system})" - exit 2 - fi - fi + if [ "${ret}" == 0 ]; then + # Running succeeded on a board + exit "${ret}" + fi + done - echo "Lock acquired, we are allowed to run" - local ret=0 + lock_tries=$lock_tries+1 + + done - if [ "${interact}" = "-r" ]; then - echo "This is a reservation. You now own ${system} and" - echo "can do what you want. Press ctrl+d or enter here when done" - read line - echo "Attempting to power off machine now that you are done" - SystemPowerOff "${system}" else - # Run the image. 'files' is deliberately not in quotes so the multiple - # files expands to multiple arguments - # 'dtbflag' is deliberately not in quotes so it expands to multiple - # parameters '-b' and 'my.dtb' - # 'linux' is deliberately not in quotes so it will not create an empty - # parameter if the flag is not set - SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files - ret=$? - fi - if ! ${no_lock_mods} ; then - UnlockSystem "${system}" 0 "${key}" - trap "exit 1" ${SIGNALS} + echo "System or pool '$system' does not exist. Valid systems are" + SystemList + echo "Valid pools are" + PoolList + exit 1 fi exit $ret - } From 4df6784329362d4bf80a9b523e9a4a19ee05bd6d Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Tue, 2 Jul 2024 15:54:34 +1000 Subject: [PATCH 02/15] Fixed arithmetic Signed-off-by: Christopher Irving --- scripts/enqueue | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index b13b6dc..3d7660b 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -284,9 +284,7 @@ Enqueue() { lock_tries=0 - # TODO obviously broken, sh doesn't do maths - while "${total_retries}" == -1 || "${lock_tries}" < "${total_retries}" ; do - + while [ "${total_retries}" -eq -1 ] || [ "${lock_tries}" -lt "${total_retries}" ] ; do # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do @@ -303,7 +301,7 @@ Enqueue() { fi done - lock_tries=$lock_tries+1 + lock_tries=$(($lock_tries+1)) done From e38b444dd73ef87a2fb170911b0f652d84ba9ed4 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Tue, 2 Jul 2024 16:19:19 +1000 Subject: [PATCH 03/15] Adding retry wait Signed-off-by: Christopher Irving --- scripts/enqueue | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/enqueue b/scripts/enqueue index 3d7660b..7ec3f42 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -303,6 +303,8 @@ Enqueue() { lock_tries=$(($lock_tries+1)) + sleep "${retry_period}" + done else From 51f78d9de24887da6c872afc89517f3f746ff242 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 3 Jul 2024 10:34:12 +1000 Subject: [PATCH 04/15] Compressing the if statement Signed-off-by: Christopher Irving --- scripts/enqueue | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index 7ec3f42..b72b55f 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -291,12 +291,9 @@ Enqueue() { # Note we do not retry and give a small retry period (which is unused) # We only want to attempt locking, not wait on it - EnqueueOneSystem "${system}" "1" "0" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" - - ret=$? - - if [ "${ret}" == 0 ]; then + if EnqueueOneSystem "${system}" "1" "0" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" ; then # Running succeeded on a board + ret=$? exit "${ret}" fi done From de0cac74948c45896fd93e9dba8e40c30adc4e2a Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 3 Jul 2024 10:34:21 +1000 Subject: [PATCH 05/15] Preparing sem for pools Signed-off-by: Christopher Irving --- scripts/lock | 130 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 91 insertions(+), 39 deletions(-) diff --git a/scripts/lock b/scripts/lock index 063a38e..815e78b 100755 --- a/scripts/lock +++ b/scripts/lock @@ -282,47 +282,99 @@ UserLock() { *);; esac - IsSystemValid "${system}" - if [ $? -ne 0 ]; then - # If it's not a system, is it a pool? - IsPoolValid "${system}" - if [ $? -ne 0 ] ; then - echo "System or pool '$system' does not exist. Valid systems are" - SystemList - echo "Valid pools are" - PoolList - exit 1 - fi - # Select a system from the pool + if IsSystemValid "${system}" ; then + + case "${action}" in + -info) + LockSystemPrintInfo "${system}" + ;; + -mr-info) + LockSystemDumpInfo "${system}" + ;; + -signal) + UnlockSystem "${system}" "${force}" "${key}" + ;; + -wait) + if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}" $timeout; then + echo "Failed to acquire lock for system (${system})" + exit 2 + fi + ;; + -cancel) + CancelWait "${system}" "${key}" + ;; + *) + echo "Unknown usage" + UserLockUsage + exit 1 + ;; + esac + + elif IsPoolValid "${system}" ; then pool=${system} - system=$(GetRandomSystemFromPool_"${pool}") - echo "${system}" + + case "${action}" in + -info) + for system in $(Systems_"${pool}"); do + LockSystemPrintInfo "${system}" + done + ;; + -mr-info) + for system in $(Systems_"${pool}"); do + LockSystemDumpInfo "${system}" + done + ;; + -signal) + for system in $(Systems_"${pool}"); do + UnlockSystem "${system}" "${force}" "${key}" + done + ;; + -wait) + + # When attempting to lock a pool, we use the retry time and retry count + # given by the user for how often we attempt locking on the whole pool, + # not for lock attempts on a specific system. + # Each time the user-configured period elapses, we try (but do not wait + # on) locking each system in the pool. + + lock_tries=0 + local ret=0 + + while [ "${total_retries}" -eq -1 ] || [ "${lock_tries}" -lt "${total_retries}" ] ; do + + for system in $(Systems_"${pool}"); do + + # Note we do not retry and give a small retry period (which is unused) + # We only want to attempt locking, not wait on it + if LockSystem "${system}" "1" "0" "${key}" $timeout ; then + # Locking succeeded on a board + ret=$? + exit "${ret}" + fi + done + + lock_tries=$(($lock_tries+1)) + + sleep "${retry_period}" + done + ;; + -cancel) + CancelWait "${system}" "${key}" + ;; + *) + echo "Unknown usage" + UserLockUsage + exit 1 + ;; + esac + + else + echo "System or pool '$system' does not exist. Valid systems are" + SystemList + echo "Valid pools are" + PoolList + exit 1 fi - case "${action}" in - -info) - LockSystemPrintInfo "${system}" - ;; - -mr-info) - LockSystemDumpInfo "${system}" - ;; - -signal) - UnlockSystem "${system}" "${force}" "${key}" - ;; - -wait) - if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}" $timeout; then - echo "Failed to acquire lock for system (${system})" - exit 2 - fi - ;; - -cancel) - CancelWait "${system}" "${key}" - ;; - *) - echo "Unknown usage" - UserLockUsage - exit 1 - ;; - esac exit 0 } From 39ee83906101d79cefda063905bfa8a19d152fa7 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 3 Jul 2024 10:50:26 +1000 Subject: [PATCH 06/15] Improving error messages Signed-off-by: Christopher Irving --- scripts/enqueue | 81 ++++++++++++++++++++++++++++--------------------- scripts/lock | 5 +++ 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index b72b55f..9a90d12 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -34,14 +34,14 @@ EnqueueOneSystem() { if [ ! -z "${linux}" ]; then if ! SystemBootsLinuxPXE "${system}" ; then - echo "System cannot boot Linux over the network." + echo "System ${system} cannot boot Linux over the network." exit 1 fi fi if [ ! -z "$dtbflag" ]; then if ! SystemAcceptsDtb "${system}" ; then - echo "System does not accept a dtb" + echo "System ${system} does not accept a dtb" exit 1 fi fi @@ -76,39 +76,41 @@ EnqueueOneSystem() { # the lock before releasing it trap "UnlockSystem \"${system}\" 0 \"${key}\"; exit 1" ${SIGNALS} - if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then - echo "Failed to acquire lock for system (${system})" - exit 2 + if LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then + + echo "Lock acquired, we are allowed to run" + local ret=0 + + if [ "${interact}" = "-r" ]; then + echo "This is a reservation. You now own ${system} and" + echo "can do what you want. Press ctrl+d or enter here when done" + read line + echo "Attempting to power off machine now that you are done" + SystemPowerOff "${system}" + else + # Run the image. 'files' is deliberately not in quotes so the multiple + # files expands to multiple arguments + # 'dtbflag' is deliberately not in quotes so it expands to multiple + # parameters '-b' and 'my.dtb' + # 'linux' is deliberately not in quotes so it will not create an empty + # parameter if the flag is not set + SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files + ret=$? + fi + + if ! ${no_lock_mods} ; then + UnlockSystem "${system}" 0 "${key}" + trap "exit 1" ${SIGNALS} + fi + + exit $ret + + else + # Locking failed; reset the trap + # The outer Enqueue function will handle reporting the lock failure + trap "exit 1" ${SIGNALS} fi fi - - echo "Lock acquired, we are allowed to run" - local ret=0 - - if [ "${interact}" = "-r" ]; then - echo "This is a reservation. You now own ${system} and" - echo "can do what you want. Press ctrl+d or enter here when done" - read line - echo "Attempting to power off machine now that you are done" - SystemPowerOff "${system}" - else - # Run the image. 'files' is deliberately not in quotes so the multiple - # files expands to multiple arguments - # 'dtbflag' is deliberately not in quotes so it expands to multiple - # parameters '-b' and 'my.dtb' - # 'linux' is deliberately not in quotes so it will not create an empty - # parameter if the flag is not set - SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files - ret=$? - fi - - if ! ${no_lock_mods} ; then - UnlockSystem "${system}" 0 "${key}" - trap "exit 1" ${SIGNALS} - fi - - exit $ret - } EnqueueUsage() { @@ -268,9 +270,14 @@ Enqueue() { if IsSystemValid "${system}" ; then # Everything is quoted so we'll pass on the exact same number of parameters every time - EnqueueOneSystem "${system}" "${retry_period}" "${total_retries}" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" + if EnqueueOneSystem "${system}" "${retry_period}" "${total_retries}" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" ; then + ret=$? + exit "${ret}" + fi - ret=$? + # If we get here, the enqueing failed + echo "Failed to acquire lock for system (${system})" + exit 2 # If it's not a system, is it a pool? elif IsPoolValid "${system}" ; then @@ -304,6 +311,10 @@ Enqueue() { done + # If we get here, we've exhausted the specified number of retries + echo "Failed to acquire lock for any system in pool (${pool})" + exit 2 + else echo "System or pool '$system' does not exist. Valid systems are" diff --git a/scripts/lock b/scripts/lock index 815e78b..1a9eff9 100755 --- a/scripts/lock +++ b/scripts/lock @@ -357,6 +357,11 @@ UserLock() { sleep "${retry_period}" done + + # If we reach this point, we've exhausted all retry attempts + echo "Failed to acquire lock for any system in pool (${pool})" + exit 2 + ;; -cancel) CancelWait "${system}" "${key}" From 9314fd707c4d0349c30fd60ee3bd69b327850ea1 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 3 Jul 2024 11:55:04 +1000 Subject: [PATCH 07/15] Some improvements to changed structure for pools Signed-off-by: Christopher Irving --- scripts/enqueue | 7 ++++--- scripts/lock | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index 9a90d12..0e61f25 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -109,6 +109,7 @@ EnqueueOneSystem() { # Locking failed; reset the trap # The outer Enqueue function will handle reporting the lock failure trap "exit 1" ${SIGNALS} + return 1 fi fi } @@ -291,7 +292,9 @@ Enqueue() { lock_tries=0 - while [ "${total_retries}" -eq -1 ] || [ "${lock_tries}" -lt "${total_retries}" ] ; do + # Loop until total_retries + 1 because we want to make the initial attempt, + # and then retry the correct number of times + while [ "${total_retries}" -eq -1 -o "${lock_tries}" -lt "$((${total_retries}+1))" ] ; do # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do @@ -323,6 +326,4 @@ Enqueue() { PoolList exit 1 fi - - exit $ret } diff --git a/scripts/lock b/scripts/lock index 1a9eff9..4169eef 100755 --- a/scripts/lock +++ b/scripts/lock @@ -340,11 +340,15 @@ UserLock() { lock_tries=0 local ret=0 - while [ "${total_retries}" -eq -1 ] || [ "${lock_tries}" -lt "${total_retries}" ] ; do + # Loop until total_retries + 1 because we want to make the initial attempt, + # and then retry the correct number of times + while [ "${total_retries}" -eq -1 -o "${lock_tries}" -lt "$((${total_retries}+1))" ] ; do + # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do - # Note we do not retry and give a small retry period (which is unused) + # Note we do not retry the individual lock attempt, + # and give only a small retry period (which is unused) # We only want to attempt locking, not wait on it if LockSystem "${system}" "1" "0" "${key}" $timeout ; then # Locking succeeded on a board @@ -364,7 +368,9 @@ UserLock() { ;; -cancel) - CancelWait "${system}" "${key}" + for system in $(Systems_"${pool}"); do + CancelWait "${system}" "${key}" + done ;; *) echo "Unknown usage" From 1d215250e24752ccb2cd933fd3901211fba1b838 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 3 Jul 2024 12:22:55 +1000 Subject: [PATCH 08/15] Changed lock success message to be solely machine name Signed-off-by: Christopher Irving --- scripts/lock | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/lock b/scripts/lock index 4169eef..c7c9b03 100755 --- a/scripts/lock +++ b/scripts/lock @@ -352,6 +352,8 @@ UserLock() { # We only want to attempt locking, not wait on it if LockSystem "${system}" "1" "0" "${key}" $timeout ; then # Locking succeeded on a board + # We need to tell the user which one + echo "${system}" ret=$? exit "${ret}" fi From 35e6d89bf991847cc047ce74ee7a3ee32127bee3 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Tue, 9 Jul 2024 16:29:49 +1000 Subject: [PATCH 09/15] Some fixes to pool locking loop Signed-off-by: Christopher Irving --- scripts/enqueue | 8 ++++++-- scripts/lock | 9 +++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index 0e61f25..100d927 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -292,9 +292,13 @@ Enqueue() { lock_tries=0 + # Need to use a different name so as not to collide with other local variables + loop_retries="$total_retries" + loop_period="$retry_period" + # Loop until total_retries + 1 because we want to make the initial attempt, # and then retry the correct number of times - while [ "${total_retries}" -eq -1 -o "${lock_tries}" -lt "$((${total_retries}+1))" ] ; do + while [ "${loop_retries}" -eq -1 -o "${lock_tries}" -lt "$((${loop_retries}+1))" ] ; do # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do @@ -310,7 +314,7 @@ Enqueue() { lock_tries=$(($lock_tries+1)) - sleep "${retry_period}" + sleep "${loop_period}" done diff --git a/scripts/lock b/scripts/lock index c7c9b03..b861a77 100755 --- a/scripts/lock +++ b/scripts/lock @@ -340,9 +340,13 @@ UserLock() { lock_tries=0 local ret=0 + # Need to use a different name so as not to collide with other local variables + loop_retries="$total_retries" + loop_period="$retry_period" + # Loop until total_retries + 1 because we want to make the initial attempt, # and then retry the correct number of times - while [ "${total_retries}" -eq -1 -o "${lock_tries}" -lt "$((${total_retries}+1))" ] ; do + while [ "${loop_retries}" -eq -1 ] || [ "${lock_tries}" -lt "$((${loop_retries}+1))" ] ; do # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do @@ -361,7 +365,8 @@ UserLock() { lock_tries=$(($lock_tries+1)) - sleep "${retry_period}" + sleep "${loop_period}" + done # If we reach this point, we've exhausted all retry attempts From 1fa53c0545146f808ddf8abfb40d9e27d73e6cb5 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 10 Jul 2024 10:34:30 +1000 Subject: [PATCH 10/15] Spelling correction Signed-off-by: Christopher Irving --- scripts/lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lock b/scripts/lock index b861a77..ff744ae 100755 --- a/scripts/lock +++ b/scripts/lock @@ -190,7 +190,7 @@ UserLockUsage() { echo " -wait SYSTEM Acquire the lock for the specified SYSTEM" echo " -cancel SYSTEM Cancel '-wait' processes on the server that are waiting for specified SYSTEM and key" echo " -w TIME Number of seconds to wait between each attempt to acquire the lock (default 8)" - echo " -t RETRIES Number of retries to preform for acquiring the lock (default -1)" + echo " -t RETRIES Number of retries to perform for acquiring the lock (default -1)" echo " -f Forcefully releases a lock even if you are not the owner" echo " -k LOCK_KEY Set a key inside the lock" echo " -T timeout Allow lock to be reclaimed after timeout seconds" From b0bf0c88bd0729c46d01fc5c51a50b3c08b9dde0 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 10 Jul 2024 11:32:24 +1000 Subject: [PATCH 11/15] Flattening out the ifs in enqueueOneSystem Signed-off-by: Christopher Irving --- scripts/enqueue | 60 ++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index 100d927..2578dad 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -63,7 +63,7 @@ EnqueueOneSystem() { if ! LockIOwn "${system}"; then echo "Failed to run because you said you own the lock (-n flag), but you don't!" LockSystemPrintInfo "${system}" - exit 1 + return 1 fi else echo "Acquiring lock for ${system}" @@ -76,42 +76,40 @@ EnqueueOneSystem() { # the lock before releasing it trap "UnlockSystem \"${system}\" 0 \"${key}\"; exit 1" ${SIGNALS} - if LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then - - echo "Lock acquired, we are allowed to run" - local ret=0 - - if [ "${interact}" = "-r" ]; then - echo "This is a reservation. You now own ${system} and" - echo "can do what you want. Press ctrl+d or enter here when done" - read line - echo "Attempting to power off machine now that you are done" - SystemPowerOff "${system}" - else - # Run the image. 'files' is deliberately not in quotes so the multiple - # files expands to multiple arguments - # 'dtbflag' is deliberately not in quotes so it expands to multiple - # parameters '-b' and 'my.dtb' - # 'linux' is deliberately not in quotes so it will not create an empty - # parameter if the flag is not set - SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files - ret=$? - fi - - if ! ${no_lock_mods} ; then - UnlockSystem "${system}" 0 "${key}" - trap "exit 1" ${SIGNALS} - fi - - exit $ret - - else + if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then # Locking failed; reset the trap # The outer Enqueue function will handle reporting the lock failure trap "exit 1" ${SIGNALS} return 1 fi fi + + echo "Lock acquired, we are allowed to run" + local ret=0 + + if [ "${interact}" = "-r" ]; then + echo "This is a reservation. You now own ${system} and" + echo "can do what you want. Press ctrl+d or enter here when done" + read line + echo "Attempting to power off machine now that you are done" + SystemPowerOff "${system}" + else + # Run the image. 'files' is deliberately not in quotes so the multiple + # files expands to multiple arguments + # 'dtbflag' is deliberately not in quotes so it expands to multiple + # parameters '-b' and 'my.dtb' + # 'linux' is deliberately not in quotes so it will not create an empty + # parameter if the flag is not set + SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files + ret=$? + fi + + if ! ${no_lock_mods} ; then + UnlockSystem "${system}" 0 "${key}" + trap "exit 1" ${SIGNALS} + fi + + exit $ret } EnqueueUsage() { From 0e091d01bd67e5216355c2a3770d7b2b53155c76 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 10 Jul 2024 11:59:04 +1000 Subject: [PATCH 12/15] Added prints to show which board is currently being attempted Signed-off-by: Christopher Irving --- scripts/enqueue | 5 +++++ scripts/lock | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/scripts/enqueue b/scripts/enqueue index 2578dad..8727def 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -301,6 +301,8 @@ Enqueue() { # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do + echo "Attempting system ${system} in pool ${pool}" + # Note we do not retry and give a small retry period (which is unused) # We only want to attempt locking, not wait on it if EnqueueOneSystem "${system}" "1" "0" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" ; then @@ -308,8 +310,11 @@ Enqueue() { ret=$? exit "${ret}" fi + done + echo "Done attempting system ${system} in pool ${pool}" + lock_tries=$(($lock_tries+1)) sleep "${loop_period}" diff --git a/scripts/lock b/scripts/lock index ff744ae..9ceedd4 100755 --- a/scripts/lock +++ b/scripts/lock @@ -351,6 +351,8 @@ UserLock() { # Loop through and try all systems in the pool. for system in $(Systems_"${pool}"); do + echo "Attempting system ${system} in pool ${pool}" + # Note we do not retry the individual lock attempt, # and give only a small retry period (which is unused) # We only want to attempt locking, not wait on it @@ -363,6 +365,8 @@ UserLock() { fi done + echo "Done attempting system ${system} in pool ${pool}" + lock_tries=$(($lock_tries+1)) sleep "${loop_period}" From 37250783c25955ee38282dd1506582968e05dd38 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 31 Jul 2024 10:30:23 +1000 Subject: [PATCH 13/15] First attempt at requiring key in run -n Signed-off-by: Christopher Irving --- scripts/enqueue | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/enqueue b/scripts/enqueue index 8727def..7599ead 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -65,6 +65,19 @@ EnqueueOneSystem() { LockSystemPrintInfo "${system}" return 1 fi + lockkey=$(LockGetKey "${system}") + if [ "${lockkey}" = "0" -a "${key}" = "0" ] + then + # No key on lock or user is fine + echo "Attempting to grab lock you already own. Please explicitly release" + return 1 + fi + if [ "${lockkey}" != "${key}" ] + then + echo "Failed to run because you said you own the lock (-n flag), but you gave the wrong key!" + return 2 + fi + else echo "Acquiring lock for ${system}" From f54fead209c0de1d237851d3c994f3d535cb0be1 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 31 Jul 2024 11:01:15 +1000 Subject: [PATCH 14/15] Fixing name collision which broke pools on repeated attempts Signed-off-by: Christopher Irving --- scripts/lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/lock b/scripts/lock index 9ceedd4..30f9b4a 100755 --- a/scripts/lock +++ b/scripts/lock @@ -97,7 +97,7 @@ LockSystem() { total_retries="$3" key="$4" newtimeout="$5" - timeout="" + oldtimeout="" owner=$(LockOwner "${system}") # Error checking - the same person can't relock the same lock @@ -120,21 +120,21 @@ LockSystem() { then # Ensure timeout is either an integer or ignored # to avoid lockfile syntax errors - timeout="$(LockTimeout $system)" || timeout="" - case $timeout in + oldtimeout="$(LockTimeout $system)" || oldtimeout="" + case $oldtimeout in ''|*[!0-9]*) # no valid timeout - timeout="" + oldtimeout="" ;; *) # valid timeout - timeout="-l ${timeout}" + oldtimeout="-l ${oldtimeout}" ;; esac fi lockname="$(LockName ${system})" keyname="$(KeyName ${system})" - RemoteCommand "umask 0111 && lockfile -'${retry_period}' $timeout -r '${total_retries}' '$lockname' && rm -f '$keyname' && printf '${key}' > '$keyname' && chmod a-w,g+r '$keyname' && chmod u+w $lockname && printf '$newtimeout' > $lockname && chmod a-w $lockname" + RemoteCommand "umask 0111 && lockfile -'${retry_period}' $oldtimeout -r '${total_retries}' '$lockname' && rm -f '$keyname' && printf '${key}' > '$keyname' && chmod a-w,g+r '$keyname' && chmod u+w $lockname && printf '$newtimeout' > $lockname && chmod a-w $lockname" [ "$?" -ne 0 ] && return 3 From bed55ab84509184aed025dea63b6abec6f2419d9 Mon Sep 17 00:00:00 2001 From: Christopher Irving Date: Wed, 31 Jul 2024 12:30:09 +1000 Subject: [PATCH 15/15] Removing incorrect lock check Signed-off-by: Christopher Irving --- scripts/enqueue | 6 ------ 1 file changed, 6 deletions(-) diff --git a/scripts/enqueue b/scripts/enqueue index 7599ead..217cd0c 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -66,12 +66,6 @@ EnqueueOneSystem() { return 1 fi lockkey=$(LockGetKey "${system}") - if [ "${lockkey}" = "0" -a "${key}" = "0" ] - then - # No key on lock or user is fine - echo "Attempting to grab lock you already own. Please explicitly release" - return 1 - fi if [ "${lockkey}" != "${key}" ] then echo "Failed to run because you said you own the lock (-n flag), but you gave the wrong key!"