From c5e5d99b07a3638453922a2ca18f96c10909d270 Mon Sep 17 00:00:00 2001 From: Janek Date: Fri, 5 Dec 2025 16:41:09 +0100 Subject: [PATCH 01/38] claude gh workflow self-hosted-runner on entropy --- .github/scripts/run_tiny_remote.sh | 35 ++++++++++++++++++++++++++++++ .github/workflows/tiny_remote.yml | 27 +++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 .github/scripts/run_tiny_remote.sh create mode 100644 .github/workflows/tiny_remote.yml diff --git a/.github/scripts/run_tiny_remote.sh b/.github/scripts/run_tiny_remote.sh new file mode 100644 index 00000000..479edb47 --- /dev/null +++ b/.github/scripts/run_tiny_remote.sh @@ -0,0 +1,35 @@ +#!/bin/bash -l + +#SBATCH --cpus-per-gpu=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tiny_remote_ci +#SBATCH --mem-per-gpu=125G +#SBATCH --nodes=1 +#SBATCH --partition=a100 +#SBATCH --time=00:10:00 + +#---------- SCRIPT ---------- +export PROJECT_HOME_PATH=/storage_ssd_1/nano +export HF_HOME=$PROJECT_HOME_PATH/hf_cache +export HYDRA_FULL_ERROR=1 +export PIXI_HOME=/storage_ssd_1/nano/pixi +export PATH="$PIXI_HOME/bin:$PATH" +export XDG_DATA_HOME="$PIXI_HOME/data" +export XDG_CACHE_HOME="$PIXI_HOME/cache" +export XDG_STATE_HOME="$PIXI_HOME/state" +cd "$PIXI_HOME" +eval "$(pixi shell-hook)" +cd - +#-------- SCRIPT END -------- + +export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) +export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000)) + +srun torchrun --nnodes=${SLURM_NNODES}\ + --nproc-per-node=${SLURM_GPUS_ON_NODE} \ + --rdzv-id=${SLURM_JOBID} \ + --rdzv-backend=c10d \ + --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \ + main.py \ + --config-path=configs \ + --config-name=tiny_remote diff --git a/.github/workflows/tiny_remote.yml b/.github/workflows/tiny_remote.yml new file mode 100644 index 00000000..c28944d0 --- /dev/null +++ b/.github/workflows/tiny_remote.yml @@ -0,0 +1,27 @@ +name: Run tiny_remote on PR + +on: + pull_request: + branches: + - main + +jobs: + tiny_remote: + runs-on: self-hosted + steps: + - uses: actions/checkout@v4 + + - name: Submit tiny_remote to SLURM + run: | + chmod +x .github/scripts/run_tiny_remote.sh + sbatch --wait .github/scripts/run_tiny_remote.sh + + - name: Check results + if: always() + run: | + # Display the latest slurm output file + latest_out=$(ls -t slurm-*.out 2>/dev/null | head -1) + if [ -n "$latest_out" ]; then + echo "=== SLURM Job Output ===" + cat "$latest_out" + fi From fa3f7f7b83f1d560073bfdde97a1cbc937f50a7c Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 11:06:06 +0100 Subject: [PATCH 02/38] added manual runner triger for testing --- .github/workflows/tiny_remote.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tiny_remote.yml b/.github/workflows/tiny_remote.yml index c28944d0..dc2b7b79 100644 --- a/.github/workflows/tiny_remote.yml +++ b/.github/workflows/tiny_remote.yml @@ -4,6 +4,7 @@ on: pull_request: branches: - main + workflow_dispatch: jobs: tiny_remote: From 4c7fb9eeb534d452c2872738a892ea65160d30ae Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 16:46:25 +0100 Subject: [PATCH 03/38] refactor pr remote tests --- ..._tiny_remote.sh => run_remote_pr_check.sh} | 16 ++++-- .github/workflows/run_tests_on_entropy.yml | 50 +++++++++++++++++++ .github/workflows/tiny_remote.yml | 28 ----------- configs/pr_tests/test_core.yaml | 39 +++++++++++++++ 4 files changed, 101 insertions(+), 32 deletions(-) rename .github/scripts/{run_tiny_remote.sh => run_remote_pr_check.sh} (71%) create mode 100644 .github/workflows/run_tests_on_entropy.yml delete mode 100644 .github/workflows/tiny_remote.yml create mode 100644 configs/pr_tests/test_core.yaml diff --git a/.github/scripts/run_tiny_remote.sh b/.github/scripts/run_remote_pr_check.sh similarity index 71% rename from .github/scripts/run_tiny_remote.sh rename to .github/scripts/run_remote_pr_check.sh index 479edb47..d15c5d9e 100644 --- a/.github/scripts/run_tiny_remote.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -1,13 +1,21 @@ #!/bin/bash -l #SBATCH --cpus-per-gpu=16 -#SBATCH --gres=gpu:1 -#SBATCH --job-name=tiny_remote_ci +#SBATCH --gres=gpu:2 +#SBATCH --job-name=pr_check_entropy #SBATCH --mem-per-gpu=125G #SBATCH --nodes=1 #SBATCH --partition=a100 #SBATCH --time=00:10:00 +# PR_TEST_CONFIG_NAME is passed from the workflow via --export=ALL +if [ -z "$PR_TEST_CONFIG_NAME" ]; then + echo "Error: PR_TEST_CONFIG_NAME not set" + exit 1 +fi + +echo "Running CI check for config: $PR_TEST_CONFIG_NAME" + #---------- SCRIPT ---------- export PROJECT_HOME_PATH=/storage_ssd_1/nano export HF_HOME=$PROJECT_HOME_PATH/hf_cache @@ -31,5 +39,5 @@ srun torchrun --nnodes=${SLURM_NNODES}\ --rdzv-backend=c10d \ --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \ main.py \ - --config-path=configs \ - --config-name=tiny_remote + --config-path=configs/pr_tests \ + --config-name=$PR_TEST_CONFIG_NAME diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml new file mode 100644 index 00000000..d8139067 --- /dev/null +++ b/.github/workflows/run_tests_on_entropy.yml @@ -0,0 +1,50 @@ +name: PR Tests + +on: + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + # First, discover all PR test configs + discover-configs: + runs-on: [self-hosted, entropy] + outputs: + configs: ${{ steps.find-configs.outputs.configs }} + steps: + - uses: actions/checkout@v4 + + - name: Find PR test configs + id: find-configs + run: | + # Find all yaml files in configs/pr_tests/ and output as JSON array + configs=$(ls configs/pr_tests/*.yaml 2>/dev/null | xargs -I {} basename {} .yaml | jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "configs=$configs" >> $GITHUB_OUTPUT + echo "Found configs: $configs" + + # Run each config as a separate job + run-pr-test: + needs: discover-configs + runs-on: [self-hosted, entropy] + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.discover-configs.outputs.configs) }} + steps: + - uses: actions/checkout@v4 + + - name: Run PR test - ${{ matrix.config }} + run: | + chmod +x .github/scripts/run_remote_pr_check.sh + export PR_TEST_CONFIG_NAME="${{ matrix.config }}" + sbatch --wait --job-name="pr_test_${{ matrix.config }}" --export=ALL .github/scripts/run_remote_pr_check.sh + + - name: Display SLURM output + if: always() + run: | + latest_out=$(ls -t slurm-*.out 2>/dev/null | head -1) + if [ -n "$latest_out" ]; then + echo "=== SLURM Output for ${{ matrix.config }} ===" + cat "$latest_out" + fi diff --git a/.github/workflows/tiny_remote.yml b/.github/workflows/tiny_remote.yml deleted file mode 100644 index dc2b7b79..00000000 --- a/.github/workflows/tiny_remote.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Run tiny_remote on PR - -on: - pull_request: - branches: - - main - workflow_dispatch: - -jobs: - tiny_remote: - runs-on: self-hosted - steps: - - uses: actions/checkout@v4 - - - name: Submit tiny_remote to SLURM - run: | - chmod +x .github/scripts/run_tiny_remote.sh - sbatch --wait .github/scripts/run_tiny_remote.sh - - - name: Check results - if: always() - run: | - # Display the latest slurm output file - latest_out=$(ls -t slurm-*.out 2>/dev/null | head -1) - if [ -n "$latest_out" ]; then - echo "=== SLURM Job Output ===" - cat "$latest_out" - fi diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml new file mode 100644 index 00000000..a16732be --- /dev/null +++ b/configs/pr_tests/test_core.yaml @@ -0,0 +1,39 @@ +defaults: + - _cluster@_here_: entropy_a100 + - _model@_here_: tiny + - _trainer@_here_: llama + - _dataset@_here_: c4 + - _checkpoints@_here_: none + - _misc@_here_: default + - _eval@_here_: default + +common: + sequence_length: 128 + batch_size: 16 + +trainer: + gradient_accumulation_steps: 1 + n_steps: 100 + learning_rate: 1e-3 + + checkpoint: + save: + type: huggingface + path: checkpoint + +infrastructure: + + metric_logger: + name: test_core + tags: + - nano + - pr_test + - core + + slurm: + time: "00:10:00" + gres: gpu:2 + job-name: ${infrastructure.metric_logger.name} + +evaluator: + limit: 5 \ No newline at end of file From 516f47acec5f714a557a9eb1785d2fea48d00f4b Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 16:53:40 +0100 Subject: [PATCH 04/38] claude fix --- .github/scripts/run_remote_pr_check.sh | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index d15c5d9e..e40994fc 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -8,6 +8,9 @@ #SBATCH --partition=a100 #SBATCH --time=00:10:00 +set -e # Exit on error +set -x # Print commands for debugging + # PR_TEST_CONFIG_NAME is passed from the workflow via --export=ALL if [ -z "$PR_TEST_CONFIG_NAME" ]; then echo "Error: PR_TEST_CONFIG_NAME not set" @@ -25,14 +28,25 @@ export PATH="$PIXI_HOME/bin:$PATH" export XDG_DATA_HOME="$PIXI_HOME/data" export XDG_CACHE_HOME="$PIXI_HOME/cache" export XDG_STATE_HOME="$PIXI_HOME/state" -cd "$PIXI_HOME" -eval "$(pixi shell-hook)" -cd - + +# Save current directory and setup pixi +ORIGINAL_DIR="$(pwd)" +cd "$PIXI_HOME" || { echo "Failed to cd to $PIXI_HOME"; exit 1; } +eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; } +cd "$ORIGINAL_DIR" || { echo "Failed to return to original directory"; exit 1; } #-------- SCRIPT END -------- +# Change to project directory +echo "Changing to project directory: $PROJECT_HOME_PATH" +cd "$PROJECT_HOME_PATH" || { echo "Failed to cd to $PROJECT_HOME_PATH"; exit 1; } + export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000)) +echo "Running training with config: $PR_TEST_CONFIG_NAME" +echo "MASTER_ADDR: $MASTER_ADDR" +echo "MASTER_PORT: $MASTER_PORT" + srun torchrun --nnodes=${SLURM_NNODES}\ --nproc-per-node=${SLURM_GPUS_ON_NODE} \ --rdzv-id=${SLURM_JOBID} \ From 6b04c31bca6ee84635d955108ee9274a358bf57d Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 16:57:02 +0100 Subject: [PATCH 05/38] claude debug messages --- .github/workflows/run_tests_on_entropy.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index d8139067..935d6379 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -19,9 +19,15 @@ jobs: id: find-configs run: | # Find all yaml files in configs/pr_tests/ and output as JSON array + echo "Searching for configs in configs/pr_tests/..." + ls -la configs/pr_tests/ || echo "Directory not found or empty" + configs=$(ls configs/pr_tests/*.yaml 2>/dev/null | xargs -I {} basename {} .yaml | jq -R -s -c 'split("\n") | map(select(length > 0))') echo "configs=$configs" >> $GITHUB_OUTPUT - echo "Found configs: $configs" + + echo "=== Found PR test configs ===" + echo "$configs" | jq -r '.[]' + echo "Total: $(echo "$configs" | jq 'length') config(s)" # Run each config as a separate job run-pr-test: @@ -36,9 +42,12 @@ jobs: - name: Run PR test - ${{ matrix.config }} run: | + echo "=== Starting PR test for config: ${{ matrix.config }} ===" chmod +x .github/scripts/run_remote_pr_check.sh export PR_TEST_CONFIG_NAME="${{ matrix.config }}" + echo "Submitting SLURM job..." sbatch --wait --job-name="pr_test_${{ matrix.config }}" --export=ALL .github/scripts/run_remote_pr_check.sh + echo "SLURM job completed with exit code: $?" - name: Display SLURM output if: always() From 91a6675963adc174acbbb6c826bfd9192c6ab8c1 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 17:00:33 +0100 Subject: [PATCH 06/38] claude more debug messages --- .github/workflows/run_tests_on_entropy.yml | 24 ++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 935d6379..06f34583 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -43,11 +43,31 @@ jobs: - name: Run PR test - ${{ matrix.config }} run: | echo "=== Starting PR test for config: ${{ matrix.config }} ===" + echo "Current directory: $(pwd)" chmod +x .github/scripts/run_remote_pr_check.sh export PR_TEST_CONFIG_NAME="${{ matrix.config }}" echo "Submitting SLURM job..." - sbatch --wait --job-name="pr_test_${{ matrix.config }}" --export=ALL .github/scripts/run_remote_pr_check.sh - echo "SLURM job completed with exit code: $?" + + # Submit job and capture the job ID + job_output=$(sbatch --wait --job-name="pr_test_${{ matrix.config }}" --output="slurm-%j.out" --export=ALL .github/scripts/run_remote_pr_check.sh) + exit_code=$? + job_id=$(echo "$job_output" | grep -oP 'Submitted batch job \K\d+') + + echo "$job_output" + echo "Job ID: $job_id" + echo "SLURM job completed with exit code: $exit_code" + + # Try to display the output immediately + if [ -f "slurm-${job_id}.out" ]; then + echo "=== SLURM Output (slurm-${job_id}.out) ===" + cat "slurm-${job_id}.out" + else + echo "Warning: SLURM output file slurm-${job_id}.out not found in $(pwd)" + echo "Looking for any slurm output files..." + ls -la slurm-*.out 2>/dev/null || echo "No slurm output files found" + fi + + exit $exit_code - name: Display SLURM output if: always() From 32f748b7f8f00b49022cc8997f212536b769676e Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 17:02:59 +0100 Subject: [PATCH 07/38] jm fix --- .github/scripts/run_remote_pr_check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index e40994fc..2008e753 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -33,7 +33,7 @@ export XDG_STATE_HOME="$PIXI_HOME/state" ORIGINAL_DIR="$(pwd)" cd "$PIXI_HOME" || { echo "Failed to cd to $PIXI_HOME"; exit 1; } eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; } -cd "$ORIGINAL_DIR" || { echo "Failed to return to original directory"; exit 1; } +cd - #-------- SCRIPT END -------- # Change to project directory From 84baafe6ed40cd5c3f7e8351405da76e9563a439 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 17:07:49 +0100 Subject: [PATCH 08/38] jm fix --- .github/scripts/run_remote_pr_check.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index 2008e753..d7d527ae 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -30,16 +30,11 @@ export XDG_CACHE_HOME="$PIXI_HOME/cache" export XDG_STATE_HOME="$PIXI_HOME/state" # Save current directory and setup pixi -ORIGINAL_DIR="$(pwd)" cd "$PIXI_HOME" || { echo "Failed to cd to $PIXI_HOME"; exit 1; } eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; } cd - #-------- SCRIPT END -------- -# Change to project directory -echo "Changing to project directory: $PROJECT_HOME_PATH" -cd "$PROJECT_HOME_PATH" || { echo "Failed to cd to $PROJECT_HOME_PATH"; exit 1; } - export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000)) From ff5b6ad97450bb24dbe0654e1f8853c2fdbd78e9 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 17:11:52 +0100 Subject: [PATCH 09/38] jm fix config default paths --- configs/pr_tests/test_core.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml index a16732be..c35d017a 100644 --- a/configs/pr_tests/test_core.yaml +++ b/configs/pr_tests/test_core.yaml @@ -1,11 +1,11 @@ defaults: - - _cluster@_here_: entropy_a100 - - _model@_here_: tiny - - _trainer@_here_: llama - - _dataset@_here_: c4 - - _checkpoints@_here_: none - - _misc@_here_: default - - _eval@_here_: default + - ../_cluster@_here_: entropy_a100 + - ../_model@_here_: tiny + - ../_trainer@_here_: llama + - ../_dataset@_here_: c4 + - ../_checkpoints@_here_: none + - ../_misc@_here_: default + - ../_eval@_here_: default common: sequence_length: 128 From c5dd9c9cc175a00f778a80ca36166b313e013fc6 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 17:41:31 +0100 Subject: [PATCH 10/38] jm remove eval from config - it doesn't work on > 1 gpu --- configs/pr_tests/test_core.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml index c35d017a..3ff73e72 100644 --- a/configs/pr_tests/test_core.yaml +++ b/configs/pr_tests/test_core.yaml @@ -5,7 +5,7 @@ defaults: - ../_dataset@_here_: c4 - ../_checkpoints@_here_: none - ../_misc@_here_: default - - ../_eval@_here_: default + - ../_eval@_here_: none common: sequence_length: 128 @@ -33,7 +33,4 @@ infrastructure: slurm: time: "00:10:00" gres: gpu:2 - job-name: ${infrastructure.metric_logger.name} - -evaluator: - limit: 5 \ No newline at end of file + job-name: ${infrastructure.metric_logger.name} \ No newline at end of file From 23e0a1cabeee6e6ae4550e112caa9ffdd9b3f0e4 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 18:48:39 +0100 Subject: [PATCH 11/38] add eval test --- configs/pr_tests/test_core.yaml | 1 + configs/pr_tests/test_eval.yaml | 40 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 configs/pr_tests/test_eval.yaml diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml index 3ff73e72..0f638689 100644 --- a/configs/pr_tests/test_core.yaml +++ b/configs/pr_tests/test_core.yaml @@ -29,6 +29,7 @@ infrastructure: - nano - pr_test - core + - train slurm: time: "00:10:00" diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml new file mode 100644 index 00000000..ee6fea52 --- /dev/null +++ b/configs/pr_tests/test_eval.yaml @@ -0,0 +1,40 @@ +defaults: + - ../_cluster@_here_: entropy_a100 + - ../_model@_here_: tiny + - ../_trainer@_here_: llama + - ../_dataset@_here_: c4 + - ../_checkpoints@_here_: none + - ../_misc@_here_: default + - ../_eval@_here_: default + +common: + sequence_length: 128 + batch_size: 16 + +trainer: + gradient_accumulation_steps: 1 + n_steps: 100 + learning_rate: 1e-3 + + checkpoint: + save: + type: huggingface + path: checkpoint + +infrastructure: + + metric_logger: + name: test_core + tags: + - nano + - pr_test + - core + - eval_1gpu + + slurm: + time: "00:10:00" + gres: gpu:1 + job-name: ${infrastructure.metric_logger.name} + +eval: + limit: 10 \ No newline at end of file From 561fc890a57098f38609f9418fdc375750b72c9c Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 18:52:56 +0100 Subject: [PATCH 12/38] add eval dummy --- configs/pr_tests/test_eval.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml index ee6fea52..169bba66 100644 --- a/configs/pr_tests/test_eval.yaml +++ b/configs/pr_tests/test_eval.yaml @@ -5,7 +5,7 @@ defaults: - ../_dataset@_here_: c4 - ../_checkpoints@_here_: none - ../_misc@_here_: default - - ../_eval@_here_: default + - ../_eval@_here_: none common: sequence_length: 128 @@ -36,5 +36,5 @@ infrastructure: gres: gpu:1 job-name: ${infrastructure.metric_logger.name} -eval: - limit: 10 \ No newline at end of file +# eval: +# limit: 10 \ No newline at end of file From c0a6c14348fd35dd64330a8dd8399d2024ff2956 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 18:57:48 +0100 Subject: [PATCH 13/38] add eval dummy --- configs/pr_tests/test_eval.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml index 169bba66..1835f843 100644 --- a/configs/pr_tests/test_eval.yaml +++ b/configs/pr_tests/test_eval.yaml @@ -33,7 +33,7 @@ infrastructure: slurm: time: "00:10:00" - gres: gpu:1 + gres: gpu:2 job-name: ${infrastructure.metric_logger.name} # eval: From e359840cc0b7d85e686d6c2ae975a3c7330e03d7 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 19:10:23 +0100 Subject: [PATCH 14/38] claude cancle old commit jobs --- .github/workflows/run_tests_on_entropy.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 06f34583..49f7fd93 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -6,6 +6,11 @@ on: - main workflow_dispatch: +# Cancel previous runs when new commits are pushed +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: # First, discover all PR test configs discover-configs: From dfa14f1858847d42ce411a31ed9bca1c9b625d99 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 19:14:22 +0100 Subject: [PATCH 15/38] back to 1 test --- configs/{pr_tests => }/test_eval.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename configs/{pr_tests => }/test_eval.yaml (100%) diff --git a/configs/pr_tests/test_eval.yaml b/configs/test_eval.yaml similarity index 100% rename from configs/pr_tests/test_eval.yaml rename to configs/test_eval.yaml From 5f1afaf392a6b590edca5515db6689c0689d7ce3 Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 19:17:05 +0100 Subject: [PATCH 16/38] scancel on cancel --- .github/workflows/run_tests_on_entropy.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 49f7fd93..5c1818d6 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -46,6 +46,7 @@ jobs: - uses: actions/checkout@v4 - name: Run PR test - ${{ matrix.config }} + id: run-test run: | echo "=== Starting PR test for config: ${{ matrix.config }} ===" echo "Current directory: $(pwd)" @@ -60,6 +61,7 @@ jobs: echo "$job_output" echo "Job ID: $job_id" + echo "job_id=$job_id" >> $GITHUB_OUTPUT echo "SLURM job completed with exit code: $exit_code" # Try to display the output immediately @@ -74,6 +76,12 @@ jobs: exit $exit_code + - name: Cancel SLURM job on workflow cancellation + if: cancelled() && steps.run-test.outputs.job_id + run: | + echo "Workflow cancelled, cancelling SLURM job ${{ steps.run-test.outputs.job_id }}" + scancel ${{ steps.run-test.outputs.job_id }} || echo "Failed to cancel SLURM job (may have already completed)" + - name: Display SLURM output if: always() run: | From 136122dd3292d0b7111c158bee58dbcad9cba5fa Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 19:46:40 +0100 Subject: [PATCH 17/38] sth --- configs/test_eval.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/test_eval.yaml b/configs/test_eval.yaml index 1835f843..363f716c 100644 --- a/configs/test_eval.yaml +++ b/configs/test_eval.yaml @@ -24,12 +24,12 @@ trainer: infrastructure: metric_logger: - name: test_core + name: test_core2 tags: - nano - pr_test - core - - eval_1gpu + - core2 slurm: time: "00:10:00" From ecf8198d0797e7446935ff459fbd06f811e24d7e Mon Sep 17 00:00:00 2001 From: Janek Date: Mon, 8 Dec 2025 21:41:43 +0100 Subject: [PATCH 18/38] 2 test configs --- configs/{ => pr_tests}/test_eval.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename configs/{ => pr_tests}/test_eval.yaml (100%) diff --git a/configs/test_eval.yaml b/configs/pr_tests/test_eval.yaml similarity index 100% rename from configs/test_eval.yaml rename to configs/pr_tests/test_eval.yaml From 94c4f87162bcf1950ad19165cf9dfeeaad71e627 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 10:26:07 +0100 Subject: [PATCH 19/38] gpt: fix slurm canceling on test cancel --- .github/workflows/run_tests_on_entropy.yml | 90 ++++++++++++++-------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 5c1818d6..1dae9b08 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -44,44 +44,70 @@ jobs: config: ${{ fromJson(needs.discover-configs.outputs.configs) }} steps: - uses: actions/checkout@v4 - - - name: Run PR test - ${{ matrix.config }} - id: run-test + # 1) Submit SLURM job and expose job_id + - name: Submit SLURM job - ${{ matrix.config }} + - id: submit run: | - echo "=== Starting PR test for config: ${{ matrix.config }} ===" - echo "Current directory: $(pwd)" + echo "=== Submitting PR test for config: ${{ matrix.config }} ===" chmod +x .github/scripts/run_remote_pr_check.sh - export PR_TEST_CONFIG_NAME="${{ matrix.config }}" - echo "Submitting SLURM job..." - - # Submit job and capture the job ID - job_output=$(sbatch --wait --job-name="pr_test_${{ matrix.config }}" --output="slurm-%j.out" --export=ALL .github/scripts/run_remote_pr_check.sh) - exit_code=$? - job_id=$(echo "$job_output" | grep -oP 'Submitted batch job \K\d+') - - echo "$job_output" - echo "Job ID: $job_id" - echo "job_id=$job_id" >> $GITHUB_OUTPUT - echo "SLURM job completed with exit code: $exit_code" - - # Try to display the output immediately - if [ -f "slurm-${job_id}.out" ]; then - echo "=== SLURM Output (slurm-${job_id}.out) ===" - cat "slurm-${job_id}.out" - else - echo "Warning: SLURM output file slurm-${job_id}.out not found in $(pwd)" - echo "Looking for any slurm output files..." - ls -la slurm-*.out 2>/dev/null || echo "No slurm output files found" - fi - exit $exit_code + # --parsable makes sbatch output just the job ID (or jobID;something) + job_output=$(sbatch \ + --parsable \ + --job-name="pr_test_${{ matrix.config }}" \ + --output="slurm-%j.out" \ + --export=ALL \ + .github/scripts/run_remote_pr_check.sh) + + # Usually it's either "12345" or "12345;stuff" + job_id="${job_output%%;*}" + + echo "Submitted job: $job_output" + echo "Parsed job_id: $job_id" + + # Expose as step output + echo "job_id=$job_id" >> "$GITHUB_OUTPUT" + # 2) Wait for SLURM job to finish (polling) + - name: Wait for SLURM job + id: wait + run: | + job_id="${{ steps.submit.outputs.job_id }}" + echo "Waiting for job $job_id to finish..." + + # Simple polling loop; you can tune sleep / commands as you like + while true; do + state=$(sacct -j "$job_id" --format=State --noheader | head -1 | awk '{print $1}') + echo "Current state: $state" + + case "$state" in + COMPLETED) + echo "Job completed successfully." + exit 0 + ;; + FAILED|CANCELLED|TIMEOUT) + echo "Job ended with state: $state" + exit 1 + ;; + ""|PENDING|RUNNING|CONFIGURING|SUSPENDED) + sleep 10 + ;; + *) + echo "Unknown state: $state" + sleep 10 + ;; + esac + done + + # 3) Cancel SLURM job if workflow is cancelled - name: Cancel SLURM job on workflow cancellation - if: cancelled() && steps.run-test.outputs.job_id + if: cancelled() && steps.submit.outputs.job_id run: | - echo "Workflow cancelled, cancelling SLURM job ${{ steps.run-test.outputs.job_id }}" - scancel ${{ steps.run-test.outputs.job_id }} || echo "Failed to cancel SLURM job (may have already completed)" + job_id="${{ steps.submit.outputs.job_id }}" + echo "Workflow cancelled, cancelling SLURM job $job_id" + scancel "$job_id" || echo "Failed to cancel SLURM job (may have already completed)" + # 4) Always try to show SLURM output - name: Display SLURM output if: always() run: | @@ -89,4 +115,6 @@ jobs: if [ -n "$latest_out" ]; then echo "=== SLURM Output for ${{ matrix.config }} ===" cat "$latest_out" + else + echo "No slurm-*.out files found." fi From 4bc43678d7e78c2cb69c022521857961507e0ce2 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 10:26:55 +0100 Subject: [PATCH 20/38] sth --- .github/workflows/run_tests_on_entropy.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 1dae9b08..c41b308e 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -17,6 +17,7 @@ jobs: runs-on: [self-hosted, entropy] outputs: configs: ${{ steps.find-configs.outputs.configs }} + steps: - uses: actions/checkout@v4 @@ -42,6 +43,7 @@ jobs: fail-fast: false matrix: config: ${{ fromJson(needs.discover-configs.outputs.configs) }} + steps: - uses: actions/checkout@v4 # 1) Submit SLURM job and expose job_id From 67a4d7750a68f101fd2eeb9c9c7f3a6b10363586 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 10:34:58 +0100 Subject: [PATCH 21/38] fix --- .github/workflows/run_tests_on_entropy.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index c41b308e..22567708 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -43,12 +43,11 @@ jobs: fail-fast: false matrix: config: ${{ fromJson(needs.discover-configs.outputs.configs) }} - steps: - uses: actions/checkout@v4 # 1) Submit SLURM job and expose job_id - name: Submit SLURM job - ${{ matrix.config }} - - id: submit + id: submit run: | echo "=== Submitting PR test for config: ${{ matrix.config }} ===" chmod +x .github/scripts/run_remote_pr_check.sh From 816bf2d8ecb4170840c6042c2a14dd26c5bb53be Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 10:40:16 +0100 Subject: [PATCH 22/38] fix --- .github/workflows/run_tests_on_entropy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 22567708..529e42db 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -57,7 +57,7 @@ jobs: --parsable \ --job-name="pr_test_${{ matrix.config }}" \ --output="slurm-%j.out" \ - --export=ALL \ + --export=ALL,PR_TEST_CONFIG_NAME=${{ matrix.config }} \ .github/scripts/run_remote_pr_check.sh) # Usually it's either "12345" or "12345;stuff" From 7f675a9d0779511b462c5c472ec191b769fa650f Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 10:43:50 +0100 Subject: [PATCH 23/38] test --- .github/workflows/run_tests_on_entropy.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 529e42db..b9652446 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -43,6 +43,7 @@ jobs: fail-fast: false matrix: config: ${{ fromJson(needs.discover-configs.outputs.configs) }} + steps: - uses: actions/checkout@v4 # 1) Submit SLURM job and expose job_id From e9188bc9cf01b715fbc9af5b178ae63d89d7e0da Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 10:44:01 +0100 Subject: [PATCH 24/38] test --- .github/workflows/run_tests_on_entropy.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index b9652446..529e42db 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -43,7 +43,6 @@ jobs: fail-fast: false matrix: config: ${{ fromJson(needs.discover-configs.outputs.configs) }} - steps: - uses: actions/checkout@v4 # 1) Submit SLURM job and expose job_id From 25837939ba3168126404cb39522489c14526c3ab Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 11:05:23 +0100 Subject: [PATCH 25/38] gpt: paralel jobs (slurm array) --- .github/scripts/run_remote_pr_check.sh | 35 ++++-- .github/workflows/run_tests_on_entropy.yml | 118 +++++++++++++-------- 2 files changed, 101 insertions(+), 52 deletions(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index d7d527ae..42febe54 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -8,12 +8,27 @@ #SBATCH --partition=a100 #SBATCH --time=00:10:00 -set -e # Exit on error -set -x # Print commands for debugging +set -euo pipefail # exit on error, treat unset vars as error +set -x # print commands for debugging + +# --- Resolve PR_TEST_CONFIG_NAME from array index --- + +if [ -z "${PR_TEST_CONFIGS_FILE:-}" ]; then + echo "Error: PR_TEST_CONFIGS_FILE not set" + exit 1 +fi + +if [ -z "${SLURM_ARRAY_TASK_ID:-}" ]; then + echo "Error: SLURM_ARRAY_TASK_ID not set (are you running as an array job?)" + exit 1 +fi + +# SLURM_ARRAY_TASK_ID is 0-based; sed is 1-based +line=$((SLURM_ARRAY_TASK_ID + 1)) +PR_TEST_CONFIG_NAME=$(sed -n "${line}p" "$PR_TEST_CONFIGS_FILE" || true) -# PR_TEST_CONFIG_NAME is passed from the workflow via --export=ALL if [ -z "$PR_TEST_CONFIG_NAME" ]; then - echo "Error: PR_TEST_CONFIG_NAME not set" + echo "Error: failed to resolve config for SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID from $PR_TEST_CONFIGS_FILE" exit 1 fi @@ -35,18 +50,18 @@ eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; } cd - #-------- SCRIPT END -------- -export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) +export MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" | head -n 1) export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000)) echo "Running training with config: $PR_TEST_CONFIG_NAME" echo "MASTER_ADDR: $MASTER_ADDR" echo "MASTER_PORT: $MASTER_PORT" -srun torchrun --nnodes=${SLURM_NNODES}\ - --nproc-per-node=${SLURM_GPUS_ON_NODE} \ - --rdzv-id=${SLURM_JOBID} \ +srun torchrun --nnodes="${SLURM_NNODES}" \ + --nproc-per-node="${SLURM_GPUS_ON_NODE}" \ + --rdzv-id="${SLURM_JOBID}" \ --rdzv-backend=c10d \ - --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT} \ + --rdzv-endpoint="${MASTER_ADDR}:${MASTER_PORT}" \ main.py \ --config-path=configs/pr_tests \ - --config-name=$PR_TEST_CONFIG_NAME + --config-name="$PR_TEST_CONFIG_NAME" diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 529e42db..5b8d0a40 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -6,13 +6,12 @@ on: - main workflow_dispatch: -# Cancel previous runs when new commits are pushed concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - # First, discover all PR test configs + # 1) Discover configs discover-configs: runs-on: [self-hosted, entropy] outputs: @@ -24,70 +23,80 @@ jobs: - name: Find PR test configs id: find-configs run: | - # Find all yaml files in configs/pr_tests/ and output as JSON array echo "Searching for configs in configs/pr_tests/..." ls -la configs/pr_tests/ || echo "Directory not found or empty" configs=$(ls configs/pr_tests/*.yaml 2>/dev/null | xargs -I {} basename {} .yaml | jq -R -s -c 'split("\n") | map(select(length > 0))') - echo "configs=$configs" >> $GITHUB_OUTPUT + echo "configs=$configs" >> "$GITHUB_OUTPUT" echo "=== Found PR test configs ===" echo "$configs" | jq -r '.[]' echo "Total: $(echo "$configs" | jq 'length') config(s)" - # Run each config as a separate job - run-pr-test: + # 2) Submit & manage one SLURM array for all configs + run-pr-tests: needs: discover-configs runs-on: [self-hosted, entropy] - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.discover-configs.outputs.configs) }} + env: + CONFIGS_JSON: ${{ needs.discover-configs.outputs.configs }} + steps: - uses: actions/checkout@v4 - # 1) Submit SLURM job and expose job_id - - name: Submit SLURM job - ${{ matrix.config }} + + - name: Submit SLURM array id: submit run: | - echo "=== Submitting PR test for config: ${{ matrix.config }} ===" - chmod +x .github/scripts/run_remote_pr_check.sh + echo "Configs JSON: $CONFIGS_JSON" + + # Write config names (one per line) into a file + configs_file="$PWD/pr_tests_configs.txt" + echo "$CONFIGS_JSON" | jq -r '.[]' > "$configs_file" + + num_configs=$(wc -l < "$configs_file") + echo "Found $num_configs config(s)" + + if [ "$num_configs" -eq 0 ]; then + echo "No PR test configs found, nothing to run." + # No job_id set -> later steps will do nothing + exit 0 + fi + + last_index=$((num_configs - 1)) + echo "Submitting SLURM array 0-$last_index" - # --parsable makes sbatch output just the job ID (or jobID;something) job_output=$(sbatch \ --parsable \ - --job-name="pr_test_${{ matrix.config }}" \ - --output="slurm-%j.out" \ - --export=ALL,PR_TEST_CONFIG_NAME=${{ matrix.config }} \ + --array=0-"$last_index" \ + --job-name="pr_tests" \ + --output="slurm-%A_%a.out" \ + --export=ALL,PR_TEST_CONFIGS_FILE="$configs_file" \ .github/scripts/run_remote_pr_check.sh) - # Usually it's either "12345" or "12345;stuff" job_id="${job_output%%;*}" - echo "Submitted job: $job_output" + echo "Submitted array job: $job_output" echo "Parsed job_id: $job_id" - # Expose as step output echo "job_id=$job_id" >> "$GITHUB_OUTPUT" - # 2) Wait for SLURM job to finish (polling) - - name: Wait for SLURM job + - name: Wait for SLURM array + if: steps.submit.outputs.job_id id: wait run: | job_id="${{ steps.submit.outputs.job_id }}" - echo "Waiting for job $job_id to finish..." + echo "Waiting for SLURM array job $job_id to finish..." - # Simple polling loop; you can tune sleep / commands as you like while true; do state=$(sacct -j "$job_id" --format=State --noheader | head -1 | awk '{print $1}') - echo "Current state: $state" + echo "Current array state: $state" case "$state" in COMPLETED) - echo "Job completed successfully." + echo "Array job completed successfully." exit 0 ;; FAILED|CANCELLED|TIMEOUT) - echo "Job ended with state: $state" + echo "Array job ended with state: $state" exit 1 ;; ""|PENDING|RUNNING|CONFIGURING|SUSPENDED) @@ -100,22 +109,47 @@ jobs: esac done - # 3) Cancel SLURM job if workflow is cancelled - - name: Cancel SLURM job on workflow cancellation - if: cancelled() && steps.submit.outputs.job_id + - name: Summarize array results and show failed logs + if: always() && steps.submit.outputs.job_id run: | job_id="${{ steps.submit.outputs.job_id }}" - echo "Workflow cancelled, cancelling SLURM job $job_id" - scancel "$job_id" || echo "Failed to cancel SLURM job (may have already completed)" + configs_file="$PWD/pr_tests_configs.txt" - # 4) Always try to show SLURM output - - name: Display SLURM output - if: always() - run: | - latest_out=$(ls -t slurm-*.out 2>/dev/null | head -1) - if [ -n "$latest_out" ]; then - echo "=== SLURM Output for ${{ matrix.config }} ===" - cat "$latest_out" + if [ ! -f "$configs_file" ]; then + echo "No configs file found at $configs_file, nothing to summarize." + exit 0 + fi + + echo "=== Per-config SLURM task states ===" + + i=0 + failures=0 + while read -r cfg; do + # SLURM child job id: _ + task_job="${job_id}_$i" + + state=$(sacct -j "$task_job" --format=State --noheader | head -1 | awk '{print $1}') + echo "[$i] config=${cfg} state=${state}" + + if [ "$state" != "COMPLETED" ]; then + failures=$((failures + 1)) + out_file="slurm-${job_id}_${i}.out" + + echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" + if [ -f "$out_file" ]; then + cat "$out_file" + else + echo "No slurm output file found (expected: ${out_file})" + fi + echo "---- END log for FAILED config: ${cfg} ----" + fi + + i=$((i + 1)) + done < "$configs_file" + + if [ "$failures" -gt 0 ]; then + echo "Some configs failed: $failures failing task(s)." + exit 1 else - echo "No slurm-*.out files found." + echo "All configs completed successfully." fi From 9095beb0405cbc17020bafa7edff8efcf48b80ff Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 11:11:15 +0100 Subject: [PATCH 26/38] sth --- .github/workflows/run_tests_on_entropy.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 5b8d0a40..c093b4fb 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -16,7 +16,6 @@ jobs: runs-on: [self-hosted, entropy] outputs: configs: ${{ steps.find-configs.outputs.configs }} - steps: - uses: actions/checkout@v4 From 1f7c1641981297c7e3687e2a96841a3eb098adfd Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 11:15:58 +0100 Subject: [PATCH 27/38] gpt fix --- .github/workflows/run_tests_on_entropy.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index c093b4fb..1caf8417 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -71,10 +71,17 @@ jobs: --export=ALL,PR_TEST_CONFIGS_FILE="$configs_file" \ .github/scripts/run_remote_pr_check.sh) - job_id="${job_output%%;*}" + echo "Raw sbatch output: $job_output" - echo "Submitted array job: $job_output" - echo "Parsed job_id: $job_id" + # Handle cases like: + # 123456 + # 123456;cluster + # 123456_0 + # 123456_0;cluster + tmp="${job_output%%;*}" # strip anything after ';' + job_id="${tmp%%_*}" # strip anything after '_' (array task suffix) + + echo "Parsed base job_id: $job_id" echo "job_id=$job_id" >> "$GITHUB_OUTPUT" From c437fb90994a7092fe48b8849ea502cf347f256e Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 11:19:21 +0100 Subject: [PATCH 28/38] sth --- .github/workflows/run_tests_on_entropy.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 1caf8417..dd8091b1 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -16,6 +16,7 @@ jobs: runs-on: [self-hosted, entropy] outputs: configs: ${{ steps.find-configs.outputs.configs }} + steps: - uses: actions/checkout@v4 From 08c06d09e968d8de7242bc90ad63574b4fe7b62b Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 11:22:06 +0100 Subject: [PATCH 29/38] sth --- .github/workflows/run_tests_on_entropy.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index dd8091b1..1caf8417 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -16,7 +16,6 @@ jobs: runs-on: [self-hosted, entropy] outputs: configs: ${{ steps.find-configs.outputs.configs }} - steps: - uses: actions/checkout@v4 From 94da876b91f30be3a03b4470b2aca9072930d94b Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 11:24:45 +0100 Subject: [PATCH 30/38] sth --- .github/workflows/run_tests_on_entropy.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 1caf8417..7020922d 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -16,6 +16,7 @@ jobs: runs-on: [self-hosted, entropy] outputs: configs: ${{ steps.find-configs.outputs.configs }} + steps: - uses: actions/checkout@v4 @@ -110,7 +111,8 @@ jobs: ;; *) echo "Unknown state: $state" - sleep 10 + echo "Array job ended with state: $state" + exit 1 ;; esac done From 5b6b210e69df1eff99875749806ccb932f8d435c Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 12:15:30 +0100 Subject: [PATCH 31/38] move runner to H100 --- .github/scripts/run_remote_pr_check.sh | 2 +- configs/pr_tests/test_core.yaml | 2 +- configs/pr_tests/test_eval.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index 42febe54..fce28dce 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -5,7 +5,7 @@ #SBATCH --job-name=pr_check_entropy #SBATCH --mem-per-gpu=125G #SBATCH --nodes=1 -#SBATCH --partition=a100 +#SBATCH --partition=h100 #SBATCH --time=00:10:00 set -euo pipefail # exit on error, treat unset vars as error diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml index 0f638689..49f6ee57 100644 --- a/configs/pr_tests/test_core.yaml +++ b/configs/pr_tests/test_core.yaml @@ -1,5 +1,5 @@ defaults: - - ../_cluster@_here_: entropy_a100 + - ../_cluster@_here_: entropy - ../_model@_here_: tiny - ../_trainer@_here_: llama - ../_dataset@_here_: c4 diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml index 363f716c..85dd68bb 100644 --- a/configs/pr_tests/test_eval.yaml +++ b/configs/pr_tests/test_eval.yaml @@ -1,5 +1,5 @@ defaults: - - ../_cluster@_here_: entropy_a100 + - ../_cluster@_here_: entropy - ../_model@_here_: tiny - ../_trainer@_here_: llama - ../_dataset@_here_: c4 From 6974f2380242ca49f06fd3ffb06ad03197593717 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 12:55:50 +0100 Subject: [PATCH 32/38] reduce ram limit --- .github/scripts/run_remote_pr_check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index fce28dce..1f2664ca 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -3,7 +3,7 @@ #SBATCH --cpus-per-gpu=16 #SBATCH --gres=gpu:2 #SBATCH --job-name=pr_check_entropy -#SBATCH --mem-per-gpu=125G +#SBATCH --mem-per-gpu=100G #SBATCH --nodes=1 #SBATCH --partition=h100 #SBATCH --time=00:10:00 From 50c7a205c1bc46b3dcdd2a8a0c24947e4c7ac203 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 13:02:05 +0100 Subject: [PATCH 33/38] copy h100 setup script to pr test bash --- .github/scripts/run_remote_pr_check.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh index 1f2664ca..ecfe3b11 100644 --- a/.github/scripts/run_remote_pr_check.sh +++ b/.github/scripts/run_remote_pr_check.sh @@ -35,10 +35,14 @@ fi echo "Running CI check for config: $PR_TEST_CONFIG_NAME" #---------- SCRIPT ---------- -export PROJECT_HOME_PATH=/storage_ssd_1/nano -export HF_HOME=$PROJECT_HOME_PATH/hf_cache +export PROJECT_HOME_PATH=/storage_nvme_4/nano +export HF_HOME="$PROJECT_HOME_PATH/hf_cache" + +# hydra errors export HYDRA_FULL_ERROR=1 -export PIXI_HOME=/storage_ssd_1/nano/pixi + +# pixi variables +export PIXI_HOME=/storage_nvme_4/nano/pixi export PATH="$PIXI_HOME/bin:$PATH" export XDG_DATA_HOME="$PIXI_HOME/data" export XDG_CACHE_HOME="$PIXI_HOME/cache" From 51e0e5e7aa77d70ddca31c73baa39ac35b3d88e4 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 13:12:20 +0100 Subject: [PATCH 34/38] limit mem_per_gpu in configs --- configs/pr_tests/test_core.yaml | 3 ++- configs/pr_tests/test_eval.yaml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml index 49f6ee57..f5650e01 100644 --- a/configs/pr_tests/test_core.yaml +++ b/configs/pr_tests/test_core.yaml @@ -34,4 +34,5 @@ infrastructure: slurm: time: "00:10:00" gres: gpu:2 - job-name: ${infrastructure.metric_logger.name} \ No newline at end of file + job-name: ${infrastructure.metric_logger.name} + mem_per_gpu: 100G \ No newline at end of file diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml index 85dd68bb..ff5064d2 100644 --- a/configs/pr_tests/test_eval.yaml +++ b/configs/pr_tests/test_eval.yaml @@ -35,6 +35,7 @@ infrastructure: time: "00:10:00" gres: gpu:2 job-name: ${infrastructure.metric_logger.name} + mem_per_gpu: 100G # eval: # limit: 10 \ No newline at end of file From 3c42bc873c918b867002a4c5e5b7027ce8c81a6c Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 14:17:48 +0100 Subject: [PATCH 35/38] fix failure check --- .github/workflows/run_tests_on_entropy.yml | 45 +++++++++++++++------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 7020922d..7a5e1281 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -139,18 +139,37 @@ jobs: state=$(sacct -j "$task_job" --format=State --noheader | head -1 | awk '{print $1}') echo "[$i] config=${cfg} state=${state}" - if [ "$state" != "COMPLETED" ]; then - failures=$((failures + 1)) - out_file="slurm-${job_id}_${i}.out" - - echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" - if [ -f "$out_file" ]; then - cat "$out_file" - else - echo "No slurm output file found (expected: ${out_file})" - fi - echo "---- END log for FAILED config: ${cfg} ----" - fi + out_file="slurm-${job_id}_${i}.out" + + case "$state" in + COMPLETED) + # ok, nothing to do + ;; + FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) + failures=$((failures + 1)) + echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" + if [ -f "$out_file" ]; then + cat "$out_file" + else + echo "No slurm output file found (expected: ${out_file})" + fi + echo "---- END log for FAILED config: ${cfg} ----" + ;; + RUNNING|PENDING|"") + echo "Task $i ($cfg) still ${state:-} at summarize time; not counting as failure." + ;; + *) + echo "Task $i ($cfg) in unexpected state $state; counting as failure." + failures=$((failures + 1)) + echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" + if [ -f "$out_file" ]; then + cat "$out_file" + else + echo "No slurm output file found (expected: ${out_file})" + fi + echo "---- END log for FAILED config: ${cfg} ----" + ;; + esac i=$((i + 1)) done < "$configs_file" @@ -160,4 +179,4 @@ jobs: exit 1 else echo "All configs completed successfully." - fi + fi \ No newline at end of file From 51bc049157ec6399d02c26f2f5ea0ebf8768f060 Mon Sep 17 00:00:00 2001 From: Janek Date: Tue, 9 Dec 2025 18:23:11 +0100 Subject: [PATCH 36/38] change mv to cp + rm in update_pixi.py --- update_pixi.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/update_pixi.py b/update_pixi.py index 79f6b8e7..8e1068ca 100644 --- a/update_pixi.py +++ b/update_pixi.py @@ -168,16 +168,18 @@ def update_remote_pixi(cfg: OmegaConf): f"mkdir -p {pixi_home}/old_pixi_files/obsolete_since_${{ts}}; " f"if [ -f {pixi_home}/pixi.toml ]; then " f"mv -f {pixi_home}/pixi.toml {pixi_home}/old_pixi_files/obsolete_since_${{ts}}/; " - f"fi; " + "fi; " f"if [ -f {pixi_home}/pixi.lock ]; then " f"mv -f {pixi_home}/pixi.lock {pixi_home}/old_pixi_files/obsolete_since_${{ts}}/; " - f"fi; " + "fi; " "fi && " - # move new files from $HOME temp dir into PIXI_HOME - f"mv -f {remote_tmp_dir}/pixi.toml {pixi_home}/ && " + # move new files from $HOME temp dir into PIXI_HOME (cross-FS safe) + f"cp {remote_tmp_dir}/pixi.toml {pixi_home}/ && " + f"rm -f {remote_tmp_dir}/pixi.toml && " f"if [ -f {remote_tmp_dir}/pixi.lock ]; then " - f"mv -f {remote_tmp_dir}/pixi.lock {pixi_home}/; " - f"fi && " + f"cp {remote_tmp_dir}/pixi.lock {pixi_home}/ && " + f"rm -f {remote_tmp_dir}/pixi.lock; " + "fi && " # run pixi install f"cd {pixi_home} && " "pixi install" From e45a3744790d69d05fa623c8397bd953e948ee7a Mon Sep 17 00:00:00 2001 From: Janek Date: Wed, 10 Dec 2025 11:36:05 +0100 Subject: [PATCH 37/38] gpt: resolve uninstalled pixi from update_pixi --- update_pixi.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/update_pixi.py b/update_pixi.py index 8e1068ca..1ef8ec6a 100644 --- a/update_pixi.py +++ b/update_pixi.py @@ -34,6 +34,44 @@ def get_project_root() -> Path: raise FileNotFoundError("Could not find pixi.toml in parent directories") +def resolve_pixi_ci(connection, shared_pixi_bin: str, shared_pixi_bin_dir: str): + # Ensure shared Pixi CLI exists on the cluster + print(f"\nEnsuring shared Pixi CLI at {shared_pixi_bin} ...") + + check_cli = connection.run( + f'[ -x "{shared_pixi_bin}" ] && echo OK || echo MISSING', + hide=True, + ) + + if check_cli.stdout.strip() == "MISSING": + print("Shared Pixi CLI missing, bootstrapping from $HOME/.pixi/bin/pixi ...") + + bootstrap_cmd = f""" + set -e + mkdir -p "{shared_pixi_bin_dir}" + if [ ! -x "$HOME/.pixi/bin/pixi" ]; then + echo "ERROR: $HOME/.pixi/bin/pixi not found; install pixi for your user first." + exit 1 + fi + cp "$HOME/.pixi/bin/pixi" "{shared_pixi_bin}" + chmod a+rx "{shared_pixi_bin}" + """ + connection.run(bootstrap_cmd, pty=True) + + verify_cli = connection.run( + f'[ -x "{shared_pixi_bin}" ] && echo OK || echo MISSING', + hide=True, + ) + if verify_cli.stdout.strip() != "OK": + print( + f"ERROR: Failed to bootstrap shared Pixi CLI at {shared_pixi_bin}. " + "Please check permissions / paths." + ) + sys.exit(1) + + print("✔ Shared Pixi CLI available.") + + @hydra.main(version_base=None) def update_remote_pixi(cfg: OmegaConf): """ @@ -55,6 +93,14 @@ def update_remote_pixi(cfg: OmegaConf): print(f"Error: {e}") sys.exit(1) + # Derive shared Pixi CLI location from PIXI_HOME: + # PIXI_HOME = /storage_hdd_1/llm-random/nano/pixi + # SHARED_CLI = /storage_hdd_1/llm-random/nano/pixi_cli/bin/pixi + pixi_home_path = Path(pixi_home) + shared_pixi_root = pixi_home_path.parent / "pixi_cli" + shared_pixi_bin_dir = shared_pixi_root / "bin" + shared_pixi_bin = shared_pixi_bin_dir / "pixi" + # This job is just "pixi install" → no GPU needed. # Remove GPU-related keys inherited from the main cluster config. for key in ("gres", "cpus_per_gpu", "mem_per_gpu"): @@ -75,10 +121,6 @@ def update_remote_pixi(cfg: OmegaConf): pixi_toml = project_root / "pixi.toml" pixi_lock = project_root / "pixi.lock" - if not pixi_toml.exists(): - print(f"Error: pixi.toml not found at {pixi_toml}") - sys.exit(1) - print(f"\nLocal pixi files:") print(f" - {pixi_toml}") if pixi_lock.exists(): @@ -107,6 +149,13 @@ def update_remote_pixi(cfg: OmegaConf): with ConnectWithPassphrase(host=server, inline_ssh_env=True) as connection: print("Connected successfully!") + # verifies existence of pixi ci, creates it if empty + resolve_pixi_ci( + connection, + shared_pixi_bin=shared_pixi_bin, + shared_pixi_bin_dir=shared_pixi_bin_dir, + ) + # Figure out remote $HOME and temp dir for pixi config home_dir = connection.run("cd && pwd", hide=True).stdout.strip() timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") @@ -129,6 +178,8 @@ def update_remote_pixi(cfg: OmegaConf): # Run pixi install on compute node using srun print(f"\nRunning 'pixi install' on compute node...") + print(f"ts: {timestamp}") + # Build srun command with SLURM parameters slurm_params = create_slurm_parameters(slurm_config) # Convert #SBATCH flags to srun flags (remove #SBATCH prefix) @@ -138,6 +189,10 @@ def update_remote_pixi(cfg: OmegaConf): # Set up PATH and other environment variables from the cluster config text = "\n".join(script_lines) env_setup = [] + + # Always put shared Pixi CLI first on PATH + env_setup.append(f'export PATH="{shared_pixi_bin_dir}:$PATH"') + for raw in text.splitlines(): line = raw.strip() @@ -157,6 +212,11 @@ def update_remote_pixi(cfg: OmegaConf): env_setup.append(line) continue + # # keep pixi shell-hook if present in config + # if "pixi shell-hook" in line: + # env_setup.append(line) + # continue + env_commands = " && ".join(env_setup) if env_setup else "" # mkdir + copy happen on the compute node, via srun @@ -182,6 +242,12 @@ def update_remote_pixi(cfg: OmegaConf): "fi && " # run pixi install f"cd {pixi_home} && " + "echo dupa &&" + "ls -a &&" + "echo pixi version &&" + "echo path $PATH &&" + "command -v pixi || { echo 'ERROR: pixi not found in PATH'; exit 127; } && " + "pixi --version && " "pixi install" ) @@ -191,6 +257,7 @@ def update_remote_pixi(cfg: OmegaConf): install_command = base_command cmd_quoted = shlex.quote(install_command) + print(f"command:\n{cmd_quoted}") full_command = f"{srun_cmd} bash -lc {cmd_quoted}" result = connection.run(full_command, pty=True) From e5306491267090c36edc73cb318048031b0b496a Mon Sep 17 00:00:00 2001 From: Janek Date: Wed, 10 Dec 2025 11:56:18 +0100 Subject: [PATCH 38/38] giving up running on entropy common - each node has different storage --- .github/workflows/run_tests_on_entropy.yml | 45 ++++------- update_pixi.py | 89 +++------------------- 2 files changed, 23 insertions(+), 111 deletions(-) diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml index 7a5e1281..7020922d 100644 --- a/.github/workflows/run_tests_on_entropy.yml +++ b/.github/workflows/run_tests_on_entropy.yml @@ -139,37 +139,18 @@ jobs: state=$(sacct -j "$task_job" --format=State --noheader | head -1 | awk '{print $1}') echo "[$i] config=${cfg} state=${state}" - out_file="slurm-${job_id}_${i}.out" - - case "$state" in - COMPLETED) - # ok, nothing to do - ;; - FAILED|CANCELLED|TIMEOUT|OUT_OF_MEMORY) - failures=$((failures + 1)) - echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" - if [ -f "$out_file" ]; then - cat "$out_file" - else - echo "No slurm output file found (expected: ${out_file})" - fi - echo "---- END log for FAILED config: ${cfg} ----" - ;; - RUNNING|PENDING|"") - echo "Task $i ($cfg) still ${state:-} at summarize time; not counting as failure." - ;; - *) - echo "Task $i ($cfg) in unexpected state $state; counting as failure." - failures=$((failures + 1)) - echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" - if [ -f "$out_file" ]; then - cat "$out_file" - else - echo "No slurm output file found (expected: ${out_file})" - fi - echo "---- END log for FAILED config: ${cfg} ----" - ;; - esac + if [ "$state" != "COMPLETED" ]; then + failures=$((failures + 1)) + out_file="slurm-${job_id}_${i}.out" + + echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" + if [ -f "$out_file" ]; then + cat "$out_file" + else + echo "No slurm output file found (expected: ${out_file})" + fi + echo "---- END log for FAILED config: ${cfg} ----" + fi i=$((i + 1)) done < "$configs_file" @@ -179,4 +160,4 @@ jobs: exit 1 else echo "All configs completed successfully." - fi \ No newline at end of file + fi diff --git a/update_pixi.py b/update_pixi.py index 1ef8ec6a..79f6b8e7 100644 --- a/update_pixi.py +++ b/update_pixi.py @@ -34,44 +34,6 @@ def get_project_root() -> Path: raise FileNotFoundError("Could not find pixi.toml in parent directories") -def resolve_pixi_ci(connection, shared_pixi_bin: str, shared_pixi_bin_dir: str): - # Ensure shared Pixi CLI exists on the cluster - print(f"\nEnsuring shared Pixi CLI at {shared_pixi_bin} ...") - - check_cli = connection.run( - f'[ -x "{shared_pixi_bin}" ] && echo OK || echo MISSING', - hide=True, - ) - - if check_cli.stdout.strip() == "MISSING": - print("Shared Pixi CLI missing, bootstrapping from $HOME/.pixi/bin/pixi ...") - - bootstrap_cmd = f""" - set -e - mkdir -p "{shared_pixi_bin_dir}" - if [ ! -x "$HOME/.pixi/bin/pixi" ]; then - echo "ERROR: $HOME/.pixi/bin/pixi not found; install pixi for your user first." - exit 1 - fi - cp "$HOME/.pixi/bin/pixi" "{shared_pixi_bin}" - chmod a+rx "{shared_pixi_bin}" - """ - connection.run(bootstrap_cmd, pty=True) - - verify_cli = connection.run( - f'[ -x "{shared_pixi_bin}" ] && echo OK || echo MISSING', - hide=True, - ) - if verify_cli.stdout.strip() != "OK": - print( - f"ERROR: Failed to bootstrap shared Pixi CLI at {shared_pixi_bin}. " - "Please check permissions / paths." - ) - sys.exit(1) - - print("✔ Shared Pixi CLI available.") - - @hydra.main(version_base=None) def update_remote_pixi(cfg: OmegaConf): """ @@ -93,14 +55,6 @@ def update_remote_pixi(cfg: OmegaConf): print(f"Error: {e}") sys.exit(1) - # Derive shared Pixi CLI location from PIXI_HOME: - # PIXI_HOME = /storage_hdd_1/llm-random/nano/pixi - # SHARED_CLI = /storage_hdd_1/llm-random/nano/pixi_cli/bin/pixi - pixi_home_path = Path(pixi_home) - shared_pixi_root = pixi_home_path.parent / "pixi_cli" - shared_pixi_bin_dir = shared_pixi_root / "bin" - shared_pixi_bin = shared_pixi_bin_dir / "pixi" - # This job is just "pixi install" → no GPU needed. # Remove GPU-related keys inherited from the main cluster config. for key in ("gres", "cpus_per_gpu", "mem_per_gpu"): @@ -121,6 +75,10 @@ def update_remote_pixi(cfg: OmegaConf): pixi_toml = project_root / "pixi.toml" pixi_lock = project_root / "pixi.lock" + if not pixi_toml.exists(): + print(f"Error: pixi.toml not found at {pixi_toml}") + sys.exit(1) + print(f"\nLocal pixi files:") print(f" - {pixi_toml}") if pixi_lock.exists(): @@ -149,13 +107,6 @@ def update_remote_pixi(cfg: OmegaConf): with ConnectWithPassphrase(host=server, inline_ssh_env=True) as connection: print("Connected successfully!") - # verifies existence of pixi ci, creates it if empty - resolve_pixi_ci( - connection, - shared_pixi_bin=shared_pixi_bin, - shared_pixi_bin_dir=shared_pixi_bin_dir, - ) - # Figure out remote $HOME and temp dir for pixi config home_dir = connection.run("cd && pwd", hide=True).stdout.strip() timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") @@ -178,8 +129,6 @@ def update_remote_pixi(cfg: OmegaConf): # Run pixi install on compute node using srun print(f"\nRunning 'pixi install' on compute node...") - print(f"ts: {timestamp}") - # Build srun command with SLURM parameters slurm_params = create_slurm_parameters(slurm_config) # Convert #SBATCH flags to srun flags (remove #SBATCH prefix) @@ -189,10 +138,6 @@ def update_remote_pixi(cfg: OmegaConf): # Set up PATH and other environment variables from the cluster config text = "\n".join(script_lines) env_setup = [] - - # Always put shared Pixi CLI first on PATH - env_setup.append(f'export PATH="{shared_pixi_bin_dir}:$PATH"') - for raw in text.splitlines(): line = raw.strip() @@ -212,11 +157,6 @@ def update_remote_pixi(cfg: OmegaConf): env_setup.append(line) continue - # # keep pixi shell-hook if present in config - # if "pixi shell-hook" in line: - # env_setup.append(line) - # continue - env_commands = " && ".join(env_setup) if env_setup else "" # mkdir + copy happen on the compute node, via srun @@ -228,26 +168,18 @@ def update_remote_pixi(cfg: OmegaConf): f"mkdir -p {pixi_home}/old_pixi_files/obsolete_since_${{ts}}; " f"if [ -f {pixi_home}/pixi.toml ]; then " f"mv -f {pixi_home}/pixi.toml {pixi_home}/old_pixi_files/obsolete_since_${{ts}}/; " - "fi; " + f"fi; " f"if [ -f {pixi_home}/pixi.lock ]; then " f"mv -f {pixi_home}/pixi.lock {pixi_home}/old_pixi_files/obsolete_since_${{ts}}/; " - "fi; " + f"fi; " "fi && " - # move new files from $HOME temp dir into PIXI_HOME (cross-FS safe) - f"cp {remote_tmp_dir}/pixi.toml {pixi_home}/ && " - f"rm -f {remote_tmp_dir}/pixi.toml && " + # move new files from $HOME temp dir into PIXI_HOME + f"mv -f {remote_tmp_dir}/pixi.toml {pixi_home}/ && " f"if [ -f {remote_tmp_dir}/pixi.lock ]; then " - f"cp {remote_tmp_dir}/pixi.lock {pixi_home}/ && " - f"rm -f {remote_tmp_dir}/pixi.lock; " - "fi && " + f"mv -f {remote_tmp_dir}/pixi.lock {pixi_home}/; " + f"fi && " # run pixi install f"cd {pixi_home} && " - "echo dupa &&" - "ls -a &&" - "echo pixi version &&" - "echo path $PATH &&" - "command -v pixi || { echo 'ERROR: pixi not found in PATH'; exit 127; } && " - "pixi --version && " "pixi install" ) @@ -257,7 +189,6 @@ def update_remote_pixi(cfg: OmegaConf): install_command = base_command cmd_quoted = shlex.quote(install_command) - print(f"command:\n{cmd_quoted}") full_command = f"{srun_cmd} bash -lc {cmd_quoted}" result = connection.run(full_command, pty=True)