llm-random · j321m · Dec 5, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -l
+
+#SBATCH --cpus-per-gpu=16
+#SBATCH --gres=gpu:2
+#SBATCH --job-name=pr_check_entropy
+#SBATCH --mem-per-gpu=100G
+#SBATCH --nodes=1
+#SBATCH --partition=h100
+#SBATCH --time=00:10:00
+
+set -euo pipefail  # exit on error, treat unset vars as error
+set -x             # print commands for debugging
+
+# --- Resolve PR_TEST_CONFIG_NAME from array index ---
+
+if [ -z "${PR_TEST_CONFIGS_FILE:-}" ]; then
+    echo "Error: PR_TEST_CONFIGS_FILE not set"
+    exit 1
+fi
+
+if [ -z "${SLURM_ARRAY_TASK_ID:-}" ]; then
+    echo "Error: SLURM_ARRAY_TASK_ID not set (are you running as an array job?)"
+    exit 1
+fi
+
+# SLURM_ARRAY_TASK_ID is 0-based; sed is 1-based
+line=$((SLURM_ARRAY_TASK_ID + 1))
+PR_TEST_CONFIG_NAME=$(sed -n "${line}p" "$PR_TEST_CONFIGS_FILE" || true)
+
+if [ -z "$PR_TEST_CONFIG_NAME" ]; then
+    echo "Error: failed to resolve config for SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID from $PR_TEST_CONFIGS_FILE"
+    exit 1
+fi
+
+echo "Running CI check for config: $PR_TEST_CONFIG_NAME"
+
+#---------- SCRIPT ----------
+export PROJECT_HOME_PATH=/storage_nvme_4/nano
+export HF_HOME="$PROJECT_HOME_PATH/hf_cache"
+
+# hydra errors
+export HYDRA_FULL_ERROR=1
+
+# pixi variables
+export PIXI_HOME=/storage_nvme_4/nano/pixi
+export PATH="$PIXI_HOME/bin:$PATH"
+export XDG_DATA_HOME="$PIXI_HOME/data"
+export XDG_CACHE_HOME="$PIXI_HOME/cache"
+export XDG_STATE_HOME="$PIXI_HOME/state"
+
+# Save current directory and setup pixi
+cd "$PIXI_HOME" || { echo "Failed to cd to $PIXI_HOME"; exit 1; }
+eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; }
+cd -
+#-------- SCRIPT END --------
+
+export MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" | head -n 1)
+export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000))
+
+echo "Running training with config: $PR_TEST_CONFIG_NAME"
+echo "MASTER_ADDR: $MASTER_ADDR"
+echo "MASTER_PORT: $MASTER_PORT"
+
+srun torchrun --nnodes="${SLURM_NNODES}" \
+  --nproc-per-node="${SLURM_GPUS_ON_NODE}" \
+  --rdzv-id="${SLURM_JOBID}" \
+  --rdzv-backend=c10d \
+  --rdzv-endpoint="${MASTER_ADDR}:${MASTER_PORT}" \
+  main.py \
+    --config-path=configs/pr_tests \
+    --config-name="$PR_TEST_CONFIG_NAME"
diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml
@@ -0,0 +1,163 @@
+name: PR Tests
+
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # 1) Discover configs
+  discover-configs:
+    runs-on: [self-hosted, entropy]
+    outputs:
+      configs: ${{ steps.find-configs.outputs.configs }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Find PR test configs
+        id: find-configs
+        run: |
+          echo "Searching for configs in configs/pr_tests/..."
+          ls -la configs/pr_tests/ || echo "Directory not found or empty"
+
+          configs=$(ls configs/pr_tests/*.yaml 2>/dev/null | xargs -I {} basename {} .yaml | jq -R -s -c 'split("\n") | map(select(length > 0))')
+          echo "configs=$configs" >> "$GITHUB_OUTPUT"
+
+          echo "=== Found PR test configs ==="
+          echo "$configs" | jq -r '.[]'
+          echo "Total: $(echo "$configs" | jq 'length') config(s)"
+
+  # 2) Submit & manage one SLURM array for all configs
+  run-pr-tests:
+    needs: discover-configs
+    runs-on: [self-hosted, entropy]
+    env:
+      CONFIGS_JSON: ${{ needs.discover-configs.outputs.configs }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Submit SLURM array
+        id: submit
+        run: |
+          echo "Configs JSON: $CONFIGS_JSON"
+
+          # Write config names (one per line) into a file
+          configs_file="$PWD/pr_tests_configs.txt"
+          echo "$CONFIGS_JSON" | jq -r '.[]' > "$configs_file"
+
+          num_configs=$(wc -l < "$configs_file")
+          echo "Found $num_configs config(s)"
+
+          if [ "$num_configs" -eq 0 ]; then
+            echo "No PR test configs found, nothing to run."
+            # No job_id set -> later steps will do nothing
+            exit 0
+          fi
+
+          last_index=$((num_configs - 1))
+          echo "Submitting SLURM array 0-$last_index"
+
+          job_output=$(sbatch \
+            --parsable \
+            --array=0-"$last_index" \
+            --job-name="pr_tests" \
+            --output="slurm-%A_%a.out" \
+            --export=ALL,PR_TEST_CONFIGS_FILE="$configs_file" \
+            .github/scripts/run_remote_pr_check.sh)
+
+          echo "Raw sbatch output: $job_output"
+
+          # Handle cases like:
+          #   123456
+          #   123456;cluster
+          #   123456_0
+          #   123456_0;cluster
+          tmp="${job_output%%;*}"   # strip anything after ';'
+          job_id="${tmp%%_*}"       # strip anything after '_' (array task suffix)
+
+          echo "Parsed base job_id: $job_id"
+
+          echo "job_id=$job_id" >> "$GITHUB_OUTPUT"
+
+      - name: Wait for SLURM array
+        if: steps.submit.outputs.job_id
+        id: wait
+        run: |
+          job_id="${{ steps.submit.outputs.job_id }}"
+          echo "Waiting for SLURM array job $job_id to finish..."
+
+          while true; do
+            state=$(sacct -j "$job_id" --format=State --noheader | head -1 | awk '{print $1}')
+            echo "Current array state: $state"
+
+            case "$state" in
+              COMPLETED)
+                echo "Array job completed successfully."
+                exit 0
+                ;;
+              FAILED|CANCELLED|TIMEOUT)
+                echo "Array job ended with state: $state"
+                exit 1
+                ;;
+              ""|PENDING|RUNNING|CONFIGURING|SUSPENDED)
+                sleep 10
+                ;;
+              *)
+                echo "Unknown state: $state"
+                echo "Array job ended with state: $state"
+                exit 1
+                ;;
+            esac
+          done
+
+      - name: Summarize array results and show failed logs
+        if: always() && steps.submit.outputs.job_id
+        run: |
+          job_id="${{ steps.submit.outputs.job_id }}"
+          configs_file="$PWD/pr_tests_configs.txt"
+
+          if [ ! -f "$configs_file" ]; then
+            echo "No configs file found at $configs_file, nothing to summarize."
+            exit 0
+          fi
+
+          echo "=== Per-config SLURM task states ==="
+
+          i=0
+          failures=0
+          while read -r cfg; do
+            # SLURM child job id: <arrayJobId>_<taskIndex>
+            task_job="${job_id}_$i"
+
+            state=$(sacct -j "$task_job" --format=State --noheader | head -1 | awk '{print $1}')
+            echo "[$i] config=${cfg}  state=${state}"
+
+            if [ "$state" != "COMPLETED" ]; then
+              failures=$((failures + 1))
+              out_file="slurm-${job_id}_${i}.out"
+
+              echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----"
+              if [ -f "$out_file" ]; then
+                cat "$out_file"
+              else
+                echo "No slurm output file found (expected: ${out_file})"
+              fi
+              echo "---- END log for FAILED config: ${cfg} ----"
+            fi
+
+            i=$((i + 1))
+          done < "$configs_file"
+
+          if [ "$failures" -gt 0 ]; then
+            echo "Some configs failed: $failures failing task(s)."
+            exit 1
+          else
+            echo "All configs completed successfully."
+          fi
diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml
@@ -0,0 +1,38 @@
+defaults:
+  - ../_cluster@_here_: entropy
+  - ../_model@_here_: tiny
+  - ../_trainer@_here_: llama
+  - ../_dataset@_here_: c4
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - ../_eval@_here_: none
+
+common:
+  sequence_length: 128
+  batch_size: 16
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 100
+  learning_rate: 1e-3
+
+  checkpoint:
+    save:
+      type: huggingface
+      path: checkpoint
+
+infrastructure:
+
+  metric_logger:
+    name: test_core
+    tags:
+      - nano
+      - pr_test
+      - core
+      - train
+
+  slurm:
+    time: "00:10:00"
+    gres: gpu:2
+    job-name: ${infrastructure.metric_logger.name}
+    mem_per_gpu: 100G
diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml
@@ -0,0 +1,41 @@
+defaults:
+  - ../_cluster@_here_: entropy
+  - ../_model@_here_: tiny
+  - ../_trainer@_here_: llama
+  - ../_dataset@_here_: c4
+  - ../_checkpoints@_here_: none
+  - ../_misc@_here_: default
+  - ../_eval@_here_: none
+
+common:
+  sequence_length: 128
+  batch_size: 16
+
+trainer:
+  gradient_accumulation_steps: 1
+  n_steps: 100
+  learning_rate: 1e-3
+
+  checkpoint:
+    save:
+      type: huggingface
+      path: checkpoint
+
+infrastructure:
+
+  metric_logger:
+    name: test_core2
+    tags:
+      - nano
+      - pr_test
+      - core
+      - core2
+
+  slurm:
+    time: "00:10:00"
+    gres: gpu:2
+    job-name: ${infrastructure.metric_logger.name}
+    mem_per_gpu: 100G
+
+# eval:
+#   limit: 10