diff --git a/.github/scripts/run_remote_pr_check.sh b/.github/scripts/run_remote_pr_check.sh new file mode 100644 index 00000000..ecfe3b11 --- /dev/null +++ b/.github/scripts/run_remote_pr_check.sh @@ -0,0 +1,71 @@ +#!/bin/bash -l + +#SBATCH --cpus-per-gpu=16 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=pr_check_entropy +#SBATCH --mem-per-gpu=100G +#SBATCH --nodes=1 +#SBATCH --partition=h100 +#SBATCH --time=00:10:00 + +set -euo pipefail # exit on error, treat unset vars as error +set -x # print commands for debugging + +# --- Resolve PR_TEST_CONFIG_NAME from array index --- + +if [ -z "${PR_TEST_CONFIGS_FILE:-}" ]; then + echo "Error: PR_TEST_CONFIGS_FILE not set" + exit 1 +fi + +if [ -z "${SLURM_ARRAY_TASK_ID:-}" ]; then + echo "Error: SLURM_ARRAY_TASK_ID not set (are you running as an array job?)" + exit 1 +fi + +# SLURM_ARRAY_TASK_ID is 0-based; sed is 1-based +line=$((SLURM_ARRAY_TASK_ID + 1)) +PR_TEST_CONFIG_NAME=$(sed -n "${line}p" "$PR_TEST_CONFIGS_FILE" || true) + +if [ -z "$PR_TEST_CONFIG_NAME" ]; then + echo "Error: failed to resolve config for SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID from $PR_TEST_CONFIGS_FILE" + exit 1 +fi + +echo "Running CI check for config: $PR_TEST_CONFIG_NAME" + +#---------- SCRIPT ---------- +export PROJECT_HOME_PATH=/storage_nvme_4/nano +export HF_HOME="$PROJECT_HOME_PATH/hf_cache" + +# hydra errors +export HYDRA_FULL_ERROR=1 + +# pixi variables +export PIXI_HOME=/storage_nvme_4/nano/pixi +export PATH="$PIXI_HOME/bin:$PATH" +export XDG_DATA_HOME="$PIXI_HOME/data" +export XDG_CACHE_HOME="$PIXI_HOME/cache" +export XDG_STATE_HOME="$PIXI_HOME/state" + +# Save current directory and setup pixi +cd "$PIXI_HOME" || { echo "Failed to cd to $PIXI_HOME"; exit 1; } +eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; } +cd - +#-------- SCRIPT END -------- + +export MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" | head -n 1) +export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000)) + +echo "Running training with config: $PR_TEST_CONFIG_NAME" +echo "MASTER_ADDR: $MASTER_ADDR" +echo "MASTER_PORT: $MASTER_PORT" + +srun torchrun --nnodes="${SLURM_NNODES}" \ + --nproc-per-node="${SLURM_GPUS_ON_NODE}" \ + --rdzv-id="${SLURM_JOBID}" \ + --rdzv-backend=c10d \ + --rdzv-endpoint="${MASTER_ADDR}:${MASTER_PORT}" \ + main.py \ + --config-path=configs/pr_tests \ + --config-name="$PR_TEST_CONFIG_NAME" diff --git a/.github/workflows/run_tests_on_entropy.yml b/.github/workflows/run_tests_on_entropy.yml new file mode 100644 index 00000000..7020922d --- /dev/null +++ b/.github/workflows/run_tests_on_entropy.yml @@ -0,0 +1,163 @@ +name: PR Tests + +on: + pull_request: + branches: + - main + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + # 1) Discover configs + discover-configs: + runs-on: [self-hosted, entropy] + outputs: + configs: ${{ steps.find-configs.outputs.configs }} + + steps: + - uses: actions/checkout@v4 + + - name: Find PR test configs + id: find-configs + run: | + echo "Searching for configs in configs/pr_tests/..." + ls -la configs/pr_tests/ || echo "Directory not found or empty" + + configs=$(ls configs/pr_tests/*.yaml 2>/dev/null | xargs -I {} basename {} .yaml | jq -R -s -c 'split("\n") | map(select(length > 0))') + echo "configs=$configs" >> "$GITHUB_OUTPUT" + + echo "=== Found PR test configs ===" + echo "$configs" | jq -r '.[]' + echo "Total: $(echo "$configs" | jq 'length') config(s)" + + # 2) Submit & manage one SLURM array for all configs + run-pr-tests: + needs: discover-configs + runs-on: [self-hosted, entropy] + env: + CONFIGS_JSON: ${{ needs.discover-configs.outputs.configs }} + + steps: + - uses: actions/checkout@v4 + + - name: Submit SLURM array + id: submit + run: | + echo "Configs JSON: $CONFIGS_JSON" + + # Write config names (one per line) into a file + configs_file="$PWD/pr_tests_configs.txt" + echo "$CONFIGS_JSON" | jq -r '.[]' > "$configs_file" + + num_configs=$(wc -l < "$configs_file") + echo "Found $num_configs config(s)" + + if [ "$num_configs" -eq 0 ]; then + echo "No PR test configs found, nothing to run." + # No job_id set -> later steps will do nothing + exit 0 + fi + + last_index=$((num_configs - 1)) + echo "Submitting SLURM array 0-$last_index" + + job_output=$(sbatch \ + --parsable \ + --array=0-"$last_index" \ + --job-name="pr_tests" \ + --output="slurm-%A_%a.out" \ + --export=ALL,PR_TEST_CONFIGS_FILE="$configs_file" \ + .github/scripts/run_remote_pr_check.sh) + + echo "Raw sbatch output: $job_output" + + # Handle cases like: + # 123456 + # 123456;cluster + # 123456_0 + # 123456_0;cluster + tmp="${job_output%%;*}" # strip anything after ';' + job_id="${tmp%%_*}" # strip anything after '_' (array task suffix) + + echo "Parsed base job_id: $job_id" + + echo "job_id=$job_id" >> "$GITHUB_OUTPUT" + + - name: Wait for SLURM array + if: steps.submit.outputs.job_id + id: wait + run: | + job_id="${{ steps.submit.outputs.job_id }}" + echo "Waiting for SLURM array job $job_id to finish..." + + while true; do + state=$(sacct -j "$job_id" --format=State --noheader | head -1 | awk '{print $1}') + echo "Current array state: $state" + + case "$state" in + COMPLETED) + echo "Array job completed successfully." + exit 0 + ;; + FAILED|CANCELLED|TIMEOUT) + echo "Array job ended with state: $state" + exit 1 + ;; + ""|PENDING|RUNNING|CONFIGURING|SUSPENDED) + sleep 10 + ;; + *) + echo "Unknown state: $state" + echo "Array job ended with state: $state" + exit 1 + ;; + esac + done + + - name: Summarize array results and show failed logs + if: always() && steps.submit.outputs.job_id + run: | + job_id="${{ steps.submit.outputs.job_id }}" + configs_file="$PWD/pr_tests_configs.txt" + + if [ ! -f "$configs_file" ]; then + echo "No configs file found at $configs_file, nothing to summarize." + exit 0 + fi + + echo "=== Per-config SLURM task states ===" + + i=0 + failures=0 + while read -r cfg; do + # SLURM child job id: _ + task_job="${job_id}_$i" + + state=$(sacct -j "$task_job" --format=State --noheader | head -1 | awk '{print $1}') + echo "[$i] config=${cfg} state=${state}" + + if [ "$state" != "COMPLETED" ]; then + failures=$((failures + 1)) + out_file="slurm-${job_id}_${i}.out" + + echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----" + if [ -f "$out_file" ]; then + cat "$out_file" + else + echo "No slurm output file found (expected: ${out_file})" + fi + echo "---- END log for FAILED config: ${cfg} ----" + fi + + i=$((i + 1)) + done < "$configs_file" + + if [ "$failures" -gt 0 ]; then + echo "Some configs failed: $failures failing task(s)." + exit 1 + else + echo "All configs completed successfully." + fi diff --git a/configs/pr_tests/test_core.yaml b/configs/pr_tests/test_core.yaml new file mode 100644 index 00000000..f5650e01 --- /dev/null +++ b/configs/pr_tests/test_core.yaml @@ -0,0 +1,38 @@ +defaults: + - ../_cluster@_here_: entropy + - ../_model@_here_: tiny + - ../_trainer@_here_: llama + - ../_dataset@_here_: c4 + - ../_checkpoints@_here_: none + - ../_misc@_here_: default + - ../_eval@_here_: none + +common: + sequence_length: 128 + batch_size: 16 + +trainer: + gradient_accumulation_steps: 1 + n_steps: 100 + learning_rate: 1e-3 + + checkpoint: + save: + type: huggingface + path: checkpoint + +infrastructure: + + metric_logger: + name: test_core + tags: + - nano + - pr_test + - core + - train + + slurm: + time: "00:10:00" + gres: gpu:2 + job-name: ${infrastructure.metric_logger.name} + mem_per_gpu: 100G \ No newline at end of file diff --git a/configs/pr_tests/test_eval.yaml b/configs/pr_tests/test_eval.yaml new file mode 100644 index 00000000..ff5064d2 --- /dev/null +++ b/configs/pr_tests/test_eval.yaml @@ -0,0 +1,41 @@ +defaults: + - ../_cluster@_here_: entropy + - ../_model@_here_: tiny + - ../_trainer@_here_: llama + - ../_dataset@_here_: c4 + - ../_checkpoints@_here_: none + - ../_misc@_here_: default + - ../_eval@_here_: none + +common: + sequence_length: 128 + batch_size: 16 + +trainer: + gradient_accumulation_steps: 1 + n_steps: 100 + learning_rate: 1e-3 + + checkpoint: + save: + type: huggingface + path: checkpoint + +infrastructure: + + metric_logger: + name: test_core2 + tags: + - nano + - pr_test + - core + - core2 + + slurm: + time: "00:10:00" + gres: gpu:2 + job-name: ${infrastructure.metric_logger.name} + mem_per_gpu: 100G + +# eval: +# limit: 10 \ No newline at end of file