Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c5e5d99
claude gh workflow self-hosted-runner on entropy
j321m Dec 5, 2025
fa3f7f7
added manual runner triger for testing
j321m Dec 8, 2025
4c7fb9e
refactor pr remote tests
j321m Dec 8, 2025
516f47a
claude fix
j321m Dec 8, 2025
6b04c31
claude debug messages
j321m Dec 8, 2025
91a6675
claude more debug messages
j321m Dec 8, 2025
32f748b
jm fix
j321m Dec 8, 2025
84baafe
jm fix
j321m Dec 8, 2025
ff5b6ad
jm fix config default paths
j321m Dec 8, 2025
c5dd9c9
jm remove eval from config - it doesn't work on > 1 gpu
j321m Dec 8, 2025
23e0a1c
add eval test
j321m Dec 8, 2025
561fc89
add eval dummy
j321m Dec 8, 2025
c0a6c14
add eval dummy
j321m Dec 8, 2025
e359840
claude cancle old commit jobs
j321m Dec 8, 2025
dfa14f1
back to 1 test
j321m Dec 8, 2025
5f1afaf
scancel on cancel
j321m Dec 8, 2025
136122d
sth
j321m Dec 8, 2025
ecf8198
2 test configs
j321m Dec 8, 2025
94c4f87
gpt: fix slurm canceling on test cancel
j321m Dec 9, 2025
4bc4367
sth
j321m Dec 9, 2025
67a4d77
fix
j321m Dec 9, 2025
816bf2d
fix
j321m Dec 9, 2025
7f675a9
test
j321m Dec 9, 2025
e9188bc
test
j321m Dec 9, 2025
2583793
gpt: paralel jobs (slurm array)
j321m Dec 9, 2025
9095beb
sth
j321m Dec 9, 2025
1f7c164
gpt fix
j321m Dec 9, 2025
c437fb9
sth
j321m Dec 9, 2025
08c06d0
sth
j321m Dec 9, 2025
94da876
sth
j321m Dec 9, 2025
5b6b210
move runner to H100
j321m Dec 9, 2025
6974f23
reduce ram limit
j321m Dec 9, 2025
50c7a20
copy h100 setup script to pr test bash
j321m Dec 9, 2025
51e0e5e
limit mem_per_gpu in configs
j321m Dec 9, 2025
3c42bc8
fix failure check
j321m Dec 9, 2025
51bc049
change mv to cp + rm in update_pixi.py
j321m Dec 9, 2025
e45a374
gpt: resolve uninstalled pixi from update_pixi
j321m Dec 10, 2025
e530649
giving up running on entropy common - each node has different storage
j321m Dec 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions .github/scripts/run_remote_pr_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash -l

#SBATCH --cpus-per-gpu=16
#SBATCH --gres=gpu:2
#SBATCH --job-name=pr_check_entropy
#SBATCH --mem-per-gpu=100G
#SBATCH --nodes=1
#SBATCH --partition=h100
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think other partition might be more suitable

#SBATCH --time=00:10:00

set -euo pipefail # exit on error, treat unset vars as error
set -x # print commands for debugging

# --- Resolve PR_TEST_CONFIG_NAME from array index ---

if [ -z "${PR_TEST_CONFIGS_FILE:-}" ]; then
echo "Error: PR_TEST_CONFIGS_FILE not set"
exit 1
fi

if [ -z "${SLURM_ARRAY_TASK_ID:-}" ]; then
echo "Error: SLURM_ARRAY_TASK_ID not set (are you running as an array job?)"
exit 1
fi

# SLURM_ARRAY_TASK_ID is 0-based; sed is 1-based
line=$((SLURM_ARRAY_TASK_ID + 1))
PR_TEST_CONFIG_NAME=$(sed -n "${line}p" "$PR_TEST_CONFIGS_FILE" || true)

if [ -z "$PR_TEST_CONFIG_NAME" ]; then
echo "Error: failed to resolve config for SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID from $PR_TEST_CONFIGS_FILE"
exit 1
fi

echo "Running CI check for config: $PR_TEST_CONFIG_NAME"

#---------- SCRIPT ----------
export PROJECT_HOME_PATH=/storage_nvme_4/nano
export HF_HOME="$PROJECT_HOME_PATH/hf_cache"

# hydra errors
export HYDRA_FULL_ERROR=1

# pixi variables
export PIXI_HOME=/storage_nvme_4/nano/pixi
export PATH="$PIXI_HOME/bin:$PATH"
export XDG_DATA_HOME="$PIXI_HOME/data"
export XDG_CACHE_HOME="$PIXI_HOME/cache"
export XDG_STATE_HOME="$PIXI_HOME/state"

# Save current directory and setup pixi
cd "$PIXI_HOME" || { echo "Failed to cd to $PIXI_HOME"; exit 1; }
eval "$(pixi shell-hook)" || { echo "Failed to run pixi shell-hook"; exit 1; }
cd -
#-------- SCRIPT END --------

export MASTER_ADDR=$(scontrol show hostname "${SLURM_NODELIST}" | head -n 1)
export MASTER_PORT=$((40000 + ${SLURM_JOB_ID} % 10000))

echo "Running training with config: $PR_TEST_CONFIG_NAME"
echo "MASTER_ADDR: $MASTER_ADDR"
echo "MASTER_PORT: $MASTER_PORT"

srun torchrun --nnodes="${SLURM_NNODES}" \
--nproc-per-node="${SLURM_GPUS_ON_NODE}" \
--rdzv-id="${SLURM_JOBID}" \
--rdzv-backend=c10d \
--rdzv-endpoint="${MASTER_ADDR}:${MASTER_PORT}" \
main.py \
--config-path=configs/pr_tests \
--config-name="$PR_TEST_CONFIG_NAME"
163 changes: 163 additions & 0 deletions .github/workflows/run_tests_on_entropy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
name: PR Tests

on:
pull_request:
branches:
- main
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# 1) Discover configs
discover-configs:
runs-on: [self-hosted, entropy]
outputs:
configs: ${{ steps.find-configs.outputs.configs }}

steps:
- uses: actions/checkout@v4

- name: Find PR test configs
id: find-configs
run: |
echo "Searching for configs in configs/pr_tests/..."
ls -la configs/pr_tests/ || echo "Directory not found or empty"

configs=$(ls configs/pr_tests/*.yaml 2>/dev/null | xargs -I {} basename {} .yaml | jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "configs=$configs" >> "$GITHUB_OUTPUT"

echo "=== Found PR test configs ==="
echo "$configs" | jq -r '.[]'
echo "Total: $(echo "$configs" | jq 'length') config(s)"

# 2) Submit & manage one SLURM array for all configs
run-pr-tests:
needs: discover-configs
runs-on: [self-hosted, entropy]
env:
CONFIGS_JSON: ${{ needs.discover-configs.outputs.configs }}

steps:
- uses: actions/checkout@v4

- name: Submit SLURM array
id: submit
run: |
echo "Configs JSON: $CONFIGS_JSON"

# Write config names (one per line) into a file
configs_file="$PWD/pr_tests_configs.txt"
echo "$CONFIGS_JSON" | jq -r '.[]' > "$configs_file"

num_configs=$(wc -l < "$configs_file")
echo "Found $num_configs config(s)"

if [ "$num_configs" -eq 0 ]; then
echo "No PR test configs found, nothing to run."
# No job_id set -> later steps will do nothing
exit 0
fi

last_index=$((num_configs - 1))
echo "Submitting SLURM array 0-$last_index"

job_output=$(sbatch \
--parsable \
--array=0-"$last_index" \
--job-name="pr_tests" \
--output="slurm-%A_%a.out" \
--export=ALL,PR_TEST_CONFIGS_FILE="$configs_file" \
.github/scripts/run_remote_pr_check.sh)

echo "Raw sbatch output: $job_output"

# Handle cases like:
# 123456
# 123456;cluster
# 123456_0
# 123456_0;cluster
tmp="${job_output%%;*}" # strip anything after ';'
job_id="${tmp%%_*}" # strip anything after '_' (array task suffix)

echo "Parsed base job_id: $job_id"

echo "job_id=$job_id" >> "$GITHUB_OUTPUT"

- name: Wait for SLURM array
if: steps.submit.outputs.job_id
id: wait
run: |
job_id="${{ steps.submit.outputs.job_id }}"
echo "Waiting for SLURM array job $job_id to finish..."

while true; do
state=$(sacct -j "$job_id" --format=State --noheader | head -1 | awk '{print $1}')
echo "Current array state: $state"

case "$state" in
COMPLETED)
echo "Array job completed successfully."
exit 0
;;
FAILED|CANCELLED|TIMEOUT)
echo "Array job ended with state: $state"
exit 1
;;
""|PENDING|RUNNING|CONFIGURING|SUSPENDED)
sleep 10
;;
*)
echo "Unknown state: $state"
echo "Array job ended with state: $state"
exit 1
;;
esac
done

- name: Summarize array results and show failed logs
if: always() && steps.submit.outputs.job_id
run: |
job_id="${{ steps.submit.outputs.job_id }}"
configs_file="$PWD/pr_tests_configs.txt"

if [ ! -f "$configs_file" ]; then
echo "No configs file found at $configs_file, nothing to summarize."
exit 0
fi

echo "=== Per-config SLURM task states ==="

i=0
failures=0
while read -r cfg; do
# SLURM child job id: <arrayJobId>_<taskIndex>
task_job="${job_id}_$i"

state=$(sacct -j "$task_job" --format=State --noheader | head -1 | awk '{print $1}')
echo "[$i] config=${cfg} state=${state}"

if [ "$state" != "COMPLETED" ]; then
failures=$((failures + 1))
out_file="slurm-${job_id}_${i}.out"

echo "---- BEGIN log for FAILED config: ${cfg} (task ${i}) ----"
if [ -f "$out_file" ]; then
cat "$out_file"
else
echo "No slurm output file found (expected: ${out_file})"
fi
echo "---- END log for FAILED config: ${cfg} ----"
fi

i=$((i + 1))
done < "$configs_file"

if [ "$failures" -gt 0 ]; then
echo "Some configs failed: $failures failing task(s)."
exit 1
else
echo "All configs completed successfully."
fi
38 changes: 38 additions & 0 deletions configs/pr_tests/test_core.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
defaults:
- ../_cluster@_here_: entropy
- ../_model@_here_: tiny
- ../_trainer@_here_: llama
- ../_dataset@_here_: c4
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- ../_eval@_here_: none

common:
sequence_length: 128
batch_size: 16

trainer:
gradient_accumulation_steps: 1
n_steps: 100
learning_rate: 1e-3

checkpoint:
save:
type: huggingface
path: checkpoint

infrastructure:

metric_logger:
name: test_core
tags:
- nano
- pr_test
- core
- train

slurm:
time: "00:10:00"
gres: gpu:2
job-name: ${infrastructure.metric_logger.name}
mem_per_gpu: 100G
41 changes: 41 additions & 0 deletions configs/pr_tests/test_eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
defaults:
- ../_cluster@_here_: entropy
- ../_model@_here_: tiny
- ../_trainer@_here_: llama
- ../_dataset@_here_: c4
- ../_checkpoints@_here_: none
- ../_misc@_here_: default
- ../_eval@_here_: none

common:
sequence_length: 128
batch_size: 16

trainer:
gradient_accumulation_steps: 1
n_steps: 100
learning_rate: 1e-3

checkpoint:
save:
type: huggingface
path: checkpoint

infrastructure:

metric_logger:
name: test_core2
tags:
- nano
- pr_test
- core
- core2

slurm:
time: "00:10:00"
gres: gpu:2
job-name: ${infrastructure.metric_logger.name}
mem_per_gpu: 100G

# eval:
# limit: 10