diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 8f7bc741f..def6b99ac 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -38,7 +38,7 @@
 
 /tests @andylin-hao
 /toolkits/auto_placement @i-Taozi
-/toolkits/ckpt_convertor @Louis-J
+/toolkits/ckpt_convertor @qurakchin
 
 /.pre-commit-config.yaml @andylin-hao
 /pyproject.toml @andylin-hao
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 0145478f0..b928e91b3 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -18,6 +18,22 @@ body:
       Please provide a clear and concise description of what the bug is.
   validations:
     required: true
+- type: textarea
+  attributes:
+    label: Log file
+    description: |
+      Please provide upload the log file or paste the full log messages when the bug happens.
+    value: |
+      You can find the log file in `logs/` folder or the $output_dir/$experiment_name folder (defined in the yaml config) if you are using our example scripts.
+
+      Log file:
+
+      If you cannot find the log, please provide the full log messages here.
+      ```
+      <paste log here>
+      ```
+  validations:
+    required: true
 - type: textarea
   attributes:
     label: Environment
@@ -28,13 +44,13 @@ body:
       pip list | grep -E "torch|sglang|vllm|ray|transformers"
       nvidia-smi
       ```
-      Additionally, please provide the RLinf version, Megatron version if you are using Megatron, and docker image version if you are using our images
+      Additionally, please provide the RLinf version, Megatron version if you are using Megatron, and docker image tag if you are using our images
     value: |
       Python version:
       PIP list:
       RLinf version: 
       Megatron version: 
-      Docker image version: 
+      Docker image tag: 
       nvidia-smi:
   validations:
     required: true
diff --git a/.github/workflows/auto_placement.yml b/.github/workflows/auto_placement.yml
deleted file mode 100644
index 346a7226f..000000000
--- a/.github/workflows/auto_placement.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: Math Auto Placement
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: auto-placement
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/auto_placement/run_auto_placement.sh
\ No newline at end of file
diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
new file mode 100644
index 000000000..0ec8ca3ef
--- /dev/null
+++ b/.github/workflows/code-test.yml
@@ -0,0 +1,373 @@
+name: Code Test
+
+on:
+  push:
+    branches:
+      - "release/v[0-9].[0-9]"
+      - main
+  pull_request:
+    branches: [main]
+    types: [synchronize, labeled]
+  workflow_dispatch:
+
+concurrency:
+  group: code-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # =============================================== check changes ====================================================
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      file_filter: ${{ steps.filter.outputs.file_filter }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Fail if the PR does not have the 'run-ci' label
+        if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
+        run: |
+          echo "This pull request does not have the 'run-ci' label. Failing the workflow."
+          exit 1
+
+      - name: Fail if the PR is a draft
+        if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
+        run: |
+          echo "This pull request is a draft. Failing the workflow."
+          exit 1
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            file_filter:
+              - '**/*.py'
+              - 'tests/**'
+              - '.github/workflows/*.yml'
+              - '!docs/**'
+              - '!README.md'
+              - '*.yaml'
+              - '*.toml'
+              - '!ray_utils/**'
+              - '!requirements/**'
+
+  # =============================================== unit tests ====================================================
+
+  unit-tests-cuda:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install pytest
+        run: |
+          source switch_env reason
+          uv pip install pytest
+
+      - name: Run pytest
+        timeout-minutes: 20
+        run: |
+          export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests
+          source switch_env reason
+          pytest tests/unit_tests
+
+      - name: Run doctest
+        timeout-minutes: 20
+        run: |
+          source switch_env reason
+          pytest --doctest-modules rlinf/scheduler
+
+  unit-tests-cpu:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install dependencies
+        run: |
+          pip install uv
+          uv venv
+          source .venv/bin/activate
+          UV_TORCH_BACKEND=auto uv sync
+          uv pip install pytest
+
+      - name: Run pytest
+        timeout-minutes: 20
+        run: |
+          export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
+          source .venv/bin/activate
+          pytest tests/unit_tests
+
+  # =============================================== reason e2e tests ====================================================
+
+  reason-qwen-grpo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Megatron SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-sgl
+
+      - name: Megatron vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-vllm
+
+      - name: Megatron SGLang Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl
+
+      - name: Megatron vLLM Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm
+
+      - name: FSDP SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl
+
+      - name: FSDP vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm
+
+
+  reason-qwen-grpo-test-rollout-logprobs:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Megatron SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs
+
+      - name: Megatron vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs
+
+      - name: Megatron SGLang Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs
+
+      - name: Megatron vLLM Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs
+
+      - name: FSDP SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs
+
+      - name: FSDP vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs
+
+  coding-online-rl-qwen-ppo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install dependencies
+        run: |
+          pip install httpx asyncio fuzzywuzzy
+
+      - name: SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/coding_online_rl/run.sh
+
+  qwen-vl-grpo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: FSDP SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-vl-3b-grpo-collocated-fsdp-sgl
+
+      - name: FSDP vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-vl-3b-grpo-collocated-fsdp-vllm
+
+  # =============================================== embodied e2e tests ====================================================
+
+  embodied-maniskill-ppo-openvla-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla
+          bash tests/e2e_tests/embodied/run.sh maniskill_ppo_openvla
+          
+  embodied-maniskill-grpo-openvlaoft-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla-oft
+          cp -r /workspace/dataset/maniskill_assets/assets ${REPO_PATH}/rlinf/envs/maniskill/
+          bash tests/e2e_tests/embodied/run.sh maniskill_grpo_openvlaoft
+
+  embodied-libero-goal-grpo-openvlaoft-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA-OFT test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla-oft
+          bash tests/e2e_tests/embodied/run.sh libero_goal_grpo_openvlaoft
+
+  embodied-libero-130-grpo-openvlaoft-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA-OFT test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla-oft
+          bash tests/e2e_tests/embodied/run.sh libero_130_grpo_openvlaoft
+
+  # =============================================== auto placement tests ====================================================
+
+  static-auto-placement-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: auto-placement
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/auto_placement/run.sh
+
+# =============================================== finale ====================================================
+
+  pr-test-finish:
+    needs: [
+      check-changes,
+
+      # Unit tests
+      unit-tests-cuda, unit-tests-cpu,
+
+      # Reason e2e tests
+      reason-qwen-grpo-test, reason-qwen-grpo-test-rollout-logprobs,
+      coding-online-rl-qwen-ppo-test, qwen-vl-grpo-test,
+
+      # Embodied e2e tests
+      embodied-maniskill-ppo-openvla-test, embodied-maniskill-grpo-openvlaoft-test, embodied-libero-goal-grpo-openvlaoft-test,embodied-libero-130-grpo-openvlaoft-test,
+
+      # Auto placement tests
+      static-auto-placement-test
+    ]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      # Refer to https://github.com/sgl-project/sglang/blob/main/.github/workflows/pr-test.yml
+      - name: Check all dependent job statuses
+        run: |
+          # Convert the 'needs' context to a JSON string
+          json_needs='${{ toJson(needs) }}'
+
+          # Get a list of all job names from the JSON keys
+          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
+
+          for job in $job_names; do
+            # For each job, extract its result
+            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
+
+            # Print the job name and its result
+            echo "$job: $result"
+
+            # Check for failure or cancellation and exit if found
+            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
+              echo "The above jobs failed."
+              exit 1
+            fi
+          done
+
+          # If the loop completes, all jobs were successful
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/commit_check.yml b/.github/workflows/commit_check.yml
index e1391abe6..7f17365ce 100644
--- a/.github/workflows/commit_check.yml
+++ b/.github/workflows/commit_check.yml
@@ -12,7 +12,7 @@ jobs:
       contents: read
       pull-requests: write
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           ref: ${{ github.event.pull_request.head.sha }}  # checkout PR HEAD commit
           fetch-depth: 0  # required for merge-base check
diff --git a/.github/workflows/embodied_e2e.yml b/.github/workflows/embodied_e2e.yml
deleted file mode 100644
index 259c76d41..000000000
--- a/.github/workflows/embodied_e2e.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Embodied End2End
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    openvla-ppo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:agentic-openvla-rlinf0.1-torch2.5.1-transformer4.40
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=2g -e NVIDIA_DRIVER_CAPABILITIES="compute,utility,graphics"
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-            - name: OpenVLA test
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/embodied/run_openvla.sh
\ No newline at end of file
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index e21ced73b..921232e28 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,6 +9,6 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v5
     - uses: actions/setup-python@v3
     - uses: pre-commit/action@v3.0.1
\ No newline at end of file
diff --git a/.github/workflows/math_e2e.yml b/.github/workflows/math_e2e.yml
deleted file mode 100644
index 921dc4076..000000000
--- a/.github/workflows/math_e2e.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-name: Math End2End
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_collocated.sh
-
-            - name: vLLM Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/vllm/run_collocated.sh
-
-            - name: SGLang Pipeline mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_pipeline.sh
-
-    qwen-grpo-test-sglang044:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.5.1-sglang0.4.4-vllm0.7.1-megatron0.11.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_collocated.sh
diff --git a/.github/workflows/math_e2e_rollout_logprobs.yml b/.github/workflows/math_e2e_rollout_logprobs.yml
deleted file mode 100644
index fde67e297..000000000
--- a/.github/workflows/math_e2e_rollout_logprobs.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: Math End2End Rollout Logprobs
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
-
-            - name: vLLM Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/vllm/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
-
-            - name: SGLang Pipeline mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
deleted file mode 100644
index aa7747de6..000000000
--- a/.github/workflows/unit_test.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: Unit Tests
-
-on:
-    push:
-        branches:
-          - 'v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-
-    pull_request:
-        branches:
-          - 'v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-
-permissions:
-  contents: read
-
-jobs:
-    unit-tests-cuda:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: Install pytest
-              run: pip install pytest
-
-            - name: Run pytest
-              run: |
-                export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
-                pytest tests/unit_tests
-            
-            - name: Run doctest
-              run: |
-                pytest --doctest-modules rlinf/scheduler
-      
-    unit-tests-cpu:
-        runs-on: ubuntu-latest
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: Install pytest
-              run: pip install pytest
-
-            - name: Run pytest
-              run: |
-                export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
-                pytest tests/unit_tests
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index c6d3a86ea..49fc83734 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,5 @@ logs/*
 *.so
 .venv
 uv.lock
-assets/
\ No newline at end of file
+assets/
+kernel_meta/
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b78ba74a6..9230a8f13 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,9 +3,8 @@ repos:
     rev: "v0.12.9"
     hooks:
       - id: ruff
-        args: ["--preview"]
+        args: ["--preview", "--fix"]
       - id: ruff-format
-        args: ["--check"]
 
   - repo: https://github.com/commit-check/commit-check
     rev: "v0.10.2"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 314cd313a..d151594d5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,7 @@ All types of contributions are encouraged and valued. See the [Table of Contents
 
 
 ## I Want To Contribute
-> All contributions (including the project team's contribution) takes the form of [GitHub Pull Requests](https://github.com/RLinf/RLinf/pulls).
+All contributions (including the project team's contribution) takes the form of [GitHub Pull Requests](https://github.com/RLinf/RLinf/pulls).
 To contribute, first you need to [fork the repository](https://github.com/RLinf/RLinf/fork) and clone it to your local machine.
 Then, create a new development branch from `main` for your contribution:
 ```bash
@@ -29,7 +29,7 @@ git checkout main
 git pull origin main
 git checkout -b feature/your-feature-name
 ```
-After you have made your changes, commit them with a clear and descriptive commit message:
+After you have made your changes, commit them with a clear and descriptive commit message. The `-s` flag is necessary, which adds a "Signed-off-by" line at the end of the commit message:
 ```bash
 git add .
 git commit -m "feat(embodied): add a clear and descriptive commit message" -s
@@ -47,8 +47,6 @@ Where `<type>` commonly includes the following (others can be found in the [Conv
 - `test`: adding missing tests, refactoring tests; no production code change
 - `chore`: updating build tasks, package manager configs, etc; no production code change.
 
-The `-s` flag is necessary, which adds a "Signed-off-by" line at the end of the commit message, certifying that you have the right to submit this work under the project's license.
-
 Finally, before pushing your changes to your fork, please run the pre-commit checks to ensure that your code adheres to the project's coding standards:
 ```bash
 pip install pre-commit
diff --git a/README.md b/README.md
index 9858cc577..41b8193a6 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,13 @@
 <a href="https://github.com/RLinf/misc/blob/main/pic/wechat.jpg?raw=true"><img src="https://img.shields.io/badge/微信-green?logo=wechat&amp"></a>
 </div>
 
+<div align="center">
+
+[![English](https://img.shields.io/badge/lang-English-blue.svg)](README.md)
+[![简体中文](https://img.shields.io/badge/语言-简体中文-red.svg)](README.zh-CN.md)
+
+</div>
+
 <h1 align="center">
   <sub>RLinf: Reinforcement Learning Infrastructure for Agentic AI</sub>
 </h1>
@@ -23,8 +30,11 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 
 
 ## What's NEW!
+- [2025/10] The RLinf Algorithm Technical Report [RLinf-VLA: A Unified and Efficient Framework for VLA+RL Training](https://arxiv.org/abs/2510.06710) is released.
+- [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [Example Gallery](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) is updated, users can find various off-the-shelf examples!
+- [2025/09] The paper [RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation](https://arxiv.org/abs/2509.15965) is released.
+- [2025/09] The [report on RLinf by Machine Heart](https://mp.weixin.qq.com/s/Xtv4gDu3lhDDGadLrzt6Aw)  is released. 
 - [2025/08] RLinf is open-sourced. The formal v0.1 will be released soon.
-- [2025/09] The paper [RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation](https://arxiv.org/abs/2509.15965) is released. 
 
 ## Key Features
 
@@ -64,90 +74,149 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 ## Main Results
 ### Embodied Intelligence
 
+
 <div align="center">
-<table>
+<table border="0">
   <tr>
-    <th colspan="5" style="text-align:center;"><strong>OpenVLA-OFT model results on ManiSkill3</strong></th>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvla.png" alt="mani_openvla" width="350"/>
+      <br/>
+      <strong>OpenVLA</strong>
+    </td>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvlaoft.png" alt="mani_openvlaoft" width="350"/>
+      <br/>
+      <strong>OpenVLA-OFT</strong>
+    </td>
   </tr>
+</table>
+</div>
+
+- Training curves on ManiSkill “PutOnPlateInScene25Mani-v3” with OpenVLA and
+OpenVLA-OFT models, using PPO and GRPO algorithms. PPO consistently outperforms GRPO
+and exhibits greater stability.
+
+<div align="center">
+<table style="text-align:center;">
   <tr>
-    <th>Model</th>
-    <th>Vision</th>
-    <th>Semantic</th>
-    <th>Position</th>
-    <th>Average</th>
+    <th colspan="6" style="text-align:center;"> <strong>Evaluation results on ManiSkill. Values denote success rates</strong></th>
+  </tr>
+  <tr>
+    <td style="text-align:center;"></td>
+    <th rowspan="2" colspan="1" style="text-align:center;">In-Distribution</th>
+    <td colspan="4" style="text-align:center;"><strong>Out-Of-Distribution<strong></td>
+  
+  </tr>
+  <tr>
+    <th style="text-align:center;"></th>
+    <th style="text-align:center;">Vision</th>
+    <th style="text-align:center;">Semantic</th>
+    <th style="text-align:center;">Execution</th>
+    <th style="text-align:center;">Avg.</th>
+  </tr>
+  <tr>
+    <td style="text-align:center;">OpenVLA (Base)</td>
+    <td style="text-align:center;">53.91%</td>
+    <td style="text-align:center;">38.75%</td>
+    <td style="text-align:center;">35.94%</td>
+    <td style="text-align:center;">42.11%</td>
+    <td style="text-align:center;">39.10%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">RL4VLA (PPO)</td>
+    <td style="text-align:center;">93.75%</td>
+    <td style="text-align:center;">80.47%</td>
+    <td style="text-align:center;">75.00%</td>
+    <td style="text-align:center;">81.77%</td>
+    <td style="text-align:center;">79.15%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">rl4vla</a></td>
-    <td>76.6%</td>
-    <td>75.4%</td>
-    <td>77.6%</td>
-    <td>76.1%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-GRPO)</td>
+    <td style="text-align:center;">84.38%</td>
+    <td style="text-align:center;">74.69%</td>
+    <td style="text-align:center;">72.99%</td>
+    <td style="text-align:center;">77.86%</td>
+    <td style="text-align:center;">75.15%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA-OFT</td>
-    <td><strong>84.6%</strong></td>
-    <td>51.6%</td>
-    <td>42.9%</td>
-    <td>61.5%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>96.09%</strong></td>
+    <td style="text-align:center;">82.03%</td>
+    <td style="text-align:center;"><strong>78.35%</strong></td>
+    <td style="text-align:center;"><strong>85.42%</strong></td>
+    <td style="text-align:center;"><strong>81.93%</strong></td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA-OFT</td>
-    <td>80.5%</td>
-    <td>56.6%</td>
-    <td>56.1%</td>
-    <td>64.5%</td>
+    <th colspan="6" style="text-align:center;"></th>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA</td>
-    <td>82.0%</td>
-    <td><strong>80.6%</strong></td>
-    <td><strong>89.3%</strong></td>
-    <td><strong>82.2%</strong></td>
+    <td style="text-align:center;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">28.13%</td>
+    <td style="text-align:center;">27.73%</td>
+    <td style="text-align:center;">12.95%</td>
+    <td style="text-align:center;">11.72%</td>
+    <td style="text-align:center;">18.29%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA</td>
-    <td>74.7%</td>
-    <td>74.4%</td>
-    <td>81.6%</td>
-    <td>75.5%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;">94.14%</td>
+    <td style="text-align:center;">84.69%</td>
+    <td style="text-align:center;">45.54%</td>
+    <td style="text-align:center;">44.66%</td>
+    <td style="text-align:center;">60.64%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>97.66%</strong></td>
+    <td style="text-align:center;"><strong>92.11%</strong></td>
+    <td style="text-align:center;">64.84%</td>
+    <td style="text-align:center;">73.57%</td>
+    <td style="text-align:center;">77.05%</td>
   </tr>
 </table>
+</div>
+
 
-<table>
+<div align="center">
+<table style="text-align:center;">
   <tr>
-    <th colspan="6" style="text-align:center;"><strong>OpenVLA-OFT model results on LIBERO</strong></th>
+    <th colspan="7" style="text-align:center;"><strong>Evaluation results of the unified model on the five LIBERO task groups</strong></th>
   </tr>
   <tr>
-    <th>Model</th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial">Spatial</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal">Goal</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object">Object</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long">Long</a></th>
-    <th>Average</th>
+    <th style="text-align:center;">Model</th>
+    <th style="text-align:center;">Spatial</th>
+    <th style="text-align:center;">Object</th>
+    <th style="text-align:center;">Goal</th>
+    <th style="text-align:center;">10</th>
+    <th style="text-align:center;">90</th>
+    <th style="text-align:center;">Avg.</th>
   </tr>
   <tr>
-    <td>OpenVLA-OFT-SFT (one-shot)</td>
-    <td>56.5%</td>
-    <td>45.6%</td>
-    <td>25.6%</td>
-    <td>9.7%</td>
-    <td>34.4%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">72.18%</td>
+    <td style="text-align:center;">71.48%</td>
+    <td style="text-align:center;">64.06%</td>
+    <td style="text-align:center;">48.44%</td>
+    <td style="text-align:center;">70.97%</td>
+    <td style="text-align:center;">65.43%</td>
   </tr>
   <tr>
-    <td>OpenVLA-OFT-RLinf</td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>94.4%</strong></td>
-    <td><strong>97.9%</strong></td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;"><strong>99.40%<strong></td>
+    <td style="text-align:center;"><strong>99.80%<strong></td>
+    <td style="text-align:center;"><strong>98.79%<strong></td>
+    <td style="text-align:center;"><strong>93.95%<strong></td>
+    <td style="text-align:center;"><strong>98.59%<strong></td>
+    <td style="text-align:center;"><strong>98.11%<strong></td>
   </tr>
   <tr>
-    <td>Improvement</td>
-    <td>+42.5%</td>
-    <td>+53.4%</td>
-    <td>+73.4%</td>
-    <td>+84.7%</td>
-    <td>+63.5%</td>
+    <td style="text-align:center;">Δ Improvement</td>
+    <td style="text-align:center;">+27.22</td>
+    <td style="text-align:center;">+28.32</td>
+    <td style="text-align:center;">+34.73</td>
+    <td style="text-align:center;">+45.51</td>
+    <td style="text-align:center;">+27.62</td>
+    <td style="text-align:center;">+32.68</td>
   </tr>
 </table>
 </div>
@@ -159,15 +228,15 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 ### Math Reasoning
 
 <div align="center">
-<table>
+<table style="text-align:center;">
   <tr>
     <th colspan="5" style="text-align:center;"><strong>1.5B model results</strong></th>
   </tr>
   <tr>
     <th>Model</th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME24">AIME 24</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME25">AIME 25</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/GPQA-diamond">GPQA-diamond</a></th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
     <th>Average</th>
   </tr>
   <tr>
@@ -204,15 +273,15 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 \* We retrain the model using the default settings for 600 steps.
 
 <div align="center">
-<table>
+<table style="text-align:center;">
   <tr>
     <th colspan="5" style="text-align:center;"><strong>7B model results</strong></th>
   </tr>
   <tr>
     <th>Model</th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME24">AIME 24</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME25">AIME 25</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/GPQA-diamond">GPQA-diamond</a></th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
     <th>Average</th>
   </tr>
   <tr>
@@ -329,7 +398,20 @@ If you find **RLinf** helpful, please cite the paper:
 }
 ```
 
-If you use RL+VLA in RLinf, you can also cite our empirical study paper:
+If you use RL+VLA in RLinf, you can also cite our technical report and empirical study paper:
+
+```bibtex
+@misc{zang2025rlinfvlaunifiedefficientframework,
+      title={RLinf-VLA: A Unified and Efficient Framework for VLA+RL Training}, 
+      author={Hongzhi Zang and Mingjie Wei and Si Xu and Yongji Wu and Zhen Guo and Yuanqing Wang and Hao Lin and Liangzhi Shi and Yuqing Xie and Zhexuan Xu and Zhihao Liu and Kang Chen and Wenhao Tang and Quanlu Zhang and Weinan Zhang and Chao Yu and Yu Wang},
+      year={2025},
+      eprint={2510.06710},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2510.06710}, 
+}
+```
+
 ```bibtex
 @misc{liu2025rlbringvlageneralization,
   title={What Can RL Bring to VLA Generalization? An Empirical Study}, 
diff --git a/README.zh-CN.md b/README.zh-CN.md
new file mode 100644
index 000000000..4eb14f6da
--- /dev/null
+++ b/README.zh-CN.md
@@ -0,0 +1,442 @@
+<div align="center">
+  <img src="docs/source-en/_static/svg/logo_white.svg" alt="RLinf-logo" width="600"/>
+</div>
+
+<div align="center">
+<a href="https://arxiv.org/abs/2509.15965"><img src="https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv"></a>
+<a href="https://huggingface.co/RLinf"><img src="https://img.shields.io/badge/HuggingFace-yellow?logo=huggingface&logoColor=white" alt="Hugging Face"></a>
+<a href="https://rlinf.readthedocs.io/en/latest/"><img src="https://img.shields.io/badge/Documentation-Purple?color=8A2BE2&logo=readthedocs"></a>
+<a href="https://rlinf.readthedocs.io/zh-cn/latest/"><img src="https://img.shields.io/badge/中文文档-red?logo=readthedocs"></a>
+<a href="https://deepwiki.com/RLinf/RLinf"><img src="https://img.shields.io/badge/Ask%20DeepWiki-1DA1F2?logo=databricks&logoColor=white&color=00ADEF" alt="Ask DeepWiki"></a>
+<a href="https://github.com/RLinf/misc/blob/main/pic/wechat.jpg?raw=true"><img src="https://img.shields.io/badge/微信-green?logo=wechat&amp"></a>
+</div>
+
+<div align="center">
+
+[![English](https://img.shields.io/badge/lang-English-blue.svg)](README.md)
+[![简体中文](https://img.shields.io/badge/语言-简体中文-red.svg)](README.zh-CN.md)
+
+</div>
+
+<h1 align="center">
+  <sub>RLinf: 为Agentic AI而生的强化学习框架</sub>
+</h1>
+
+RLinf 是一个灵活且可扩展的开源框架，专为利用强化学习进行基础模型的后训练而设计。名称中的 “inf” 既代表 `Infrastructure`，强调其作为新一代训练坚实基础的作用；也代表 `Infinite`，寓意其支持开放式学习、持续泛化以及智能发展的无限可能。
+
+<div align="center">
+  <img src="docs/source-en/_static/svg/overview.svg" alt="RLinf-overview"/>
+</div>
+
+
+## 最新动态
+- [2025/10] RLinf算法技术报告 [《RLinf-VLA：一个统一且高效的VLA+RL训练框架》](https://arxiv.org/abs/2510.06710) 已正式发布。
+- [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [示例库](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) 已更新，用户可以在其中找到多种可直接使用的示例！
+- [2025/09] 我们的论文 [《RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation》](https://arxiv.org/abs/2509.15965)已正式发布。
+- [2025/09] 机器之心关于 RLinf 的报道[《首个为具身智能而生的大规模强化学习框架RLinf！清华、北京中关村学院、无问芯穹等重磅开源》](https://mp.weixin.qq.com/s/Xtv4gDu3lhDDGadLrzt6Aw)已经发布。
+- [2025/08] RLinf 已经开源，正式的 v0.1 版本即将发布。
+
+
+## ✨ 核心特性
+
+
+**RLinf 的独特之处在于：**
+- 宏工作流到微执行流的映射机制（Macro-to-Micro Flow）：一种全新的 M2Flow 范式，通过微观层次的执行流来驱动宏观层次的逻辑流，实现逻辑工作流构建（可编程）与物理通信和调度（高效性）的解耦。
+
+- 灵活的执行模式
+
+  - 共享式（Collocated Mode）：用户可以配置组件是否同时常驻于 GPU 内存，或通过卸载 / 重新加载机制交替使用 GPU。
+  - 分离式（Disaggregated Mode）：组件既可以顺序运行（可能导致 GPU 空闲），也可以以流水线方式执行，从而确保所有 GPU 都处于忙碌状态。
+  - 混合式（Hybrid Mode）：进一步扩展了灵活性，支持自定义组合不同的放置形式。典型案例是 Generator 和 GPU-based Simulator 执行分离式细粒度流水，二者与 Inference 和 Trainer 执行共享式。
+
+- 自动调度策略： 根据训练任务自动选择最合适的执行模式，无需手动分配资源。
+  
+- 具身智能体支持
+  - 主流 VLA 模型的快速自适应支持: [OpenVLA](https://github.com/openvla/openvla), [OpenVLA-OFT](https://github.com/moojink/openvla-oft), [π₀](https://github.com/Physical-Intelligence/openpi) 和 [π₀.₅](https://github.com/Physical-Intelligence/openpi).
+  - 支持主流基于 CPU 与 GPU 的模拟器（通过标准化 RL 接口）： [ManiSkill3](https://github.com/haosulab/ManiSkill), [LIBERO](https://github.com/Lifelong-Robot-Learning/LIBERO).
+  - 首次实现对带有 flow-matching action expert 的 $\pi_0$ 和 $\pi_{0.5}$ 模型家族的 RL 微调。
+
+**RLinf 的高效性体现在：**
+
+- 细粒度流水化的混合式模式： 相较于其他框架，实现了 120%+ 的吞吐量提升。
+- 秒级显卡自动扩缩： 可动态扩展训练资源，支持在数秒内完成 GPU 切换，在保持 RL 算法 on-policy 特性的同时，进一步提升 20–40% 的效率。
+
+**RLinf 的灵活性与易用性体现在：**
+
+- 多后端集成
+
+  - FSDP + Hugging Face： 快速适配新模型与新算法，非常适合初学者和快速原型开发。
+  - Megatron + SGLang： 针对大规模训练进行了优化，为专家用户提供最大化效率。
+
+- 自适应通信： 通过异步通信通道实现高效交互。
+
+- 内置支持主流 RL 方法： 包括 [PPO](https://arxiv.org/abs/1707.06347), [GRPO](https://arxiv.org/abs/2402.03300), [DAPO](https://arxiv.org/abs/2503.14476), [Reinforce++](https://arxiv.org/abs/2501.03262) 等。
+
+## 主要成果
+### 具身智能
+
+
+<div align="center">
+<table border="0">
+  <tr>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvla.png" alt="mani_openvla" width="350"/>
+      <br/>
+      <strong>OpenVLA</strong>
+    </td>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvlaoft.png" alt="mani_openvlaoft" width="350"/>
+      <br/>
+      <strong>OpenVLA-OFT</strong>
+    </td>
+  </tr>
+</table>
+</div>
+
+- 在 ManiSkill 环境 “PutOnPlateInScene25Mani-v3” 上，使用 OpenVLA 与 OpenVLA-OFT 模型进行训练。结果显示，在 PPO 与 GRPO 算法的对比中，PPO 始终表现优于 GRPO，且训练过程更加稳定。
+
+<div align="center">
+<table style="text-align:center;">
+  <tr>
+    <th colspan="6" style="text-align:center;"> <strong>在 ManiSkill 上的评测结果。表中数值表示任务的成功率（Success Rate）</strong></th>
+  </tr>
+  <tr>
+    <td style="text-align:center;"></td>
+    <th rowspan="2" colspan="1" style="text-align:center;">In-Distribution</th>
+    <td colspan="4" style="text-align:center;"><strong>Out-Of-Distribution<strong></td>
+  
+  </tr>
+  <tr>
+    <th style="text-align:center;"></th>
+    <th style="text-align:center;">Vision</th>
+    <th style="text-align:center;">Semantic</th>
+    <th style="text-align:center;">Execution</th>
+    <th style="text-align:center;">Avg.</th>
+  </tr>
+  <tr>
+    <td style="text-align:center;">OpenVLA (Base)</td>
+    <td style="text-align:center;">53.91%</td>
+    <td style="text-align:center;">38.75%</td>
+    <td style="text-align:center;">35.94%</td>
+    <td style="text-align:center;">42.11%</td>
+    <td style="text-align:center;">39.10%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">RL4VLA (PPO)</td>
+    <td style="text-align:center;">93.75%</td>
+    <td style="text-align:center;">80.47%</td>
+    <td style="text-align:center;">75.00%</td>
+    <td style="text-align:center;">81.77%</td>
+    <td style="text-align:center;">79.15%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-GRPO)</td>
+    <td style="text-align:center;">84.38%</td>
+    <td style="text-align:center;">74.69%</td>
+    <td style="text-align:center;">72.99%</td>
+    <td style="text-align:center;">77.86%</td>
+    <td style="text-align:center;">75.15%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>96.09%</strong></td>
+    <td style="text-align:center;">82.03%</td>
+    <td style="text-align:center;"><strong>78.35%</strong></td>
+    <td style="text-align:center;"><strong>85.42%</strong></td>
+    <td style="text-align:center;"><strong>81.93%</strong></td>
+  </tr>
+  <tr>
+    <th colspan="6" style="text-align:center;"></th>
+  </tr>
+  <tr>
+    <td style="text-align:center;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">28.13%</td>
+    <td style="text-align:center;">27.73%</td>
+    <td style="text-align:center;">12.95%</td>
+    <td style="text-align:center;">11.72%</td>
+    <td style="text-align:center;">18.29%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;">94.14%</td>
+    <td style="text-align:center;">84.69%</td>
+    <td style="text-align:center;">45.54%</td>
+    <td style="text-align:center;">44.66%</td>
+    <td style="text-align:center;">60.64%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>97.66%</strong></td>
+    <td style="text-align:center;"><strong>92.11%</strong></td>
+    <td style="text-align:center;">64.84%</td>
+    <td style="text-align:center;">73.57%</td>
+    <td style="text-align:center;">77.05%</td>
+  </tr>
+</table>
+</div>
+
+
+<div align="center">
+<table style="text-align:center;">
+  <tr>
+    <th colspan="7" style="text-align:center;"><strong>统一模型在五个 LIBERO 任务组上的评测结果</strong></th>
+  </tr>
+  <tr>
+    <th style="text-align:center;">Model</th>
+    <th style="text-align:center;">Spatial</th>
+    <th style="text-align:center;">Object</th>
+    <th style="text-align:center;">Goal</th>
+    <th style="text-align:center;">10</th>
+    <th style="text-align:center;">90</th>
+    <th style="text-align:center;">Avg.</th>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">72.18%</td>
+    <td style="text-align:center;">71.48%</td>
+    <td style="text-align:center;">64.06%</td>
+    <td style="text-align:center;">48.44%</td>
+    <td style="text-align:center;">70.97%</td>
+    <td style="text-align:center;">65.43%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;"><strong>99.40%<strong></td>
+    <td style="text-align:center;"><strong>99.80%<strong></td>
+    <td style="text-align:center;"><strong>98.79%<strong></td>
+    <td style="text-align:center;"><strong>93.95%<strong></td>
+    <td style="text-align:center;"><strong>98.59%<strong></td>
+    <td style="text-align:center;"><strong>98.11%<strong></td>
+  </tr>
+  <tr>
+    <td style="text-align:center;">Δ Improvement</td>
+    <td style="text-align:center;">+27.22</td>
+    <td style="text-align:center;">+28.32</td>
+    <td style="text-align:center;">+34.73</td>
+    <td style="text-align:center;">+45.51</td>
+    <td style="text-align:center;">+27.62</td>
+    <td style="text-align:center;">+32.68</td>
+  </tr>
+</table>
+</div>
+
+
+- RLinf 同时支持 PPO 与 GRPO 算法，为视觉-语言-动作（Vision-Language-Action, VLA）模型提供最先进的训练能力。
+- 该框架与主流具身智能基准测试（如 ManiSkill3 与 LIBERO）无缝集成，并在多样化的评测指标上均取得了优异表现。
+
+
+### 数学推理
+
+<div align="center">
+<table>
+  <tr>
+    <th colspan="5" style="text-align:center;"><strong>1.5B model results</strong></th>
+  </tr>
+  <tr>
+    <th>Model</th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepSeek-R1-Distill-Qwen-1.5B (base model)</a></td>
+    <td>28.33</td><td>24.90</td><td>27.45</td><td>26.89</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/zwhe99/DeepMath-1.5B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepMath-1.5B</a></td>
+    <td>37.80</td><td>30.42</td><td>32.11</td><td>33.44</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/agentica-org/DeepScaleR-1.5B-Preview"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepScaleR-1.5B-Preview</a></td>
+    <td>40.41</td><td>30.93</td><td>27.54</td><td>32.96</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/inclusionAI/AReaL-1.5B-Preview-Stage-3"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">AReaL-1.5B-Preview-Stage-3</a></td>
+    <td>40.73</td><td>31.56</td><td>28.10</td><td>33.46</td>
+  </tr>
+  <tr>
+    <td>AReaL-1.5B-retrain*</td>
+    <td>44.42</td><td>34.27</td><td>33.81</td><td>37.50</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/Nickyang/FastCuRL-1.5B-V3"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">FastCuRL-1.5B-V3</a></td>
+    <td>43.65</td><td>32.49</td><td>35.00</td><td>37.05</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-math-1.5B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;"><strong>RLinf-math-1.5B</strong></a></td>
+    <td><strong>48.44</strong></td><td><strong>35.63</strong></td><td><strong>38.46</strong></td><td><strong>40.84</strong></td>
+  </tr>
+</table>
+</div>
+
+\* 我们使用默认设置对模型进行了 600 步的重新训练。
+
+<div align="center">
+<table>
+  <tr>
+    <th colspan="5" style="text-align:center;"><strong>7B model results</strong></th>
+  </tr>
+  <tr>
+    <th>Model</th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepSeek-R1-Distill-Qwen-7B (base model)</a></td>
+    <td>54.90</td><td>40.20</td><td>45.48</td><td>46.86</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/inclusionAI/AReaL-boba-RL-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">AReaL-boba-RL-7B</a></td>
+    <td>61.66</td><td>49.38</td><td>46.93</td><td>52.66</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/Skywork/Skywork-OR1-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Skywork-OR1-7B</a></td>
+    <td>66.87</td><td>52.49</td><td>44.43</td><td>54.60</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/POLARIS-Project/Polaris-7B-Preview"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Polaris-7B-Preview</a></td>
+    <td><strong>68.55</strong></td><td>51.24</td><td>43.88</td><td>54.56</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/nvidia/AceMath-RL-Nemotron-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">AceMath-RL-Nemotron-7B</a></td>
+    <td>67.30</td><td><strong>55.00</strong></td><td>45.57</td><td>55.96</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-math-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;"><strong>RLinf-math-7B</strong></a></td>
+    <td>68.33</td><td>52.19</td><td><strong>48.18</strong></td><td><strong>56.23</strong></td>
+  </tr>
+</table>
+</div>
+
+- RLinf 在数学推理任务上实现了当前最先进的性能，在多个基准测试（AIME 24、AIME 25、GPQA-diamond）中，1.5B 与 7B 规模的模型均稳定超越现有方法。
+
+## 路线图
+
+### 1. 系统级增强
+- [ ] 支持异构 GPU
+
+- [ ] 支持异步流水线执行
+
+- [ ] 支持专家混合（Mixture of Experts, MoE）
+
+- [ ] 支持 vLLM 推理后端
+
+### 2. 应用级扩展
+- [ ] 支持视觉-语言模型（VLMs）训练
+
+- [ ] 支持深度搜索智能体训练
+
+- [ ] 支持多智能体训练
+- [ ] 支持更多具身模拟器的集成 (如 [Meta-World](https://github.com/Farama-Foundation/Metaworld), [GENESIS](https://github.com/Genesis-Embodied-AI/Genesis), [RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin))  
+- [ ] 支持更多VLA模型，比如 [GR00T](https://github.com/NVIDIA/Isaac-GR00T), [WALL-OSS](https://huggingface.co/x-square-robot/wall-oss-flow)
+- [ ] 支持世界模型（World Model）
+
+- [ ] 支持真实世界的具身智能强化学习
+
+
+## 快速开始 
+
+完整的 RLinf 文档请见[**这里**](https://rlinf.readthedocs.io/en/latest/).
+
+**快速上手**
+
+  - [安装指南](https://rlinf.readthedocs.io/en/latest/rst_source/start/installation.html)
+  - [快速上手 1：在 ManiSkill3 上进行 VLA 的 PPO 训练](https://rlinf.readthedocs.io/en/latest/rst_source/start/vla.html)
+  - [快速上手 2：在 MATH 上进行 LLM 的 GRPO 训练](https://rlinf.readthedocs.io/en/latest/rst_source/start/llm.html)
+  - [多节点训练](https://rlinf.readthedocs.io/en/latest/rst_source/start/distribute.html)
+  - [模型评估](https://rlinf.readthedocs.io/en/latest/rst_source/start/eval.html)
+
+**关键设计**
+  - [统一用户接口使用](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/user/index.html)
+  - [灵活的执行模式](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/mode/index.html)
+  - [自动调度支持](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/scheduler/index.html)
+  - [弹性通信](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/communication/index.html)
+
+**示例库**
+
+  - [具身智能 VLA 模型训练](https://rlinf.readthedocs.io/en/latest/rst_source/examples/embodied.html)
+  - [数学推理模型训练](https://rlinf.readthedocs.io/en/latest/rst_source/examples/reasoning.html)
+
+**高级特性**
+
+  - [Megatron-LM 的 5D 并行配置](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/5D.html)
+  - [LoRA 集成以实现高效微调](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/lora.html)
+  - [在不同版本的 SGLang 之间切换](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/version.html)
+  - [检查点恢复与重启支持](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/resume.html)
+
+**框架扩展**
+
+  - [添加新环境](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/extend/new_env.html)
+  - [基于 FSDP+Hugging Face 后端添加新模型](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/extend/new_model_fsdp.html)
+  - [基于 Megatron+SGLang 后端添加新模型](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/extend/new_model_megatron.html)
+
+**博客**
+
+  - [与 VeRL 的对比](https://rlinf.readthedocs.io/en/latest/rst_source/blog/compare_with_verl.html)
+
+## 构建状态
+
+| Type             | Status |
+| :--------------: | :----: |
+| 推理 RL-MATH | [![Build Status](https://github.com/RLinf/RLinf/actions/workflows/math_e2e.yml/badge.svg)](https://github.com/RLinf/RLinf/actions/workflows/math_e2e.yml) |
+| 具身 RL-VLA   | [![Build Status](https://github.com/RLinf/RLinf/actions/workflows/embodied_e2e.yml/badge.svg)](https://github.com/RLinf/RLinf/actions/workflows/embodied_e2e.yml) |
+
+
+## 贡献指南
+我们欢迎对 RLinf 的贡献。在参与之前，请先阅读 [贡献指南](https://rlinf.readthedocs.io/en/latest/index.html#contribution-guidelines)。
+
+## 引用与致谢
+
+如果您觉得 **RLinf** 对您的研究或工作有所帮助，请引用以下论文：
+
+```bibtex
+@misc{yu2025rlinfflexibleefficientlargescale,
+  title={RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation}, 
+  author={Chao Yu and Yuanqing Wang and Zhen Guo and Hao Lin and Si Xu and Hongzhi Zang and Quanlu Zhang and Yongji Wu and Chunyang Zhu and Junhao Hu and Zixiao Huang and Mingjie Wei and Yuqing Xie and Ke Yang and Bo Dai and Zhexuan Xu and Xiangyuan Wang and Xu Fu and Zhihao Liu and Kang Chen and Weilin Liu and Gang Liu and Boxun Li and Jianlei Yang and Zhi Yang and Guohao Dai and Yu Wang},
+  year={2025},
+  eprint={2509.15965},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG},
+  url={https://arxiv.org/abs/2509.15965}, 
+}
+```
+
+如果你在 RLinf 中使用了 RL+VLA，欢迎引用我们的算法技术报告和实证研究论文：
+
+```bibtex
+@misc{zang2025rlinfvlaunifiedefficientframework,
+      title={RLinf-VLA: A Unified and Efficient Framework for VLA+RL Training}, 
+      author={Hongzhi Zang and Mingjie Wei and Si Xu and Yongji Wu and Zhen Guo and Yuanqing Wang and Hao Lin and Liangzhi Shi and Yuqing Xie and Zhexuan Xu and Zhihao Liu and Kang Chen and Wenhao Tang and Quanlu Zhang and Weinan Zhang and Chao Yu and Yu Wang},
+      year={2025},
+      eprint={2510.06710},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2510.06710}, 
+}
+```
+
+```bibtex
+@misc{liu2025rlbringvlageneralization,
+  title={What Can RL Bring to VLA Generalization? An Empirical Study}, 
+  author={Jijia Liu and Feng Gao and Bingwen Wei and Xinlei Chen and Qingmin Liao and Yi Wu and Chao Yu and Yu Wang},
+  year={2025},
+  eprint={2505.19789},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG},
+  url={https://arxiv.org/abs/2505.19789}, 
+}
+```
+
+**致谢**
+RLinf 的灵感来源并受益于更广泛开源社区的思想与工具。
+我们特别感谢 VeRL、AReaL、Megatron-LM、SGLang 和 PyTorch Fully Sharded Data Parallel (FSDP) 的团队与贡献者。
+如果我们不慎遗漏了您的项目或贡献，请提交 issue 或 pull request，以便我们能够给予您应有的致谢。
+
+**联系方式：**
+我们欢迎博士后、博士/硕士研究生以及实习生的加入。
+诚邀您共同塑造强化学习基础设施与具身智能的未来！
+- Chao Yu: zoeyuchao@gmail.com
+- Yu Wang: yu-wang@tsinghua.edu.cn
\ No newline at end of file
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000..2610efcd6
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,24 @@
+## Building Docker Images
+
+RLinf provides a unified Dockerfile for both the math reasoning and embodied images, and can switch between the two images using the `BUILD_TARGET` build argument, which can be `reason` or `embodied`.
+To build the Docker image, run the following command in the `docker/torch-x.x` directory, replacing `x.x` with the desired PyTorch version (e.g., `2.6` or `2.7`):
+
+```shell
+export BUILD_TARGET=reason # or embodied for the embodied image
+docker build --build-arg BUILD_TARGET=$BUILD_TARGET -t rlinf:$BUILD_TARGET .
+```
+
+If you are building the `reason` image and run into OOM during build, it might be because the `APEX` package's number of compile threads is set too high (default 24 and may require over 200 GB memory).
+You can reduce the number of compile threads by adding `--build-arg APEX_BUILD_THREADS=<num_threads>` to the `docker build` command, where `<num_threads>` is the number of threads you want to use (e.g., 8 or 12).
+
+# Using the Docker Image
+
+The built Docker image contains one or multiple Python virtual environments (venv) in the `/opt/venv` directory, depending on the `BUILD_TARGET`.
+
+Currently, the reasoning image contains one venv named `reason` in `/opt/venv/reason`, while the embodied image contains three venvs named `openvla`, `openvla-oft` and `pi0` in `/opt/venv/`.
+
+To switch to the desired venv, we have a built-in script `switch_env` that can switch among venvs in a single command.
+
+```shell
+source switch_env <env_name> # e.g., source switch_env openvla-oft, source switch_env pi0, etc.
+```
\ No newline at end of file
diff --git a/docker/embodied/Dockerfile.openvla.hf.fsdp b/docker/embodied/Dockerfile.openvla.hf.fsdp
deleted file mode 100644
index b02a975f7..000000000
--- a/docker/embodied/Dockerfile.openvla.hf.fsdp
+++ /dev/null
@@ -1,91 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PATH=/opt/conda/bin:$PATH \
-    CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-
-RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    sed -i 's|http://security.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        vim \
-        libibverbs-dev \
-        openssh-server \
-        sudo \
-        runit \
-        runit-systemd \
-        tmux \
-        wget \
-        curl \
-        ca-certificates \
-        mesa-utils \
-        libosmesa6-dev \
-        freeglut3-dev \
-        libglew-dev \
-        libegl1 \
-        libgles2 \
-        libglvnd-dev \
-        libglfw3-dev \
-        libgl1-mesa-dev \
-        libgl1-mesa-glx \
-        libglib2.0-0 \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        libgomp1 \
-        && rm -rf /var/lib/apt/lists/* \
-        && apt-get clean
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip && \
-    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-RUN pip install \
-    hydra-core==1.4.0.dev1 \
-    torchdata \
-    word2number \
-    setuptools==69.5.1 \
-    datasets \
-    sentencepiece \
-    regex \
-    einops \
-    scipy \
-    wandb \
-    tensorboard \
-    nvitop \
-    accelerate \
-    pylatexenc \
-    pybind11 \
-    torch_memory_saver \
-    ray[default]==2.47.0 \
-    draccus \
-    rich \
-    tensorflow_graphics \
-    peft==0.11.1 \
-    timm==0.9.10 \
-    tensordict \
-    transformers==4.40.1
-
-RUN pip install --no-build-isolation  --use-pep517 flash-attn==2.5.5
-
-WORKDIR /workspace
-
-RUN git clone https://github.com/openvla/openvla.git /workspace/openvla && \
-    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git /workspace/libero && \
-    git clone https://github.com/haosulab/ManiSkill.git /workspace/maniskill
-
-RUN cd /workspace/maniskill && \
-    git checkout fa22a46ecf54a4035a762dade27f8cb3f907aa46 && \
-    cd /workspace
-
-RUN pip install \
-    -e /workspace/maniskill \
-    -e /workspace/libero \
-    -e /workspace/openvla
-
-RUN pip install -r /workspace/openvla/experiments/robot/libero/libero_requirements.txt 
-
-RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN echo "conda activate" >> ~/.bashrc
-
-CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp b/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
deleted file mode 100644
index e5a24e24e..000000000
--- a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
+++ /dev/null
@@ -1,91 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PATH=/opt/conda/bin:$PATH \
-    CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-
-RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    sed -i 's|http://security.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        vim \
-        libibverbs-dev \
-        openssh-server \
-        sudo \
-        runit \
-        runit-systemd \
-        tmux \
-        wget \
-        curl \
-        ca-certificates \
-        mesa-utils \
-        libosmesa6-dev \
-        freeglut3-dev \
-        libglew-dev \
-        libegl1 \
-        libgles2 \
-        libglvnd-dev \
-        libglfw3-dev \
-        libgl1-mesa-dev \
-        libgl1-mesa-glx \
-        libglib2.0-0 \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        libgomp1 \
-        && rm -rf /var/lib/apt/lists/* \
-        && apt-get clean
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip && \
-    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-RUN pip install \
-    hydra-core==1.4.0.dev1 \
-    torchdata \
-    word2number \
-    setuptools==69.5.1 \
-    datasets \
-    sentencepiece \
-    regex \
-    einops \
-    scipy \
-    wandb \
-    tensorboard \
-    nvitop \
-    accelerate \
-    pylatexenc \
-    pybind11 \
-    torch_memory_saver \
-    ray[default]==2.47.0 \
-    draccus \
-    rich \
-    tensorflow_graphics \
-    peft==0.11.1 \
-    timm==0.9.10 \
-    tensordict \
-    transformers==4.40.1
-
-RUN pip install --no-build-isolation  --use-pep517 flash-attn==2.5.5
-
-WORKDIR /workspace
-
-RUN git clone https://github.com/moojink/openvla-oft.git /workspace/openvla_oft && \
-    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git /workspace/libero && \
-    git clone https://github.com/haosulab/ManiSkill.git /workspace/maniskill
-
-RUN cd /workspace/maniskill && \
-    git checkout fa22a46ecf54a4035a762dade27f8cb3f907aa46 && \
-    cd /workspace
-
-RUN pip install \
-    -e /workspace/maniskill \
-    -e /workspace/libero \
-    -e /workspace/openvla_oft
-
-RUN pip install -r /workspace/openvla_oft/experiments/robot/libero/libero_requirements.txt 
-
-RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN echo "conda activate" >> ~/.bashrc
-
-CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/embodied/Dockerfile.pi0.hf.fsdp b/docker/embodied/Dockerfile.pi0.hf.fsdp
deleted file mode 100644
index d85608a1e..000000000
--- a/docker/embodied/Dockerfile.pi0.hf.fsdp
+++ /dev/null
@@ -1,91 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PATH=/opt/conda/bin:$PATH \
-    CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-
-RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    sed -i 's|http://security.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        vim \
-        libibverbs-dev \
-        openssh-server \
-        sudo \
-        runit \
-        runit-systemd \
-        tmux \
-        wget \
-        curl \
-        ca-certificates \
-        mesa-utils \
-        libosmesa6-dev \
-        freeglut3-dev \
-        libglew-dev \
-        libegl1 \
-        libgles2 \
-        libglvnd-dev \
-        libglfw3-dev \
-        libgl1-mesa-dev \
-        libgl1-mesa-glx \
-        libglib2.0-0 \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        libgomp1 \
-        && rm -rf /var/lib/apt/lists/* \
-        && apt-get clean
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip && \
-    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-RUN pip install \
-    hydra-core==1.4.0.dev1 \
-    torchdata \
-    word2number \
-    setuptools==69.5.1 \
-    datasets \
-    sentencepiece \
-    regex \
-    einops \
-    scipy \
-    wandb \
-    tensorboard \
-    nvitop \
-    accelerate \
-    pylatexenc \
-    pybind11 \
-    torch_memory_saver \
-    ray[default]==2.47.0 \
-    draccus \
-    rich \
-    tensorflow_graphics \
-    peft==0.11.1 \
-    timm==0.9.10 \
-    tensordict \
-    transformers==4.53.0 \
-    tokenizers==0.21.4 \
-    av==15.0.0
-
-RUN pip install --no-build-isolation --use-pep517 flash-attn==2.5.5
-
-WORKDIR /workspace
-
-RUN git clone https://github.com/huggingface/lerobot.git /workspace/lerobot && \
-    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git /workspace/libero && \
-    git clone https://github.com/haosulab/ManiSkill.git /workspace/maniskill
-
-RUN cd /workspace/maniskill && \
-    git checkout fa22a46ecf54a4035a762dade27f8cb3f907aa46 && \
-    cd /workspace
-
-RUN pip install \
-    -e /workspace/maniskill \
-    -e /workspace/libero \
-    -e /workspace/lerobot
-
-RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN echo "conda activate" >> ~/.bashrc
-
-CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0 b/docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0
deleted file mode 100644
index 41ffd17c2..000000000
--- a/docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0
+++ /dev/null
@@ -1,32 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt update && apt install git vim libibverbs-dev openssh-server sudo runit runit-systemd tmux -y
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip
-RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout v0.11.0
-WORKDIR /opt
-RUN git clone https://github.com/RLinf/latex2sympy2.git && cd latex2sympy2 && pip install -e .
-RUN pip install hydra-core==1.4.0.dev1
-RUN pip install torchdata
-RUN pip install word2number
-RUN pip install vllm==0.7.1
-RUN pip install setuptools==69.5.1 datasets sentencepiece regex einops scipy wandb tensorboard nvitop accelerate pylatexenc pybind11 torch_memory_saver swanlab
-RUN pip install ray[default]==2.47.0
-ENV CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-RUN pip install transformer_engine[pytorch]==2.1.0
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/apex && cd apex && pip install -v \
-        --disable-pip-version-check \
-        --no-cache-dir \
-        --no-build-isolation \
-        --config-settings "--build-option=--cpp_ext" \
-        --config-settings "--build-option=--cuda_ext" ./
-WORKDIR /opt
-RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
-# install sglang
-RUN pip install sglang[all]==0.4.4
-ENV PATH=/opt/conda/bin:$PATH
-ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
-WORKDIR /workspace
-CMD [ "/bin/bash" ]
\ No newline at end of file
diff --git a/docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0 b/docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0
deleted file mode 100644
index 5e0a1bcb8..000000000
--- a/docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0
+++ /dev/null
@@ -1,56 +0,0 @@
-FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    git vim libibverbs-dev openssh-server sudo runit runit-systemd tmux \
-    build-essential python3-dev cmake pkg-config \
- && rm -rf /var/lib/apt/lists/*
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip setuptools wheel
-RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-ENV HF_HOME=/opt/.cache/huggingface
-RUN mkdir -p $HF_HOME
-
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout core_r0.13.0
-WORKDIR /opt
-RUN git clone --depth=1 https://github.com/RLinf/latex2sympy2.git && cd latex2sympy2 && pip install -e .
-
-RUN pip install hydra-core==1.4.0.dev1 torchdata word2number vllm==0.8.5 \
-    datasets sentencepiece regex einops scipy wandb tensorboard nvitop accelerate pylatexenc pybind11 \
-    torch_memory_saver swanlab ray[default]==2.47.0
-
-ENV CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-RUN pip install 'transformer_engine[pytorch]==2.1.0'
-
-WORKDIR /opt
-RUN git clone --depth=1 https://github.com/NVIDIA/apex && cd apex && \
-    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
-        --config-settings "--build-option=--cpp_ext" \
-        --config-settings "--build-option=--cuda_ext" ./
-
-# RUN pip install 'flash-attn'==2.7.4.post1 --no-build-isolation
-RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
-
-
-RUN pip install 'sglang[all]==0.4.6.post5'
-
-RUN pip install flashinfer-python==0.2.2 
-
-RUN pip install triton==3.1.0
-
-RUN pip uninstall pynvml -y
-
-ENV PATH=/opt/conda/bin:$PATH
-ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
-WORKDIR /workspace
-
-RUN python - <<'PY'
-import torch, vllm, flash_attn_cuda
-import apex
-print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)
-print("CUDA available:", torch.cuda.is_available())
-PY
-
-CMD ["/bin/bash"]
diff --git a/docker/torch-2.6/Dockerfile b/docker/torch-2.6/Dockerfile
new file mode 100644
index 000000000..085d5c55d
--- /dev/null
+++ b/docker/torch-2.6/Dockerfile
@@ -0,0 +1,145 @@
+ARG BUILD_TARGET=reason
+
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS base-image
+
+SHELL ["/bin/bash", "-c"]
+ENV PATH=/opt/conda/bin:$PATH
+ENV DEBIAN_FRONTEND=noninteractive
+RUN sed -i 's@//.*archive.ubuntu.com@//mirrors.ustc.edu.cn@g' /etc/apt/sources.list
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git vim libibverbs-dev openssh-server sudo runit runit-systemd tmux \
+    build-essential python3-dev cmake pkg-config iproute2 pciutils python3 python3-pip
+
+RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+RUN python3 -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip setuptools wheel uv
+
+ENV HF_HOME=/opt/.cache/huggingface
+RUN mkdir -p $HF_HOME
+
+# UV index
+RUN mkdir -p /etc/uv
+RUN cat <<EOF > /etc/uv/uv.toml
+[[index]]
+url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+default = true
+EOF
+
+# UV setup
+ENV UV_PATH=/opt/venv
+RUN mkdir $UV_PATH
+WORKDIR $UV_PATH
+COPY project $UV_PATH/pyproject.toml
+ENV UV_LINK_MODE=symlink
+ENV UV_CACHE_DIR=$UV_PATH/.cache
+
+FROM base-image AS reason-image
+
+ARG APEX_BUILD_THREADS=24
+
+# Install Megatron-LM
+RUN git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
+ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
+
+# Install reasoning env
+RUN uv venv reason && source ${UV_PATH}/reason/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra sglang-vllm --active && \
+    APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads $APEX_BUILD_THREADS" APEX_PARALLEL_BUILD=$APEX_BUILD_THREADS uv pip install git+https://github.com/NVIDIA/apex.git --no-build-isolation && \
+    uv pip install transformer_engine[pytorch]==2.1.0 --no-build-isolation && \
+    uv pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl && \
+    uv pip uninstall pynvml
+
+# Set default env
+RUN echo "source ${UV_PATH}/reason/bin/activate" >> ~/.bashrc
+
+FROM base-image AS embodied-image
+
+# Embodied NVIDIA_DRIVER_CAPABILITIES
+ENV NVIDIA_DRIVER_CAPABILITIES="compute,utility,graphics"
+
+# Embodied system dependencies
+RUN apt-get install -y --no-install-recommends \
+    wget \
+    unzip \
+    mesa-utils \
+    libosmesa6-dev \
+    freeglut3-dev \
+    libglew-dev \
+    libegl1 \
+    libgles2 \
+    libglvnd-dev \
+    libglfw3-dev \
+    libgl1-mesa-dev \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1
+
+# Install openvla env
+RUN uv venv openvla && source ${UV_PATH}/openvla/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra embodied --active && \
+    uv pip install git+https://github.com/openvla/openvla.git --no-build-isolation && \
+    uv pip install flash-attn==2.5.5 --no-build-isolation && \
+    uv pip uninstall pynvml
+
+# Install openvla-oft env
+RUN uv venv openvla-oft && source ${UV_PATH}/openvla-oft/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra embodied --active && \
+    uv pip install git+https://github.com/moojink/openvla-oft.git --no-build-isolation && \
+    uv pip install flash-attn==2.5.5 --no-build-isolation && \
+    uv pip uninstall pynvml
+
+# Install pi0 env
+RUN uv venv pi0 && source ${UV_PATH}/pi0/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra embodied --active && \
+    uv pip install "lerobot>=0.3.3" && \
+    uv pip install flash-attn==2.5.5 --no-build-isolation && \
+    uv pip uninstall pynvml
+
+# Install maniskill assets
+RUN source ${UV_PATH}/openvla/bin/activate && \
+    python -m mani_skill.utils.download_asset bridge_v2_real2sim -y && \
+    python -m mani_skill.utils.download_asset widowx250s -y
+
+# Install SAPIEN PhysX Patch
+RUN export PHYSX_VERSION=105.1-physx-5.3.1.patch0 && \
+    export PHYSX_DIR=~/.sapien/physx/$PHYSX_VERSION && \ 
+    mkdir -p $PHYSX_DIR && \
+    wget -O $PHYSX_DIR/linux-so.zip https://github.com/sapien-sim/physx-precompiled/releases/download/$PHYSX_VERSION/linux-so.zip && \
+    unzip $PHYSX_DIR/linux-so.zip -d $PHYSX_DIR && rm $PHYSX_DIR/linux-so.zip
+
+RUN git clone https://github.com/RLinf/LIBERO.git /opt/libero
+ENV PYTHONPATH=/opt/libero:$PYTHONPATH
+
+# Set default env
+RUN echo "source ${UV_PATH}/openvla/bin/activate" >> ~/.bashrc
+
+FROM ${BUILD_TARGET}-image AS final-image
+
+# switch_env utility
+RUN cat <<EOF > /usr/local/bin/switch_env
+#!/bin/bash
+if [ -z "\$1" ]; then
+    echo "Usage: switch_env <env_name>"
+    exit 1
+fi
+if [ ! -d "${UV_PATH}/\$1" ]; then
+    echo "Environment \$1 does not exist in ${UV_PATH}."
+    exit 1
+fi
+source ${UV_PATH}/\$1/bin/activate
+EOF
+RUN chmod +x /usr/local/bin/switch_env
+
+# Clean up
+RUN uv clean prune
+RUN rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/torch-2.6/project b/docker/torch-2.6/project
new file mode 100644
index 000000000..ebf2c57e4
--- /dev/null
+++ b/docker/torch-2.6/project
@@ -0,0 +1,128 @@
+[project]
+name = "RLinf"
+version = "0.1.0"
+readme = {file = "README.md", content-type = "text/markdown"}
+requires-python = "==3.11.10"
+keywords = [
+    "reinforcement-learning",
+    "embodied-intelligence",
+    "large-language-models",
+]
+classifiers = [
+    #   2 - Pre-Alpha  
+    #   3 - Alpha
+    #   4 - Beta
+    #   5 - Production/Stable
+    "Development Status :: 2 - Pre-Alpha",
+    "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4",
+    "Intended Audience :: Developers",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+]
+
+dependencies = [
+    # Core System
+    "accelerate",
+    "ray[default]>=2.47.0",
+    "torch==2.6.0",
+
+    # Data processing
+    "pylatexenc",
+    "datasets",
+    "latex2sympy2 @ git+https://github.com/RLinf/latex2sympy2.git",
+    "sentencepiece",
+    "torchdata",
+    "wandb",
+    "word2number",
+    "regex",
+    "scipy",
+
+    # Utilities
+    "hydra-core==1.4.0.dev1",
+    "einops",
+    "nvitop",
+    "pybind11",
+    "torch-memory-saver",
+    "setuptools>=69.5.1,<75.9",
+    "ninja",
+
+    # Logging
+    "swanlab",
+    "tensorboard",
+]
+
+[project.optional-dependencies]
+sglang-vllm = [
+    "transformers==4.51.1",
+    "sglang[all]==0.4.6.post5",
+    "vllm==0.8.5",
+]
+embodied = [
+    "prismatic",
+    "transformers==4.40.1",
+    "draccus",
+    "rich",
+    "tensorflow_graphics",
+    "peft==0.11.1",
+    "timm==0.9.10",
+    "sapien==3.0.1;platform_system=='Linux'",
+    "mani_skill @ git+https://github.com/haosulab/ManiSkill.git",
+    "tensordict",
+    "libero @ git+https://github.com/RLinf/LIBERO.git",
+    "imageio[ffmpeg]",
+    "robosuite==1.4.1",
+    "bddl",
+    "easydict",
+    "cloudpickle",
+    "gym",
+]
+
+[tool.uv]
+prerelease = "allow"
+conflicts = [
+    [
+      { extra = "sglang-vllm" },
+      { extra = "embodied" },
+    ],
+]
+override-dependencies = [
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+    "torchaudio==2.6.0",
+    "xgrammar==0.1.19"
+]
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+target-version = "py38"
+
+[tool.ruff.lint]
+isort = {known-first-party = ["rlinf"]}
+select = ["C", "E", "F", "I", "W", "CPY001", "RUF013", "UP006", "PERF102", "PLC1802", "PLC0208", "D", "RUF002"]
+ignore = [
+    "C901", # "complex-structure"
+    "E501", # "line-too-long"
+    "E741", # "ambiguous-variable-name"
+    "D100", # "Missing docstring in public module"
+    "D104", # "Missing docstring in public package"
+    "D203", # "incorrect-blank-line-before-class", conflict with D211,
+    "D213", # "multi-line-summary-second-line", conflict with D212
+]
+fixable = ["ALL"]
+unfixable = []
+
+[tool.ruff.lint.per-file-ignores]
+# Only enable docstring check for the scheduler module for now.
+"!rlinf/scheduler/**.py" = ["D", "RUF002"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = true
+docstring-code-line-length = "dynamic"
\ No newline at end of file
diff --git a/docs/source-en/rst_source/examples/coding_online_rl.rst b/docs/source-en/rst_source/examples/coding_online_rl.rst
new file mode 100644
index 000000000..d8cc50daa
--- /dev/null
+++ b/docs/source-en/rst_source/examples/coding_online_rl.rst
@@ -0,0 +1,180 @@
+Online Reinforcement Learning for Code Completion Agent
+=======================================================
+
+Online Reinforcement Learning for Code Completion Agent is an important application scenario in the RLinf framework.
+Through integration with code editors like Continue, we can collect user preference feedback on code completions, enabling near real-time code generation and feedback learning to quickly improve code completion quality and align with user preferences.
+This example demonstrates how to use the RLinf framework to train a model capable of online code completion tasks.
+
+Overview
+--------
+
+The online reinforcement learning for code completion agent system works through the following process:
+
+1. **Real-time Interaction**: The system receives code completion requests from editors like Continue
+2. **Model Inference**: Uses trained models to generate code completion suggestions
+3. **User Feedback**: Collects user acceptance/rejection feedback on generated code
+4. **Online Learning**: Updates model parameters in real-time based on user feedback
+
+This real-time learning mechanism allows the model to quickly adapt to user programming habits and preferences.
+
+Running the Script
+------------------
+
+**Environment Setup**
+
+First, ensure you have installed the RLinf framework and its dependencies:
+
+.. code-block:: bash
+
+   # Install additional dependencies
+   pip install httpx asyncio fuzzywuzzy
+
+**Configure Continue Integration**
+
+1. **Install Continue Extension**
+   
+   Since the current Continue does not support uploading user preference feedback on code completions, we have modified the Continue source code to support uploading user preference feedback on code completions.
+   Users can get the compiled modified Continue plugin from `here <https://github.com/RLinf/continue/releases>`_ or build it themselves.
+
+   After downloading the compiled Continue plugin, install it in VS Code.
+
+   Method 1: code --install-extension /path/to/continue-1.3.9.vsix"
+
+   Method 2: In VSCode, press Cmd+Shift+P, type 'Extensions: Install from VSIX', and select the above file
+
+2. **Configure Continue Settings**
+
+   The Continue configuration file path is:
+
+   .. code-block:: bash
+
+      ~/.continue/config.yaml
+
+   Add the following settings to your Continue configuration file:
+
+   .. code-block:: yaml
+
+      # Please replace http://xxx:xx/ with the actual RLinf online code completion service address
+
+      models:
+      # Add a model for code completion
+      - name: my-autocomplete
+         provider: openai
+         model: Qwen2.5-Coder-1.5B
+         apiBase: http://xxx:8081/v1
+         apiKey: xxx
+         roles:
+            - autocomplete
+
+      # Add sending user feedback on whether to accept code completions
+      tabAutocompleteOptions:
+      enableCompletionTracking: true
+      completionTrackingUrl: http://xxx:8082/api/training/submit
+      completionTrackingHeaders:
+         Authorization: "Bearer test-token"
+         X-Project-ID: "test-project"
+      maxPromptTokens: 1024
+      debounceDelay: 350
+      multilineCompletions: "auto"
+
+   After modifying and saving, open the Continue extension from the left panel, click the "Settings" gear button in the top right corner, and ensure "Autocomplete Model" is set to my-autocomplete in the "Models" page.
+
+**Start Training Service**
+
+1. **Prepare Model and Configuration**
+   
+   Ensure you have pre-trained model weights and modify the configuration file to match model paths, ports to use, etc.
+
+   .. code-block:: yaml
+
+      rollout:
+        model_dir: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+      
+      actor:
+        tokenizer:
+          tokenizer_model: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+
+2. **Start RLinf Training Service**
+   
+   .. code-block:: bash
+
+      # Navigate to project directory
+      cd /path/to/rlinf_online_rl
+      
+      # Start training service
+      bash examples/coding_online_rl/run_main_math_pipeline_grpo_megatron.sh qwen2.5-1.5b-ppo-megatron
+
+   This will start the following services:
+   - **Inference Service**: Provides code completion API on port 8081
+   - **Training Service**: Receives user feedback data on port 8082
+
+**Integration with Continue**
+
+1. **Start Continue**
+   
+   Launch the Continue extension in VS Code, ensuring it connects to the correct API endpoints.
+
+2. **Begin Programming**
+   
+   Start writing code in Continue. The system will:
+   - Automatically send code completion requests to the inference service
+   - Receive model-generated code suggestions
+   - Collect your acceptance/rejection feedback on suggestions
+
+3. **Real-time Learning**
+   
+   The system processes your feedback in real-time:
+   - Accepted suggestions are marked as positive feedback
+   - Rejected suggestions are marked as negative feedback
+   - Model parameters are updated online based on feedback
+
+**Monitor Training Process**
+
+You can monitor the training process through the following methods:
+
+1. **View Log Output**
+   
+   .. code-block:: bash
+
+      # View training logs
+      tail -f results/ppo-1.5b/train.log
+
+2. **Use TensorBoard**
+   
+   .. code-block:: bash
+
+      # Start TensorBoard
+      tensorboard --logdir results/grpo-1.5b
+
+3. **Check Model Checkpoints**
+   
+   Model checkpoints are periodically saved to the `results/grpo-1.5b/checkpoints/` directory during training.
+
+**Test Client**
+
+You can use the provided test client to verify system functionality:
+
+.. code-block:: bash
+
+   # Run test client
+   python examples/coding_online_rl/simple_test_client.py
+
+The test client simulates Continue behavior by sending code completion requests and submitting feedback data.
+
+**Troubleshooting**
+
+Common issues and solutions:
+
+1. **Port Conflicts**
+   
+   If ports 8081 or 8082 are occupied, modify the port settings in the configuration file.
+
+2. **Model Loading Failure**
+   
+   Check that the model path is correct and ensure model files exist and are accessible.
+
+3. **Continue Connection Failure**
+   
+   Ensure the API endpoint addresses in Continue configuration are correct and check network connectivity. You can also use simple_test_client to test if feedback data can be received normally.
+
+Through these steps, you can successfully run the online reinforcement learning for code completion agent system and achieve seamless integration with the Continue editor.
diff --git a/docs/source-en/rst_source/examples/index.rst b/docs/source-en/rst_source/examples/index.rst
index 678f2d68c..01b6fbd8a 100644
--- a/docs/source-en/rst_source/examples/index.rst
+++ b/docs/source-en/rst_source/examples/index.rst
@@ -4,20 +4,196 @@ Example Gallery
 This section presents the collection of **examples currently supported by RLinf**, 
 showcasing how the framework can be applied across different scenarios and 
 demonstrating its efficiency in practice.
+This example gallery is continuously expanding, covering new scenarios and tasks to highlight RLinf's flexibility and efficiency.
 
-At present, we provide two major categories of examples:
+Embodied Intelligence Scenarios
+-------------------------------
 
-- **Embodied Agent Scenario**: Training **VLA** models for embodied intelligence.  
-  See :doc:`embodied`.
-- **Reasoner Scenario**: Training **LLM** models for advanced reasoning tasks.  
-  See :doc:`reasoning`.
+This category includes embodied training examples with SOTA models (e.g., pi0, pi0.5, OpenVLA-OFT) and different simulators (e.g., LIBERO, ManiSkill, RoboTwin),
+as well as reinforcement learning training examples on real robots.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <video controls autoplay loop muted playsinline preload="metadata" style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);">
+         <source src="https://github.com/RLinf/misc/raw/main/pic/embody.mp4" type="video/mp4">
+         Your browser does not support the video tag.
+       </video>
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/maniskill.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>RL with ManiSkill Simulator</b>
+         </a><br>
+         ManiSkill + OpenVLA + PPO/GRPO achieves SOTA performance
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/libero_numbers.jpeg" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/libero.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>RL with LIBERO Simulator</b>
+         </a><br>
+         LIBERO + OpenVLA-OFT + GRPO reaches 99% success rate
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL on π₀ Models</b><br>
+         Significant improvement in RL training on π₀
+       </p>
+     </div>
+   </div>
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://raw.githubusercontent.com/RoboTwin-Platform/RoboTwin/main/assets/files/50_tasks.gif" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" 
+            data-target="animated-image.originalImage">
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL with RoboTwin</b><br>
+         RoboTwin + OpenVLA-OFT + PPO achieves SOTA performance
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/franka_arm_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]Real-World RL with Franka</b><br>
+         RLinf worker seamlessly integrates with the Franka robotic arm
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL with World Models</b><br>
+         Training with integrated UnifoLM-WMA-0 world models
+       </p>
+     </div>
+   </div>
+
+
+Reasoning Scenarios
+-------------------
+
+Reinforcement learning is a key approach to improving reasoning capabilities. RLinf supports mainstream models such as Qwen and Qwen-next for RL training in tasks like Math, achieving SOTA results.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/reasoning.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>RL Training for Math Reasoning</b>
+         </a><br>
+         Achieves SOTA results on AIME24/AIME25/GPQA-diamond benchmarks
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL Training for MoE Models</b><br>
+         RL training speed improved by xx% compared to other tools
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL Training for Qwen-next</b><br>
+         Achieves SOTA training performance with Qwen-next
+       </p>
+     </div>
+   </div>
+
+
+Agent Scenarios
+---------------
+
+RLinf's worker abstraction, flexible communication modules, and support for various accelerators make it naturally suited for building agent workflows and training agents.
+The following examples include agent workflow construction, online RL training, and environment integration.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/coding_online_rl.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>Open-Source Online RL for Code Completion</b>
+         </a><br>
+         End-to-end online RL with RLinf + Continue, improving model performance by xx%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]rStar2-agent RL Training</b><br>
+         Flexible resource allocation and scheduling across components
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]SWE-agent</b><br>
+         Unified deployment, inference, and training with high flexibility and performance
+       </p>
+     </div>
+   </div>
+
+
+Practical System Features
+-------------------------
+
+RLinf's overall design is simple and modular.
+Workers abstract components for RL and agents, with a flexible and efficient communication library enabling inter-component interaction.
+Thanks to this decoupled design, workers can be flexibly and dynamically scheduled to computing resources or assigned to the most suitable accelerators.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]Hot Scaling/Switching of Workers (Components)</b><br>
+         Hot switching reduces training time by 50%+
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]Hybrid Training on Heterogeneous Accelerator</b><br>
+         Flexible inter-operability between components on different accelerators to build training workflows
+       </p>
+     </div>
+   </div>
 
-The example gallery will continue to expand with new scenarios and tasks over time, 
-illustrating the versatility and scalability of RLinf.
 
 .. toctree::
    :hidden:
    :maxdepth: 2
 
-   embodied
+   maniskill
+   libero
    reasoning
+   coding_online_rl
diff --git a/docs/source-en/rst_source/examples/libero.rst b/docs/source-en/rst_source/examples/libero.rst
new file mode 100644
index 000000000..7836ca9c8
--- /dev/null
+++ b/docs/source-en/rst_source/examples/libero.rst
@@ -0,0 +1,232 @@
+RL with LIBERO Simulator
+========================
+
+.. |huggingface| image:: /_static/svg/hf-logo.svg
+   :width: 16px
+   :height: 16px
+   :class: inline-icon
+
+This document provides a comprehensive guide to launching and managing the 
+Vision-Language-Action Models (VLAs) training task within the RLinf framework, 
+focusing on finetuning a VLA model for robotic manipulation in the LIBERO environment. 
+
+The primary objective is to develop a model capable of performing robotic manipulation by:
+
+1. **Visual Understanding**: Processing RGB images from the robot's camera.
+2. **Language Comprehension**: Interpreting natural-language task descriptions.
+3. **Action Generation**: Producing precise robotic actions (position, rotation, gripper control).
+4. **Reinforcement Learning**: Optimizing the policy via the PPO with environment feedback.
+
+Environment
+-----------------------
+
+**LIBERO Environment**
+
+- **Environment**: LIBERO simulation benchmark built on top of *robosuite* (MuJoCo).
+- **Task**: Command a 7-DoF robotic arm to perform a variety of household manipulation skills (pick-and-place, stacking, opening drawers, spatial rearrangement).
+- **Observation**: RGB images (typical resolutions 128 × 128 or 224 × 224) captured by off-screen cameras placed around the workspace.
+- **Action Space**: 7-dimensional continuous actions  
+  - 3D end-effector position control (x, y, z)  
+  - 3D rotation control (roll, pitch, yaw)  
+  - Gripper control (open / close)
+
+**Task Description Format**
+
+.. code-block:: text
+
+   In: What action should the robot take to [task_description]?
+   Out: 
+
+**Data Structure**
+
+- **Images**: RGB tensors ``[batch_size, 3, 224, 224]``
+- **Task Descriptions**: Natural-language instructions
+- **Actions**: Normalized continuous values converted to discrete tokens
+- **Rewards**: Step-level rewards based on task completion
+
+Algorithm
+-----------------------------------------
+
+**Core Algorithm Components**
+
+1. **PPO (Proximal Policy Optimization)**
+
+   - Advantage estimation using GAE (Generalized Advantage Estimation)
+
+   - Policy clipping with ratio limits
+
+   - Value function clipping
+
+   - Entropy regularization
+
+2. **GRPO (Group Relative Policy Optimization)**
+
+   - For every state / prompt the policy generates *G* independent actions
+
+   - Compute the advantage of each action by subtracting the group’s mean reward.
+
+
+3. **Vision-Language-Action Model**
+
+   - OpenVLA architecture with multimodal fusion
+
+   - Action tokenization and de-tokenization
+
+   - Value head for critic function
+
+Running the Script
+-------------------
+
+**1. Key Parameters Configuration**
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-7
+         rollout: 8-15
+         actor: 0-15
+
+   rollout:
+      pipeline_stage_num: 2
+
+Here you can flexibly configure the GPU count for env, rollout, and actor components.
+Using the above configuration, you can achieve pipeline overlap between env and rollout, and sharing with actor.
+Additionally, by setting `pipeline_stage_num = 2` in the configuration, you can achieve pipeline overlap between rollout and actor, improving rollout efficiency.
+
+.. code-block:: yaml
+   
+   cluster:
+      num_nodes: 1
+      component_placement:
+         env,rollout,actor: all
+
+You can also reconfigure the placement to achieve complete sharing, where env, rollout, and actor components all share all GPUs.
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-3
+         rollout: 4-7
+         actor: 8-15
+
+You can also reconfigure the placement to achieve complete separation, where env, rollout, and actor components each use their own GPUs without interference, eliminating the need for offload functionality.
+
+**2. Configuration Files**
+
+We currently support training in two environments: **ManiSkill3** and **LIBERO**.
+
+We support the **OpenVLA-OFT** model with both **PPO** and **GRPO** algorithms.  
+The corresponding configuration files are:
+
+- **OpenVLA-OFT + PPO**: ``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``
+- **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
+
+**3. Launch Commands**
+
+To start training with a chosen configuration, run the following command:
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh CHOSEN_CONFIG
+
+For example, to train the OpenVLA model using the PPO algorithm in the ManiSkill3 environment, run:
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh libero_10_ppo_openvlaoft
+
+
+Visualization and Results
+-------------------------
+
+**1. TensorBoard Logging**
+
+.. code-block:: bash
+
+   # Start TensorBoard
+   tensorboard --logdir ./logs --port 6006
+
+**2. Key Metrics Tracked**
+
+- **Training Metrics**:
+
+  - ``actor/loss``: PPO policy loss
+  - ``actor/value_loss``: Value function loss
+  - ``actor/entropy``: Policy entropy
+  - ``actor/grad_norm``: Gradient norm
+  - ``actor/lr``: Learning rate
+
+- **Rollout Metrics**:
+
+  - ``rollout/reward_mean``: Average episode reward
+  - ``rollout/reward_std``: Reward standard deviation
+  - ``rollout/episode_length``: Average episode length
+  - ``rollout/success_rate``: Task completion rate
+
+- **Environment Metrics**:
+
+  - ``env/success_rate``: Success rate across environments
+  - ``env/step_reward``: Step-by-step reward
+  - ``env/termination_rate``: Episode termination rate
+
+**3. Video Generation**
+
+.. code-block:: yaml
+
+   video_cfg:
+     save_video: True
+     info_on_video: True
+     video_base_dir: ./logs/video/train
+
+**4. WandB Integration**
+
+.. code-block:: yaml
+
+   trainer:
+     logger:
+       wandb:
+         enable: True
+         project_name: "RLinf"
+         experiment_name: "openvla-maniskill"
+
+
+LIBERO Results
+~~~~~~~~~~~~~~~~~~~
+
+Furthermore, we trained OpenVLA-OFT in the LIBERO environment using the GRPO algorithm. The improvements achieved through our RL fine-tuning are shown below:
+
+.. list-table:: **OpenVLA-OFT model results on LIBERO**
+   :header-rows: 1
+
+   * - Model
+     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
+     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
+     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
+     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
+     - Average
+   * - OpenVLA-OFT-SFT (one-shot)
+     - 56.5%
+     - 45.6%
+     - 25.6%
+     - 9.7%
+     - 34.4%
+   * - OpenVLA-OFT-RLinf
+     - **99.0%**
+     - **99.0%**
+     - **99.0%**
+     - **94.4%**
+     - **97.9%**
+   * - Improvement
+     - +42.5%
+     - +53.4%
+     - +73.4%
+     - +84.7%
+     - +63.5%
+
+For the Libero experiment, we were inspired by 
+`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_, 
+with only minor modifications. We thank the authors for releasing their open-source code.
diff --git a/docs/source-en/rst_source/examples/embodied.rst b/docs/source-en/rst_source/examples/maniskill.rst
similarity index 74%
rename from docs/source-en/rst_source/examples/embodied.rst
rename to docs/source-en/rst_source/examples/maniskill.rst
index e35f9c4ef..de1992441 100644
--- a/docs/source-en/rst_source/examples/embodied.rst
+++ b/docs/source-en/rst_source/examples/maniskill.rst
@@ -1,5 +1,5 @@
-Agentic RL-VLA
-========================
+RL with ManiSkill Simulator
+===========================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
@@ -8,7 +8,7 @@ Agentic RL-VLA
 
 This document provides a comprehensive guide to launching and managing the 
 Vision-Language-Action Models (VLAs) training task within the RLinf framework, 
-focusing on finetuning a VLA model for robotic manipulation in the ManiSkill3/LIBERO environment. 
+focusing on finetuning a VLA model for robotic manipulation in the ManiSkill3 environment. 
 
 The primary objective is to develop a model capable of performing robotic manipulation by:
 
@@ -18,7 +18,7 @@ The primary objective is to develop a model capable of performing robotic manipu
 4. **Reinforcement Learning**: Optimizing the policy via the PPO with environment feedback.
 
 Environment
------------------------
+-----------
 
 **ManiSkill3 Environment**
 
@@ -30,16 +30,6 @@ Environment
   - 3D rotation control (roll, pitch, yaw)
   - Gripper control (open/close)
 
-**LIBERO Environment**
-
-- **Environment**: LIBERO simulation benchmark built on top of *robosuite* (MuJoCo).
-- **Task**: Command a 7-DoF robotic arm to perform a variety of household manipulation skills (pick-and-place, stacking, opening drawers, spatial rearrangement).
-- **Observation**: RGB images (typical resolutions 128 × 128 or 224 × 224) captured by off-screen cameras placed around the workspace.
-- **Action Space**: 7-dimensional continuous actions  
-  - 3D end-effector position control (x, y, z)  
-  - 3D rotation control (roll, pitch, yaw)  
-  - Gripper control (open / close)
-
 **Task Description Format**
 
 .. code-block:: text
@@ -129,23 +119,13 @@ You can also reconfigure the placement to achieve complete separation, where env
 
 We currently support training in two environments: **ManiSkill3** and **LIBERO**.
 
-1. **ManiSkill3 Environment**
-
-   We support two models: **OpenVLA** and **OpenVLA-OFT**, along with two algorithms: **PPO** and **GRPO**.  
-   The corresponding configuration files are:
-
-   - **OpenVLA + PPO**: ``examples/embodiment/config/maniskill_ppo_openvla.yaml``
-   - **OpenVLA-OFT + PPO**: ``examples/embodiment/config/maniskill_ppo_openvlaoft.yaml``
-   - **OpenVLA + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvla.yaml``
-   - **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvlaoft.yaml``
+We support two models: **OpenVLA** and **OpenVLA-OFT**, along with two algorithms: **PPO** and **GRPO**.  
+The corresponding configuration files are:
 
-2. **LIBERO Environment**
-
-   We support the **OpenVLA-OFT** model with both **PPO** and **GRPO** algorithms.  
-   The corresponding configuration files are:
-
-   - **OpenVLA-OFT + PPO**: ``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``
-   - **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
+- **OpenVLA + PPO**: ``examples/embodiment/config/maniskill_ppo_openvla.yaml``
+- **OpenVLA-OFT + PPO**: ``examples/embodiment/config/maniskill_ppo_openvlaoft.yaml``
+- **OpenVLA + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvla.yaml``
+- **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvlaoft.yaml``
 
 **3. Launch Commands**
 
@@ -216,7 +196,7 @@ Visualization and Results
          experiment_name: "openvla-maniskill"
 
 ManiSkill3 Results
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~
 
 As an illustrative example, we present the training results of the PPO algorithm in the ManiSkill3 environment. 
 Running on a single 8-GPU H100 machine, OpenVLA (left) and OpenVLA-OFT (right) achieved up to 90% success on ManiSkill3’s plate-25-main task, after 48 and 24 hours of PPO training, respectively.
@@ -289,41 +269,3 @@ using the PPO algorithm within the RLinf framework.
      <source src="https://github.com/RLinf/misc/raw/main/pic/embody.mp4" type="video/mp4">
      Your browser does not support the video tag.
    </video>
-
-
-LIBERO Results
-~~~~~~~~~~~~~~~~~~~
-
-Furthermore, we trained OpenVLA-OFT in the LIBERO environment using the GRPO algorithm. The improvements achieved through our RL fine-tuning are shown below:
-
-.. list-table:: **OpenVLA-OFT model results on LIBERO**
-   :header-rows: 1
-
-   * - Model
-     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
-     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
-     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
-     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
-     - Average
-   * - OpenVLA-OFT-SFT (one-shot)
-     - 56.5%
-     - 45.6%
-     - 25.6%
-     - 9.7%
-     - 34.4%
-   * - OpenVLA-OFT-RLinf
-     - **99.0%**
-     - **99.0%**
-     - **99.0%**
-     - **94.4%**
-     - **97.9%**
-   * - Improvement
-     - +42.5%
-     - +53.4%
-     - +73.4%
-     - +84.7%
-     - +63.5%
-
-For the Libero experiment, we were inspired by 
-`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_, 
-with only minor modifications. We thank the authors for releasing their open-source code.
\ No newline at end of file
diff --git a/docs/source-en/rst_source/examples/reasoning.rst b/docs/source-en/rst_source/examples/reasoning.rst
index d786791cf..fdac14524 100644
--- a/docs/source-en/rst_source/examples/reasoning.rst
+++ b/docs/source-en/rst_source/examples/reasoning.rst
@@ -1,5 +1,5 @@
-Reasoning RL-LLM
-=================
+Math Reasoning RL Training
+==========================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
diff --git a/docs/source-en/rst_source/start/index.rst b/docs/source-en/rst_source/start/index.rst
index 3d338d783..19a97f017 100644
--- a/docs/source-en/rst_source/start/index.rst
+++ b/docs/source-en/rst_source/start/index.rst
@@ -13,7 +13,7 @@ We present three concise examples to demonstrate the framework's workflow and he
 
 - **Distributed training:** Multi-node training for mathematical tasks (see :doc:`distribute`).
 
-- **Evaluation:** Assessing model performance on embodied intelligence (see :doc:`vlm-eval`) and assessing model performance on long-chain-of-thought mathematical reasoning (see :doc:`llm-eval`).
+- **Evaluation:** Assessing model performance on embodied intelligence (see :doc:`vla-eval`) and assessing model performance on long-chain-of-thought mathematical reasoning (see :doc:`llm-eval`).
 
 .. toctree::
    :hidden:
diff --git a/docs/source-en/rst_source/start/installation.rst b/docs/source-en/rst_source/start/installation.rst
index fce45b6fa..534e467f9 100644
--- a/docs/source-en/rst_source/start/installation.rst
+++ b/docs/source-en/rst_source/start/installation.rst
@@ -21,12 +21,6 @@ Backend Engines
 
    - **Huggingface**: Easy to use, with native APIs provided by the Huggingface ecosystem.
 
-Installation Methods
---------------------
-
-RLinf provides two installation options. We **recommend using Docker**, as it provides the fastest and most reproducible environment.
-However, if your system is incompatible with the Docker image, you can also install RLinf manually in a Python environment.
-
 Hardware Requirements
 ~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -71,21 +65,25 @@ Software Requirements
    * - NVIDIA Container Toolkit
      - 1.17.8
 
+Installation Methods
+--------------------
 
-Install from Docker Image
+RLinf provides two installation options. We **recommend using Docker**, as it provides the fastest and most reproducible environment.
+However, if your system is incompatible with the Docker image, you can also install RLinf manually in a Python environment.
+
+
+Installation Method 1: Docker Image
 -------------------------
 
 We provide two official Docker images optimized for different backend configurations:
 
-- **Megatron + SGLang/vLLM**:  
+- **Math reasoning with Megatron + SGLang/vLLM**:  
 
   - ``rlinf/rlinf:math-rlinf0.1-torch2.5.1-sglang0.4.4-vllm0.7.1-megatron0.11.0-te2.1`` (used for enhancing LLM reasoning on MATH tasks)
 
-- **FSDP + Huggingface**:  
-
-  - ``rlinf/rlinf:agentic-openvla-rlinf0.1-torch2.5.1`` (for the OpenVLA model)  
-  - ``rlinf/rlinf:agentic-openvlaoft-rlinf0.1-torch2.5.1`` (for the OpenVLA-OFT model)
+- **Embodied with FSDP + Huggingface**:  
 
+  - ``rlinf/rlinf:agentic-rlinf0.1-torch2.6.0-openvla-openvlaoft-pi0`` (for the OpenVLA/OpenVLA-OFT/Pi0 model)
 
 Once you've identified the appropriate image for your setup, pull the Docker image:
 
@@ -100,7 +98,6 @@ Then, start the container using the pulled image:
    docker run -it --gpus all \
       --shm-size 100g \
       --net=host \
-      --env NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
       --name rlinf \
       rlinf/rlinf:CHOSEN_IMAGE /bin/bash
 
@@ -111,25 +108,32 @@ Inside the container, clone the RLinf repository:
    git clone https://github.com/RLinf/RLinf.git
    cd RLinf
 
-.. tip::
+The embodied image contains multiple Python virtual environments (venv) located in the `/opt/venv` directory for different models, namely ``openvla``, ``openvla-oft``, and ``pi0``.
+The default environment is set to ``openvla``.
+To switch to the desired venv, use the built-in script `switch_env`:
+
+.. code-block:: bash
 
-   For multi-node training, make sure to clone the repository in shared storage so that every node has access to it.
+   source switch_env <env_name> # e.g., source switch_env openvla-oft, source switch_env pi0, etc.
 
+.. tip::
 
+   - For multi-node training, make sure to clone the repository in shared storage so that every node has access to it.
+   - To use ManiSkill settings, refer to the README at ``https://huggingface.co/datasets/RLinf/maniskill_assets`` for instructions on downloading the required files.
 
-Install from Custom Environment
+Installation Method 2: UV Custom Environment
 -------------------------------
+**If you have already used the Docker image, you can skip the following steps.**
 
-Installation is divided into three parts depending on the type of experiments you plan to run.
+Installation is divided into two parts depending on the type of experiments you plan to run.
 
-First, for all experiments, follow the :ref:`Common Dependencies <common-dependencies>` section to install the shared dependencies.  
-This already includes the full backend setup for **FSDP + Huggingface**.
+First, for all experiments, follow the :ref:`Common Dependencies <common-dependencies>` section to install the shared dependencies.
 
-Second, for experiments using **Megatron** and **SGLang/vLLM** backends,  
-follow the :ref:`Megatron and SGLang/vLLM Dependencies <megatron-and-sglang-vllm-dependencies>` section to install all required packages.  
+Next, install the specific dependencies based on your experiment type.
 
-Third, for embodied intelligence experiments (e.g., OpenVLA, OpenVLA-OFT and Pi0),  
-follow the :ref:`Embodied Dependencies <embodied-dependencies>` section to install their specific dependencies.
+* For reasoning experiments using **Megatron** and **SGLang/vLLM** backends, follow the :ref:`Megatron and SGLang/vLLM Dependencies <megatron-and-sglang-vllm-dependencies>` section to install all required packages.  
+
+* For embodied intelligence experiments (e.g., OpenVLA, OpenVLA-OFT and Pi0), follow the :ref:`Embodied Dependencies <embodied-dependencies>` section to install their specific dependencies.
 
 .. _common-dependencies:
 
@@ -158,13 +162,17 @@ After installing ``uv``, create a virtual environment and install PyTorch along
 Megatron and SGLang/vLLM Dependencies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+  If you are running embodied experiments, there is no need to install these dependencies.
+  Please proceed directly to the :ref:`Embodied Dependencies <embodied-dependencies>` section.
+
 Run the following commands to install Megatron, SGLang or vLLM, and their dependencies:
 
 .. code-block:: shell
 
-   uv sync --extra sgl_vllm
+   uv sync --extra sglang-vllm
    mkdir -p /opt && git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
-   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 uv pip install -r requirements/megatron.txt --no-build-isolation
+   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads 24" APEX_PARALLEL_BUILD=24 uv pip install -r requirements/megatron.txt --no-build-isolation
 
 Before using Megatron, ensure its path is added to the ``PYTHONPATH`` environment variable:
 
@@ -172,37 +180,39 @@ Before using Megatron, ensure its path is added to the ``PYTHONPATH`` environmen
 
    export PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 
-SGLang installation:
-
-.. code-block:: shell
-
-   uv sync --extra sglang
-
-vLLM installation:
-
-.. code-block:: shell
-
-   uv sync --extra vllm
-
 .. _embodied-dependencies:
 
-Additional Embodied Dependencies
+Embodied Dependencies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 For embodied experiments, first install the necessary system dependencies (currently only supported on Debian/Ubuntu via ``apt``):
 
 .. code-block:: shell
 
-   bash requirements/install_embodied_deps.sh
    uv sync --extra embodied
+   bash requirements/install_embodied_deps.sh # Must be run after the above command
 
 Then, depending on the experiment type, install the required packages for ``openvla``, ``openvla-oft`` and ``pi0``:
 
 .. code-block:: shell
 
-   # For OpenVLA/OpenVLA-oft experiments
+   # For OpenVLA experiments
    UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla.txt --no-build-isolation
 
+   # For OpenVLA-oft experiment
+   UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
+
    # For Pi0 experiments
    UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
 
+Finally, Run the following to install the libero dependency.
+
+.. code-block:: shell
+
+  mkdir -p /opt && git clone https://github.com/RLinf/LIBERO.git /opt/libero
+
+Before using LIBERO, make sure its path is added to the `PYTHONPATH` environment variables.
+
+.. code-block:: shell
+  
+  export PYTHONPATH=/opt/libero:$PYTHONPATH
\ No newline at end of file
diff --git a/docs/source-en/rst_source/start/llm.rst b/docs/source-en/rst_source/start/llm.rst
index 795b8c61a..5cdde520f 100644
--- a/docs/source-en/rst_source/start/llm.rst
+++ b/docs/source-en/rst_source/start/llm.rst
@@ -53,14 +53,15 @@ we highly recommend updating the following configuration option in
 ``cluster.component_placement``.
 
 
-You can dynamically set it to **1, 2, 4, or 8** depending on your available resources.
+You can set it to **0-1**, **0-3** or  **0-7** to use 2/4/8 GPUs depending on your available resources.
+Refer to :doc:`../tutorials/user/yaml` for a more detailed explanation of the placement configuration.
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0
 
 Finally, before running the script, you need to modify the corresponding configuration options in the YAML file according to the download paths of the model and dataset. Specifically, update:
 
diff --git a/docs/source-en/rst_source/start/vla-eval.rst b/docs/source-en/rst_source/start/vla-eval.rst
index fc1d93ae8..db9ca221b 100644
--- a/docs/source-en/rst_source/start/vla-eval.rst
+++ b/docs/source-en/rst_source/start/vla-eval.rst
@@ -100,9 +100,8 @@ Quick Start — LIBERO
    export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
    # path to the LIBERO repo
-   export LIBERO_REPO_PATH="/root/LIBERO"
-   export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-   export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+   export LIBERO_PATH="/opt/LIBERO"
+   export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
 
    export CUDA_LAUNCH_BLOCKING=1
    export HYDRA_FULL_ERROR=1
diff --git a/docs/source-en/rst_source/start/vla.rst b/docs/source-en/rst_source/start/vla.rst
index 4f4646ae0..32357ba91 100644
--- a/docs/source-en/rst_source/start/vla.rst
+++ b/docs/source-en/rst_source/start/vla.rst
@@ -34,20 +34,29 @@ the model is cited in `paper <https://arxiv.org/abs/2505.19789>`_
 
 **Step 2: Execute the provided launch script:**
 
-For user convenience, our configuration file is set up to run with a single GPU by default.  
+.. note:: 
+   If you have installed RLinf via the Docker image (see :doc:`./installation`), please make sure you have switched to the right Python environment for the target model.
+   The default environment is set to ``openvla``. 
+   To switch to OpenVLA-OFT or Pi0, use the built-in script `switch_env`: 
+   ``source switch_env openvla-oft`` or ``source switch_env pi0``.
+
+   If you have installed RLinf in a custom environment, please ensure that you have installed the model's corresponding dependencies as described in :doc:`./installation`.
+
+For user convenience, our configuration file is set up to run with at least two GPUs by default.  
 However, if you have multiple GPUs and wish to accelerate the quickstart process,  
 we highly recommend updating the following configuration option in  
 ``./examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml``:  
 ``cluster.component_placement``.
 
-You can dynamically set it to **1, 2, 4, or 8** depending on your available resources.
+You can set it to **0-3** or  **0-7** to use 4/8 GPUs depending on your available resources.
+Refer to :doc:`../tutorials/user/yaml` for a more detailed explanation of the placement configuration.
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0-1
 
 Finally, before running the script, you need to modify the corresponding configuration options in the YAML file according to the download paths of the model and dataset. Specifically, update:
 
diff --git a/docs/source-en/rst_source/tutorials/user/yaml.rst b/docs/source-en/rst_source/tutorials/user/yaml.rst
index f70366ed7..51c0c691e 100644
--- a/docs/source-en/rst_source/tutorials/user/yaml.rst
+++ b/docs/source-en/rst_source/tutorials/user/yaml.rst
@@ -750,7 +750,6 @@ algorithm
 
     n_chunk_steps: 10
     n_eval_chunk_steps: 10
-    rollout_micro_batch_size: 256
     num_group_envs: 32
     rollout_epoch: 1
 
@@ -766,7 +765,7 @@ algorithm
 
 ``algorithm.auto_reset``: Automatically reset environments when episodes terminate.
 
-``algorithm.ignore_terminations``: Ignore episode terminations during training.
+``algorithm.ignore_terminations``: Ignore episode terminations during training (if enabled, episode only ends when it reaches the ``max_episode_steps``).
 
 ``algorithm.use_fixed_reset_state_ids``: Use fixed reset state IDs (false for randomization). Always True for GRPO, default be False for PPO.
 
@@ -774,19 +773,15 @@ algorithm
 
 ``algorithm.normalize_advantages``: Normalize advantages across the batch.
 
-``algorithm.kl_penalty``: KL divergence estimation method (kl or kl_penalty).
+``algorithm.n_chunk_steps``: Number of chunks (i.e., times the model is called to predict action chunks) within one rollout epoch.
 
-``algorithm.n_chunk_steps``: Number of action steps per chunk.
-
-``algorithm.n_eval_chunk_steps``: Number of action steps per evaluation chunk.
-
-``algorithm.rollout_micro_batch_size``: Micro-batch size for rollout generation.
+``algorithm.n_eval_chunk_steps``: Number of chunks in evaluation.
 
 ``algorithm.num_group_envs``: Number of environment groups.
 
 ``algorithm.rollout_epoch``: Number of rollout epochs per training step.
 
-``algorithm.reward_type``: Reward aggregation level (chunk_level, token_level, step_level).
+``algorithm.reward_type``: Reward aggregation level (chunk_level, action_level).
 
 ``algorithm.logprob_type``: Log probability computation level.
 
@@ -850,10 +845,6 @@ rollout
 
 ``rollout.backend``: Model backend (huggingface, vllm).
 
-``rollout.enforce_eager``: Disable CUDA graph capture for faster initialization.
-
-``rollout.enable_offload``: Enable model offloading to reduce memory usage.
-
 ``rollout.pipeline_stage_num``: Number of pipeline stages for model parallelism.
 
 actor
diff --git a/docs/source-zh/rst_source/examples/coding_online_rl.rst b/docs/source-zh/rst_source/examples/coding_online_rl.rst
new file mode 100644
index 000000000..8917a5999
--- /dev/null
+++ b/docs/source-zh/rst_source/examples/coding_online_rl.rst
@@ -0,0 +1,181 @@
+代码补全在线强化学习
+=================
+
+代码补全在线强化学习（Online Coding RL）是 RLinf 框架中的一个重要应用场景。
+通过与 Continue 等代码编辑器的集成，获取用户对代码补全的偏好反馈，可以实现近乎实时的代码生成和反馈学习，快速提高代码补全的质量，和对齐用户的偏好。
+本示例展示了如何使用 RLinf 框架训练一个能够进行在线代码补全任务的模型。
+
+概述
+----
+
+代码补全在线强化学习系统通过以下方式工作：
+
+1. **实时交互**：系统接收来自 Continue 等编辑器的代码补全请求
+2. **模型推理**：使用训练好的模型生成代码补全建议
+3. **用户反馈**：收集用户对生成代码的接受/拒绝反馈
+4. **在线学习**：基于用户反馈实时更新模型参数
+
+这种实时学习机制使得模型能够快速适应用户的编程习惯和偏好。
+
+运行脚本
+-------
+
+**环境准备**
+
+
+首先确保您已经安装了 RLinf 框架及其依赖：
+
+.. code-block:: bash
+
+   # 安装额外依赖
+   pip install httpx asyncio fuzzywuzzy
+
+**配置 Continue 集成**
+
+1. **安装 Continue 扩展**
+   
+   由于当前 Continue 未支持上传用户对代码补全的偏好反馈，因此我们修改了 Continue 的源码，支持上传用户对代码补全的偏好反馈。
+   用户可从 `这里 <https://github.com/RLinf/continue/releases>`_ 获取编译好的修改后的 Continue 插件，或自行构建。
+
+   下载编译好的 Continue 插件后，在 VS Code 中安装。
+
+   方法1: code --install-extension /path/to/continue-1.3.9.vsix"
+
+   方法2: 在 VSCode 中按 Cmd+Shift+P ，输入 'Extensions: Install from VSIX'，选择上述文件
+
+2. **配置 Continue 设置**
+
+   Continue 的配置文件路径为：
+
+   .. code-block:: bash
+
+      ~/.continue/config.yaml
+
+   在 Continue 的配置文件中添加以下设置：
+
+   .. code-block:: yaml
+
+      # 请将 http://xxx:xx/ 替换为实际的 RLinf 在线代码补全服务地址
+
+      models:
+      # 添加一个模型，用于代码补全
+      - name: my-autocomplete
+         provider: openai
+         model: Qwen2.5-Coder-1.5B
+         apiBase: http://xxx:8081/v1
+         apiKey: xxx
+         roles:
+            - autocomplete
+
+      # 添加发送用户是否接受代码补全的反馈
+      tabAutocompleteOptions:
+      enableCompletionTracking: true
+      completionTrackingUrl: http://xxx:8082/api/training/submit
+      completionTrackingHeaders:
+         Authorization: "Bearer test-token"
+         X-Project-ID: "test-project"
+      maxPromptTokens: 1024
+      debounceDelay: 350
+      multilineCompletions: "auto"
+
+   修改并保存完成后，从左侧面板打开 Continue 扩展，点击右上角的 "设置" 齿轮按钮，在 "Models" 页面确保 "Autocomplete 模型" 选用 my-autocomplete。
+
+**启动训练服务**
+
+1. **准备模型和配置**
+   
+   确保您有预训练的模型权重，并修改配置文件，匹配模型路径、需要使用的端口等
+
+   .. code-block:: yaml
+
+      rollout:
+        model_dir: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+      
+      actor:
+        tokenizer:
+          tokenizer_model: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+
+2. **启动 RLinf 训练服务**
+   
+   .. code-block:: bash
+
+      # 进入项目目录
+      cd /path/to/rlinf_online_rl
+      
+      # 启动训练服务
+      bash examples/coding_online_rl/run_main_math_pipeline_grpo_megatron.sh qwen2.5-1.5b-ppo-megatron
+
+   这将启动以下服务：
+   - **推理服务**：在端口 8081 提供代码补全 API
+   - **训练服务**：在端口 8082 接收用户反馈数据
+
+**与 Continue 联动**
+
+1. **启动 Continue**
+   
+   在 VS Code 中启动 Continue 扩展，确保它连接到正确的 API 端点。
+
+2. **开始编程**
+   
+   在 Continue 中开始编写代码，系统将：
+   - 自动发送代码补全请求到推理服务
+   - 接收模型生成的代码建议
+   - 收集您对建议的接受/拒绝反馈
+
+3. **实时学习**
+   
+   系统会实时处理您的反馈：
+   - 接受的建议被标记为正面反馈
+   - 拒绝的建议被标记为负面反馈
+   - 模型参数根据反馈进行在线更新
+
+**监控训练过程**
+
+您可以通过以下方式监控训练过程：
+
+1. **查看日志输出**
+   
+   .. code-block:: bash
+
+      # 查看训练日志
+      tail -f results/ppo-1.5b/train.log
+
+2. **使用 TensorBoard**
+   
+   .. code-block:: bash
+
+      # 启动 TensorBoard
+      tensorboard --logdir results/grpo-1.5b
+
+3. **检查模型检查点**
+   
+   训练过程中会定期保存模型检查点到 `results/grpo-1.5b/checkpoints/` 目录。
+
+**测试客户端**
+
+您可以使用提供的测试客户端来验证系统功能：
+
+.. code-block:: bash
+
+   # 运行测试客户端
+   python examples/coding_online_rl/simple_test_client.py
+
+测试客户端会模拟 Continue 的行为，发送代码补全请求并提交反馈数据。
+
+**故障排除**
+
+常见问题及解决方案：
+
+1. **端口冲突**
+   
+   如果端口 8081 或 8082 被占用，请修改配置文件中的端口设置。
+
+2. **模型加载失败**
+   
+   检查模型路径是否正确，确保模型文件存在且可访问。
+
+3. **Continue 连接失败**
+   
+   确保 Continue 配置中的 API 端点地址正确，检查网络连接。还可使用 simple_test_client 测试是否能正常收到反馈数据。
+
+通过以上步骤，您就可以成功运行代码补全在线强化学习系统，并实现与 Continue 编辑器的无缝集成。
diff --git a/docs/source-zh/rst_source/examples/index.rst b/docs/source-zh/rst_source/examples/index.rst
index b1f1c2122..00e7759c7 100644
--- a/docs/source-zh/rst_source/examples/index.rst
+++ b/docs/source-zh/rst_source/examples/index.rst
@@ -2,19 +2,192 @@
 ===============
 
 本节展示了 **RLinf 目前支持的示例集合**，  
-展示该框架如何应用于不同场景，并演示其在实际中的高效性。  
+展示该框架如何应用于不同场景，并演示其在实际中的高效性。示例库会随着时间不断扩展，涵盖新的场景和任务，以展示 RLinf 的多样性和可扩展性。
 
-目前，我们提供两大类示例：  
+具身智能场景
+----------------
 
-- **具身智能场景**：训练 **VLA** 模型用于具身智能。参见 :doc:`embodied`。  
-- **推理场景**：训练 **LLM** 模型用于高级推理任务。参见 :doc:`reasoning`。  
+具身智能场景包含SOTA模型（如pi0、pi0.5、OpenVLA-OFT）和不同模拟器（如LIBERO、ManiSkill、RoboTwin）的训练示例，以及真机强化学习训练示例等。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <video controls autoplay loop muted playsinline preload="metadata" style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);">
+         <source src="https://github.com/RLinf/misc/raw/main/pic/embody.mp4" type="video/mp4">
+         Your browser does not support the video tag.
+       </video>
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/maniskill.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>基于ManiSkill的强化学习</b>
+         </a><br>
+         ManiSkill+OpenVLA+PPO/GRPO达到SOTA训练效果
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/libero_numbers.jpeg" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/libero.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>基于LIBERO的强化学习</b>
+         </a><br>
+         LIBERO+OpenVLA-OFT+GRPO成功率达99%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]π₀模型强化学习训练</b><br>
+         在π₀上实现强化学习的效果跃升
+       </p>
+     </div>
+   </div>
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://raw.githubusercontent.com/RoboTwin-Platform/RoboTwin/main/assets/files/50_tasks.gif" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" 
+            data-target="animated-image.originalImage">
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]基于RoboTwin的强化学习</b><br>
+         RoboTwin+OpenVLA-OFT+PPO达到SOTA训练效果
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/franka_arm_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]Franka真机强化学习</b><br>
+         RLinf worker无缝对接Franka机械臂
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]基于世界模型的强化学习</b><br>
+         集成UnifoLM-WMA-0世界模型的强化学习训练
+       </p>
+     </div>
+   </div>
+
+
+推理场景
+--------------
+
+强化学习是提升模型推理能力的关键手段，RLinf支持主流模型如Qwen、Qwen-next在Math等场景的强化学习训练，并达到SOTA的训练效果。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/reasoning.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>Math推理的强化学习训练</b>
+         </a><br>
+         AIME24/AIME25/GPQA-diamond评测结果达到SOTA
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]MoE模型强化学习训练</b><br>
+         MoE RL训练速度相比同类工具提升xx%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]Qwen-next强化学习训练</b><br>
+         Qwen-next强化学习训练效果达到SOTA
+       </p>
+     </div>
+   </div>
+
+
+智能体场景
+--------------
+
+RLinf的worker抽象、灵活的通信组件、以及对不同类型加速器的支持使RLinf天然支持智能体工作流的构建，以及智能体的训练。以下示例包含智能体工作流构建、在线强化学习训练、环境接入等示例。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/coding_online_rl.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>代码补全在线强化学习开源版</b>
+         </a><br>
+         基于RLinf+continue实现端到端在线强化学习，模型效果提升xx%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[适配中]rStar2-agent强化学习</b><br>
+         支持各组件所用资源量的灵活配置与调度
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[适配中]SWE-agent</b><br>
+         部署、推理、训练一体，高灵活性、高性能
+       </p>
+     </div>
+   </div>
+
+
+实用系统功能
+--------------------
+
+RLinf的整体设计简洁且模块化，以Worker为抽象封装强化学习训练、智能体中的组件，提供灵活高效的通信库做组件间通信。基于这种解耦的设计，可以灵活调度Worker所使用的计算资源，也可以将Worker分配到更适配的加速器上。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]Worker(组件)间秒级热切换</b><br>
+         秒级热切换提升训练速度50%+
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]异构加速器混合训练</b><br>
+         使用不同加速器运行的组件间灵活互通，构建训练工作流
+       </p>
+     </div>
+   </div>
 
-示例库会随着时间不断扩展，涵盖新的场景和任务，  
-以展示 RLinf 的多样性和可扩展性。  
 
 .. toctree::
    :hidden:
    :maxdepth: 2
 
-   embodied
+   maniskill
+   libero
    reasoning
+   coding_online_rl
diff --git a/docs/source-zh/rst_source/examples/libero.rst b/docs/source-zh/rst_source/examples/libero.rst
new file mode 100644
index 000000000..14b27a7c7
--- /dev/null
+++ b/docs/source-zh/rst_source/examples/libero.rst
@@ -0,0 +1,222 @@
+基于LIBERO模拟器的强化学习训练
+===========================================================
+
+.. |huggingface| image:: /_static/svg/hf-logo.svg
+   :width: 16px
+   :height: 16px
+   :class: inline-icon
+
+本文档给出在 RLinf 框架内启动与管理 **Vision-Language-Action Models (VLAs)** 训练任务的完整指南，
+在 LIBERO 环境中微调 VLA 模型以完成机器人操作。
+
+主要目标是让模型具备以下能力：
+
+1. **视觉理解**：处理来自机器人相机的 RGB 图像。  
+2. **语言理解**：理解自然语言的任务描述。  
+3. **动作生成**：产生精确的机器人动作（位置、旋转、夹爪控制）。  
+4. **强化学习**：结合环境反馈，使用 PPO 优化策略。
+
+环境
+-----------------------
+
+**LIBERO 环境**
+
+- **Environment**：基于 *robosuite*（MuJoCo）的 LIBERO 仿真基准  
+- **Task**：指挥一台 7 自由度机械臂完成多种家居操作技能（抓取放置、叠放、开抽屉、空间重排等）  
+- **Observation**：工作区周围离屏相机采集的 RGB 图像（常见分辨率 128×128 或 224×224）  
+- **Action Space**：7 维连续动作  
+  - 末端执行器三维位置控制（x, y, z）  
+  - 三维旋转控制（roll, pitch, yaw）  
+  - 夹爪控制（开/合）
+
+**任务描述格式**
+
+.. code-block:: text
+
+   In: What action should the robot take to [task_description]?
+   Out: 
+
+**数据结构**
+
+- **Images**：RGB 张量 ``[batch_size, 3, 224, 224]``  
+- **Task Descriptions**：自然语言指令  
+- **Actions**：归一化的连续值，转换为离散 tokens  
+- **Rewards**：基于任务完成度的逐步奖励
+
+算法
+-----------------------------------------
+
+**核心算法组件**
+
+1. **PPO（Proximal Policy Optimization）**
+
+   - 使用 GAE（Generalized Advantage Estimation）进行优势估计  
+   - 基于比率的策略裁剪  
+   - 价值函数裁剪  
+   - 熵正则化
+
+2. **GRPO（Group Relative Policy Optimization）**
+
+   - 对于每个状态/提示，策略生成 *G* 个独立动作  
+   - 以组内平均奖励为基线，计算每个动作的相对优势
+
+3. **Vision-Language-Action 模型**
+
+   - OpenVLA 架构，多模态融合  
+   - 动作 token 化与反 token 化  
+   - 带 Value Head 的 Critic 功能
+
+运行脚本
+-------------------
+
+**1. 关键参数配置**
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-7
+         rollout: 8-15
+         actor: 0-15
+
+   rollout:
+      pipeline_stage_num: 2
+
+你可以灵活配置 env、rollout、actor 三个组件使用的 GPU 数量。  
+使用上述配置，可以让 env 与 rollout 之间流水线重叠，并与 actor 共享。  
+此外，在配置中设置 `pipeline_stage_num = 2`，可实现 **rollout 与 actor** 之间的流水线重叠，从而提升 rollout 效率。
+
+.. code-block:: yaml
+   
+   cluster:
+      num_nodes: 1
+      component_placement:
+         env,rollout,actor: all
+
+你也可以重新配置 Placement，实现 **完全共享**：env、rollout、actor 三个组件共享全部 GPU。
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-3
+         rollout: 4-7
+         actor: 8-15
+
+你还可以重新配置 Placement，实现 **完全分离**：env、rollout、actor 各用各的 GPU、互不干扰，  
+这样就不需要 offload 功能。
+
+**2. 配置文件**
+
+   支持 **OpenVLA-OFT** 模型，算法为 **PPO** 与 **GRPO**。  
+   对应配置文件：
+
+   - **OpenVLA-OFT + PPO**：``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``  
+   - **OpenVLA-OFT + GRPO**：``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
+
+**3. 启动命令**
+
+选择配置后，运行以下命令开始训练：
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh CHOSEN_CONFIG
+
+例如，在 LIBERO 环境中使用 PPO 训练 OpenVLA 模型：
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh libero_10_ppo_openvlaoft
+
+可视化与结果
+-------------------------
+
+**1. TensorBoard 日志**
+
+.. code-block:: bash
+
+   # 启动 TensorBoard
+   tensorboard --logdir ./logs --port 6006
+
+**2. 关键监控指标**
+
+- **训练指标**：
+
+  - ``actor/loss``：PPO 策略损失  
+  - ``actor/value_loss``：价值函数损失  
+  - ``actor/entropy``：策略熵  
+  - ``actor/grad_norm``：梯度范数  
+  - ``actor/lr``：学习率  
+
+- **Rollout 指标**：
+
+  - ``rollout/reward_mean``：平均回合奖励  
+  - ``rollout/reward_std``：奖励标准差  
+  - ``rollout/episode_length``：平均回合长度  
+  - ``rollout/success_rate``：任务完成率  
+
+- **环境指标**：
+
+  - ``env/success_rate``：各环境的成功率  
+  - ``env/step_reward``：逐步奖励  
+  - ``env/termination_rate``：回合终止率  
+
+**3. 视频生成**
+
+.. code-block:: yaml
+
+   video_cfg:
+     save_video: True
+     info_on_video: True
+     video_base_dir: ./logs/video/train
+
+**4. WandB 集成**
+
+.. code-block:: yaml
+
+   trainer:
+     logger:
+       wandb:
+         enable: True
+         project_name: "RLinf"
+         experiment_name: "openvla-libero"
+
+LIBERO 结果
+~~~~~~~~~~~~~~~~~~~
+
+此外，我们在 LIBERO 环境中使用 GRPO 训练了 OpenVLA-OFT。  
+通过 RL 微调所获得的改进如下：
+
+.. list-table:: **LIBERO 上 OpenVLA-OFT 的模型结果**
+   :header-rows: 1
+
+   * - 模型
+     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
+     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
+     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
+     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
+     - 平均值
+   * - OpenVLA-OFT-SFT (one-shot)
+     - 56.5%
+     - 45.6%
+     - 25.6%
+     - 9.7%
+     - 34.4%
+   * - OpenVLA-OFT-RLinf
+     - **99.0%**
+     - **99.0%**
+     - **99.0%**
+     - **94.4%**
+     - **97.9%**
+   * - 提升
+     - +42.5%
+     - +53.4%
+     - +73.4%
+     - +84.7%
+     - +63.5%
+
+在 Libero 实验中，我们参考了  
+`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_，仅做了少量改动。  
+感谢作者开源代码。
diff --git a/docs/source-zh/rst_source/examples/embodied.rst b/docs/source-zh/rst_source/examples/maniskill.rst
similarity index 78%
rename from docs/source-zh/rst_source/examples/embodied.rst
rename to docs/source-zh/rst_source/examples/maniskill.rst
index cf510b986..e9d00fb7f 100644
--- a/docs/source-zh/rst_source/examples/embodied.rst
+++ b/docs/source-zh/rst_source/examples/maniskill.rst
@@ -1,13 +1,13 @@
-具身智能 RL-VLA
-========================
+基于ManiSkill模拟器的强化学习训练
+======================================================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
    :height: 16px
    :class: inline-icon
 
-本文档给出在 RLinf 框架内启动与管理 **Vision-Language-Action Models (VLAs)** 训练任务的完整指南，  
-重点是在 ManiSkill3/LIBERO 环境中微调 VLA 模型以完成机器人操作。
+本文档给出在 RLinf 框架内启动与管理 **Vision-Language-Action Models (VLAs)** 训练任务的完整指南，
+在ManiSkill3环境中微调VLA模型以完成机器人操作。
 
 主要目标是让模型具备以下能力：
 
@@ -29,16 +29,6 @@
   - 三维旋转控制（roll, pitch, yaw）  
   - 夹爪控制（开/合）
 
-**LIBERO 环境**
-
-- **Environment**：基于 *robosuite*（MuJoCo）的 LIBERO 仿真基准  
-- **Task**：指挥一台 7 自由度机械臂完成多种家居操作技能（抓取放置、叠放、开抽屉、空间重排等）  
-- **Observation**：工作区周围离屏相机采集的 RGB 图像（常见分辨率 128×128 或 224×224）  
-- **Action Space**：7 维连续动作  
-  - 末端执行器三维位置控制（x, y, z）  
-  - 三维旋转控制（roll, pitch, yaw）  
-  - 夹爪控制（开/合）
-
 **任务描述格式**
 
 .. code-block:: text
@@ -120,10 +110,6 @@
 
 **2. 配置文件**
 
-当前我们支持两个环境：**ManiSkill3** 与 **LIBERO**。
-
-1. **ManiSkill3 环境**
-
    支持两种模型：**OpenVLA** 与 **OpenVLA-OFT**；两种算法：**PPO** 与 **GRPO**。  
    对应配置文件：
 
@@ -132,14 +118,6 @@
    - **OpenVLA + GRPO**：``examples/embodiment/config/maniskill_grpo_openvla.yaml``  
    - **OpenVLA-OFT + GRPO**：``examples/embodiment/config/maniskill_grpo_openvlaoft.yaml``
 
-2. **LIBERO 环境**
-
-   支持 **OpenVLA-OFT** 模型，算法为 **PPO** 与 **GRPO**。  
-   对应配置文件：
-
-   - **OpenVLA-OFT + PPO**：``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``  
-   - **OpenVLA-OFT + GRPO**：``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
-
 **3. 启动命令**
 
 选择配置后，运行以下命令开始训练：
@@ -279,41 +257,3 @@ ManiSkill3 结果
      <source src=https://github.com/RLinf/misc/raw/main/pic/embody.mp4 type="video/mp4">
      Your browser does not support the video tag.
    </video>
-
-LIBERO 结果
-~~~~~~~~~~~~~~~~~~~
-
-此外，我们在 LIBERO 环境中使用 GRPO 训练了 OpenVLA-OFT。  
-通过 RL 微调所获得的改进如下：
-
-.. list-table:: **LIBERO 上 OpenVLA-OFT 的模型结果**
-   :header-rows: 1
-
-   * - 模型
-     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
-     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
-     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
-     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
-     - 平均值
-   * - OpenVLA-OFT-SFT (one-shot)
-     - 56.5%
-     - 45.6%
-     - 25.6%
-     - 9.7%
-     - 34.4%
-   * - OpenVLA-OFT-RLinf
-     - **99.0%**
-     - **99.0%**
-     - **99.0%**
-     - **94.4%**
-     - **97.9%**
-   * - 提升
-     - +42.5%
-     - +53.4%
-     - +73.4%
-     - +84.7%
-     - +63.5%
-
-在 Libero 实验中，我们参考了  
-`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_，仅做了少量改动。  
-感谢作者开源代码。
diff --git a/docs/source-zh/rst_source/examples/reasoning.rst b/docs/source-zh/rst_source/examples/reasoning.rst
index 2ac1e1791..a0b5d4b93 100644
--- a/docs/source-zh/rst_source/examples/reasoning.rst
+++ b/docs/source-zh/rst_source/examples/reasoning.rst
@@ -1,5 +1,5 @@
-推理 RL-LLM
-=================
+Math推理的强化学习训练
+================================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
diff --git a/docs/source-zh/rst_source/start/installation.rst b/docs/source-zh/rst_source/start/installation.rst
index 55ada5280..84038f8de 100644
--- a/docs/source-zh/rst_source/start/installation.rst
+++ b/docs/source-zh/rst_source/start/installation.rst
@@ -21,12 +21,6 @@ RLinf 支持多种后端引擎，用于训练和推理。目前支持以下配
 
    - **Huggingface**：简单易用，配套 Huggingface 生态提供的原生 API。
 
-安装方式
---------------------
-
-RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可以提供最快速、最可复现的环境。  
-如果你的系统无法使用 Docker 镜像，也可以选择在本地 Python 环境中手动安装。
-
 硬件要求
 ~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -70,19 +64,25 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
    * - NVIDIA Container Toolkit
      - 1.17.8
 
-使用 Docker 镜像安装
+
+安装方式
+--------------------
+
+RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可以提供最快速、最可复现的环境。  
+如果你的系统无法使用 Docker 镜像，也可以选择在本地 Python 环境中手动安装。
+
+安装方式1： Docker 镜像
 -------------------------
 
 我们提供了两个官方镜像，分别针对不同后端配置进行了优化：
 
-- **Megatron + SGLang/vLLM**：
+- **基于Megatron + SGLang/vLLM的数学推理镜像**：
 
   - ``rlinf/rlinf:math-rlinf0.1-torch2.5.1-sglang0.4.4-vllm0.7.1-megatron0.11.0-te2.1`` （用于增强大语言模型在 MATH 任务中的推理能力）
 
-- **FSDP + Huggingface**：
+- **基于FSDP + Huggingface的具身智能镜像**：
 
-  - ``rlinf/rlinf:agentic-openvla-rlinf0.1-torch2.5.1`` （适用于 OpenVLA 模型）  
-  - ``rlinf/rlinf:agentic-openvlaoft-rlinf0.1-torch2.5.1`` （适用于 OpenVLA-OFT 模型）
+  - ``rlinf/rlinf:agentic-rlinf0.1-torch2.6.0-openvla-openvlaoft-pi0`` （适用于 OpenVLA/OpenVLA-OFT/Pi0 模型）
 
 确认适合你任务的镜像后，拉取镜像：
 
@@ -97,7 +97,6 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
    docker run -it --gpus all \
       --shm-size 100g \
       --net=host \
-      --env NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
       --name rlinf \
       rlinf/rlinf:CHOSEN_IMAGE /bin/bash
 
@@ -108,23 +107,30 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
    git clone https://github.com/RLinf/RLinf.git
    cd RLinf
 
+具身智能镜像中包含多个 Python 虚拟环境（venv），位于 ``/opt/venv`` 目录下，分别对应不同模型，即 ``openvla``、``openvla-oft`` 和 ``pi0``。
+默认环境设置为 ``openvla``。
+要切换到所需的 venv，可以使用内置脚本 `switch_env`：
+.. code-block:: bash
+
+   source switch_env <env_name> # 例如，source switch_env openvla-oft, source switch_env pi0 等
+
 .. tip::
 
    如果进行多节点训练，请将仓库克隆到共享存储路径，确保每个节点都能访问该代码。
 
-自定义环境安装
+安装方式2：UV 自定义环境
 -------------------------------
+**如果你已经使用了 Docker 镜像，下面步骤可跳过。**
 
-根据你的实验类型，安装分为三步进行：
+根据你的实验类型，安装分为两步进行：
 
-第一步，对于所有实验，请先完成 :ref:`共同依赖 <common-dependencies>` 中的依赖安装，  
-这一步已经包括了 **FSDP + Huggingface** 的完整配置。
+第一步，对于所有实验类型，请先完成 :ref:`共同依赖 <common-dependencies>` 中的依赖安装。
 
-第二步，如果你的实验使用的是 **Megatron 和 SGLang/vLLM** 后端，  
-请参考 :ref:`Megatron 及 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
+第二步，根据你的实验类型，安装对应的依赖。  
 
-第三步，如果你要运行具身智能相关实验（如 OpenVLA、OpenVLA-OFT、Pi0），  
-请参考 :ref:`具身智能依赖 <embodied-dependencies>` 安装专用依赖项。
+* 如果你要运行数学推理实验，需要安装 **Megatron 和 SGLang/vLLM** 后端，请参考 :ref:`Megatron 和 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
+
+* 如果你要运行具身智能相关实验（如 OpenVLA、OpenVLA-OFT、Pi0），请参考 :ref:`具身智能相关依赖 <embodied-dependencies>` 安装专用依赖项。
 
 .. _common-dependencies:
 
@@ -153,13 +159,17 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
 Megatron 和 SGLang/vLLM 依赖
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+  如果你运行的是具身智能实验，则无需安装这些依赖。  
+  请直接跳转到 :ref:`具身智能相关依赖 <embodied-dependencies>` 部分。
+
 运行以下命令，安装 Megatron、SGLang/vLLM 及其所需依赖：
 
 .. code-block:: shell
 
-   uv sync --extra sgl_vllm
+   uv sync --extra sglang-vllm
    mkdir -p /opt && git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
-   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 uv pip install -r requirements/megatron.txt --no-build-isolation
+   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads 24" APEX_PARALLEL_BUILD=24 uv pip install -r requirements/megatron.txt --no-build-isolation
 
 使用 Megatron 前，请将其路径加入 ``PYTHONPATH`` 环境变量：
 
@@ -167,18 +177,6 @@ Megatron 和 SGLang/vLLM 依赖
 
    export PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 
-SGLang 安装：
-
-.. code-block:: shell
-
-   uv sync --extra sglang
-
-vLLM 安装：
-
-.. code-block:: shell
-
-   uv sync --extra vllm
-
 .. _embodied-dependencies:
 
 具身智能相关依赖
@@ -188,15 +186,30 @@ vLLM 安装：
 
 .. code-block:: shell
 
-   bash requirements/install_embodied_deps.sh
    uv sync --extra embodied
+   bash requirements/install_embodied_deps.sh # Must be run after the above command
 
 接着，根据具体实验类型安装对应的 Python 包：
 
 .. code-block:: shell
 
-   # OpenVLA / OpenVLA-OFT 实验所需依赖
+   # OpenVLA 实验所需依赖
    UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla.txt --no-build-isolation
 
+   # OpenVLA-oft 实验所需依赖
+   UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
+
    # Pi0 实验所需依赖
    UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
+
+最后，运行以下命令安装 libero 依赖。
+
+.. code-block:: shell
+
+  mkdir -p /opt && git clone https://github.com/RLinf/LIBERO.git /opt/libero
+
+在使用 LIBERO 前，请确保将其路径添加到 ``PYTHONPATH`` 环境变量中：
+
+.. code-block:: shell
+
+  export PYTHONPATH=/opt/libero:$PYTHONPATH
diff --git a/docs/source-zh/rst_source/start/llm.rst b/docs/source-zh/rst_source/start/llm.rst
index 4435b15bf..37155e02e 100644
--- a/docs/source-zh/rst_source/start/llm.rst
+++ b/docs/source-zh/rst_source/start/llm.rst
@@ -50,14 +50,15 @@
 我们推荐你修改配置文件  
 ``./examples/math/config/qwen2.5-1.5b-single-gpu.yaml`` 中的参数 ``cluster.component_placement``。
 
-你可以根据资源情况将其动态设置为 **1, 2, 4 或 8**。
+你可以根据实际资源将该项设置为 **0-1**， **0-3** 或 **0-7**来使用 2/4/8 张 GPU。
+查看 :doc:`../tutorials/user/yaml` 以获取有关 Placement 配置的更详细说明。
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0
 
 在运行脚本之前，请根据你的模型和数据集下载路径，  
 在 YAML 配置文件中修改以下字段：
diff --git a/docs/source-zh/rst_source/start/vla-eval.rst b/docs/source-zh/rst_source/start/vla-eval.rst
index 499777f2f..e671ddc73 100644
--- a/docs/source-zh/rst_source/start/vla-eval.rst
+++ b/docs/source-zh/rst_source/start/vla-eval.rst
@@ -94,9 +94,8 @@ RLinf 提供了 **即开即用的评估脚本**，用于在 *训练分布内* 
    export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
    # LIBERO 仓库路径
-   export LIBERO_REPO_PATH="/root/LIBERO"
-   export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-   export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+   export LIBERO_PATH="/opt/LIBERO"
+   export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
 
    export CUDA_LAUNCH_BLOCKING=1
    export HYDRA_FULL_ERROR=1
diff --git a/docs/source-zh/rst_source/start/vla.rst b/docs/source-zh/rst_source/start/vla.rst
index b92451c08..fc7b5a537 100644
--- a/docs/source-zh/rst_source/start/vla.rst
+++ b/docs/source-zh/rst_source/start/vla.rst
@@ -34,20 +34,29 @@ ManiSkill3 是一个基于 GPU 加速的机器人研究仿真平台，
 
 **步骤 2：运行官方提供的训练脚本**
 
-为方便使用，我们提供的配置文件默认支持单卡训练。  
+.. note::
+   如果你是通过 Docker 镜像安装的 **RLinf**（见 :doc:`./installation`），请确保已切换到目标模型对应的 Python 环境。
+   默认环境为 ``openvla``。
+   若使用 OpenVLA-OFT 或 Pi0，请使用内置脚本 `switch_env` 切换环境：
+   ``source switch_env openvla-oft`` 或 ``source switch_env pi0``。
+
+   如果你是通过自定义环境安装的 **RLinf**，请确保已安装对应模型的依赖，详见 :doc:`./installation`。
+
+为方便使用，我们提供的配置文件需要至少双卡进行训练。  
 如果你有多张 GPU 并希望加快训练速度，  
 建议你修改配置文件  
 ``./examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml`` 中的参数  
 ``cluster.component_placement``。
 
-你可以根据实际资源设置为 **1、2、4 或 8**。
+你可以根据实际资源将该项设置为 **0-3** 或 **0-7**来使用 4/8 张 GPU。
+查看 :doc:`../tutorials/user/yaml` 以获取有关 Placement 配置的更详细说明。
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0-1
 
 运行脚本之前，请根据你下载的模型和数据集路径，修改 YAML 文件中的以下字段：
 
diff --git a/docs/source-zh/rst_source/tutorials/user/yaml.rst b/docs/source-zh/rst_source/tutorials/user/yaml.rst
index 82a7d61c4..b169c4ae8 100644
--- a/docs/source-zh/rst_source/tutorials/user/yaml.rst
+++ b/docs/source-zh/rst_source/tutorials/user/yaml.rst
@@ -695,7 +695,6 @@ algorithm
 
     n_chunk_steps: 10
     n_eval_chunk_steps: 10
-    rollout_micro_batch_size: 256
     num_group_envs: 32
     rollout_epoch: 1
 
@@ -710,27 +709,23 @@ algorithm
 
 ``algorithm.auto_reset``：是否在 episode 结束时自动重置环境。
 
-``algorithm.ignore_terminations``：训练时是否忽略 episode 的终止信号。
+``algorithm.ignore_terminations``：训练时是否忽略 episode 的终止信号（若开启，episode 仅在达到最大步数时结束）。
 
 ``algorithm.use_fixed_reset_state_ids``：是否使用固定 reset 状态 ID（GRPO 推荐 True，PPO 默认为 False，旨在随机化）。
 
-``algorithm.require_values``：是否需要同时计算值函数。
+``algorithm.require_values``：是否需要同时计算价值函数。
 
 ``algorithm.normalize_advantages``：是否对优势值归一化处理。
 
-``algorithm.kl_penalty``：KL 散度的估算方式（kl 或 kl_penalty）。
+``algorithm.n_chunk_steps``：每个 rollout epoch 中的 chunk 数量（调用模型 predict 的次数）。
 
-``algorithm.n_chunk_steps``：每个 chunk 的动作步数。
-
-``algorithm.n_eval_chunk_steps``：评估模式下每个 chunk 的动作步数。
-
-``algorithm.rollout_micro_batch_size``：Rollout 生成时的微批大小。
+``algorithm.n_eval_chunk_steps``：评估模式下的 chunk 数量。
 
 ``algorithm.num_group_envs``：环境组数量（用于并行）。
 
 ``algorithm.rollout_epoch``：每个训练步骤前的 rollout 轮数。
 
-``algorithm.reward_type``：奖励聚合层级（chunk_level、token_level、step_level）。
+``algorithm.reward_type``：奖励聚合层级（chunk_level、action_level）。
 
 ``algorithm.logprob_type``：对数概率的计算层级。
 
@@ -793,10 +788,6 @@ rollout
 
 ``rollout.backend``：模型后端（huggingface、vllm）。  
 
-``rollout.enforce_eager``：禁用 CUDA graph，以更快完成初始化。  
-
-``rollout.enable_offload``：启用模型下放以降低内存占用。  
-
 ``rollout.pipeline_stage_num``：模型并行的流水线阶段数。
 
 actor
diff --git a/examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml b/examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml
new file mode 100644
index 000000000..35d9b1833
--- /dev/null
+++ b/examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml
@@ -0,0 +1,300 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    rollout: 0-3
+    inference: 4-5
+    actor: 6-7
+
+runner:
+  task_type: coding_online_rl
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 10
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 10
+
+  seq_length: 2560
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 2560
+
+  resume_dir: null
+  experiment_name: online-ppo-1.5b-pipeline
+  output_dir: /mnt/public/zhuchunyang_rl/logs
+
+algorithm:
+  group_size: 1
+
+  n_minibatches: 2
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # PPO loss params (no critic model)
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+
+  # Control critic usage (similar to AReaL's disable_head)
+  use_critic: False  # Disable critic model
+  use_value_loss: False  # Disable value loss computation
+  
+  # PPO parameters for no-critic setup
+  gamma: 0.99
+  gae_lambda: 0.95
+  # value_clip and huber_delta not needed without critic
+
+  # Use no-critic GAE advantage computation
+  adv_type: math_gae_no_critic
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 0.1
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+    stop: [
+      "<|endoftext|>",
+      "<|fim_prefix|>",
+      "<|fim_middle|>",
+      "<|fim_suffix|>",
+      "<|fim_pad|>",
+      "<|repo_name|>",
+      "<|file_sep|>",
+      "<|im_start|>",
+      "<|im_end|>",
+    ]
+
+inference:
+  model_arch: ${rollout.model_arch}
+  group_name: "InferenceGroup"
+  load_from_actor: True
+  model:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: True
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /mnt/public/hf_models/Qwen2.5-Coder-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: True            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: sglang     # online_rl now only support sglang 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    # not used, but reserved to pass config validate
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  max_prompt_length: 1024
+  rollout_batch_size: 16
+  seed: 1234
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: null
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-06
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-7
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: False
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-6
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: ${rollout.model_dir}
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: Qwen2.5-Coder-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+reward:
+  use_reward_model: False
+  reward_type: fim_verify_call
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: False
+
+server:
+  # online serving and user reward track
+  online_router:
+    host: 0.0.0.0
+    port: 8081
+
+  tracking_rollout:
+    host: 0.0.0.0
+    port: 8082
+    enable_dummy_data: True
diff --git a/examples/math/config/tp_comm_overlap_cfg.yaml b/examples/coding_online_rl/config/tp_comm_overlap_cfg.yaml
similarity index 100%
rename from examples/math/config/tp_comm_overlap_cfg.yaml
rename to examples/coding_online_rl/config/tp_comm_overlap_cfg.yaml
diff --git a/examples/coding_online_rl/main_coding_online_rl.py b/examples/coding_online_rl/main_coding_online_rl.py
new file mode 100644
index 000000000..c5246292c
--- /dev/null
+++ b/examples/coding_online_rl/main_coding_online_rl.py
@@ -0,0 +1,106 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import hydra
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from rlinf.config import validate_cfg
+from rlinf.runners.coding_online_rl_runner import CodingOnlineRLRunner
+from rlinf.scheduler import Cluster
+from rlinf.scheduler.placement import PackedPlacementStrategy
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+from rlinf.utils.utils import output_redirector
+from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.rollout.server.online_router_worker import OnlineRouterWorker
+from rlinf.workers.rollout.server.server_rollout_worker import ServerRolloutWorker
+from rlinf.workers.rollout.utils import get_rollout_backend_worker
+
+"""Script to start GRPO training"""
+mp.set_start_method("spawn", force=True)
+
+
+@hydra.main(version_base="1.1")
+@output_redirector
+def main(cfg) -> None:
+    cfg = validate_cfg(cfg)
+    print(json.dumps(OmegaConf.to_container(cfg, resolve=True), indent=2))
+
+    cluster = Cluster(num_nodes=cfg.cluster.num_nodes)
+    component_placement = ModelParallelComponentPlacement(cfg, cluster)
+
+    singleton_placement_strategy = PackedPlacementStrategy(
+        start_accelerator_id=0, end_accelerator_id=0
+    )
+    online_router = OnlineRouterWorker.create_group(cfg, component_placement).launch(
+        cluster=cluster,
+        name="OnlineRouterWorker",
+        placement_strategy=singleton_placement_strategy,
+    )
+    server_rollout = ServerRolloutWorker.create_group(cfg).launch(
+        cluster=cluster,
+        name="ServerRolloutWorker",
+        placement_strategy=singleton_placement_strategy,
+    )
+
+    rollout_worker_cls = get_rollout_backend_worker(cfg, component_placement)
+
+    # Rollout group
+    rollout_placement_strategy = component_placement.get_strategy("rollout")
+    rollout_group = rollout_worker_cls.create_group(cfg, component_placement).launch(
+        cluster,
+        name=cfg.rollout.group_name,
+        placement_strategy=rollout_placement_strategy,
+    )
+
+    # Inference group
+    inference_group = None
+    if (
+        component_placement.placement_mode == PlacementMode.DISAGGREGATED
+        and cfg.algorithm.recompute_logprobs
+    ):
+        inference_placement_strategy = component_placement.get_strategy("inference")
+        inference_group = MegatronInference.create_group(
+            cfg, component_placement
+        ).launch(
+            cluster,
+            name=cfg.inference.group_name,
+            placement_strategy=inference_placement_strategy,
+        )
+
+    # GRPO Actor group
+    actor_placement_strategy = component_placement.get_strategy("actor")
+    actor_group = MegatronActor.create_group(cfg, component_placement).launch(
+        cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
+    )
+
+    runner = CodingOnlineRLRunner(
+        cfg=cfg,
+        placement=component_placement,
+        rollout=rollout_group,
+        inference=inference_group,
+        actor=actor_group,
+        online_router=online_router,
+        server_rollout=server_rollout,
+    )
+
+    runner.init_workers()
+    runner.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/math/run_main_math_grpo_megatron.sh b/examples/coding_online_rl/run_main_coding_online_rl.sh
similarity index 71%
rename from examples/math/run_main_math_grpo_megatron.sh
rename to examples/coding_online_rl/run_main_coding_online_rl.sh
index f826f882f..a547d773f 100644
--- a/examples/math/run_main_math_grpo_megatron.sh
+++ b/examples/coding_online_rl/run_main_coding_online_rl.sh
@@ -13,9 +13,9 @@ MEGATRON_PATH=/opt/Megatron-LM
 export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-megatron"
+    CONFIG_NAME="qwen2.5-1.5b-ppo"
 else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/coding_online_rl/main_coding_online_rl.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/coding_online_rl/simple_test_client.py b/examples/coding_online_rl/simple_test_client.py
new file mode 100644
index 000000000..570e6e842
--- /dev/null
+++ b/examples/coding_online_rl/simple_test_client.py
@@ -0,0 +1,110 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import uuid
+from datetime import datetime
+
+import httpx
+
+batch_size = 16
+epoch = 10 * 2
+
+
+async def agenerate(prefix, suffix):
+    TARGET_URL = "http://127.0.0.1:8081/v1/completions"
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer test-token",
+    }
+    body = {
+        "model": "test-model",
+        "prompt": f"<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>",
+        "max_tokens": 50,
+        "temperature": 0.7,
+        "stream": False,
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            TARGET_URL,
+            headers=headers,
+            json=body,
+            timeout=15.0,
+        )
+        print(f"agenerate get response: {response.json()}")
+        return response.json()["choices"][0]["text"]
+
+
+async def atrack(prefix, suffix, completion, accepted):
+    TARGET_URL = "http://127.0.0.1:8082/api/training/submit"
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer test-token",
+    }
+
+    body = {
+        "completionId": str(uuid.uuid4()),
+        "filepath": "file:///Users/qurakchin/.vscode/extensions/continue.continue-1.2.3-darwin-arm64/continue_tutorial.py",
+        "prefix": prefix,
+        "suffix": suffix,
+        "prompt": f"<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>",
+        "completion": completion,
+        "modelProvider": "openai",
+        "modelName": "Qwen2.5-Coder-1.5B-Q8_0.gguf",
+        "accepted": accepted,
+        "timestamp": datetime.now().isoformat(),
+        "time": 4294,
+        "uniqueId": str(uuid.uuid4()),
+        "numLines": 1,
+        "cacheHit": False,
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            TARGET_URL,
+            headers=headers,
+            json=body,
+            timeout=15.0,
+        )
+        print(f"atrack get response: {response.json()}")
+
+
+async def single_iteration(prefix, suffix):
+    await asyncio.sleep(0.001)
+    completion = await agenerate(prefix=prefix, suffix=suffix)
+    await asyncio.sleep(0.001)
+    await atrack(prefix=prefix, suffix=suffix, completion=completion, accepted=True)
+
+
+async def loop():
+    prefix = "if x[j] > x[j + 1]:\n                x[j], x[j + 1] = x[j + 1], x[j]\n    return x\n\ndef han"
+    suffix = '\n# —————————————————————————————————————————————————     Agent      ————————————————————————————————————————————————— #\n#           Agent equips the Chat model with the tools needed to handle a wide range of coding tasks, allowing\n#           the model to make decisions and save you the work of manually finding context and performing actions.\n\n# 1. Switch from "Chat" to "Agent" mode using the dropdown in the bottom left of the input box'
+
+    tasks = []
+    for i in range(batch_size * epoch):
+        task = asyncio.create_task(single_iteration(prefix, suffix))
+        tasks.append(task)
+
+        if i % batch_size == 0:
+            await asyncio.gather(*tasks)
+            tasks = []
+
+    await asyncio.gather(*tasks)
+
+
+if __name__ == "__main__":
+    asyncio.run(loop())
diff --git a/examples/embodiment/config/env/eval/libero_130.yaml b/examples/embodiment/config/env/eval/libero_130.yaml
new file mode 100644
index 000000000..c30f619af
--- /dev/null
+++ b/examples/embodiment/config/env/eval/libero_130.yaml
@@ -0,0 +1,27 @@
+simulator_type: libero
+task_suite_name: libero_130
+
+auto_reset: True
+ignore_terminations: True
+max_episode_steps: 512
+
+use_rel_reward: True
+reward_coef: 5.0
+only_eval: True
+
+seed: 0
+num_group: ${env.eval.num_envs}
+group_size: 1
+use_fixed_reset_state_ids: True
+num_images_in_input: 1
+
+num_envs: 500
+
+video_cfg:
+  save_video: True
+  info_on_video: True
+  video_base_dir: ${runner.logger.log_path}/video/train
+
+init_params:
+  camera_heights: 256
+  camera_widths: 256
\ No newline at end of file
diff --git a/examples/embodiment/config/env/eval/libero_90.yaml b/examples/embodiment/config/env/eval/libero_90.yaml
new file mode 100644
index 000000000..be380c000
--- /dev/null
+++ b/examples/embodiment/config/env/eval/libero_90.yaml
@@ -0,0 +1,27 @@
+simulator_type: libero
+task_suite_name: libero_90
+
+auto_reset: True
+ignore_terminations: True
+max_episode_steps: 512
+
+use_rel_reward: True
+reward_coef: 5.0
+only_eval: True
+
+seed: 0
+num_group: ${env.eval.num_envs}
+group_size: 1
+use_fixed_reset_state_ids: True
+num_images_in_input: 1
+
+num_envs: 500
+
+video_cfg:
+  save_video: True
+  info_on_video: True
+  video_base_dir: ${runner.logger.log_path}/video/train
+
+init_params:
+  camera_heights: 256
+  camera_widths: 256
\ No newline at end of file
diff --git a/examples/embodiment/config/env/eval/maniskill_ood_template.yaml b/examples/embodiment/config/env/eval/maniskill_ood_template.yaml
index d224576fb..42e1279a2 100644
--- a/examples/embodiment/config/env/eval/maniskill_ood_template.yaml
+++ b/examples/embodiment/config/env/eval/maniskill_ood_template.yaml
@@ -16,7 +16,7 @@ video_cfg:
   video_base_dir: ${runner.logger.log_path}/video/eval
 
 init_params:
-  id: null
+  id: "PutOnPlateInScene25Main-v3"
   num_envs: ${env.eval.num_envs}
   obs_mode: ${env.train.init_params.obs_mode}
   control_mode: None
@@ -25,4 +25,4 @@ init_params:
   max_episode_steps: ${env.train.init_params.max_episode_steps}
   sensor_configs: ${env.train.init_params.sensor_configs}
   render_mode: sensors
-  obj_set: test
\ No newline at end of file
+  obj_set: train
\ No newline at end of file
diff --git a/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml b/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml
index 39083cefb..45fc9a424 100644
--- a/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml
+++ b/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml
@@ -10,7 +10,7 @@ only_eval: False
 max_episode_steps: 80
 
 video_cfg:
-  save_video: True
+  save_video: False
   info_on_video: True
   video_base_dir: ${runner.logger.log_path}/video/train
 
diff --git a/examples/embodiment/config/env/train/libero_10_ppo.yaml b/examples/embodiment/config/env/train/libero_10.yaml
similarity index 91%
rename from examples/embodiment/config/env/train/libero_10_ppo.yaml
rename to examples/embodiment/config/env/train/libero_10.yaml
index 0147d3b1a..d6dc7807a 100644
--- a/examples/embodiment/config/env/train/libero_10_ppo.yaml
+++ b/examples/embodiment/config/env/train/libero_10.yaml
@@ -24,4 +24,6 @@ video_cfg:
 
 init_params:
   camera_heights: 256
-  camera_widths: 256
\ No newline at end of file
+  camera_widths: 256
+
+use_ordered_reset_state_ids: True
diff --git a/examples/embodiment/config/env/train/libero_130.yaml b/examples/embodiment/config/env/train/libero_130.yaml
new file mode 100644
index 000000000..5be83a8a0
--- /dev/null
+++ b/examples/embodiment/config/env/train/libero_130.yaml
@@ -0,0 +1,29 @@
+simulator_type: libero
+task_suite_name: libero_130
+
+auto_reset: ${algorithm.auto_reset}
+ignore_terminations: ${algorithm.ignore_terminations}
+max_episode_steps: 512
+
+use_rel_reward: True
+reward_coef: 5.0
+only_eval: False
+
+seed: 0
+num_group: ${algorithm.num_group_envs}
+group_size: ${algorithm.group_size}
+use_fixed_reset_state_ids: ${algorithm.use_fixed_reset_state_ids}
+num_images_in_input: 1
+
+num_envs: ${multiply:${algorithm.group_size}, ${algorithm.num_group_envs}}
+
+video_cfg:
+  save_video: True
+  info_on_video: True
+  video_base_dir: ${runner.logger.log_path}/video/train
+
+init_params:
+  camera_heights: 256
+  camera_widths: 256
+
+use_ordered_reset_state_ids: True
\ No newline at end of file
diff --git a/examples/embodiment/config/env/train/libero_10_grpo.yaml b/examples/embodiment/config/env/train/libero_90.yaml
similarity index 95%
rename from examples/embodiment/config/env/train/libero_10_grpo.yaml
rename to examples/embodiment/config/env/train/libero_90.yaml
index d25e1a57b..97c074b87 100644
--- a/examples/embodiment/config/env/train/libero_10_grpo.yaml
+++ b/examples/embodiment/config/env/train/libero_90.yaml
@@ -1,5 +1,5 @@
 simulator_type: libero
-task_suite_name: libero_10
+task_suite_name: libero_90
 
 auto_reset: ${algorithm.auto_reset}
 ignore_terminations: ${algorithm.ignore_terminations}
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
index 3f15ff2ba..525b1c951 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env/train: libero_10_grpo
+  - env/train: libero_10
   - env/eval: libero_10
   - override hydra/job_logging: stdout
 
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -140,6 +140,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
@@ -156,6 +157,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
index f409707d9..69708e7c0 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env/train: libero_10_grpo
+  - env/train: libero_10
   - env/eval: libero_10
   - override hydra/job_logging: stdout
 
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -141,6 +141,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
@@ -157,6 +158,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
index ba0d90d0d..15b33bbbc 100644
--- a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env/train: libero_10_ppo
+  - env/train: libero_10
   - env/eval: libero_10
   - override hydra/job_logging: stdout
 
@@ -135,6 +135,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 5e-6
@@ -151,6 +152,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_130_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_130_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..1e71c0868
--- /dev/null
+++ b/examples/embodiment/config/libero_130_grpo_openvlaoft.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - env/train: libero_130
+  - env/eval: libero_130
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:EMBODIED_PATH}/config/
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: all
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: "../results"
+    project_name: rlinf
+    experiment_name: "test_openvla"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1000
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: 25
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 8
+  reward_type: step_level  # step_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/path/to/model/Openvla-oft-SFT-libero130-traj1/"
+
+  enable_offload: False
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/path/to/model/Openvla-oft-SFT-libero130-traj1/"
+  checkpoint_save_path: "../results"
+  micro_batch_size: 32
+  global_batch_size: 8192
+  seed: 1234
+  enable_offload: False
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_130_no_noops
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-130/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/examples/embodiment/config/libero_90_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_90_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..ec400877f
--- /dev/null
+++ b/examples/embodiment/config/libero_90_grpo_openvlaoft.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - env/train: libero_90
+  - env/eval: libero_90
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:EMBODIED_PATH}/config/
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: all
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: "../results"
+    project_name: rlinf
+    experiment_name: "test_openvla"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1000
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: 25
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 8
+  reward_type: step_level  # step_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-90/"
+
+  enable_offload: False
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-90/"
+  checkpoint_save_path: "../results"
+  micro_batch_size: 32
+  global_batch_size: 8192
+  seed: 1234
+  enable_offload: False
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_90_no_noops
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-90/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
index 80a691848..699d7ab40 100644
--- a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
@@ -155,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
index 37326ef1e..d2bacd05e 100644
--- a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
@@ -155,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
index eb6f1cf0c..9469166d9 100644
--- a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
@@ -155,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvla.yaml b/examples/embodiment/config/maniskill_grpo_openvla.yaml
index f947e658a..3679d533d 100644
--- a/examples/embodiment/config/maniskill_grpo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvla.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: PutCarrotOnPlateInScene
-  - env/eval: PutCarrotOnPlateInScene
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: PutOnPlateInScene25Main
   - override hydra/job_logging: stdout
 
 hydra:
@@ -12,10 +12,11 @@ hydra:
 
 cluster:
   num_nodes: 1
+  num_gpus_per_node: 8
   component_placement:
-    actor: 0-3
-    env: 4-5
-    rollout: 6-7
+    actor: 0-7
+    env: 0-3
+    rollout: 4-7
 
 runner:
   task_type: embodied
@@ -38,16 +39,19 @@ algorithm:
   auto_reset: False
   ignore_terminations: False
   use_fixed_reset_state_ids: True
+
   require_values: False
-  shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 8
-  n_chunk_steps: 80
-  n_eval_steps: 80
-  num_group_envs: 16
+
+  num_group_envs: 32
   rollout_epoch: 1
-  reward_type: step_level
+
+  n_chunk_steps: 80
+  n_eval_chunk_steps: 80
+
+  reward_type: action_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -65,6 +69,7 @@ algorithm:
   gamma: 0.99
   gae_lambda: 0.95
 
+  # params for generation
   sampling_params:
     use_greedy: False
     temperature_train: 1.0
@@ -86,7 +91,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -123,7 +128,6 @@ actor:
 
   model:
     model_name: "openvla"
-    value_type: ${algorithm.reward_type}  # 'action' or 'token'
     action_dim: 7
     num_action_chunks: 1
     use_proprio: False
@@ -144,6 +148,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-5
@@ -153,6 +158,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 1.0
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
index 2e2a6cc1f..7bd32855b 100644
--- a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: libero_10_grpo
-  - env/eval: libero_10
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
   - override hydra/job_logging: stdout
 
 hydra:
@@ -13,9 +13,9 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor: 0-3
-    env: 4-5
-    rollout: 6-7
+    actor: 0-7
+    env: 0-7
+    rollout: 0-7
 
 runner:
   task_type: embodied
@@ -30,7 +30,7 @@ runner:
 
   only_eval: False
   val_check_interval: -1
-  save_interval: 25
+  save_interval: 40
   seq_length: 4096
   max_prompt_length: 30
 
@@ -39,15 +39,14 @@ algorithm:
   ignore_terminations: False
   use_fixed_reset_state_ids: True
   require_values: False
-  shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 8
   n_chunk_steps: 10
   n_eval_chunk_steps: 10
-  num_group_envs: 8
+  num_group_envs: 32
   rollout_epoch: 1
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -65,11 +64,12 @@ algorithm:
   gamma: 0.99
   gae_lambda: 0.95
 
+  # params for rollout
   sampling_params:
     use_greedy: False
     temperature_train: 1.0
     temperature_eval: 0.6
-    top_k: 50
+    top_k: 0
     top_p: 1.0
     repetition_penalty: 1.0
 
@@ -86,7 +86,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -97,8 +97,8 @@ rollout:
   mode: "colocate"
   generation_backend: "huggingface"
   model_dir: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
-  enable_offload: False
-  pipeline_stage_num: 2
+  enable_offload: True
+  pipeline_stage_num: 1
 
 actor:
   group_name: "ActorGroup"
@@ -109,8 +109,8 @@ actor:
   training_backend: "fsdp"
   checkpoint_load_path: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
   checkpoint_save_path: "../results"
-  micro_batch_size: 8
-  global_batch_size: 160
+  micro_batch_size: 40
+  global_batch_size: 640
   seed: 1234
   enable_offload: True
 
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   tokenizer:
     tokenizer_type: "HuggingFaceTokenizer"
@@ -148,12 +149,18 @@ actor:
     padding_side: "right"
 
   optim:
-    lr: 5.0e-6
+    lr: 1.0e-4
     value_lr: 3.0e-3
     adam_beta1: 0.9
-    adam_beta2: 0.95
+    adam_beta2: 0.999
     adam_eps: 1.0e-05
-    clip_grad: 1.0
+    clip_grad: 10.0
+
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
 
 reward:
   use_reward_model: False
diff --git a/examples/embodiment/config/maniskill_ppo_openvla.yaml b/examples/embodiment/config/maniskill_ppo_openvla.yaml
index f76f18830..bf9cab8eb 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: PutCarrotOnPlateInScene
-  - env/eval: PutCarrotOnPlateInScene
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
   - override hydra/job_logging: stdout
 
 hydra:
@@ -36,18 +36,17 @@ runner:
 
 algorithm:
   auto_reset: True
-  ignore_terminations: True
+  ignore_terminations: False
   use_fixed_reset_state_ids: False
   require_values: True
-  shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 1
-  n_chunk_steps: 80
+  n_chunk_steps: 160
   n_eval_chunk_steps: 80
-  num_group_envs: 120
+  num_group_envs: 128
   rollout_epoch: 1
-  reward_type: step_level
+  reward_type: action_level
   logprob_type: action_level
   entropy_type: action_level
 
@@ -70,7 +69,7 @@ algorithm:
     use_greedy: False
     temperature_train: 1.0
     temperature_eval: 0.6
-    top_k: 50
+    top_k: 0
     top_p: 1.0
     repetition_penalty: 1.0
 
@@ -87,7 +86,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -110,8 +109,8 @@ actor:
   training_backend: "fsdp"
   checkpoint_load_path: "/path/to/model/rl4vla/openvla-7b-rlvla-warmup/"
   checkpoint_save_path: "../results"
-  micro_batch_size: 20
-  global_batch_size: 160
+  micro_batch_size: 40
+  global_batch_size: 640
   seed: 1234
   enable_offload: True
 
@@ -138,14 +137,15 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
     value_lr: 3.0e-3
     adam_beta1: 0.9
-    adam_beta2: 0.95
+    adam_beta2: 0.999
     adam_eps: 1.0e-05
-    clip_grad: 1.0
+    clip_grad: 10.0
 
   tokenizer:
     tokenizer_type: "HuggingFaceTokenizer"
@@ -154,6 +154,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml b/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
index d8aaac28c..034130c7b 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
@@ -144,6 +144,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
index baad8b402..b81d1390f 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
@@ -23,7 +23,7 @@ runner:
     experiment_name: "test_openvla"
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 5
+  max_epochs: 1000
   max_steps: -1
 
   only_eval: False
@@ -41,14 +41,14 @@ algorithm:
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 1
 
-  n_chunk_steps: 10
-  n_eval_chunk_steps: 10
+  n_chunk_steps: 80
+  n_eval_chunk_steps: 80
   # training rollout mbs
   rollout_micro_batch_size: 64
-  num_group_envs: 8
+  num_group_envs: 32
   rollout_epoch: 1
 
-  reward_type: step_level
+  reward_type: action_level
   logprob_type: action_level
   entropy_type: action_level
 
@@ -123,7 +123,7 @@ actor:
   checkpoint_save_path: "../results"
 
   micro_batch_size: 20
-  global_batch_size: 80
+  global_batch_size: 160
   seed: 1234
   enable_offload: True
 
@@ -160,6 +160,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
@@ -169,6 +170,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 1.0
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
index 35680f9d6..f957f1997 100644
--- a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: PutCarrotOnPlateInScene
-  - env/eval: PutCarrotOnPlateInScene
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
   - override hydra/job_logging: stdout
 
 hydra:
@@ -13,9 +13,9 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor: 0-1
-    env: 2-3
-    rollout: 4-5
+    actor: 0-7
+    env: 0-3
+    rollout: 4-7
 
 runner:
   task_type: embodied
@@ -36,18 +36,18 @@ runner:
 
 algorithm:
   auto_reset: True
-  ignore_terminations: True
+  ignore_terminations: False
   use_fixed_reset_state_ids: False
   require_values: True
   shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 1
-  n_chunk_steps: 10
+  n_chunk_steps: 20
   n_eval_chunk_steps: 10
-  num_group_envs: 64
+  num_group_envs: 128
   rollout_epoch: 1
-  reward_type: chunk_level
+  reward_type: step_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -71,7 +71,7 @@ algorithm:
     use_greedy: False
     temperature_train: 1.0
     temperature_eval: 0.6
-    top_k: 50
+    top_k: 0
     top_p: 1.0
     repetition_penalty: 1.0
 
@@ -88,7 +88,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -100,7 +100,7 @@ rollout:
   backend: "huggingface"
   model_dir: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
   enable_offload: True
-  pipeline_stage_num: 2
+  pipeline_stage_num: 1
 
 actor:
   group_name: "ActorGroup"
@@ -111,8 +111,8 @@ actor:
   training_backend: "fsdp"
   checkpoint_load_path: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
   checkpoint_save_path: "../results"
-  micro_batch_size: 8
-  global_batch_size: 160
+  micro_batch_size: 40
+  global_batch_size: 640
   seed: 1234
   enable_offload: True
 
@@ -142,6 +142,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
@@ -158,6 +159,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml b/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
index 48686107a..56d936659 100644
--- a/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
@@ -141,6 +141,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
@@ -157,6 +158,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/eval_embodiment.sh b/examples/embodiment/eval_embodiment.sh
index eec38eee0..316076ae8 100644
--- a/examples/embodiment/eval_embodiment.sh
+++ b/examples/embodiment/eval_embodiment.sh
@@ -7,12 +7,10 @@ export SRC_FILE="${EMBODIED_PATH}/eval_embodied_agent.py"
 export MUJOCO_GL="osmesa"
 export PYOPENGL_PLATFORM="osmesa"
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-# NOTE: set LIBERO_REPO_PATH to the path of the LIBERO repo
-export LIBERO_REPO_PATH="/path/to/repo/LIBERO"
-# NOTE: set LIBERO_CONFIG_PATH for libero/libero/__init__.py
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
 
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+LIBERO_PATH=/opt/libero
+export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
+
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
 
diff --git a/examples/embodiment/eval_all.sh b/examples/embodiment/eval_mani_ood.sh
similarity index 51%
rename from examples/embodiment/eval_all.sh
rename to examples/embodiment/eval_mani_ood.sh
index 86d4a9c4d..a9b49a8fe 100644
--- a/examples/embodiment/eval_all.sh
+++ b/examples/embodiment/eval_mani_ood.sh
@@ -5,24 +5,12 @@ export EMBODIED_PATH="$( cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export REPO_PATH=$(dirname $(dirname "$EMBODIED_PATH"))
 export SRC_FILE="${EMBODIED_PATH}/eval_embodied_agent.py"
 
-export MUJOCO_GL="egl"
-export PYOPENGL_PLATFORM="egl"
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-# NOTE: set LIBERO_REPO_PATH to the path of the LIBERO repo
-export LIBERO_REPO_PATH="/path/to/repo/LIBERO"
-# NOTE: set LIBERO_CONFIG_PATH for libero/libero/__init__.py
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
 
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="maniskill_ppo_openvla_eval"
-else
-    CONFIG_NAME=$1
-fi
+EVAL_NAME=YOUR_EVAL_NAME
+CKPT_PATH=YOUR_CKPT_PATH        # .pt file
+CONFIG_NAME=YOUR_CFG_NAME       # env.eval must be maniskill_ood_template
 
 for env_id in \
     "PutOnPlateInScene25VisionImage-v1" "PutOnPlateInScene25VisionTexture03-v1" "PutOnPlateInScene25VisionTexture05-v1" \
@@ -32,14 +20,34 @@ for env_id in \
     "PutOnPlateInScene25Position-v1" "PutOnPlateInScene25EEPose-v1" "PutOnPlateInScene25PositionChangeTo-v1" ; \
 do
     obj_set="test"
-    LOG_DIR="${REPO_PATH}/logs/eval/$(date +'%Y%m%d-%H:%M:%S')-${env_id}-${obj_set}"
+    LOG_DIR="${REPO_PATH}/logs/eval/${EVAL_NAME}/$(date +'%Y%m%d-%H:%M:%S')-${env_id}-${obj_set}"
     MEGA_LOG_FILE="${LOG_DIR}/run_ppo.log"
     mkdir -p "${LOG_DIR}"
-    CMD="python ${SRC_FILE} --config-path ${EMBODIED_PATH}/config/ --config-name ${CONFIG_NAME} \
+    CMD="python ${SRC_FILE} --config-path ${EMBODIED_PATH}/config/ \
+        --config-name ${CONFIG_NAME} \
         runner.logger.log_path=${LOG_DIR} \
         env.eval.init_params.id=${env_id} \
-        env.eval.init_params.obj_set=$obj_set"
+        env.eval.init_params.obj_set=${obj_set} \
+        actor.model.ckpt_path=${CKPT_PATH}"
 
     echo ${CMD} > ${MEGA_LOG_FILE}
     ${CMD} 2>&1 | tee -a ${MEGA_LOG_FILE}
+done
+
+for env_id in \
+    "PutOnPlateInScene25Carrot-v1" "PutOnPlateInScene25MultiCarrot-v1" \
+    "PutOnPlateInScene25MultiPlate-v1" ; \
+do
+    obj_set="train"
+    LOG_DIR="${REPO_PATH}/logs/eval/${EVAL_NAME}/$(date +'%Y%m%d-%H:%M:%S')-${env_id}-${obj_set}"
+    MEGA_LOG_FILE="${LOG_DIR}/run_ppo.log"
+    mkdir -p "${LOG_DIR}"
+    CMD="python ${SRC_FILE} --config-path ${EMBODIED_PATH}/config/ \
+        --config-name ${CONFIG_NAME} \
+        runner.logger.log_path=${LOG_DIR} \
+        env.eval.init_params.id=${env_id} \
+        env.eval.init_params.obj_set=${obj_set} \
+        actor.model.ckpt_path=${CKPT_PATH}"
+    echo ${CMD}  > ${MEGA_LOG_FILE}
+    ${CMD} 2>&1 | tee -a ${MEGA_LOG_FILE}
 done
\ No newline at end of file
diff --git a/examples/embodiment/run_embodiment.sh b/examples/embodiment/run_embodiment.sh
index 19134dfbc..cf5d91d4f 100644
--- a/examples/embodiment/run_embodiment.sh
+++ b/examples/embodiment/run_embodiment.sh
@@ -7,12 +7,10 @@ export SRC_FILE="${EMBODIED_PATH}/train_embodied_agent.py"
 export MUJOCO_GL="egl"
 export PYOPENGL_PLATFORM="egl"
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-# NOTE: set LIBERO_REPO_PATH to the path of the LIBERO repo
-export LIBERO_REPO_PATH="/path/to/repo/LIBERO"
-# NOTE: set LIBERO_CONFIG_PATH for libero/libero/__init__.py
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
 
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+LIBERO_PATH=/opt/libero
+export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
+
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
 
@@ -23,6 +21,7 @@ else
     CONFIG_NAME=$1
 fi
 
+echo "Using Python at $(which python)"
 LOG_DIR="${REPO_PATH}/logs/$(date +'%Y%m%d-%H:%M:%S')" #/$(date +'%Y%m%d-%H:%M:%S')"
 MEGA_LOG_FILE="${LOG_DIR}/run_embodiment.log"
 mkdir -p "${LOG_DIR}"
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
new file mode 100644
index 000000000..c6e8251ac
--- /dev/null
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -0,0 +1,225 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 28672
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: fsdp-sglang-512*8-16card-timeout
+  output_dir: ./logs
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: ascend # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 4096 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 128 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 32
+  val_rollout_batch_size: null
+  num_workers: 2
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/home/dataset/boba/AReaL-boba-106k.jsonl"]
+  val_data_paths: ["/home/dataset/boba/AReaL-boba-106k.jsonl"]
+  prompt_key: prompt
+  image_keys: [image]
+  answer_key: answer
+  choice_key: choices
+  solution_key: null
+  use_chat_template: True
+  lazy_loading: True
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
similarity index 92%
rename from examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
index a9ad232ef..ea4606003 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
@@ -12,9 +12,10 @@ cluster:
     rollout: 0-15
     inference: 16-23
     actor: 24-63
+    reward: 0-15
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -65,6 +66,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: True
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: False
@@ -117,6 +120,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -132,6 +137,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 512
@@ -144,6 +150,7 @@ data:
   train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
   val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
 
+
 actor:
   group_name: "ActorGroup"
   training_backend: megatron
@@ -269,9 +276,15 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
 
 critic:
   use_critic_model: false
diff --git a/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
similarity index 91%
rename from examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
index c63d1f7be..4d851f894 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 16
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
@@ -104,6 +106,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -119,6 +123,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 512
@@ -256,9 +261,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
similarity index 91%
rename from examples/math/config/qwen2.5-1.5b-single-gpu.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
index af2a94a13..1829050c1 100644
--- a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: 0
+    actor,rollout,reward: 0
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
@@ -90,7 +92,7 @@ rollout:
   detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
-  
+
   rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
@@ -104,6 +106,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -237,7 +241,7 @@ actor:
       process_num: 16 # number of processes to use for checkpointing
       tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
       pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
-      
+
     profiler: # profile megatron when inference and traning
       output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
       activities: ["cpu", "cuda"]
@@ -256,9 +260,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/config/qwen2.5-32b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
similarity index 92%
rename from examples/math/config/qwen2.5-32b-grpo-megatron.yaml
rename to examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
index 4aa80cb4c..e9eb2089e 100644
--- a/examples/math/config/qwen2.5-32b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 32
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
@@ -104,6 +106,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -257,5 +261,11 @@ reward:
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/config/qwen2.5-7b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
similarity index 91%
rename from examples/math/config/qwen2.5-7b-grpo-megatron.yaml
rename to examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
index 29c4f1d5b..b2a70d6f8 100644
--- a/examples/math/config/qwen2.5-7b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 16
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
@@ -104,6 +106,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -255,9 +259,16 @@ actor:
 
   
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/reasoning/config/tp_comm_overlap_cfg.yaml b/examples/reasoning/config/tp_comm_overlap_cfg.yaml
new file mode 100644
index 000000000..97fc8b391
--- /dev/null
+++ b/examples/reasoning/config/tp_comm_overlap_cfg.yaml
@@ -0,0 +1,47 @@
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+proj_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
\ No newline at end of file
diff --git a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
new file mode 100644
index 000000000..17c192fa2
--- /dev/null
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
@@ -0,0 +1,228 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: ../results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: ascend # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+  val_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+critic:
+  use_critic_model: false
diff --git a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml
new file mode 100644
index 000000000..fa5b16080
--- /dev/null
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml
@@ -0,0 +1,222 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: ../results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize. taoxu 1010
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: ascend # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+  val_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/main_math.py b/examples/reasoning/main_grpo.py
similarity index 81%
rename from examples/math/main_math.py
rename to examples/reasoning/main_grpo.py
index 7150566fb..30073d562 100644
--- a/examples/math/main_math.py
+++ b/examples/reasoning/main_grpo.py
@@ -21,12 +21,13 @@
 from rlinf.config import validate_cfg
 from rlinf.data.datasets import create_rl_dataset
 from rlinf.data.tokenizers import hf_tokenizer
-from rlinf.runners.math_runner import MathRunner
+from rlinf.runners.reasoning_runner import ReasoningRunner
 from rlinf.scheduler import Cluster
 from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
 from rlinf.utils.utils import output_redirector
-from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.actor import get_actor_worker
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.reward.reward_worker import RewardWorker
 from rlinf.workers.rollout.utils import get_rollout_backend_worker
 
 """Script to start GRPO training"""
@@ -67,16 +68,25 @@ def main(cfg) -> None:
             placement_strategy=inference_placement_strategy,
         )
 
+    # Reward group
+    reward_placement_strategy = component_placement.get_strategy("reward")
+    reward_group = RewardWorker.create_group(cfg, component_placement).launch(
+        cluster,
+        name=cfg.reward.group_name,
+        placement_strategy=reward_placement_strategy,
+    )
+
     # GRPO Actor group
+    actor_worker_cls = get_actor_worker(cfg)
     actor_placement_strategy = component_placement.get_strategy("actor")
-    actor_group = MegatronActor.create_group(cfg, component_placement).launch(
+    actor_group = actor_worker_cls.create_group(cfg, component_placement).launch(
         cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
     )
 
     tokenizer = hf_tokenizer(cfg.actor.tokenizer.tokenizer_model)
-    train_ds, val_ds = create_rl_dataset(cfg.data, tokenizer)
+    train_ds, val_ds = create_rl_dataset(cfg, tokenizer)
 
-    runner = MathRunner(
+    runner = ReasoningRunner(
         cfg=cfg,
         placement=component_placement,
         train_dataset=train_ds,
@@ -84,6 +94,7 @@ def main(cfg) -> None:
         rollout=rollout_group,
         inference=inference_group,
         actor=actor_group,
+        reward=reward_group,
     )
 
     runner.init_workers()
diff --git a/examples/reasoning/run_main_grpo_math.sh b/examples/reasoning/run_main_grpo_math.sh
new file mode 100644
index 000000000..81f4fcd56
--- /dev/null
+++ b/examples/reasoning/run_main_grpo_math.sh
@@ -0,0 +1,20 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+export RAY_DEDUP_LOGS=0
+
+CONFIG_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+REPO_PATH=$(dirname $(dirname "$CONFIG_PATH"))
+MEGATRON_PATH=/opt/Megatron-LM
+export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    CONFIG_NAME="qwen2.5-1.5b-grpo-megatron"
+else
+    CONFIG_NAME="qwen2.5-1.5b-grpo-fsdp.yaml"
+fi
+
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path ${CONFIG_PATH}/config/math/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/math/run_main_math_pipeline_grpo_megatron.sh b/examples/reasoning/run_main_grpo_vqa.sh
old mode 100644
new mode 100755
similarity index 64%
rename from examples/math/run_main_math_pipeline_grpo_megatron.sh
rename to examples/reasoning/run_main_grpo_vqa.sh
index 7deb96519..3cc526f0e
--- a/examples/math/run_main_math_pipeline_grpo_megatron.sh
+++ b/examples/reasoning/run_main_grpo_vqa.sh
@@ -2,7 +2,6 @@
 set -x
 
 tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export TOKENIZERS_PARALLELISM=false
 export RAY_DEDUP_LOGS=0
@@ -13,9 +12,9 @@ MEGATRON_PATH=/opt/Megatron-LM
 export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-megatron-pipeline"
+    CONFIG_NAME="qwen2.5-vl-3b-grpo-fsdp"
 else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path ${CONFIG_PATH}/config/vqa/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/math/run_placement_autotune.sh b/examples/reasoning/run_placement_autotune.sh
similarity index 100%
rename from examples/math/run_placement_autotune.sh
rename to examples/reasoning/run_placement_autotune.sh
diff --git a/fusion_result.json b/fusion_result.json
new file mode 100644
index 000000000..20b56950d
--- /dev/null
+++ b/fusion_result.json
@@ -0,0 +1,21 @@
+{
+    "session_and_graph_id_0_0": {
+        "graph_fusion": {
+            "IndexByTensorStaticFusionPass": {
+                "effect_times": "0",
+                "match_times": "1"
+            },
+            "RefreshInt64ToInt32FusionPass": {
+                "effect_times": "0",
+                "match_times": "1"
+            }
+        },
+        "ub_fusion": {
+            "AutomaticUbFusion": {
+                "effect_times": "0",
+                "match_times": "2",
+                "repository_hit_times": "0"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/cmd.sh b/logs-lxs/cmd.sh
new file mode 100644
index 000000000..31c8fc1ee
--- /dev/null
+++ b/logs-lxs/cmd.sh
@@ -0,0 +1,3 @@
+nohup bash examples/reasoning/run_main_grpo_math.sh examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml > test_math.log 2>&1
+
+bash examples/reasoning/run_main_grpo_math.sh qwen2.5-1.5b-grpo-fsdp
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json
new file mode 100644
index 000000000..1e6eeade1
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-256*8-16card",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-256*8-16card",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-256*8-16card",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 256,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 512,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761824220.hostname-kjuul.1028618.0 b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761824220.hostname-kjuul.1028618.0
new file mode 100644
index 000000000..853a8b900
Binary files /dev/null and b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761824220.hostname-kjuul.1028618.0 differ
diff --git a/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761825588.hostname-kjuul.1562049.0 b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761825588.hostname-kjuul.1562049.0
new file mode 100644
index 000000000..af500a464
Binary files /dev/null and b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761825588.hostname-kjuul.1562049.0 differ
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json
new file mode 100644
index 000000000..13cd40304
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-512*8-16card-timeout",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 512,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 1024,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761829334.hostname-kjuul.2113735.0 b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761829334.hostname-kjuul.2113735.0
new file mode 100644
index 000000000..a4cd17a41
Binary files /dev/null and b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761829334.hostname-kjuul.2113735.0 differ
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json
new file mode 100644
index 000000000..c1197da7b
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-512*8-16card-timeout",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 16,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 32,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761898947.hostname-kjuul.3557129.0 b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761898947.hostname-kjuul.3557129.0
new file mode 100644
index 000000000..f39457f20
Binary files /dev/null and b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761898947.hostname-kjuul.3557129.0 differ
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761900889.hostname-kjuul.30866.0 b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761900889.hostname-kjuul.30866.0
new file mode 100644
index 000000000..ddcafb574
Binary files /dev/null and b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761900889.hostname-kjuul.30866.0 differ
diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json
new file mode 100644
index 000000000..3ffdafacb
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-512*8-16card",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-512*8-16card",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-512*8-16card",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 512,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 1024,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820416.hostname-kjuul.3651077.0 b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820416.hostname-kjuul.3651077.0
new file mode 100644
index 000000000..095a18e71
Binary files /dev/null and b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820416.hostname-kjuul.3651077.0 differ
diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820811.hostname-kjuul.84956.0 b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820811.hostname-kjuul.84956.0
new file mode 100644
index 000000000..1ae80ab66
Binary files /dev/null and b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820811.hostname-kjuul.84956.0 differ
diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761821088.hostname-kjuul.261840.0 b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761821088.hostname-kjuul.261840.0
new file mode 100644
index 000000000..96fc0177f
Binary files /dev/null and b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761821088.hostname-kjuul.261840.0 differ
diff --git a/logs-lxs/fsdp-sglang/tensorboard/config.json b/logs-lxs/fsdp-sglang/tensorboard/config.json
new file mode 100644
index 000000000..5299b4cdb
--- /dev/null
+++ b/logs-lxs/fsdp-sglang/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "0-1"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 2000,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 64,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 16,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 32,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761816870.hostname-kjuul.2287170.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761816870.hostname-kjuul.2287170.0
new file mode 100644
index 000000000..af851bb2d
Binary files /dev/null and b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761816870.hostname-kjuul.2287170.0 differ
diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761817980.hostname-kjuul.2660597.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761817980.hostname-kjuul.2660597.0
new file mode 100644
index 000000000..ebbf40543
Binary files /dev/null and b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761817980.hostname-kjuul.2660597.0 differ
diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819019.hostname-kjuul.2893398.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819019.hostname-kjuul.2893398.0
new file mode 100644
index 000000000..b3eb25312
Binary files /dev/null and b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819019.hostname-kjuul.2893398.0 differ
diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819424.hostname-kjuul.3154814.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819424.hostname-kjuul.3154814.0
new file mode 100644
index 000000000..54327db79
Binary files /dev/null and b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819424.hostname-kjuul.3154814.0 differ
diff --git a/logs-lxs/fusion_result.json b/logs-lxs/fusion_result.json
new file mode 100644
index 000000000..ec747fa47
--- /dev/null
+++ b/logs-lxs/fusion_result.json
@@ -0,0 +1 @@
+null
\ No newline at end of file
diff --git a/logs-lxs/run_sglang.py b/logs-lxs/run_sglang.py
new file mode 100644
index 000000000..165e813e7
--- /dev/null
+++ b/logs-lxs/run_sglang.py
@@ -0,0 +1,32 @@
+import dataclasses
+import sglang as sgl
+from sglang.srt.server_args import ServerArgs
+if __name__ == "__main__":
+    model_dir = "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    serve_args = ServerArgs(
+        model_path=model_dir,
+        attention_backend="ascend",
+        cuda_graph_max_bs=32,
+        enable_memory_saver=True,
+    )
+    llm = sgl.Engine(**dataclasses.asdict(serve_args))
+    sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 32}
+    outputs = llm.generate(prompt=prompts, sampling_params=sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+    llm.release_memory_occupation()
+    print("Memory occupation released.")
+    llm.resume_memory_occupation()
+    print("Memory occupation resumed.")
+    outputs = llm.generate(prompt=prompts, sampling_params=sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Regenerate prompt: {prompt}\nGenerated text: {output['text']}")
diff --git a/pyproject.toml b/pyproject.toml
index 7adf0a799..ebf2c57e4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "pybind11",
     "torch-memory-saver",
     "setuptools>=69.5.1,<75.9",
+    "ninja",
 
     # Logging
     "swanlab",
@@ -51,12 +52,9 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-sglang = [
+sglang-vllm = [
     "transformers==4.51.1",
     "sglang[all]==0.4.6.post5",
-]
-vllm = [
-    "transformers==4.51.1",
     "vllm==0.8.5",
 ]
 embodied = [
@@ -67,20 +65,32 @@ embodied = [
     "tensorflow_graphics",
     "peft==0.11.1",
     "timm==0.9.10",
+    "sapien==3.0.1;platform_system=='Linux'",
     "mani_skill @ git+https://github.com/haosulab/ManiSkill.git",
     "tensordict",
-    "libero @ git+https://github.com/Lifelong-Robot-Learning/LIBERO.git"
+    "libero @ git+https://github.com/RLinf/LIBERO.git",
+    "imageio[ffmpeg]",
+    "robosuite==1.4.1",
+    "bddl",
+    "easydict",
+    "cloudpickle",
+    "gym",
 ]
 
 [tool.uv]
 prerelease = "allow"
 conflicts = [
     [
-      { extra = "sglang" },
-      { extra = "vllm" },
+      { extra = "sglang-vllm" },
       { extra = "embodied" },
     ],
 ]
+override-dependencies = [
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+    "torchaudio==2.6.0",
+    "xgrammar==0.1.19"
+]
 
 [tool.ruff]
 line-length = 88
diff --git a/requirements/README.md b/requirements/README.md
index c9be5abb4..d8f565a1e 100644
--- a/requirements/README.md
+++ b/requirements/README.md
@@ -26,38 +26,40 @@ UV_TORCH_BACKEND=auto uv sync
 ### Megatron and SGLang/vLLM Dependencies
 Run the following to install Megatron, SGLang or vLLM and their dependencies.
 
-Megatron installation:
 ```shell
-uv sync --extra sgl_vllm
+uv sync --extra sglang-vllm
 mkdir -p /opt && git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
-APEX_CPP_EXT=1 APEX_CUDA_EXT=1 uv pip install -r requirements/megatron.txt --no-build-isolation
+APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads 24" APEX_PARALLEL_BUILD=24 uv pip install -r requirements/megatron.txt --no-build-isolation
 ```
 Before using Megatron, make sure it's path is added to the `PYTHONPATH` environment variables.
 ```shell
 export PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 ```
 
-SGLang installation:
-```shell
-uv sync --extra sglang
-```
-
-vLLM installation:
-```shell
-uv sync --extra vllm
-```
-
 ### Embodied Dependencies
 For embodied experiments, first install the necessary system dependencies (currently only Debian/Ubuntu `apt` package management is supported).
 ```shell
-bash requirements/install_embodied_deps.sh
 uv sync --extra embodied
+bash requirements/install_embodied_deps.sh # Must be run after the above command
 ```
-Next, depending on the experiment types, install the `openvla` or `pi0` dependencies.
+Next, depending on the experiment types, install the `openvla`, `openvla_oft` or `pi0` dependencies.
 ```shell
-# For OpenVLA/OpenVLA-oft experiments
+# For OpenVLA experiments
 UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla.txt --no-build-isolation
 
+# For OpenVLA-oft experiment
+UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
+
 # For Pi0 experiment
 UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
+```
+
+Finally, Run the following to install the libero dependency.
+
+```shell
+mkdir -p /opt && git clone https://github.com/RLinf/LIBERO.git /opt/libero
+```
+Before using LIBERO, make sure its path is added to the `PYTHONPATH` environment variables.
+```shell
+export PYTHONPATH=/opt/libero:$PYTHONPATH
 ```
\ No newline at end of file
diff --git a/requirements/install_embodied_deps.sh b/requirements/install_embodied_deps.sh
index caeaf4477..1bb90fd8b 100755
--- a/requirements/install_embodied_deps.sh
+++ b/requirements/install_embodied_deps.sh
@@ -3,6 +3,8 @@
 # Embodied dependencies
 apt-get update -y
 apt-get install -y --no-install-recommends \
+    wget \
+    unzip \
     libibverbs-dev \
     mesa-utils \
     libosmesa6-dev \
@@ -18,6 +20,13 @@ apt-get install -y --no-install-recommends \
     libsm6 \
     libxext6 \
     libxrender-dev \
-    libgomp1 \
+    libgomp1
+
+python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
+python -m mani_skill.utils.download_asset widowx250s -y
+
+PHYSX_VERSION=105.1-physx-5.3.1.patch0
+PHYSX_DIR=~/.sapien/physx/$PHYSX_VERSION
+mkdir -p $PHYSX_DIR && wget -O $PHYSX_DIR/linux-so.zip https://github.com/sapien-sim/physx-precompiled/releases/download/$PHYSX_VERSION/linux-so.zip && unzip $PHYSX_DIR/linux-so.zip -d $PHYSX_DIR && rm $PHYSX_DIR/linux-so.zip
 
 
diff --git a/requirements/openvla.txt b/requirements/openvla.txt
index 7fdcdce13..c9d8a97cc 100644
--- a/requirements/openvla.txt
+++ b/requirements/openvla.txt
@@ -1,10 +1,2 @@
 openvla @ git+https://github.com/openvla/openvla.git
-openvla_oft @ git+https://github.com/moojink/openvla-oft.git
-# https://github.com/openvla/openvla/blob/main/experiments/robot/libero/libero_requirements.txt
-flash-attn==2.5.5
-imageio[ffmpeg]
-robosuite==1.4.1
-bddl
-easydict
-cloudpickle
-gym
\ No newline at end of file
+flash-attn==2.5.5
\ No newline at end of file
diff --git a/requirements/openvla_oft.txt b/requirements/openvla_oft.txt
new file mode 100644
index 000000000..886134df0
--- /dev/null
+++ b/requirements/openvla_oft.txt
@@ -0,0 +1,3 @@
+openvla_oft @ git+https://github.com/moojink/openvla-oft.git
+# https://github.com/openvla/openvla/blob/main/experiments/robot/libero/libero_requirements.txt
+flash-attn==2.5.5
\ No newline at end of file
diff --git a/rlinf/algorithms/advantages.py b/rlinf/algorithms/advantages.py
index abb654533..4ce0338e0 100644
--- a/rlinf/algorithms/advantages.py
+++ b/rlinf/algorithms/advantages.py
@@ -155,6 +155,168 @@ def compute_embodied_grpo_advantages(
     return advantages, advantages
 
 
+@register_advantage("math_gae_no_critic")
+def compute_math_gae_no_critic_advantages_and_returns(**kwargs):
+    """
+    Calculate advantages and returns for math tasks using GAE without critic model.
+
+    This function implements a simplified advantage estimation for math tasks
+    without requiring a value function, similar to AReaL's disable_head approach.
+
+    Args:
+        reward_scores (torch.Tensor): Reward scores for math responses
+        mask (torch.Tensor): Attention mask of shape [bsz, seq_len] or [bsz, max_seq_len]
+        gamma (float): Discount factor
+        gae_lambda (float): GAE lambda parameter
+        normalize_advantages (bool): Whether to normalize advantages
+        normalize_returns (bool): Whether to normalize returns
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (advantages, returns) tensors
+    """
+    reward_scores = kwargs["reward_scores"]
+    mask = kwargs["mask"]
+    gamma = kwargs.get("gamma", 1.0)
+    normalize_advantages = kwargs.get("normalize_advantages", True)
+    normalize_returns = kwargs.get("normalize_returns", False)
+
+    # For math tasks without critic, we use reward-to-go as baseline
+    bsz, seq_len = mask.shape
+
+    # Create reward structure: reward at the end of sequence
+    rewards = torch.zeros_like(mask, dtype=torch.float32)
+    rewards[:, -1] = reward_scores  # Put reward at the end of sequence
+
+    # Create done flags (episode ends at the last token)
+    dones = torch.zeros_like(mask, dtype=torch.bool)
+    dones[:, -1] = True
+
+    # Compute reward-to-go (cumulative discounted rewards)
+    returns = torch.zeros_like(mask, dtype=torch.float32)
+    cumulative_reward = 0
+
+    for t in reversed(range(seq_len)):
+        cumulative_reward = rewards[:, t] + gamma * cumulative_reward * (~dones[:, t])
+        returns[:, t] = cumulative_reward
+
+    # For no-critic setup, advantages are computed using reward-to-go
+    # with a simple baseline subtraction
+    advantages = returns.clone()
+
+    # Apply mask
+    advantages = advantages * mask
+    returns = returns * mask
+
+    # Simple baseline subtraction (mean of valid advantages)
+    if normalize_advantages:
+        valid_advantages = advantages[mask.bool()]
+        if len(valid_advantages) > 0:
+            mean_advantages = valid_advantages.mean()
+            std_advantages = valid_advantages.std()
+            advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
+
+    # Normalize returns if requested
+    if normalize_returns:
+        valid_returns = returns[mask.bool()]
+        if len(valid_returns) > 0:
+            mean_returns = valid_returns.mean()
+            std_returns = valid_returns.std()
+            returns = (returns - mean_returns) / (std_returns + 1e-5)
+
+    return advantages, returns
+
+
+@register_advantage("math_gae")
+def compute_math_gae_advantages_and_returns(**kwargs):
+    """
+    Calculate advantages and returns for math tasks using GAE.
+
+    This function implements Generalized Advantage Estimation (GAE) specifically
+    designed for math tasks, which may have different data structures compared
+    to embodied tasks.
+
+    Args:
+        reward_scores (torch.Tensor): Reward scores for math responses
+        values (torch.Tensor): Value predictions of shape [bsz, seq_len] or [bsz, max_seq_len]
+        mask (torch.Tensor): Attention mask of shape [bsz, seq_len] or [bsz, max_seq_len]
+        gamma (float): Discount factor
+        gae_lambda (float): GAE lambda parameter
+        normalize_advantages (bool): Whether to normalize advantages
+        normalize_returns (bool): Whether to normalize returns
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (advantages, returns) tensors
+    """
+    reward_scores = kwargs["reward_scores"]
+    values = kwargs["values"]
+    mask = kwargs["mask"]
+    gamma = kwargs.get("gamma", 1.0)
+    gae_lambda = kwargs.get("gae_lambda", 1.0)
+    normalize_advantages = kwargs.get("normalize_advantages", True)
+    normalize_returns = kwargs.get("normalize_returns", False)
+
+    # For math tasks, we typically have [bsz, seq_len] tensors
+    bsz, seq_len = values.shape
+
+    # Create a simple reward structure for math tasks
+    # The reward is typically given at the end of the sequence
+    rewards = torch.zeros_like(values)
+    rewards[:, -1] = reward_scores  # Put reward at the end of sequence
+
+    # Create done flags (episode ends at the last token)
+    dones = torch.zeros_like(values, dtype=torch.bool)
+    dones[:, -1] = True
+
+    # Add bootstrap value for the next state (after the sequence)
+    next_values = torch.zeros(bsz, 1, device=values.device, dtype=values.dtype)
+
+    # Compute GAE advantages
+    advantages = torch.zeros_like(values)
+    returns = torch.zeros_like(values)
+
+    gae = 0
+    for t in reversed(range(seq_len)):
+        if t == seq_len - 1:
+            # Last timestep
+            delta = (
+                rewards[:, t]
+                + gamma * next_values[:, 0] * (~dones[:, t])
+                - values[:, t]
+            )
+        else:
+            # Regular timestep
+            delta = (
+                rewards[:, t] + gamma * values[:, t + 1] * (~dones[:, t]) - values[:, t]
+            )
+
+        gae = delta + gamma * gae_lambda * (~dones[:, t]) * gae
+        advantages[:, t] = gae
+        returns[:, t] = gae + values[:, t]
+
+    # Apply mask to advantages and returns
+    advantages = advantages * mask
+    returns = returns * mask
+
+    # Normalize advantages if requested
+    if normalize_advantages:
+        # Only normalize over valid (masked) positions
+        valid_advantages = advantages[mask.bool()]
+        if len(valid_advantages) > 0:
+            mean_advantages = valid_advantages.mean()
+            std_advantages = valid_advantages.std()
+            advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
+
+    # Normalize returns if requested
+    if normalize_returns:
+        valid_returns = returns[mask.bool()]
+        if len(valid_returns) > 0:
+            mean_returns = valid_returns.mean()
+            std_returns = valid_returns.std()
+            returns = (returns - mean_returns) / (std_returns + 1e-5)
+
+    return advantages, returns
+
+
 @register_advantage("math_grpo")
 def compute_math_grpo_advantages(**kwargs):
     reward_scores = kwargs["reward_scores"]
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index 6900d5f68..72e9004c6 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -196,15 +196,20 @@ def compute_math_ppo_actor_loss(**kwargs):
     loss_agg_func = kwargs["loss_agg_func"]
     logprobs = kwargs["logprobs"]
     old_logprobs = kwargs["old_logprobs"]
-    eps_clip = kwargs["eps_clip"]
+    clip_ratio_low = kwargs["clip_ratio_low"]
+    clip_ratio_high = kwargs["clip_ratio_high"]
     advantages = kwargs["advantages"]
     loss_mask = kwargs.get("loss_mask", None)
     c_clip = kwargs.get("c_clip", None)
-
-    assert logprobs.dtype == torch.float32
-    assert old_logprobs.dtype == torch.float32
-    assert advantages.dtype == torch.float32
-
+    if logprobs.dtype != torch.float32:
+        logprobs = logprobs.float()  # 转换为 float32
+    #assert logprobs.dtype == torch.float32
+    #assert old_logprobs.dtype == torch.float32
+    #assert advantages.dtype == torch.float32
+    if old_logprobs.dtype != torch.float32:
+        old_logprobs = old_logprobs.float()
+    if advantages.dtype != torch.float32:
+        advantages = advantages.float()
     assert loss_mask is not None
 
     loss_mask_count = loss_mask.count_nonzero() or 1
@@ -212,7 +217,7 @@ def compute_math_ppo_actor_loss(**kwargs):
     ratio = torch.where(loss_mask, torch.exp(logprobs - old_logprobs), 0)
     approx_kl = torch.where(loss_mask, (logprobs - old_logprobs).detach(), 0.0)
 
-    clipped_ratio = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)
+    clipped_ratio = torch.clamp(ratio, 1.0 - clip_ratio_low, 1.0 + clip_ratio_high)
     policy_loss1 = -advantages * ratio
     policy_loss2 = -advantages * clipped_ratio
 
@@ -232,17 +237,21 @@ def compute_math_ppo_actor_loss(**kwargs):
     clip_mask = policy_loss1.detach() < policy_loss2.detach()
     dual_clip_mask.logical_and_(loss_mask)
 
-    clip_fraction = clip_mask.logical_and_(loss_mask).count_nonzero() / loss_mask_count
-    approx_kl = approx_kl.sum() / loss_mask_count
+    num_clipped = clip_mask.logical_and_(loss_mask).count_nonzero()
+
+    clip_fraction = num_clipped.float() / float(loss_mask_count)
+    approx_kl = -approx_kl.sum() / float(loss_mask_count)
 
     dual_cliped_ratio = torch.where(dual_clip_mask, ratio, 0)
 
     # Compile metrics for logging
     metrics_data = {
-        "policy_loss": masked_mean(policy_loss.detach(), loss_mask),
-        "ratio": masked_mean(ratio.detach(), loss_mask),
-        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask),
-        "dual_cliped_ratio": masked_mean(dual_cliped_ratio.detach(), loss_mask),
+        "policy_loss": masked_mean(policy_loss.detach(), loss_mask).detach(),
+        "ratio": masked_mean(ratio.detach(), loss_mask).detach(),
+        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask).detach(),
+        "dual_cliped_ratio": masked_mean(
+            dual_cliped_ratio.detach(), loss_mask
+        ).detach(),
         "approx_kl": approx_kl.detach(),
         "clip_fraction": clip_fraction.detach(),
     }
@@ -267,18 +276,9 @@ def compute_math_ppo_actor_loss(**kwargs):
         "loss_mask": loss_mask,
         "loss_agg_func": lambda x, mask: (x * mask).sum() / (mask.sum() or 1),
     }
-    (
-        loss,
-        clip_fraction,
-        approx_kl,
-        ratio,
-        clipped_ratio,
-        dual_cliped_ratio,
-    ) = compute_math_ppo_actor_loss(**kwargs)
-    print(f"{loss=}, {clip_fraction=}, {approx_kl=}")
-    print(f"{ratio=}")
-    print(f"{clipped_ratio=}")
-    print(f"{dual_cliped_ratio=}")
+    loss, metrics_data = compute_math_ppo_actor_loss(**kwargs)
+    print(f"Policy loss: {loss=}")
+    print(f"Metrics: {metrics_data}")
 
     # test grpo_actor_loss_fn
     torch.manual_seed(0)
@@ -298,6 +298,7 @@ def compute_math_ppo_actor_loss(**kwargs):
         "clip_ratio_high": clip_ratio_high,
         "loss_mask": loss_mask,
         "loss_mask_sum": loss_mask.sum(),
+        "max_episode_steps": 512,
     }
     loss, metrics_data = compute_embodied_grpo_actor_loss_fn(**kwargs)
     print(f"{loss=}, {metrics_data=}")
diff --git a/rlinf/algorithms/rewards/__init__.py b/rlinf/algorithms/rewards/__init__.py
new file mode 100644
index 000000000..380cfa102
--- /dev/null
+++ b/rlinf/algorithms/rewards/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rlinf.algorithms.rewards.code import CodeReward
+from rlinf.algorithms.rewards.math import MathReward
+from rlinf.algorithms.rewards.vqa import VQAReward
+
+
+def register_reward(name: str, reward_class: type):
+    assert name not in reward_registry, f"Reward {name} already registered"
+    reward_registry[name] = reward_class
+
+
+def get_reward_class(name: str):
+    assert name in reward_registry, f"Reward {name} not found"
+    return reward_registry[name]
+
+
+reward_registry = {}
+
+register_reward("math", MathReward)
+register_reward("vqa", VQAReward)
+register_reward("code", CodeReward)
diff --git a/rlinf/algorithms/rewards/code/__init__.py b/rlinf/algorithms/rewards/code/__init__.py
new file mode 100644
index 000000000..0fc75f971
--- /dev/null
+++ b/rlinf/algorithms/rewards/code/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from omegaconf import DictConfig
+
+from toolkits.code_verifier.verify import fim_verify_call
+
+
+class CodeReward:
+    def __init__(self, config: DictConfig):
+        self.scale = config.get("reward_scale", 1.0)
+
+    def get_reward(
+        self, response: List[str], reference: List[List[str]]
+    ) -> List[float]:
+        rewards = fim_verify_call(response, reference)
+        return [float(reward) * self.scale for reward in rewards]
diff --git a/rlinf/algorithms/rewards/math/__init__.py b/rlinf/algorithms/rewards/math/__init__.py
new file mode 100644
index 000000000..1a67e80e1
--- /dev/null
+++ b/rlinf/algorithms/rewards/math/__init__.py
@@ -0,0 +1,41 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from omegaconf import DictConfig
+
+from toolkits.math_verifier.verify import math_verify_call
+
+
+class MathReward:
+    def __init__(self, config: DictConfig):
+        self.scale = config.get("reward_scale", 1.0)
+
+    def get_reward(
+        self, response: List[str], reference: List[List[str]]
+    ) -> List[float]:
+        """
+        Calculates reward scores for a list of responses compared to corresponding lists of reference answers.
+        For each response, the function checks if it matches any of the provided references using the `process_results` function.
+        The reward for each response is computed as the first element of the result (converted to float) multiplied by `self.scale`.
+        Args:
+            response (List[str]): A list of response strings to be evaluated.
+            reference (List[List[str]]): A list where each element is a list of reference strings corresponding to each response.
+        Returns:
+            List[float]: A list of reward scores, one for each response.
+        """
+
+        rewards = math_verify_call(response, reference)
+        return [float(reward) * self.scale for reward in rewards]
diff --git a/rlinf/algorithms/rewards/vqa/__init__.py b/rlinf/algorithms/rewards/vqa/__init__.py
new file mode 100644
index 000000000..77b009369
--- /dev/null
+++ b/rlinf/algorithms/rewards/vqa/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import torch
+from omegaconf import DictConfig
+
+from .format_rewards import answer_format_reward, think_format_reward
+from .qa_rewards import qa_accuracy_reward
+
+
+class VQAReward:
+    NEEDED_REWARD_FUNCTIONS = {
+        "qa_accuracy": qa_accuracy_reward,
+        "think_format": think_format_reward,
+        "answer_format": answer_format_reward,
+    }
+
+    def __init__(self, config: DictConfig):
+        assert "reward_weights" in config, "VQAReward requires reward_weights in config"
+
+        self.reward_weights_config = config.reward_weights
+        assert set(self.reward_weights_config.keys()) == set(
+            self.NEEDED_REWARD_FUNCTIONS.keys()
+        ), (
+            f"Reward weights must contains all of: {self.NEEDED_REWARD_FUNCTIONS.keys()} but got {list(self.reward_weights_config.keys())}"
+        )
+        assert all(
+            reward_weight >= 0 for reward_weight in self.reward_weights_config.values()
+        ), (
+            f"All reward weights must be non-negative but got {list(self.reward_weights_config.values())}"
+        )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def get_reward(self, completions: List[str], answers: List[dict]) -> List[float]:
+        rewards = []
+        reward_weights = []
+        for reward_name, reward_function in self.NEEDED_REWARD_FUNCTIONS.items():
+            if self.reward_weights_config[reward_name] > 0:
+                rewards.append(reward_function(completions, answers))
+            else:
+                rewards.append([0.0] * len(completions))
+            reward_weights.append(self.reward_weights_config[reward_name])
+
+        rewards_tensor = torch.tensor(rewards, device=self.device)
+        weights_tensor = torch.tensor(reward_weights, device=self.device)
+
+        final_rewards = (rewards_tensor * weights_tensor.unsqueeze(1)).sum(dim=0)
+
+        return final_rewards.tolist()
diff --git a/rlinf/algorithms/rewards/vqa/format_rewards.py b/rlinf/algorithms/rewards/vqa/format_rewards.py
new file mode 100644
index 000000000..205bbe336
--- /dev/null
+++ b/rlinf/algorithms/rewards/vqa/format_rewards.py
@@ -0,0 +1,67 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import List
+
+
+def think_format_reward(completions, answers) -> List[float]:
+    """
+    Think format reward function compatible with GRPO training.
+
+    Reward function that checks if reasoning is enclosed within <think></think> tags.
+
+    Args:
+        completions: List of model completions (text strings)
+
+    Returns:
+        List of reward scores (1.0 for correct format, 0.0 otherwise)
+    """
+    pattern = r"^<think>(?!.*<think>)(.*?)</think>.*$"
+    rewards = []
+
+    for completion in completions:
+        completion_text = str(completion).strip()
+        match = re.match(pattern, completion_text, re.DOTALL | re.MULTILINE)
+        rewards.append(1.0 if match else 0.0)
+
+    return rewards
+
+
+def answer_format_reward(completions, answers) -> List[float]:
+    """
+    Reward function that checks for proper answer formatting.
+
+    Expected format: <answer>X. content</answer> where X is a choice letter.
+
+    Args:
+        completions: List of model completions (text strings)
+
+    Returns:
+        List of reward scores (1.0 for correct format, 0.0 otherwise)
+    """
+    rewards = []
+
+    for completion in completions:
+        completion_text = str(completion).strip()
+
+        # Check for proper answer format: <answer>X. content</answer>
+        answer_pattern = r"<answer>\s*[A-E]\.\s*.+?\s*</answer>"
+        has_proper_answer = bool(
+            re.search(answer_pattern, completion_text, re.DOTALL | re.IGNORECASE)
+        )
+
+        rewards.append(1.0 if has_proper_answer else 0.0)
+
+    return rewards
diff --git a/rlinf/algorithms/rewards/vqa/qa_rewards.py b/rlinf/algorithms/rewards/vqa/qa_rewards.py
new file mode 100644
index 000000000..2bc9540d3
--- /dev/null
+++ b/rlinf/algorithms/rewards/vqa/qa_rewards.py
@@ -0,0 +1,110 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import List
+
+
+def qa_accuracy_reward(completions, answers) -> List[float]:
+    """
+    Reward function that evaluates question-answering accuracy for VQA tasks.
+
+    Based on TRL's accuracy_reward pattern but adapted for multiple choice VQA.
+
+    Args:
+        completions: List of model completions (text strings)
+        answers: List of correct answers (dict)
+
+    Returns:
+        List of reward scores (1.0 for correct, 0.0 for incorrect)
+    """
+    rewards = []
+
+    for completion, answer in zip(completions, answers):
+        completion_text = str(completion).strip()
+
+        # Extract answer from completion - look for <answer>X. content</answer>
+        answer_match = re.search(
+            r"<answer>\s*([A-E])\.\s*(.*?)\s*</answer>",
+            completion_text,
+            re.DOTALL | re.IGNORECASE,
+        )
+
+        if not answer_match:
+            rewards.append(0.0)
+            continue
+
+        predicted_letter = answer_match.group(1).upper()
+        predicted_content = answer_match.group(2).strip()
+
+        # Get ground truth from kwargs
+        correct_answer = answer.get("correct_answer", None)
+        choices = answer.get("choices", None)
+
+        if correct_answer is None or choices is None:
+            rewards.append(0.0)
+            continue
+
+        # Normalize correct_answer to letter format
+        if isinstance(correct_answer, int):
+            correct_letter = chr(65 + correct_answer)  # 0->A, 1->B, etc.
+        elif isinstance(correct_answer, str):
+            correct_letter = correct_answer.strip().upper()
+        else:
+            rewards.append(0.0)
+            continue
+
+        # Parse choices if string format
+        if isinstance(choices, str):
+            try:
+                import ast
+
+                choices = ast.literal_eval(choices)
+            except (ValueError, SyntaxError):
+                choices = [str(choices)]
+
+        # Get correct choice content
+        letter_to_idx = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}
+        if correct_letter in letter_to_idx and letter_to_idx[correct_letter] < len(
+            choices
+        ):
+            correct_content = choices[letter_to_idx[correct_letter]].strip()
+        else:
+            rewards.append(0.0)
+            continue
+
+        # Check accuracy: both letter and content must match
+        letter_match = predicted_letter == correct_letter
+        content_match = _compare_choice_content(predicted_content, correct_content)
+
+        rewards.append(1.0 if (letter_match and content_match) else 0.0)
+
+    return rewards
+
+
+def _compare_choice_content(predicted: str, correct: str) -> bool:
+    """Compare predicted choice content with correct content."""
+    # Simple normalized comparison
+    pred_normalized = predicted.lower().strip()
+    correct_normalized = correct.lower().strip()
+
+    # Direct match
+    if pred_normalized == correct_normalized:
+        return True
+
+    # Partial match for more flexibility
+    if pred_normalized in correct_normalized or correct_normalized in pred_normalized:
+        return True
+
+    return False
diff --git a/rlinf/config.py b/rlinf/config.py
index 5700a2172..05de15254 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import dataclasses
+import importlib.util
 import logging
 import os
 from dataclasses import asdict
@@ -24,22 +25,19 @@
 from omegaconf.dictconfig import DictConfig
 from transformers import AutoConfig
 
+from rlinf.scheduler.cluster import Cluster
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+
 if TYPE_CHECKING:
     from megatron.core.model_parallel_config import ModelParallelConfig
     from megatron.core.transformer.transformer_config import TransformerConfig
 
 logging.getLogger().setLevel(logging.INFO)
 
-try:
-    import transformer_engine
-
-    HAVE_TE = True
-except ImportError:
-    transformer_engine = None
-    HAVE_TE = False
-
-SUPPORTED_MODEL_ARCHS = ["qwen2.5", "openvla", "openvla_oft"]
+SUPPORTED_MODEL_ARCHS = ["qwen2.5", "qwen2.5_vl", "openvla", "openvla_oft"]
 SUPPORTED_ROLLOUT_BACKENDS = ["sglang", "vllm"]
+SUPPORTED_TASK_TYPE = ["embodied", "reasoning", "coding_online_rl"]
+SUPPORTED_TRAINING_BACKENDS = ["megatron", "fsdp"]
 __all__ = ["build_config"]
 
 
@@ -164,6 +162,8 @@ def validate_vllm_cfg(cfg):
         cfg.enable_chunked_prefill = cfg.get("enable_chunked_prefill", True)
         cfg.enable_prefix_caching = cfg.get("enable_prefix_caching", True)
         cfg.enable_flash_infer_sampler = cfg.get("enable_flash_infer_sampler", True)
+        cfg.max_num_batched_tokens = cfg.get("max_num_batched_tokens", None)
+        cfg.torch_profiler_dir = cfg.get("torch_profiler_dir", None)
         return cfg
 
     with open_dict(cfg):
@@ -186,7 +186,7 @@ def validate_vllm_cfg(cfg):
 
 def validate_model_cfg_by_hf_config(cfg, hf_model_path):
     # validate by hf config
-    hf_config = AutoConfig.from_pretrained(hf_model_path)
+    hf_config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
 
     if "Qwen2ForCausalLM" in hf_config.architectures:
         qkv_bias = True
@@ -194,8 +194,16 @@ def validate_model_cfg_by_hf_config(cfg, hf_model_path):
         qkv_bias = getattr(hf_config, "attention_bias", False)
 
     with open_dict(cfg):
-        if hf_config.rope_scaling is not None:
-            cfg.model.seq_len_interpolation_factor = hf_config.rope_scaling["factor"]
+        rs = getattr(hf_config, "rope_scaling", None)
+        if isinstance(rs, dict):
+            rtype = rs.get("type", "")
+            if rtype in {"linear", "dynamic", "ntk", "yarn"}:
+                f = rs.get("factor")
+                if f is not None:
+                    cfg.model.seq_len_interpolation_factor = float(f)
+            else:
+                # mrope
+                cfg.model.seq_len_interpolation_factor = None
         cfg.model.override_vocab_size = hf_config.vocab_size
         cfg.model.max_position_embeddings = hf_config.max_position_embeddings
         cfg.model.rotary_base = hf_config.rope_theta
@@ -215,6 +223,16 @@ def validate_model_cfg_by_hf_config(cfg, hf_model_path):
     return cfg
 
 
+def validate_fsdp_cfg(cfg: DictConfig) -> DictConfig:
+    OmegaConf.set_struct(cfg, True)
+    with open_dict(cfg):
+        cfg.fsdp.forward_prefetch = cfg.fsdp.get("forward_prefetch", False)
+        cfg.fsdp.limit_all_gathers = cfg.fsdp.get("limit_all_gathers", False)
+        cfg.fsdp.backward_prefetch = cfg.fsdp.get("backward_prefetch", False)
+        cfg.fsdp.use_orig_params = cfg.fsdp.get("use_orig_params", False)
+    return cfg
+
+
 def validate_megatron_cfg(cfg: DictConfig) -> DictConfig:
     OmegaConf.set_struct(cfg, True)
 
@@ -522,7 +540,7 @@ def get_robot_control_mode(robot: str):
     return cfg
 
 
-def validate_math_cfg(cfg: DictConfig) -> DictConfig:
+def validate_reasoning_cfg(cfg: DictConfig) -> DictConfig:
     assert cfg.rollout.model_arch in SUPPORTED_MODEL_ARCHS, (
         f"Model {cfg.rollout.model_arch} is not supported"
     )
@@ -553,13 +571,63 @@ def validate_math_cfg(cfg: DictConfig) -> DictConfig:
     return cfg
 
 
+def validate_coding_online_rl_cfg(cfg: DictConfig) -> DictConfig:
+    assert cfg.rollout.model_arch == "qwen2.5", (
+        f"Model {cfg.rollout.model_arch} is not supported"
+    )
+
+    assert cfg.algorithm.recompute_logprobs != cfg.rollout.return_logprobs, (
+        "Exactly one of `algorithm.recompute_logprobs` or `rollout.return_logprobs` must be True to compute `prev_logprobs`."
+    )
+
+    assert cfg.algorithm.recompute_logprobs, (
+        "Online coding task must use recompute_logprobs"
+    )
+
+    assert cfg.actor.training_backend == "megatron", (
+        "Online coding task must use megatron training backend"
+    )
+
+    cluster = Cluster(num_nodes=cfg.cluster.num_nodes)
+    component_placement = ModelParallelComponentPlacement(cfg, cluster)
+    assert component_placement.placement_mode == PlacementMode.DISAGGREGATED, (
+        "Online coding task must use disaggregated placement mode"
+    )
+
+    with open_dict(cfg):
+        cfg.algorithm.training_batch_size_per_gpu = cfg.algorithm.get(
+            "training_batch_size_per_gpu", 1
+        )
+        cfg.algorithm.n_minibatches = cfg.algorithm.get("n_minibatches", 1)
+        cfg.algorithm.max_num_gen_batches = cfg.algorithm.get("max_num_gen_batches", 1)
+        cfg.actor.micro_batch_size = cfg.algorithm.training_batch_size_per_gpu
+        cfg.actor.global_batch_size = (
+            cfg.data.rollout_batch_size
+            * cfg.algorithm.group_size
+            // cfg.algorithm.n_minibatches
+        )
+        assert cfg.actor.micro_batch_size >= 1
+        assert cfg.actor.global_batch_size >= 1
+        assert cfg.runner.seq_length > cfg.data.max_prompt_length, (
+            f"runner.seq_length ({cfg.runner.seq_length}) must be greater than data.max_prompt_length ({cfg.data.max_prompt_length})"
+        )
+
+        cfg.rollout = validate_rollout_cfg(cfg.rollout)
+    return cfg
+
+
 def validate_cfg(cfg: DictConfig) -> DictConfig:
     OmegaConf.set_struct(cfg, True)
 
+    assert cfg.runner.task_type in SUPPORTED_TASK_TYPE, (
+        f"task_type must be one of {SUPPORTED_TASK_TYPE}"
+    )
     if cfg.runner.task_type == "embodied":
         cfg = validate_embodied_cfg(cfg)
-    if cfg.runner.task_type == "math":
-        cfg = validate_math_cfg(cfg)
+    elif cfg.runner.task_type == "reasoning":
+        cfg = validate_reasoning_cfg(cfg)
+    elif cfg.runner.task_type == "coding_online_rl":
+        cfg = validate_coding_online_rl_cfg(cfg)
 
     if (
         cfg.algorithm.adv_type == "embodied_grpo"
@@ -567,13 +635,21 @@ def validate_cfg(cfg: DictConfig) -> DictConfig:
     ):
         assert cfg.algorithm.group_size > 1
 
+    assert cfg.actor.training_backend in SUPPORTED_TRAINING_BACKENDS, (
+        f"Unsupported training_backend {cfg.actor.training_backend}. Supported training backends are {SUPPORTED_TRAINING_BACKENDS}."
+    )
+
     if cfg.actor.training_backend == "megatron":
         cfg.actor = validate_megatron_cfg(cfg.actor)
         cfg.actor = validate_model_cfg_by_hf_config(cfg.actor, cfg.rollout.model_dir)
+    elif cfg.actor.training_backend == "fsdp":
+        cfg.actor = validate_fsdp_cfg(cfg.actor)
 
     if cfg.critic.use_critic_model and cfg.critic.training_backend == "megatron":
         cfg.critic = validate_megatron_cfg(cfg.critic)
-        cfg = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
+        cfg.critic = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
+    elif cfg.critic.use_critic_model and cfg.critic.training_backend == "fsdp":
+        cfg.critic = validate_fsdp_cfg(cfg.critic)
 
     return cfg
 
@@ -704,7 +780,10 @@ def build_transformer_config(cfg) -> "TransformerConfig":
     tp_only_amax_red = cfg.get("tp_only_amax_red", False)
 
     if cfg.get("enable_cuda_graph", False):
-        assert HAVE_TE, "Transformer Engine is required for cudagraphs."
+        if importlib.util.find_spec("transformer_engine") is None:
+            raise ImportError(
+                "Can not import transformer_engine, which is required for cudagraphs."
+            )
         assert cfg.get("use_te_rng_tracker", False), (
             "Transformer engine's RNG tracker is required for cudagraphs, this can be enabled with \
             'use_te_rng_tracker=True'."
diff --git a/rlinf/data/datasets/__init__.py b/rlinf/data/datasets/__init__.py
new file mode 100644
index 000000000..f86f3576e
--- /dev/null
+++ b/rlinf/data/datasets/__init__.py
@@ -0,0 +1,150 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any, Dict, List, Tuple
+
+import torch
+from omegaconf import DictConfig
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+
+from rlinf.data.datasets.item import DatasetItem
+from rlinf.data.datasets.math import MathDataset
+from rlinf.data.datasets.vlm import VLMDatasetRegistry
+
+
+def create_rl_dataset(
+    config: DictConfig, tokenizer: AutoTokenizer
+) -> Tuple[Dataset, Dataset]:
+    """Create rl datasets.
+
+    Arguments:
+        config: The RLinf config.
+        tokenizer (Tokenizer): The tokenizer.
+
+    Returns:
+        train_dataset (Dataset): The training dataset.
+
+        val_dataset (Dataset): The validation dataset.
+    """
+
+    if config.data.type == "math":
+        dataset_cls = MathDataset
+    elif config.data.type == "vision_language":
+        # Prefer new factory-based VLM datasets; fallback to legacy if requested
+        dataset_name = getattr(config.data, "dataset_name", None)
+        lazy_loading = bool(getattr(config.data, "lazy_loading", False))
+
+        logging.info(
+            f"Using VLM dataset: name={dataset_name}, lazy_loading={lazy_loading}"
+        )
+
+        train_dataset = VLMDatasetRegistry.create(
+            dataset_name,
+            data_paths=config.data.train_data_paths,
+            config=config,
+            tokenizer=tokenizer,
+        )
+        val_dataset = VLMDatasetRegistry.create(
+            dataset_name,
+            data_paths=config.data.val_data_paths,
+            config=config,
+            tokenizer=tokenizer,
+        )
+        return train_dataset, val_dataset
+    else:
+        return None, None
+
+    logging.info(f"Using dataset class: {dataset_cls.__name__}")
+
+    # Instantiate the dataset using the determined dataset class
+    train_dataset = dataset_cls(
+        data_paths=config.data.train_data_paths,
+        config=config,
+        tokenizer=tokenizer,
+    )
+
+    val_dataset = dataset_cls(
+        data_paths=config.data.val_data_paths,
+        config=config,
+        tokenizer=tokenizer,
+    )
+
+    return train_dataset, val_dataset
+
+
+def collate_fn(data_list: List["DatasetItem"]) -> Dict[str, Any]:
+    """
+    Collate function for batching dataset items.
+    """
+    prompts = []
+    lens = []
+    for it in data_list:
+        p = (
+            it.prompt
+            if isinstance(it.prompt, torch.Tensor)
+            else torch.as_tensor(it.prompt, dtype=torch.long)
+        )
+        if p.dim() == 2 and p.size(0) == 1:
+            p = p.squeeze(0)
+        assert p.dim() == 1, (
+            f"DatasetItem.prompt must be 1-D tensor, current shape is: {p.shape}"
+        )
+        prompts.append(p)
+        lens.append(p.numel())
+
+    if len(set(lens)) == 1:
+        target_len = lens[0]
+    else:
+        target_len = min(lens)
+        prompts = [p[-target_len:] if p.numel() > target_len else p for p in prompts]
+
+    batch_prompt = torch.stack(prompts, dim=0)  # [B, L]
+    batch_length = torch.tensor(
+        [min(int(it.length), target_len) for it in data_list], dtype=torch.long
+    )
+
+    batch_idx = torch.tensor([int(it.idx) for it in data_list], dtype=torch.long)
+
+    batch: Dict[str, Any] = {
+        "prompt": batch_prompt,  # [B, L]
+        "length": batch_length,  # [B]
+        "answer": [it.answer for it in data_list],  # List[str]
+        "idx": batch_idx,  # [B]
+        "solution": [it.solution for it in data_list],  # List[Optional[str]]
+        "image_data": [
+            it.image_data for it in data_list
+        ],  # List[Optional[List[bytes|str]]]
+        "prompt_text": [it.prompt_text for it in data_list],  # List[Optional[str]]
+        "meta": [it.meta for it in data_list],  # List[Optional[dict]]
+        "multi_modal_inputs": [
+            it.multi_modal_inputs for it in data_list
+        ],  # List[Optional[dict]]
+    }
+    return batch
diff --git a/rlinf/data/datasets/item.py b/rlinf/data/datasets/item.py
new file mode 100644
index 000000000..e75155dcb
--- /dev/null
+++ b/rlinf/data/datasets/item.py
@@ -0,0 +1,59 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+
+
+@dataclass
+class DatasetItem:
+    """
+    A single item in processed dataset.
+
+    Attributes:
+        prompt (torch.Tensor): Tokenized prompt input_ids tensor.
+        length (int): Length of the prompt input_ids.
+        answer (str | dict): The answer associated with the prompt.
+        idx (int): Index of the item in the dataset.
+        solution (Optional[str]): Optional solution text if exists.
+        prompt_text (Optional[str]): Optional original prompt text before tokenization.
+        meta (Optional[Dict[str, Any]]): Optional metadata dictionary.
+        multi_modal_inputs (Optional[Dict[str, Any]]): Optional dictionary for additional multi-modal inputs.
+    """
+
+    prompt: torch.Tensor
+    length: int
+    answer: str | dict
+    idx: int
+    solution: Optional[str] = None
+    image_data: Optional[List[Union[bytes, str]]] = None
+    prompt_text: Optional[str] = None
+    meta: Optional[Dict[str, Any]] = None
+    multi_modal_inputs: Optional[Dict[str, Any]] = None
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets/math.py
similarity index 52%
rename from rlinf/data/datasets.py
rename to rlinf/data/datasets/math.py
index fcce53f47..821074bf6 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets/math.py
@@ -12,67 +12,53 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import json
 import logging
 import os
-from collections import defaultdict
-from typing import List
+from typing import Any, List, Tuple, Union
 
-import numpy as np
 import torch
+from omegaconf import DictConfig
 from torch.utils.data import Dataset
+from transformers import AutoTokenizer
 
-
-def batch_pad_to_fixed_len(
-    batch: List[torch.Tensor],
-    max_batch_len: int,
-    pad_token: int,
-    left_pad: bool = False,
-) -> torch.Tensor:
-    if left_pad:
-        batch_pad = torch.stack(
-            [
-                torch.cat(
-                    [
-                        torch.full(
-                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
-                        ),  # pad on the left
-                        seq,
-                    ]
-                )
-                for seq in batch
-            ]
-        )
-    else:
-        batch_pad = torch.stack(
-            [
-                torch.cat(
-                    [
-                        seq,
-                        torch.full(
-                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
-                        ),
-                    ]
-                )
-                for seq in batch
-            ]
-        )
-    return batch_pad
+from rlinf.data.datasets.item import DatasetItem
+from rlinf.data.datasets.utils import batch_pad_to_fixed_len
 
 
 class MathDataset(Dataset):
-    def __init__(self, data_paths, config, tokenizer):
+    def __init__(
+        self,
+        data_paths: Union[str, List[str]],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ):
         super().__init__()
         self.data_paths = data_paths
         if isinstance(self.data_paths, str):
             self.data_paths = [self.data_paths]
 
-        self.max_prompt_length = config.max_prompt_length
+        self.max_prompt_length = config.data.max_prompt_length
         self.tokenizer = tokenizer
-        self.prompt_key = config.prompt_key
+        self.prompt_key = config.data.prompt_key
 
         self.data = self._load_data()
-        if config.get("filter_prompt_by_length", False):
+        if config.data.get("filter_prompt_by_length", False):
             total = len(self.data)
             filtered = []
             failed = 0
@@ -97,7 +83,10 @@ def __init__(self, data_paths, config, tokenizer):
                     f"(kept {len(self.data)} / {total})."
                 )
 
-    def _load_data(self):
+    def _load_data(self) -> List[Any]:
+        """
+        Load and merge data from multiple files(json or jsonl).
+        """
         merged_data = []
 
         for path in self.data_paths:
@@ -122,7 +111,10 @@ def _load_data(self):
     def __len__(self):
         return len(self.data)
 
-    def encode(self, text):
+    def encode(self, text: str) -> Tuple[List[int], int]:
+        """
+        Use tokenizer to encode the text and return the token ids and length.
+        """
         text_ids = self.tokenizer.encode(text)
         return text_ids, len(text_ids)
 
@@ -151,78 +143,11 @@ def __getitem__(self, idx):
             self.tokenizer.eos_token_id,
             left_pad=True,
         )[0]
-
-        output = {
-            "prompt": prompt_tokens_tensor,
-            "length": prompt_length,
-            "answer": answer,
-            "idx": idx,
-        }
+        output = DatasetItem(
+            prompt=prompt_tokens_tensor,
+            length=prompt_length,
+            answer=answer,
+            idx=idx,
+            image_data=[],
+        )
         return output
-
-
-def create_rl_dataset(data_config, tokenizer):
-    """Create rl datasets.
-
-    Arguments:
-        data_config: The data config.
-        tokenizer (Tokenizer): The tokenizer.
-
-    Returns:
-        train_dataset (Dataset): The training dataset.
-
-        val_dataset (Dataset): The validation dataset.
-    """
-
-    if data_config.type == "math":
-        dataset_cls = MathDataset
-    else:
-        return None, None
-
-    print(f"Using dataset class: {dataset_cls.__name__}")
-
-    # Instantiate the dataset using the determined dataset class
-    train_dataset = dataset_cls(
-        data_paths=data_config.train_data_paths,
-        config=data_config,
-        tokenizer=tokenizer,
-    )
-
-    val_dataset = dataset_cls(
-        data_paths=data_config.val_data_paths,
-        config=data_config,
-        tokenizer=tokenizer,
-    )
-
-    return train_dataset, val_dataset
-
-
-def collate_fn(data_list: list[dict]) -> dict:
-    r"""
-    Collate a batch of sample dicts into batched tensors and arrays.
-
-    Args:
-        data_list: List of dicts mapping feature names to torch.Tensor or other values.
-
-    Returns:
-        Dict where tensor entries are stacked into a torch.Tensor of shape
-        (batch_size, \*dims) and non-tensor entries are converted to
-        np.ndarray of dtype object with shape (batch_size,).
-    """
-    tensors = defaultdict(list)
-    non_tensors = defaultdict(list)
-
-    for data in data_list:
-        for key, val in data.items():
-            if isinstance(val, torch.Tensor):
-                tensors[key].append(val)
-            else:
-                non_tensors[key].append(val)
-
-    for key, val in tensors.items():
-        tensors[key] = torch.stack(val, dim=0)
-
-    for key, val in non_tensors.items():
-        non_tensors[key] = np.array(val, dtype=object)
-
-    return {**tensors, **non_tensors}
diff --git a/rlinf/data/datasets/utils.py b/rlinf/data/datasets/utils.py
new file mode 100644
index 000000000..db4dbdb58
--- /dev/null
+++ b/rlinf/data/datasets/utils.py
@@ -0,0 +1,68 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import torch
+
+
+def batch_pad_to_fixed_len(
+    batch: List[torch.Tensor],
+    max_batch_len: int,
+    pad_token: int,
+    left_pad: bool = False,
+) -> torch.Tensor:
+    if left_pad:
+        batch_pad = torch.stack(
+            [
+                torch.cat(
+                    [
+                        torch.full(
+                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
+                        ),  # pad on the left
+                        seq,
+                    ]
+                )
+                for seq in batch
+            ]
+        )
+    else:
+        batch_pad = torch.stack(
+            [
+                torch.cat(
+                    [
+                        seq,
+                        torch.full(
+                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
+                        ),
+                    ]
+                )
+                for seq in batch
+            ]
+        )
+    return batch_pad
diff --git a/rlinf/data/datasets/vlm.py b/rlinf/data/datasets/vlm.py
new file mode 100644
index 000000000..da9e952b9
--- /dev/null
+++ b/rlinf/data/datasets/vlm.py
@@ -0,0 +1,483 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import logging
+import os
+from io import BytesIO
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import pandas as pd
+import torch
+from omegaconf import DictConfig
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import AutoProcessor, AutoTokenizer
+
+from rlinf.data.datasets.item import DatasetItem
+from rlinf.data.datasets.utils import batch_pad_to_fixed_len
+
+
+class VLMBaseDataset(Dataset):
+    def __init__(
+        self,
+        data_paths: Union[List[str], str],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ) -> None:
+        super().__init__()
+        self.cfg = config
+        raw_paths = [data_paths] if isinstance(data_paths, str) else list(data_paths)
+        # Expand directories into file lists recursively (json/jsonl/parquet)
+        self.data_paths = self._expand_data_paths(raw_paths)
+        self.tokenizer = tokenizer
+        # Delay processor creation; only needed when use_chat_template is True
+        self._processor = None
+
+        self.system_prompt = config.data.get("system_prompt", None)
+        self.use_chat_template = bool(config.data.use_chat_template)
+        self.image_keys = list(config.data.image_keys or [])
+        self.prompt_key = config.data.prompt_key
+        self.choice_key = config.data.get("choice_key", None)
+        self.answer_key = config.data.get("answer_key", None)
+        self.solution_key = config.data.get("solution_key", None)
+        self.max_prompt_length = int(config.data.max_prompt_length)
+        self.eos_id = int(self.tokenizer.eos_token_id)
+
+        # Loading mode
+        self.lazy_loading = bool(getattr(config.data, "lazy_loading", False))
+
+        self._records = []
+        self._indices = []  # (path, fmt, row_index_or_offset)
+
+        if self.lazy_loading:
+            self._build_lazy_indices()
+        else:
+            self._eager_load_all()
+
+    def __len__(self) -> int:
+        return len(self._indices) if self.lazy_loading else len(self._records)
+
+    def __getitem__(self, idx: int) -> DatasetItem:
+        if self.lazy_loading:
+            path, fmt, key = self._indices[idx]
+            raw = self._load_single_lazy(path, fmt, key)
+            return self._process_raw_record(raw, idx)
+        else:
+            raw = self._records[idx]
+            return self._process_raw_record(raw, idx)
+
+    # Ensure dataset is picklable for multi-process DataLoader by removing
+    # unpicklable cache objects like pyarrow.ParquetFile from state.
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Drop heavy/unpicklable caches; they will be rebuilt on-demand in workers
+        for k in ("_parquet_cache", "_parquet_df_cache"):
+            if k in state:
+                state[k] = {}
+        return state
+
+    def __setstate__(self, state):
+        # Restore state and ensure cache dicts exist
+        self.__dict__.update(state)
+        self._parquet_cache = getattr(self, "_parquet_cache", {})
+        self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+
+    def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, None]]:
+        images: List[Union[bytes, str, None]] = []
+        for k in self.image_keys:
+            v = dataitem.get(k, None)
+            if v is None:
+                continue
+            if isinstance(v, Image.Image):
+                images.append(v)
+            elif isinstance(v, dict) and "bytes" in v:
+                images.append(v["bytes"])
+            else:
+                images.append(v)  # path or url
+        if not images:
+            images = [None]
+        return images
+
+    def build_prompt_text(self, data_item: Dict[str, Any]) -> str:
+        # Default: prompt + optional choices rendered inline
+        q = data_item.get(self.prompt_key, "")
+        choices = data_item.get(self.choice_key, []) if self.choice_key else []
+        if not isinstance(choices, list):
+            choices = [choices]
+        if choices:
+            return f"{q}{choices}\n"
+        return str(q)
+
+    def encode_prompt(
+        self, prompt_text: str, images
+    ) -> Tuple[torch.Tensor, int, Optional[str]]:
+        """
+        Return (token_ids[L], length, prompt_text_used). If using chat template, encode with processor.
+        Subclasses may override to support alternative prompting.
+        """
+        if self.use_chat_template:
+            if self._processor is None:
+                self._processor = AutoProcessor.from_pretrained(
+                    self.cfg.actor.model.model_path
+                )
+            messages = []
+            if self.system_prompt is not None:
+                messages.append(
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": self.system_prompt}],
+                    }
+                )
+
+            content: List[Dict[str, Any]] = []
+            for _ in range(max(0, len(images))):
+                content.append({"type": "image"})
+            content.append({"type": "text", "text": prompt_text})
+            messages.append({"role": "user", "content": content})
+            rendered = self._processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+
+            images_inputs = []
+            for image in images:
+                image_obj = None
+                if isinstance(image, Image.Image):
+                    image_obj = image.convert("RGB")
+                if isinstance(image, (bytes, bytearray)):
+                    image_obj = Image.open(BytesIO(image)).convert("RGB")
+                images_inputs.append(image_obj)
+
+            inputs = self._processor(
+                text=[rendered], images=images_inputs, padding=True, return_tensors="pt"
+            )
+            inputs.pop("attention_mask")
+            if self.cfg.rollout.rollout_backend == "sglang":
+                ids = inputs.pop("input_ids")
+            elif self.cfg.rollout.rollout_backend == "vllm":
+                inputs.pop("input_ids")
+                ids = self._processor(
+                    text=[rendered], images=None, padding=True, return_tensors="pt"
+                )["input_ids"]
+            else:
+                raise ValueError(
+                    f"Unsupported rollout backend {self.cfg.rollout.rollout_backend}"
+                )
+            if isinstance(ids, torch.Tensor):
+                if ids.dim() == 2 and ids.size(0) == 1:
+                    ids = ids.squeeze(0)
+                ids = ids.to(dtype=torch.long)
+            else:
+                ids = torch.tensor(ids, dtype=torch.long)
+
+            multi_modal_inputs = {}
+            for k, v in inputs.items():
+                multi_modal_inputs[k] = v
+            return ids, int(ids.numel()), rendered, multi_modal_inputs
+        else:
+            # fallback: tokenizer only
+            ids_list = self.tokenizer.encode(prompt_text)
+            ids = torch.as_tensor(ids_list, dtype=torch.long)
+            return ids, int(ids.numel()), prompt_text, {}
+
+    def postprocess_dataset_item(
+        self, item: DatasetItem, raw: Dict[str, Any]
+    ) -> DatasetItem:
+        return item
+
+    def _expand_data_paths(self, inputs: List[str]) -> List[str]:
+        exts = {".jsonl", ".json", ".parquet"}
+        files: List[str] = []
+        for p in inputs:
+            if os.path.isdir(p):
+                for root, _, fnames in os.walk(p):
+                    for fn in fnames:
+                        ext = os.path.splitext(fn)[1].lower()
+                        if ext in exts:
+                            files.append(os.path.join(root, fn))
+            else:
+                files.append(p)
+        files = sorted(set(files))
+        return files
+
+    def _eager_load_all(self) -> None:
+        merged: List[Dict[str, Any]] = []
+        for path in self.data_paths:
+            fmt = os.path.splitext(path)[1].lower()
+            if fmt == ".jsonl":
+                with open(path, "r", encoding="utf-8") as f:
+                    merged.extend(json.loads(l) for l in f)
+            elif fmt == ".json":
+                with open(path, "r", encoding="utf-8") as f:
+                    content = json.load(f)
+                    if isinstance(content, list):
+                        merged.extend(content)
+                    else:
+                        merged.append(content)
+            elif fmt == ".parquet":
+                try:
+                    merged.extend(pd.read_parquet(path).to_dict(orient="records"))
+                except Exception as e:
+                    raise RuntimeError(f"Failed to load parquet eagerly: {path}: {e}")
+            else:
+                logging.warning(f"Unsupported format {fmt} for path {path}, skipping.")
+        self._records = merged
+        # Build indices for consistency
+        self._indices = [("", "eager", i) for i in range(len(self._records))]
+
+    def _build_lazy_indices(self) -> None:
+        self._indices.clear()
+        for path in self.data_paths:
+            fmt = os.path.splitext(path)[1].lower()
+            if fmt == ".jsonl":
+                # index by byte offsets for each line
+                offsets: List[int] = []
+                with open(path, "rb") as fb:
+                    pos = 0
+                    for line in fb:
+                        offsets.append(pos)
+                        pos += len(line)
+                self._indices.extend((path, "jsonl", off) for off in offsets)
+            elif fmt == ".json":
+                try:
+                    with open(path, "r", encoding="utf-8") as f:
+                        content = json.load(f)
+                    if not isinstance(content, list):
+                        content = [content]
+                    # store the content to avoid re-reading
+                    # keep perfile cache
+                    self._json_cache = getattr(self, "_json_cache", {})
+                    self._json_cache[path] = content
+                    self._indices.extend((path, "json", i) for i in range(len(content)))
+                except Exception as e:
+                    raise RuntimeError(f"Failed to index json lazily: {path}: {e}")
+            elif fmt == ".parquet":
+                try:
+                    import pyarrow.parquet as pq  # type: ignore
+
+                    pf = pq.ParquetFile(path)
+                    num_rows = pf.metadata.num_rows
+                    # file handle cache
+                    self._parquet_cache = getattr(self, "_parquet_cache", {})
+                    self._parquet_cache[path] = pf
+                    self._indices.extend((path, "parquet", i) for i in range(num_rows))
+                except Exception:
+                    df = pd.read_parquet(path)
+                    self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+                    self._parquet_df_cache[path] = df
+                    self._indices.extend(
+                        (path, "parquet_pd", i) for i in range(len(df))
+                    )
+            else:
+                logging.warning(f"Unsupported format {fmt} for path {path}, skipping.")
+
+    def _load_single_lazy(self, path: str, fmt: str, key: Any) -> Dict[str, Any]:
+        if fmt == "eager":
+            return self._records[int(key)]
+        if fmt == "jsonl":
+            with open(path, "rb") as fb:
+                fb.seek(int(key))
+                line = fb.readline()
+            return json.loads(line.decode("utf-8").strip())
+        if fmt == "json":
+            return self._json_cache[path][int(key)]  # type: ignore[attr-defined]
+        if fmt == "parquet":
+            # Try to use pyarrow lazily; rebuild cache if missing
+            self._parquet_cache = getattr(self, "_parquet_cache", {})
+            pf = self._parquet_cache.get(path)
+            if pf is None:
+                try:
+                    import pyarrow.parquet as pq  # type: ignore
+
+                    pf = pq.ParquetFile(path)
+                    self._parquet_cache[path] = pf
+                except Exception:
+                    # Fall back to pandas-based cache
+                    self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+                    df = self._parquet_df_cache.get(path)
+                    if df is None:
+                        df = pd.read_parquet(path)
+                        self._parquet_df_cache[path] = df
+                    return df.iloc[int(key)].to_dict()
+            table = pf.read_row_group(key // max(1, pf.metadata.num_rows), columns=None)
+            try:
+                df = table.to_pandas()
+                return df.iloc[int(key) % len(df)].to_dict()
+            except Exception:
+                df_all = pf.read().to_pandas()
+                return df_all.iloc[int(key)].to_dict()
+        if fmt == "parquet_pd":
+            self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+            df = self._parquet_df_cache.get(path)
+            if df is None:
+                df = pd.read_parquet(path)
+                self._parquet_df_cache[path] = df
+            return df.iloc[int(key)].to_dict()
+        raise RuntimeError(f"Unknown lazy fmt {fmt}")
+
+    def _process_raw_record(self, raw: Dict[str, Any], idx: int) -> DatasetItem:
+        images = self.get_image_list(raw)
+        prompt_text = self.build_prompt_text(raw)
+        prompt_ids, plen, rendered_text, multi_modal_inputs = self.encode_prompt(
+            prompt_text, images
+        )
+
+        if plen > self.max_prompt_length:
+            prompt_ids = prompt_ids[: self.max_prompt_length]
+            plen = self.max_prompt_length
+        prompt_ids = batch_pad_to_fixed_len(
+            [prompt_ids], self.max_prompt_length, self.eos_id, left_pad=True
+        )[0]
+
+        answer_val = raw.get(self.answer_key, None) if self.answer_key else None
+        solution_val = raw.get(self.solution_key, None) if self.solution_key else None
+        item = DatasetItem(
+            prompt=prompt_ids,
+            length=plen,
+            answer=str(answer_val) if answer_val is not None else None,
+            idx=idx,
+            image_data=images,
+            prompt_text=rendered_text or prompt_text,
+            solution=solution_val,
+            meta=None,
+            multi_modal_inputs=multi_modal_inputs,
+        )
+        return self.postprocess_dataset_item(item, raw)
+
+
+class VLMDatasetRegistry:
+    registry: Dict[str, Callable[..., VLMBaseDataset]] = {}
+
+    @classmethod
+    def register(
+        cls, name: str
+    ) -> Callable[[Callable[..., VLMBaseDataset]], Callable[..., VLMBaseDataset]]:
+        def decorator(klass: Callable[..., VLMBaseDataset]):
+            cls.registry[name] = klass
+            return klass
+
+        return decorator
+
+    @classmethod
+    def create(
+        cls,
+        dataset_name: Optional[str],
+        *,
+        data_paths: Union[List[str], str],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ) -> VLMBaseDataset:
+        key = dataset_name.lower()
+        dataset_class = cls.registry.get(key)
+        return dataset_class(data_paths=data_paths, config=config, tokenizer=tokenizer)
+
+
+@VLMDatasetRegistry.register("robo2vlm")
+class Robo2VLMDataset(VLMBaseDataset):
+    def __init__(
+        self,
+        data_paths: Union[List[str], str],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ) -> None:
+        super().__init__(data_paths, config, tokenizer)
+        self.system_prompt = (
+            "You are a helpful robotic vision assistant specialized in "
+            "answering questions about robotic manipulation tasks. "
+            "Use <think></think> tags to show your reasoning process, "
+            "then provide your final answer in <answer></answer> tags."
+        )
+
+    def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, None]]:
+        images: List[Any] = []
+        if "images" in dataitem:
+            v = dataitem.get("images")
+            if isinstance(v, list):
+                images = list(v)
+            elif v is not None:
+                images = [v]
+            else:
+                images = [None]
+        elif "image" in dataitem:
+            v = dataitem.get("image")
+            if v is not None:
+                images = [v]
+            else:
+                images = [None]
+        else:
+            return super().get_image_list(dataitem)
+
+        normed: List[Union[bytes, str, None]] = []
+        for v in images:
+            if v is None:
+                continue
+            if isinstance(v, Image.Image):
+                normed.append(v)
+            elif isinstance(v, dict) and "bytes" in v:
+                normed.append(v["bytes"])  # raw bytes
+            else:
+                normed.append(v)  # path/uri/string
+        if not normed:
+            normed = [None]
+        return normed
+
+    def build_prompt_text(self, data_item: Dict[str, Any]) -> str:
+        # Use 'question' and 'choices' if present; else fallback to base using configured prompt/choice keys
+        question = data_item.get("question", None)
+        choices = data_item.get("choices", None)
+        if question is None:
+            return super().build_prompt_text(data_item)
+        # normalize choices
+        if isinstance(choices, str):
+            try:
+                import ast
+
+                choices = ast.literal_eval(choices)
+            except Exception:
+                choices = [choices]
+        if not isinstance(choices, list):
+            choices = [choices] if choices is not None else []
+
+        text = f"{question}\n"
+        if choices:
+            text += "Choices:\n"
+            for i, c in enumerate(choices):
+                text += f"{chr(65 + i)}. {c}\n"
+        return text
+
+    def postprocess_dataset_item(
+        self, item: DatasetItem, raw: Dict[str, Any]
+    ) -> DatasetItem:
+        answer_dict = {
+            "choices": raw.get("choices", None),
+            "correct_answer": raw.get("correct_answer", None),
+        }
+        item.answer = answer_dict
+
+        return item
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index d34f3a32c..2fa0fea8a 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -13,19 +13,21 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from omegaconf import DictConfig
-from vllm.outputs import CompletionOutput
-from vllm.outputs import RequestOutput as VllmRequestOutput
 
-from rlinf.data.datasets import batch_pad_to_fixed_len
+if TYPE_CHECKING:
+    from vllm.outputs import CompletionOutput
+    from vllm.outputs import RequestOutput as VllmRequestOutput
+
+from rlinf.data.datasets.utils import batch_pad_to_fixed_len
 from rlinf.utils.data_iter_utils import (
     get_iterator_k_split,
     split_list,
 )
-
+import torch_npu
 
 def get_batch_size(
     batch: Dict[str, torch.Tensor], batch_tensor_key: str = "input_ids"
@@ -47,26 +49,111 @@ class RolloutRequest:
     Attr
     input_ids: List of input token IDs for rollout
     n: Number of completions to generate for each input
-    idx: List of unique identifiers for the requests, used for tracking
-    input_lengths: List of lengths of the input sequences, corresponding to input_ids
+    image_data: list of image data (bytes or URLs) for multimodal inputs
     answers: Optional list of answers for the requests, if available
+    multi_modal_inputs: list of multi-modal inputs for the requests
     """
 
     n: int
     input_ids: List[List[int]]
+    image_data: Union[List[List[bytes]], List[List[str]]]
     answers: List[str]
+    multi_modal_inputs: List[Dict]
+
+    def repeat(self) -> "RolloutRequest":
+        """Repeat each input in the RolloutRequest a specified number of times.
+
+        Args:
+            times (int): The number of times to repeat each input.
+
+        Returns:
+            RolloutRequest: A new RolloutRequest with repeated inputs.
+        """
+        assert self.n > 0, "n must be greater than 0"
+
+        input_ids, answers, image_data, multi_modal_inputs = zip(
+            *[
+                (input_id, answer, image_data, multi_modal_inputs)
+                for input_id, answer, image_data, multi_modal_inputs in zip(
+                    self.input_ids,
+                    self.answers,
+                    self.image_data,
+                    self.multi_modal_inputs,
+                )
+                for _ in range(self.n)
+            ]
+        )
+        return RolloutRequest(
+            n=self.n,
+            input_ids=list(input_ids),
+            answers=list(answers),
+            image_data=list(image_data),
+            multi_modal_inputs=list(multi_modal_inputs),
+        )
+
+    def split(self, num_splits: int) -> List["RolloutRequest"]:
+        """Split the RolloutRequest into multiple smaller requests.
+
+        Args:
+            num_splits (int): The number of splits to create.
+
+        Returns:
+            List[RolloutRequest]: A list of smaller RolloutRequest instances.
+        """
+        assert num_splits > 0, "num_splits must be greater than 0"
+        assert len(self.input_ids) % num_splits == 0, (
+            f"Input IDs length {len(self.input_ids)} is not divisible by num_splits {num_splits}"
+        )
+
+        input_ids_split_list = split_list(self.input_ids, num_splits)
+        answers_split_list = split_list(self.answers, num_splits)
+        image_data_split_list = split_list(self.image_data, num_splits)
+        multi_modal_inputs_split_list = split_list(self.multi_modal_inputs, num_splits)
+
+        splitted_requests = []
+        for (
+            input_ids_batch,
+            answers_batch,
+            image_data_batch,
+            multi_modal_inputs_batch,
+        ) in zip(
+            input_ids_split_list,
+            answers_split_list,
+            image_data_split_list,
+            multi_modal_inputs_split_list,
+        ):
+            request = RolloutRequest(
+                n=self.n,
+                input_ids=input_ids_batch,
+                answers=answers_batch,
+                image_data=image_data_batch,
+                multi_modal_inputs=multi_modal_inputs_batch,
+            )
+            splitted_requests.append(request)
+
+        return splitted_requests
 
     def repeat_and_split(
         self, rollout_batch_size: Optional[int] = None
     ) -> List["RolloutRequest"]:
-        input_ids, answers = zip(
+        input_ids, answers, image_data, multi_modal_inputs = zip(
             *[
-                (input_id, answer)
-                for input_id, answer in zip(self.input_ids, self.answers)
+                (input_id, answer, image_data, multi_modal_inputs)
+                for input_id, answer, image_data, multi_modal_inputs in zip(
+                    self.input_ids,
+                    self.answers,
+                    self.image_data,
+                    self.multi_modal_inputs,
+                )
                 for _ in range(self.n)
             ]
         )
-        input_ids, answers = (list(input_ids), list(answers))
+        input_ids, answers, image_data, multi_modal_inputs = (
+            list(input_ids),
+            list(answers),
+            list(image_data),
+            list(multi_modal_inputs),
+        )
 
         # Split input ids based on rollout_batch_size_per_gpu
         if rollout_batch_size is None:
@@ -80,14 +167,26 @@ def repeat_and_split(
         splitted_requests = []
         input_ids_split_list = split_list(input_ids, num_batches)
         answers_split_list = split_list(answers, num_batches)
-
-        for input_ids_batch, answers_batch in zip(
-            input_ids_split_list, answers_split_list
+        image_data_split_list = split_list(image_data, num_batches)
+        multi_modal_inputs_split_list = split_list(multi_modal_inputs, num_batches)
+
+        for (
+            input_ids_batch,
+            answers_batch,
+            image_data_batch,
+            multi_modal_inputs_batch,
+        ) in zip(
+            input_ids_split_list,
+            answers_split_list,
+            image_data_split_list,
+            multi_modal_inputs_split_list,
         ):
             request = RolloutRequest(
                 n=self.n,
                 input_ids=input_ids_batch,
                 answers=answers_batch,
+                image_data=image_data_batch,
+                multi_modal_inputs=multi_modal_inputs_batch,
             )
             splitted_requests.append(request)
 
@@ -202,8 +301,9 @@ class RolloutResult:
     advantages: Optional[List[float] | torch.Tensor] = None
     prompt_texts: Optional[List[str]] = None
     response_texts: Optional[List[str]] = None
-    answers: Optional[List[str]] = None
-
+    answers: Optional[List[str | dict]] = None
+    image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None
+    multi_modal_inputs: Optional[List[dict]] = None
     # Inference
     # Only set when recompute_logprobs is False
     rollout_logprobs: Optional[List[List[float]]] = None
@@ -254,12 +354,13 @@ def _get_attention_masks_and_position_ids(
     @staticmethod
     def from_vllm_results(
         group_size: int,
-        results: List[VllmRequestOutput],
-        answers: Optional[List[List[int]]] = None,
+        results: List["VllmRequestOutput"],
+        answers: Optional[List[str]] = None,
+        multi_modal_inputs: Optional[List[Dict]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         def get_logprobs(
-            response_ids: List[int], output: CompletionOutput
+            response_ids: List[int], output: "CompletionOutput"
         ) -> List[float]:
             logprobs = []
             returned_logprobs = output.logprobs
@@ -270,7 +371,14 @@ def get_logprobs(
                 logprobs.append(logprob[response_ids[i]].logprob)
             return logprobs
 
-        num_sequences = len(results)
+        num_sequences = len(results) * group_size
+
+        if multi_modal_inputs:
+            mm_inputs = []
+            for mm_input in multi_modal_inputs:
+                mm_inputs.extend([mm_input] * group_size)
+        else:
+            mm_inputs = None
 
         prompt_lengths = []
         prompt_ids = []
@@ -278,26 +386,43 @@ def get_logprobs(
         response_ids = []
         logprobs = []
         is_end = []
-        for _, res in enumerate(results):
-            if res.prompt_token_ids is not None:
-                prompt_ids.append(res.prompt_token_ids)
-                prompt_lengths.append(len(res.prompt_token_ids))
+        response_texts = []
+        rollout_answers = (
+            [answer for answer in answers for _ in range(group_size)]
+            if answers
+            else None
+        )
+        for vllm_result in results:
+            if vllm_result.prompt_token_ids is not None:
+                prompt_ids.extend([vllm_result.prompt_token_ids] * group_size)
+                prompt_lengths.extend([len(vllm_result.prompt_token_ids)] * group_size)
             else:
-                return NotImplementedError("vllm should return tokenized prompt.")
-            response_id = list(res.outputs[0].token_ids)
-            response_ids.append(response_id)
-            response_lengths.append(len(response_id))
-            is_end.append(res.finished)
+                raise NotImplementedError("vllm should return tokenized prompt.")
+            response_ids.extend(
+                [list(output.token_ids) for output in vllm_result.outputs]
+            )
+            response_texts.extend([output.text for output in vllm_result.outputs])
+            response_lengths.extend(
+                [len(output.token_ids) for output in vllm_result.outputs]
+            )
+            is_end.extend([vllm_result.finished] * group_size)
             if return_logprobs:
-                logprobs.append(get_logprobs(response_id, res.outputs[0]))
+                logprobs.extend(
+                    [
+                        get_logprobs(list(output.token_ids), output)
+                        for output in vllm_result.outputs
+                    ]
+                )
         result: RolloutResult = RolloutResult(
             group_size=group_size,
             num_sequence=num_sequences,
-            answers=answers,
+            answers=rollout_answers,
             prompt_ids=prompt_ids,
             prompt_lengths=prompt_lengths,
             response_ids=response_ids,
             response_lengths=response_lengths,
+            response_texts=response_texts,
+            multi_modal_inputs=mm_inputs,
             is_end=is_end,
         )
         if return_logprobs:
@@ -310,6 +435,8 @@ def from_sglang_results(
         group_size: int,
         input_ids: List[List[int]],
         answers: Optional[List[List[int]]] = None,
+        image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None,
+        multi_modal_inputs: Optional[List[Dict]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         """Create a MathRolloutResult from the given results and input IDs.
@@ -336,6 +463,8 @@ def from_sglang_results(
             response_lengths=[len(res["output_ids"]) for res in results],
             response_ids=[res["output_ids"] for res in results],
             answers=answers,
+            image_data=image_data,
+            multi_modal_inputs=multi_modal_inputs,
             is_end=[
                 res["meta_info"]["finish_reason"]["type"] == "stop" for res in results
             ],
@@ -437,6 +566,161 @@ def merge_list(dst_list: List, src_list: List):
 
         return merged_result
 
+    @staticmethod
+    def split_result_list_by_group(
+        rollout_results: List["RolloutResult"],
+    ) -> List["RolloutResult"]:
+        """
+        Split RolloutResult objects by group_size.
+
+        If input has only one RolloutResult, split it into multiple RolloutResult objects by group_size.
+        If input has multiple RolloutResult objects, split each one and merge the results.
+
+        Args:
+            rollout_results: List of input RolloutResult objects
+
+        Returns:
+            List of RolloutResult objects grouped by group_size
+        """
+        assert len(rollout_results) > 0, "No rollout results to split."
+
+        all_split_results = []
+
+        for rollout_result in rollout_results:
+            split_results = RolloutResult._split_single_result_by_group(rollout_result)
+            all_split_results.extend(split_results)
+
+        return all_split_results
+
+    @staticmethod
+    def _split_single_result_by_group(
+        rollout_result: "RolloutResult",
+    ) -> List["RolloutResult"]:
+        """
+        Split a single RolloutResult into multiple RolloutResult objects by group_size.
+
+        Args:
+            rollout_result: The RolloutResult to be split
+
+        Returns:
+            List of split RolloutResult objects
+        """
+        group_size = rollout_result.group_size
+        num_sequence = rollout_result.num_sequence
+
+        assert num_sequence % group_size == 0, (
+            f"num_sequence ({num_sequence}) must be divisible by group_size ({group_size})"
+        )
+
+        num_groups = num_sequence // group_size
+        split_results = []
+
+        # Split list fields
+        prompt_lengths_split = split_list(rollout_result.prompt_lengths, num_groups)
+        prompt_ids_split = split_list(rollout_result.prompt_ids, num_groups)
+        response_lengths_split = split_list(rollout_result.response_lengths, num_groups)
+        response_ids_split = split_list(rollout_result.response_ids, num_groups)
+        is_end_split = split_list(rollout_result.is_end, num_groups)
+
+        # Handle optional fields
+        answers_split = None
+        if rollout_result.answers is not None:
+            answers_split = split_list(rollout_result.answers, num_groups)
+
+        image_data_split = None
+        if rollout_result.image_data is not None:
+            image_data_split = split_list(rollout_result.image_data, num_groups)
+
+        multi_modal_inputs_split = None
+        if rollout_result.multi_modal_inputs is not None:
+            multi_modal_inputs_split = split_list(
+                rollout_result.multi_modal_inputs, num_groups
+            )
+
+        prompt_texts_split = None
+        if rollout_result.prompt_texts is not None:
+            prompt_texts_split = split_list(rollout_result.prompt_texts, num_groups)
+
+        response_texts_split = None
+        if rollout_result.response_texts is not None:
+            response_texts_split = split_list(rollout_result.response_texts, num_groups)
+
+        rollout_logprobs_split = None
+        if rollout_result.rollout_logprobs is not None:
+            rollout_logprobs_split = split_list(
+                rollout_result.rollout_logprobs, num_groups
+            )
+
+        # Handle tensor fields
+        rewards_split = None
+        if rollout_result.rewards is not None:
+            if isinstance(rollout_result.rewards, torch.Tensor):
+                rewards_split = torch.chunk(rollout_result.rewards, num_groups, dim=0)
+            else:
+                rewards_split = split_list(rollout_result.rewards, num_groups)
+
+        advantages_split = None
+        if rollout_result.advantages is not None:
+            if isinstance(rollout_result.advantages, torch.Tensor):
+                advantages_split = torch.chunk(
+                    rollout_result.advantages, num_groups, dim=0
+                )
+            else:
+                advantages_split = split_list(rollout_result.advantages, num_groups)
+
+        prev_logprobs_split = None
+        if rollout_result.prev_logprobs is not None:
+            prev_logprobs_split = torch.chunk(
+                rollout_result.prev_logprobs, num_groups, dim=0
+            )
+
+        ref_logprobs_split = None
+        if rollout_result.ref_logprobs is not None:
+            ref_logprobs_split = torch.chunk(
+                rollout_result.ref_logprobs, num_groups, dim=0
+            )
+
+        # Create split RolloutResult objects
+        for i in range(num_groups):
+            split_result = RolloutResult(
+                num_sequence=group_size,
+                group_size=group_size,
+                prompt_lengths=prompt_lengths_split[i],
+                prompt_ids=prompt_ids_split[i],
+                response_lengths=response_lengths_split[i],
+                response_ids=response_ids_split[i],
+                is_end=is_end_split[i],
+                answers=answers_split[i] if answers_split is not None else None,
+                image_data=image_data_split[i]
+                if image_data_split is not None
+                else None,
+                multi_modal_inputs=multi_modal_inputs_split[i]
+                if multi_modal_inputs_split is not None
+                else None,
+                prompt_texts=prompt_texts_split[i]
+                if prompt_texts_split is not None
+                else None,
+                response_texts=response_texts_split[i]
+                if response_texts_split is not None
+                else None,
+                rollout_logprobs=rollout_logprobs_split[i]
+                if rollout_logprobs_split is not None
+                else None,
+                rewards=rewards_split[i] if rewards_split is not None else None,
+                advantages=advantages_split[i]
+                if advantages_split is not None
+                else None,
+                prev_logprobs=prev_logprobs_split[i]
+                if prev_logprobs_split is not None
+                else None,
+                ref_logprobs=ref_logprobs_split[i]
+                if ref_logprobs_split is not None
+                else None,
+            )
+            split_results.append(split_result)
+
+        return split_results
+
     def to_actor_batch(
         self,
         data_seq_length: int,
@@ -525,17 +809,23 @@ def to_actor_batch(
         )  # [B, training_seq_length]
 
         batch = {
-            "input_ids": input_ids.cuda(),
-            "attention_mask": attention_mask.cuda(),
-            "is_end": is_end.cuda(),
-            "position_ids": position_ids.cuda(),
-            "prompt_lengths": prompt_lengths.cuda(),
-            "response_lengths": response_lengths.cuda(),
+            "input_ids": input_ids.npu(),
+            "attention_mask": attention_mask.npu(),
+            "is_end": is_end.npu(),
+            "position_ids": position_ids.npu(),
+            "prompt_lengths": prompt_lengths.npu(),
+            "response_lengths": response_lengths.npu(),
         }
 
+        if (
+            self.multi_modal_inputs is not None
+            and self.multi_modal_inputs[0] is not None
+        ):
+            batch["multi_modal_inputs"] = self.multi_modal_inputs
+
         if self.advantages is not None:
             if isinstance(self.advantages, torch.Tensor):
-                batch["advantages"] = self.advantages.cuda()
+                batch["advantages"] = self.advantages.npu()
             else:
                 response_attention_mask = attention_mask[
                     :, -max_response_len:
@@ -543,17 +833,17 @@ def to_actor_batch(
                 advantages = torch.tensor(self.advantages, dtype=torch.float32).reshape(
                     -1, 1
                 )  # [B, 1]
-                advantages = response_attention_mask.float().cuda() * advantages.cuda()
-                batch["advantages"] = advantages.cuda()
+                advantages = response_attention_mask.float().npu() * advantages.npu()
+                batch["advantages"] = advantages.npu()
 
         if self.prev_logprobs is not None:
-            batch["prev_logprobs"] = self.prev_logprobs.cuda()
+            batch["prev_logprobs"] = self.prev_logprobs.npu()
 
         if self.ref_logprobs is not None:
-            batch["ref_logprobs"] = self.ref_logprobs.cuda()
+            batch["ref_logprobs"] = self.ref_logprobs.npu()
 
         if self.rewards is not None:
-            batch["rewards"] = self.rewards.cuda()
+            batch["rewards"] = self.rewards.npu()
 
         if self.rollout_logprobs is not None:
             logprobs = batch_pad_to_fixed_len(
@@ -564,7 +854,7 @@ def to_actor_batch(
                 max_batch_len=max_response_len,
                 pad_token=pad_token,
             )
-            batch["prev_logprobs"] = logprobs.cuda()
+            batch["prev_logprobs"] = logprobs.npu()
 
         return batch
 
@@ -578,14 +868,16 @@ def merge_batches(
             return merged_batch
         if len(batches) == 1:
             return batches[0]
+
         for key in batches[0].keys():
-            assert torch.is_tensor(batches[0][key]), (
-                f"Expected tensor for key {key} in batches, got {type(batches[0][key])}"
-            )
-            assert torch.is_tensor(batches[0][key]), (
-                f"Expected tensor for key {key} in batches, got {type(batches[0][key])}"
-            )
-            merged_batch[key] = torch.cat([batch[key] for batch in batches], dim=0)
+            if torch.is_tensor(batches[0][key]):
+                merged_batch[key] = torch.cat([batch[key] for batch in batches], dim=0)
+            elif isinstance(batches[0][key], list):
+                merged_batch[key] = []
+                for batch in batches:
+                    merged_batch[key].extend(batch[key])
+            else:
+                raise ValueError(f"Unsupported batch key type: {type(batches[0][key])}")
         return merged_batch
 
 
diff --git a/rlinf/envs/env_manager.py b/rlinf/envs/env_manager.py
index f1cc38ffa..18d9bacbd 100644
--- a/rlinf/envs/env_manager.py
+++ b/rlinf/envs/env_manager.py
@@ -157,10 +157,13 @@ def recursive_to_own(obj):
 
 
 class EnvManager:
-    def __init__(self, cfg, rank, world_size, env_cls, enable_offload=False):
+    def __init__(
+        self, cfg, rank, seed_offset, total_num_processes, env_cls, enable_offload=False
+    ):
         self.cfg = cfg
         self.rank = rank
-        self.world_size = world_size
+        self.seed_offset = seed_offset
+        self.total_num_processes = total_num_processes
         self.process: Optional[mp.Process] = None
         self.command_queue: Optional[mp.Queue] = None
         self.result_queue: Optional[mp.Queue] = None
@@ -181,7 +184,7 @@ def __init__(self, cfg, rank, world_size, env_cls, enable_offload=False):
             self.env = None
         else:
             self.env_cls = env_cls
-            self.env = self.env_cls(cfg, rank, world_size)
+            self.env = self.env_cls(cfg, seed_offset, total_num_processes)
 
     def start_simulator(self):
         """Start simulator process with shared memory queues"""
@@ -202,7 +205,8 @@ def start_simulator(self):
             args=(
                 self.cfg,
                 self.rank,
-                self.world_size,
+                self.seed_offset,
+                self.total_num_processes,
                 self.env_cls,
                 self.command_queue,
                 self.result_queue,
@@ -213,7 +217,7 @@ def start_simulator(self):
         self.process.start()
 
         # Wait for initialization
-        result = self.result_queue.get(timeout=60)
+        result = self.result_queue.get()
         if result["status"] != "ready":
             raise RuntimeError(f"Simulator initialization failed: {result}")
 
@@ -277,7 +281,8 @@ def __setattr__(self, name, value):
         if name in [
             "cfg",
             "rank",
-            "world_size",
+            "seed_offset",
+            "total_num_processes",
             "process",
             "command_queue",
             "result_queue",
@@ -321,7 +326,8 @@ def __setattr__(self, name, value):
 def _simulator_worker(
     cfg,
     rank,
-    world_size,
+    seed_offset,
+    total_num_processes,
     env_cls,
     command_queue,
     result_queue,
@@ -340,7 +346,7 @@ def _simulator_worker(
     omegaconf_register()
 
     try:
-        simulator = env_cls(cfg, rank, world_size)
+        simulator = env_cls(cfg, seed_offset, total_num_processes)
         assert isinstance(simulator, EnvOffloadMixin), (
             f"Environment class {env_cls.__name__} must inherit from EnvOffloadMixin"
         )
diff --git a/rlinf/envs/libero/libero_env.py b/rlinf/envs/libero/libero_env.py
index 8f916c0f2..6471bf770 100644
--- a/rlinf/envs/libero/libero_env.py
+++ b/rlinf/envs/libero/libero_env.py
@@ -20,11 +20,12 @@
 import numpy as np
 import torch
 from libero.libero import get_libero_path
-from libero.libero.benchmark import Benchmark, get_benchmark
+from libero.libero.benchmark import Benchmark
 from libero.libero.envs import OffScreenRenderEnv
 from omegaconf.omegaconf import OmegaConf
 
 from rlinf.envs.libero.utils import (
+    get_benchmark_overridden,
     get_libero_image,
     get_libero_wrist_image,
     list_of_dict_to_dict_of_list,
@@ -38,11 +39,11 @@
 
 
 class LiberoEnv(gym.Env):
-    def __init__(self, cfg, rank, world_size):
-        self.rank = rank
+    def __init__(self, cfg, seed_offset, total_num_processes):
+        self.seed_offset = seed_offset
         self.cfg = cfg
-        self.world_size = world_size
-        self.seed = self.cfg.seed + rank
+        self.total_num_processes = total_num_processes
+        self.seed = self.cfg.seed + seed_offset
         self._is_start = True
         self.num_envs = self.cfg.num_envs
         self.group_size = self.cfg.group_size
@@ -56,7 +57,7 @@ def __init__(self, cfg, rank, world_size):
         self._generator_ordered = np.random.default_rng(seed=0)
         self.start_idx = 0
 
-        self.task_suite: Benchmark = get_benchmark(cfg.task_suite_name)()
+        self.task_suite: Benchmark = get_benchmark_overridden(cfg.task_suite_name)()
 
         self._compute_total_num_group_envs()
         self.reset_state_ids_all = self.get_reset_state_ids_all()
@@ -147,20 +148,22 @@ def _get_random_reset_state_ids(self, num_reset_states):
 
     def get_reset_state_ids_all(self):
         reset_state_ids = np.arange(self.total_num_group_envs)
-        valid_size = len(reset_state_ids) - (len(reset_state_ids) % self.world_size)
+        valid_size = len(reset_state_ids) - (
+            len(reset_state_ids) % self.total_num_processes
+        )
         self._generator_ordered.shuffle(reset_state_ids)
         reset_state_ids = reset_state_ids[:valid_size]
-        reset_state_ids = reset_state_ids.reshape(self.world_size, -1)
+        reset_state_ids = reset_state_ids.reshape(self.total_num_processes, -1)
         return reset_state_ids
 
     def _get_ordered_reset_state_ids(self, num_reset_states):
-        reset_state_ids = self.reset_state_ids_all[self.rank][
+        if self.start_idx + num_reset_states > len(self.reset_state_ids_all[0]):
+            self.reset_state_ids_all = self.get_reset_state_ids_all()
+            self.start_idx = 0
+        reset_state_ids = self.reset_state_ids_all[self.seed_offset][
             self.start_idx : self.start_idx + num_reset_states
         ]
         self.start_idx = self.start_idx + num_reset_states
-        if self.start_idx >= len(self.reset_state_ids_all[0]):
-            self.reset_state_ids_all = self.get_reset_state_ids_all()
-            self.start_idx = 0
         return reset_state_ids
 
     def _get_task_and_trial_ids_from_reset_state_ids(self, reset_state_ids):
@@ -473,7 +476,7 @@ def add_new_frames(self, raw_obs, plot_infos):
         self.render_images.append(full_image)
 
     def flush_video(self, video_sub_dir: Optional[str] = None):
-        output_dir = os.path.join(self.video_cfg.video_base_dir, f"rank_{self.rank}")
+        output_dir = os.path.join(self.video_cfg.video_base_dir, f"seed_{self.seed}")
         if video_sub_dir is not None:
             output_dir = os.path.join(output_dir, f"{video_sub_dir}")
         save_rollout_video(
diff --git a/rlinf/envs/libero/utils.py b/rlinf/envs/libero/utils.py
index e55c353e7..3c92f30d9 100644
--- a/rlinf/envs/libero/utils.py
+++ b/rlinf/envs/libero/utils.py
@@ -19,9 +19,11 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import imageio
+import libero.libero.benchmark as benchmark
 import numpy as np
 import torch
 from libero.libero import get_libero_path
+from libero.libero.benchmark import Benchmark
 from libero.libero.envs import OffScreenRenderEnv
 from PIL import Image, ImageDraw, ImageFont
 
@@ -447,3 +449,47 @@ def save_rollout_video(
     for img in rollout_images:
         video_writer.append_data(img)
     video_writer.close()
+
+
+def get_benchmark_overridden(benchmark_name) -> Benchmark:
+    """
+    Return the Benchmark class for a given name.
+    For "libero_130": return a dynamically aggregated class from all suites.
+    For others: delegate to the original LIBERO get_benchmark.
+
+    Args:
+        benchmark_name: Name of the benchmark to get
+
+    Returns:
+        Benchmark class
+    """
+    name = str(benchmark_name).lower()
+    if name != "libero_130":
+        return benchmark.get_benchmark(benchmark_name)
+
+    libreo_cls = benchmark.BENCHMARK_MAPPING.get("libero_130", None)
+    if libreo_cls is not None:
+        return libreo_cls
+
+    # Build aggregated task map once, preserving order and de-duplicating by task name
+    aggregated_task_map: Dict[str, benchmark.Task] = {}
+    for suite_name in getattr(benchmark, "libero_suites", []):
+        suite_map = benchmark.task_maps.get(suite_name, {})
+        for task_name, task in suite_map.items():
+            if task_name not in aggregated_task_map:
+                aggregated_task_map[task_name] = task
+
+    class LIBERO_ALL(Benchmark):
+        def __init__(self, task_order_index=0):
+            super().__init__(task_order_index=task_order_index)
+            self.name = "libero_130"
+            self._make_benchmark()
+
+        def _make_benchmark(self):
+            tasks = list(aggregated_task_map.values())
+            self.tasks = tasks
+            self.n_tasks = len(self.tasks)
+
+    # Register for discoverability/help
+    benchmark.BENCHMARK_MAPPING["libero_130"] = LIBERO_ALL
+    return LIBERO_ALL
diff --git a/rlinf/envs/maniskill/maniskill_env.py b/rlinf/envs/maniskill/maniskill_env.py
index 715d490fc..8d79315e2 100644
--- a/rlinf/envs/maniskill/maniskill_env.py
+++ b/rlinf/envs/maniskill/maniskill_env.py
@@ -47,11 +47,10 @@ def extract_termination_from_info(info, num_envs, device):
 
 
 class ManiskillEnv(gym.Env):
-    def __init__(self, cfg, rank, world_size, record_metrics=True):
+    def __init__(self, cfg, seed_offset, total_num_processes, record_metrics=True):
         env_seed = cfg.seed
-        self.seed = env_seed + rank
-        self.rank = rank
-        self.world_size = world_size
+        self.seed = env_seed + seed_offset
+        self.total_num_processes = total_num_processes
         self.auto_reset = cfg.auto_reset
         self.use_rel_reward = cfg.use_rel_reward
         self.ignore_terminations = cfg.ignore_terminations
@@ -111,15 +110,12 @@ def instruction(self):
     def _init_reset_state_ids(self):
         self._generator = torch.Generator()
         self._generator.manual_seed(self.seed)
-        self.all_reset_state_ids = torch.randperm(
-            self.total_num_group_envs, generator=self._generator
-        ).to(self.device)
         self.update_reset_state_ids()
 
     def update_reset_state_ids(self):
         reset_state_ids = torch.randint(
             low=0,
-            high=len(self.all_reset_state_ids),
+            high=self.total_num_group_envs,
             size=(self.num_group,),
             generator=self._generator,
         )
@@ -370,8 +366,14 @@ def add_new_frames(self, infos, rewards=None):
         image = self.render(infos, rewards)
         self.render_images.append(image)
 
+    def add_new_frames_from_obs(self, raw_obs):
+        """For debugging render"""
+        raw_imgs = common.to_numpy(raw_obs["images"].permute(0, 2, 3, 1))
+        raw_full_img = tile_images(raw_imgs, nrows=int(np.sqrt(self.num_envs)))
+        self.render_images.append(raw_full_img)
+
     def flush_video(self, video_sub_dir: Optional[str] = None):
-        output_dir = os.path.join(self.video_cfg.video_base_dir, f"rank_{self.rank}")
+        output_dir = os.path.join(self.video_cfg.video_base_dir, f"seed_{self.seed}")
         if video_sub_dir is not None:
             output_dir = os.path.join(output_dir, f"{video_sub_dir}")
         images_to_video(
diff --git a/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py b/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py
index 85e92a057..94de7ebc3 100644
--- a/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py
+++ b/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py
@@ -14,7 +14,6 @@
 
 import os
 from pathlib import Path
-from typing import Optional
 
 import cv2
 import numpy as np
@@ -745,11 +744,8 @@ def _green_sceen_rgb(
 
         return rgb_ret
 
-    def get_obs(self, info: Optional[dict] = None, unflattened=True):
-        assert unflattened
-        obs = super().get_obs(info)
-
-        # "greenscreen" process
+    def _get_obs_sensor_data(self, apply_texture_transforms=True):
+        sensor_obs = super()._get_obs_sensor_data(apply_texture_transforms)
         if (
             self.obs_mode_struct.visual.rgb
             and self.obs_mode_struct.visual.segmentation
@@ -757,27 +753,23 @@ def get_obs(self, info: Optional[dict] = None, unflattened=True):
         ):
             # get the actor ids of objects to manipulate; note that objects here are not articulated
             camera_name = self.rgb_camera_name
-            assert "segmentation" in obs["sensor_data"][camera_name].keys()
+            assert "segmentation" in sensor_obs[camera_name].keys()
 
-            overlay_img = self.overlay_images.to(
-                obs["sensor_data"][camera_name]["rgb"].device
-            )
-            overlay_texture = self.overlay_textures.to(
-                obs["sensor_data"][camera_name]["rgb"].device
-            )
-            overlay_mix = self.overlay_mix.to(
-                obs["sensor_data"][camera_name]["rgb"].device
-            )
+            raw_rgb_device = sensor_obs[camera_name]["rgb"].device
+
+            overlay_img = self.overlay_images.to(raw_rgb_device)
+            overlay_texture = self.overlay_textures.to(raw_rgb_device)
+            overlay_mix = self.overlay_mix.to(raw_rgb_device)
 
             green_screened_rgb = self._green_sceen_rgb(
-                obs["sensor_data"][camera_name]["rgb"],
-                obs["sensor_data"][camera_name]["segmentation"],
+                sensor_obs[camera_name]["rgb"],
+                sensor_obs[camera_name]["segmentation"],
                 overlay_img,
                 overlay_texture,
                 overlay_mix,
             )
-            obs["sensor_data"][camera_name]["rgb"] = green_screened_rgb
-        return obs
+            sensor_obs[camera_name]["rgb"] = green_screened_rgb
+        return sensor_obs
 
     # widowx
     @property
diff --git a/rlinf/envs/offload_wrapper/maniskill_wrapper.py b/rlinf/envs/offload_wrapper/maniskill_wrapper.py
index 9317a6ea8..734cd6c50 100644
--- a/rlinf/envs/offload_wrapper/maniskill_wrapper.py
+++ b/rlinf/envs/offload_wrapper/maniskill_wrapper.py
@@ -65,7 +65,6 @@ def get_state(self) -> bytes:
             "action_space_state": action_space_state,
             "prev_step_reward": self.prev_step_reward.cpu(),
             "reset_state_ids": self.reset_state_ids.cpu(),
-            "all_reset_state_ids": self.all_reset_state_ids.cpu(),
             "generator_state": self._generator.get_state(),
             "is_start": self.is_start,
             "video_cnt": self.video_cnt,
@@ -176,7 +175,6 @@ def load_state(self, state_buffer: bytes):
         # Restore simulator task state
         self.prev_step_reward = state["prev_step_reward"].to(self.device)
         self.reset_state_ids = state["reset_state_ids"].to(self.device)
-        self.all_reset_state_ids = state["all_reset_state_ids"].to(self.device)
         self._generator.set_state(state["generator_state"])
         self.is_start = state["is_start"]
 
diff --git a/rlinf/envs/robotwin/RoboTwin_env.py b/rlinf/envs/robotwin/RoboTwin_env.py
index b43f98c9a..aac0be6aa 100644
--- a/rlinf/envs/robotwin/RoboTwin_env.py
+++ b/rlinf/envs/robotwin/RoboTwin_env.py
@@ -247,11 +247,11 @@ def update_obs(observation):
 
 
 class RoboTwin(gym.Env):
-    def __init__(self, cfg, rank, world_size, record_metrics=True):
+    def __init__(self, cfg, seed_offset, total_num_processes, record_metrics=True):
         # Get parameters from configuration
         self.cfg = cfg
-        self.rank = rank
-        self.world_size = world_size
+        self.seed_offset = seed_offset
+        self.total_num_processes = total_num_processes
         self.record_metrics = record_metrics
         self._is_start = True
         self.info_logging_keys = ["is_src_obj_grasped", "consecutive_grasp", "success"]
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index 5b2fba92d..690b3d4f5 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -17,17 +17,25 @@
 import torch
 import torch.optim as optim
 from omegaconf import DictConfig
+from torch.distributed.fsdp import (
+    BackwardPrefetch,
+    MixedPrecision,
+    ShardingStrategy,
+    StateDictType,
+)
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy, StateDictType
-from transformers import AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq
 
 from rlinf.config import torch_dtype_from_precision
+from rlinf.data.tokenizers import hf_tokenizer
 from rlinf.hybrid_engines.fsdp.utils import (
     get_fsdp_wrap_policy,
     init_fn,
 )
+from rlinf.utils.logging import get_logger
 from rlinf.utils.utils import clear_memory
 
+import torch_npu
 
 class FSDPModelManager:
     """
@@ -36,49 +44,70 @@ class FSDPModelManager:
 
     def __init__(self, cfg: DictConfig):
         self._cfg = cfg
+        self.logger = get_logger()
         self.torch_dtype = torch_dtype_from_precision(self._cfg.model.precision)
 
-        assert (
-            self.torch_dtype == torch.float16 or self.torch_dtype == torch.bfloat16
-        ), (
-            f"Precision {self._cfg.model.precision} is not supported, only support bf16 and fp16."
-        )
+        self.tokenizer = hf_tokenizer(cfg.tokenizer.tokenizer_model)
 
     def model_provider_func(self) -> torch.nn.Module:
-        if self._cfg.model.get("gptq_model", False):
+        cfg = self._cfg
+        use_gptq = cfg.model.get("gptq_model", False)
+        load_in_8bit = cfg.model.get("load_in_8bit", False)
+
+        use_triton = cfg.get("use_triton", True)
+
+        assert torch.npu.is_available(), "CUDA is not available."
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        device = torch.npu.device(f"npu:{local_rank}")
+
+        model_config = AutoConfig.from_pretrained(
+            cfg.model.model_path,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2",
+        )
+
+        if use_gptq:
             from auto_gptq import AutoGPTQForCausalLM
 
             model_wrapper = AutoGPTQForCausalLM.from_quantized(
-                self._cfg.model.model_path, device="cuda:0", use_triton=True
+                cfg.model.model_path,
+                device=device,
+                use_triton=use_triton,
             )
             model = model_wrapper.model
-        elif self._cfg.model.get("load_in_8bit", False):
+        elif load_in_8bit:
             model = AutoModelForCausalLM.from_pretrained(
-                self._cfg.model.model_path,
-                device_map=self._cfg.model.get("device_map", "auto"),
+                cfg.model.model_path,
+                config=model_config,
                 load_in_8bit=True,
             )
         else:
-            # default load in float16
-            model = AutoModelForCausalLM.from_pretrained(
-                self._cfg.model.model_path,
+            if type(model_config) in AutoModelForVision2Seq._model_mapping.keys():
+                auto_model_class = AutoModelForVision2Seq
+            else:
+                auto_model_class = AutoModelForCausalLM
+
+            model = auto_model_class.from_pretrained(
+                cfg.model.model_path,
                 torch_dtype=self.torch_dtype,
-                device_map=self._cfg.model.get("device_map", "auto"),
+                config=model_config,
                 trust_remote_code=True,
-                use_safetensors=self._cfg.model.get("use_safetensors", False),
             )
-            if torch.cuda.is_available():
-                model = model.cuda()
-            if self.torch_dtype == torch.float16:
-                model = model.half()
 
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
         return model
 
     def setup_model_and_optimizer(self):
         """Setup model and optimizer."""
         module = self.model_provider_func()
 
-        module.gradient_checkpointing_enable()
+        # Enable gradient checkpointing if configured
+        if self._cfg.model.get("gradient_checkpointing", False):
+            self.logger.info("[FSDP] Enabling gradient checkpointing")
+            module.gradient_checkpointing_enable()
+        else:
+            self.logger.info("[FSDP] Gradient checkpointing is disabled")
 
         mixed_precision = MixedPrecision(
             param_dtype=self.torch_dtype,
@@ -101,12 +130,19 @@ def setup_model_and_optimizer(self):
         self.model = FSDP(
             module,
             param_init_fn=init_fn,
-            use_orig_params=True,
             auto_wrap_policy=auto_wrap_policy,
             device_id=int(os.environ["LOCAL_RANK"]),
             sharding_strategy=sharding_strategy,  # zero3
             mixed_precision=mixed_precision,
             sync_module_states=True,
+            forward_prefetch=self._cfg.fsdp.forward_prefetch,
+            backward_prefetch=(
+                BackwardPrefetch.BACKWARD_PRE
+                if self._cfg.fsdp.backward_prefetch
+                else None
+            ),
+            limit_all_gathers=self._cfg.fsdp.limit_all_gathers,
+            use_orig_params=self._cfg.fsdp.use_orig_params,
         )
 
         # NOTE: Currently we assume that only the value head contains "value_head" in its name.
@@ -123,7 +159,7 @@ def setup_model_and_optimizer(self):
             },
         ]
 
-        if self._cfg.model.vh_mode in ["a", "a0", "a6"]:
+        if self._cfg.model.get("vh_mode", None) in ["a", "a0", "a6"]:
             param_groups.append(
                 {
                     "params": [
diff --git a/rlinf/hybrid_engines/fsdp/utils.py b/rlinf/hybrid_engines/fsdp/utils.py
index 0e136bffb..0a9f0054d 100644
--- a/rlinf/hybrid_engines/fsdp/utils.py
+++ b/rlinf/hybrid_engines/fsdp/utils.py
@@ -30,7 +30,6 @@
 
 import torch
 from accelerate import init_empty_weights
-from prismatic.extern.hf.modeling_prismatic import PrismaticProjector
 from torch.distributed.fsdp.wrap import (
     transformer_auto_wrap_policy,
 )
@@ -59,7 +58,7 @@ def cpu_init_weights():
     return init_context
 
 
-def get_fsdp_wrap_policy(module, config=None, is_lora=False):
+def get_fsdp_wrap_policy(module, config=None, is_lora=False, is_vla_model=False):
     """
     FSDP wrap policy that handles both standard transformer models and VLA models.
 
@@ -77,11 +76,8 @@ def get_fsdp_wrap_policy(module, config=None, is_lora=False):
     if config.get("disable", False):
         return None
 
-    # Check if this is a VLA model by looking for language_model attribute
-    is_vla_model = hasattr(module, "language_model")
-
     # Get transformer layer classes to wrap
-    if is_vla_model:
+    if hasattr(module, "language_model"):
         # For VLA models, get transformer classes from language_model submodule
         default_transformer_cls_names_to_wrap = getattr(
             module.language_model, "_no_split_modules", None
@@ -111,6 +107,12 @@ def get_fsdp_wrap_policy(module, config=None, is_lora=False):
         policies.append(vit_wrap_policy)
 
         # Prismatic projector policy for VLA models
+        # The prismatic package initializes a DistributedOverwatch by default,
+        # which initializes accelerate.PartialState, which in turn
+        # initializes a torch.distributed process group in gloo.
+        # This results in default group being gloo, which does not support CUDA tensors and allreduce average.
+        from prismatic.extern.hf.modeling_prismatic import PrismaticProjector
+
         prismatic_fsdp_wrapping_policy = functools.partial(
             _module_wrap_policy,
             module_classes={PrismaticProjector},
diff --git a/rlinf/hybrid_engines/megatron/megatron_model_manager.py b/rlinf/hybrid_engines/megatron/megatron_model_manager.py
index 57f34dbf5..6fe4fbfd6 100644
--- a/rlinf/hybrid_engines/megatron/megatron_model_manager.py
+++ b/rlinf/hybrid_engines/megatron/megatron_model_manager.py
@@ -184,6 +184,7 @@ def model_provider_func(self, pre_process, post_process):
         return model
 
     def optimizer_step(self, increment):
+        clear_memory()
         success, grad_norm, num_zeros_in_grad = self.optimizer.step()
 
         self.lr_scheduler.step(increment=increment)
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py
index 509c56bab..87f736798 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py
@@ -108,10 +108,13 @@ def __init__(
             placement
         )[(self.get_parent_rank(), self._rank)]
 
+        use_presharded_weights = (
+            False if self.cfg.actor.training_backend == "fsdp" else True
+        )
         # it's important to use load_weight to load resharded weight from megatron
         for _, module in self.tp_worker.worker.model_runner.model.named_modules():
             if hasattr(module, "use_presharded_weights"):
-                module.use_presharded_weights = True
+                module.use_presharded_weights = use_presharded_weights
 
         self._logger.info(
             f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
index ef503527b..9a69b8548 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
@@ -110,10 +110,13 @@ def __init__(
         self.actor_weight_rank = RankMapper.get_rollout_rank_to_actor_rank_map(
             placement
         )[(self.get_parent_rank(), self._rank)]
+        use_presharded_weights = (
+            False if self.cfg.actor.training_backend == "fsdp" else True
+        )
         # it's important to use load_weight to load resharded weight from megatron
         for _, module in self.tp_worker.worker.model_runner.model.named_modules():
             if hasattr(module, "use_presharded_weights"):
-                module.use_presharded_weights = True
+                module.use_presharded_weights = use_presharded_weights
 
         self._logger.info(
             f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py
index a7057a161..1f9beb409 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py
@@ -112,10 +112,13 @@ def __init__(
         self.actor_weight_rank = RankMapper.get_rollout_rank_to_actor_rank_map(
             placement
         )[(self.get_parent_rank(), self._rank)]
+        use_presharded_weights = (
+            False if self.cfg.actor.training_backend == "fsdp" else True
+        )
         # it's important to use load_weight to load resharded weight from megatron
         for _, module in self.tp_worker.worker.model_runner.model.named_modules():
             if hasattr(module, "use_presharded_weights"):
-                module.use_presharded_weights = True
+                module.use_presharded_weights = use_presharded_weights
 
         self._logger.info(
             f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py
new file mode 100644
index 000000000..5b365ea1e
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py
new file mode 100644
index 000000000..960d40eb0
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py
@@ -0,0 +1,59 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class TaskMethodInput:
+    method_name: str
+    args: List[Any] = field(default_factory=list)
+    kwargs: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class TaskMethodOutput:
+    method_name: str
+    result: Optional[Any] = None
+
+
+@dataclass
+class OffloadReqInput:
+    pass
+
+
+@dataclass
+class OffloadReqOutput:
+    pass
+
+
+@dataclass
+class SyncWeightInput:
+    pass
+
+
+@dataclass
+class SyncWeightOutput:
+    pass
+
+
+@dataclass
+class SyncHFWeightInput:
+    pass
+
+
+@dataclass
+class SyncHFWeightOutput:
+    pass
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py
new file mode 100644
index 000000000..e8e05c88b
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py
@@ -0,0 +1,363 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import logging
+import multiprocessing as mp
+import os
+import random
+import signal
+import threading
+import time
+from typing import Dict, Optional, Tuple
+
+import uvloop
+import zmq
+from omegaconf import DictConfig
+from sglang.srt.entrypoints.engine import Engine as _Engine
+from sglang.srt.managers.data_parallel_controller import (
+    run_data_parallel_controller_process,
+)
+from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
+from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
+
+# from sglang.srt.managers.scheduler import run_scheduler_process
+# from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.managers.template_manager import TemplateManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import (
+    assert_pkg_version,
+    configure_logger,
+    get_bool_env_var,
+    get_zmq_socket,
+    is_cuda,
+    kill_process_tree,
+    launch_dummy_health_check_server,
+    prepare_model_and_tokenizer,
+    set_prometheus_multiproc_dir,
+    set_ulimit,
+)
+
+from rlinf.scheduler import WorkerAddress
+from rlinf.utils.placement import ComponentPlacement
+
+from .io_struct import OffloadReqInput, SyncHFWeightInput, SyncWeightInput
+from .sgl_scheduler import run_scheduler_process
+from .tokenizer_manager import TokenizerManager
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+_is_cuda = is_cuda()
+
+
+class Engine(_Engine):
+    def __init__(
+        self,
+        parent_address: WorkerAddress,
+        placement: ComponentPlacement,
+        config: DictConfig,
+        dp_rank: int,
+        **kwargs,
+    ):
+        """
+        The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
+        Please refer to `ServerArgs` for the documentation.
+        """
+        if "server_args" in kwargs:
+            # Directly load server_args
+            server_args = kwargs["server_args"]
+        else:
+            # Construct server_args from kwargs
+            if "log_level" not in kwargs:
+                # Do not print logs by default
+                kwargs["log_level"] = "error"
+            server_args = ServerArgs(**kwargs)
+
+        # Shutdown the subprocesses automatically when the program exits
+        atexit.register(self.shutdown)
+
+        # Allocate ports for inter-process communications
+        self.port_args = PortArgs.init_new(server_args)
+
+        # Launch subprocesses
+        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
+            parent_address=parent_address,
+            placement=placement,
+            config=config,
+            dp_rank=dp_rank,
+            server_args=server_args,
+            port_args=self.port_args,
+        )
+
+        self.server_args = server_args
+        self.tokenizer_manager = tokenizer_manager
+        self.scheduler_info = scheduler_info
+        self.template_manager = template_manager
+
+        context = zmq.Context(2)
+        self.send_to_rpc = get_zmq_socket(
+            context, zmq.DEALER, self.port_args.rpc_ipc_name, True
+        )
+
+    def offload_model_weights(self):
+        """Offload model weights to meta."""
+        obj = OffloadReqInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.offload_model_weights(obj, None)
+        )
+
+    def sync_hf_weight(self):
+        obj = SyncHFWeightInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.sync_hf_weight(obj))
+
+    def sync_weight(self):
+        obj = SyncWeightInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.sync_weight(obj))
+
+
+def _set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if not server_args.enable_symm_mem:
+        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+    os.environ["CUDA_MODULE_LOADING"] = "AUTO"
+    # flashinfer uses this environment variable for various kernels from MoE to quant kernels
+    os.environ["TRTLLM_ENABLE_PDL"] = "1"
+
+    # Can also be passed as argument
+    os.environ["SGLANG_RUN_ID"] = (
+        f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
+    )
+
+    # Set prometheus env vars
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
+
+    # Set ulimit
+    set_ulimit()
+
+    # Check flashinfer version
+    if server_args.attention_backend == "flashinfer":
+        assert_pkg_version(
+            "flashinfer_python",
+            "0.3.0",
+            "Please uninstall the old version and "
+            "reinstall the latest version by following the instructions "
+            "at https://docs.flashinfer.ai/installation.html.",
+        )
+    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
+        assert_pkg_version(
+            "sgl-kernel",
+            "0.3.8",
+            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
+        )
+
+    if True:  # Keep this check for internal code compatibility
+        # Register the signal handler.
+        # The child processes will send SIGQUIT to this process when any error happens
+        # This process then clean up the whole process tree
+        # Note: This sigquit handler is used in the launch phase, and may be replaced by
+        # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
+        def launch_phase_sigquit_handler(signum, frame):
+            logger.error(
+                "Received sigquit from a child process. It usually means the child failed."
+            )
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
+
+    # Set mp start method
+    mp.set_start_method("spawn", force=True)
+
+
+def _launch_subprocesses(
+    parent_address: WorkerAddress,
+    placement: ComponentPlacement,
+    config: DictConfig,
+    dp_rank: int,
+    server_args: ServerArgs,
+    port_args: Optional[PortArgs] = None,
+) -> Tuple[TokenizerManager, TemplateManager, Dict]:
+    """
+    Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
+    """
+
+    assert server_args.pp_size == 1, (
+        "RLinf currently only supports and validates pp_size=1."
+    )
+
+    # Configure global environment
+    configure_logger(server_args)
+    server_args.check_server_args()
+    _set_envs_and_config(server_args)
+
+    # Allocate ports for inter-process communications
+    if port_args is None:
+        port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
+
+    # If using model from www.modelscope.cn, first download the model.
+    server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
+        server_args.model_path, server_args.tokenizer_path
+    )
+
+    scheduler_procs = []
+    if server_args.dp_size == 1:
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+        scheduler_pipe_readers = []
+
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
+        tp_rank_range = range(
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
+        )
+
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                )
+                moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        parent_address,
+                        placement,
+                        config,
+                        server_args.tp_size * server_args.pp_size,
+                        tp_rank + pp_rank * server_args.pp_size,
+                        server_args,
+                        port_args,
+                        gpu_id,
+                        tp_rank,
+                        moe_ep_rank,
+                        pp_rank,
+                        None,
+                        writer,
+                        None,
+                    ),
+                )
+
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
+    else:
+        # Launch the data parallel controller
+        reader, writer = mp.Pipe(duplex=False)
+        scheduler_pipe_readers = [reader]
+        proc = mp.Process(
+            target=run_data_parallel_controller_process,
+            args=(server_args, port_args, writer),
+        )
+        proc.start()
+        scheduler_procs.append(proc)
+
+    if server_args.node_rank >= 1:
+        # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
+        # so they can just wait here.
+
+        for reader in scheduler_pipe_readers:
+            data = reader.recv()
+            assert data["status"] == "ready"
+
+        if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
+            # When using `Engine` as a Python API, we don't want to block here.
+            return None, None, None
+
+        launch_dummy_health_check_server(
+            server_args.host, server_args.port, server_args.enable_metrics
+        )
+
+        for proc in scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+        return None, None, None
+
+    # Launch detokenizer process
+    detoken_proc = mp.Process(
+        target=run_detokenizer_process,
+        args=(
+            server_args,
+            port_args,
+        ),
+    )
+    detoken_proc.start()
+    if server_args.tokenizer_worker_num > 1:
+        # Launch multi-tokenizer router
+        tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
+
+        # Initialize templates
+        template_manager = None
+    else:
+        # Launch tokenizer process
+        tokenizer_manager = TokenizerManager(server_args, port_args)
+
+        # Initialize templates
+        template_manager = TemplateManager()
+        template_manager.initialize_templates(
+            tokenizer_manager=tokenizer_manager,
+            model_path=server_args.model_path,
+            chat_template=server_args.chat_template,
+            completion_template=server_args.completion_template,
+        )
+
+    # Wait for the model to finish loading
+    scheduler_infos = []
+    for i in range(len(scheduler_pipe_readers)):
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError:
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
+
+        if data["status"] != "ready":
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        scheduler_infos.append(data)
+
+    # Assume all schedulers have the same scheduler_info
+    scheduler_info = scheduler_infos[0]
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
+    return tokenizer_manager, template_manager, scheduler_info
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py
new file mode 100644
index 000000000..2cfb69abc
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py
@@ -0,0 +1,476 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import faulthandler
+import logging
+import os
+import signal
+from typing import Optional
+
+import psutil
+import setproctitle
+import torch
+from omegaconf import DictConfig
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+)
+from sglang.srt.managers.io_struct import (
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+)
+from sglang.srt.managers.scheduler import Scheduler as _Scheduler
+from sglang.srt.managers.scheduler import logger
+from sglang.srt.managers.utils import DPBalanceMeta
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    broadcast_pyobj,
+    configure_logger,
+    get_bool_env_var,
+    kill_itself_when_parent_died,
+    set_gpu_proc_affinity,
+    suppress_other_loggers,
+)
+from sglang.utils import get_exception_traceback
+
+from rlinf.scheduler import Worker, WorkerAddress
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+from rlinf.workers.rollout.utils import (
+    RankMapper,
+    get_module_from_name,
+    rebind_param_attr,
+    swap_tensor_pointer,
+)
+
+from .io_struct import (
+    OffloadReqInput,
+    OffloadReqOutput,
+    SyncHFWeightInput,
+    SyncHFWeightOutput,
+    SyncWeightInput,
+    SyncWeightOutput,
+    TaskMethodInput,
+    TaskMethodOutput,
+)
+import torch_npu
+logger.setLevel(logging.INFO)
+
+def safe_load_weights(model, weights: list):
+    """
+    安全加载权重，自动兼容两种命名约定：
+    - 'visual.xxx'          (Hugging Face Qwen-VL 格式)
+    - 'model.visual.xxx'    (部分 SGLang 或自定义格式)
+    
+    Parameters:
+        model: PyTorch 模型实例（需有 named_parameters()）
+        weights: List of (name, torch.Tensor)
+    """
+    params_dict = dict(model.named_parameters())
+    
+    # 构建一个映射：标准化 key -> 实际参数名
+    # 例如：'visual.patch_embed.proj.weight' 可能对应 params_dict 中的 'visual...' 或 'model.visual...'
+    normalized_to_actual = {}
+    for param_name in params_dict.keys():
+        if param_name.startswith("model.visual."):
+            # 映射到无 model. 的标准名
+            normalized = param_name[len("model."):]  # "visual.xxx"
+            normalized_to_actual[normalized] = param_name
+            normalized_to_actual[param_name] = param_name  # 也保留原名
+        elif param_name.startswith("visual."):
+            normalized = param_name
+            normalized_to_actual[normalized] = param_name
+            normalized_to_actual["model." + normalized] = param_name  # 兼容带 model. 的输入
+        else:
+            # 非 visual 参数，直接映射
+            normalized_to_actual[param_name] = param_name
+
+    # 加载每个权重
+    for name, loaded_weight in weights:
+        if name in normalized_to_actual:
+            actual_name = normalized_to_actual[name]
+            param = params_dict[actual_name]
+            assert param.shape == loaded_weight.shape, (
+                f"Shape mismatch for {name}: expected {param.shape}, got {loaded_weight.shape}"
+            )
+            param.copy_(loaded_weight)
+        else:
+            # 可选：跳过不存在的参数（如优化器状态、非模型参数）
+            print(f"[Warning] Skipping weight not in model: {name}")
+            continue
+
+class Scheduler(_Scheduler, Worker):
+    """
+    Overridden class of SGLang's TP worker class _Scheduler.
+    A Scheduler is a Task that manages the TP worker, and performs necessary weight synchronization with actor and weight offloading.
+    """
+
+    def __init__(
+        self,
+        parent_address: WorkerAddress,
+        placement: ModelParallelComponentPlacement,
+        config: DictConfig,
+        world_size: int,
+        rank: int,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        dp_rank: Optional[int],
+        dp_balance_meta: Optional[DPBalanceMeta] = None,
+    ):
+        Worker.__init__(
+            self, parent_address=parent_address, world_size=world_size, rank=rank
+        )
+
+        # since 0.4.6.post2, pp_rank is added into Scheduler init's parameters
+        # but we don't use it in our implementation, so we set it to 0
+        _Scheduler.__init__(
+            self,
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+            dp_balance_meta,
+        )
+        # `TpModelWorkerClient` is used when ServerArgs.enable_overlap=True, and it has 'worker' attribute.
+        # But in early SGLang version, `TpModelWorker` doesn't have 'worker' attribute.
+        if not hasattr(self.tp_worker, "worker"):
+            self.tp_worker.worker = self.tp_worker
+
+        self._request_dispatcher._mapping.extend(
+            [
+                (TaskMethodInput, self.run_task_method),
+                (OffloadReqInput, self.offload_model_weights),
+                (SyncWeightInput, self.sync_weight),
+                (SyncHFWeightInput, self.sync_hf_weight),
+            ]
+        )
+        self.cfg = config
+        self.binded_attr = {}
+
+        self._actor_group_name = self.cfg.actor.group_name
+        self.placement_mode = placement.placement_mode
+        self.actor_weight_rank = RankMapper.get_rollout_rank_to_actor_rank_map(
+            placement
+        )[(self.get_parent_rank(), self._rank)]
+        # it's important to use load_weight to load resharded weight from megatron
+        for _, module in self.tp_worker.worker.model_runner.model.named_modules():
+            if hasattr(module, "use_presharded_weights"):
+                module.use_presharded_weights = False
+
+        self._logger.info(
+            f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
+        )
+
+    def sync_in_tp(self, fn: str = ""):
+        broadcast_pyobj(
+            [], self.tp_rank, self.tp_worker.worker.model_runner.tp_group.cpu_group
+        )
+        # logger.info(f"{fn}: Sync in tp success!")
+
+    def cuda_info(self, text: str = ""):
+        free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
+        free_gpu_memory /= 2**30
+        total_gpu_memory /= 2**30
+
+        memory_allocated = torch.npu.memory_allocated() / 2**30
+        memory_reserved = torch.npu.memory_reserved() / 2**30
+
+        self._logger.info(
+            f"[dp {self.get_parent_rank()}-tp {self.tp_rank}] {text} "
+            f"{memory_allocated=:.2f} GiB, {memory_reserved=:.2f} GiB, "
+            f"{free_gpu_memory=:.2f} GiB, {total_gpu_memory=:.2f} GiB"
+        )
+
+    def offload_model_weights(self, recv_req: OffloadReqInput):
+        use_cudagraph = not self.cfg.rollout.enforce_eager
+        colocate = self.placement_mode == PlacementMode.COLLOCATED
+        if not colocate:
+            assert use_cudagraph, "If not colocate, use_cudagraph must be True now."
+
+        if use_cudagraph or not colocate:
+            self.release_memory_occupation(ReleaseMemoryOccupationReqInput())
+            # self.cuda_info("After offload Model weights and kv cache")
+            return OffloadReqOutput()
+
+        # manually offload
+        self.named_buffers = {
+            n: buf.clone()
+            for n, buf in self.tp_worker.worker.model_runner.model.named_buffers()
+        }
+
+        self.binded_attr = {
+            name: param.__dict__
+            for name, param in self.tp_worker.worker.model_runner.model.named_parameters()
+        }
+
+        # offload parameters
+        self.tp_worker.worker.model_runner.model.to("meta")
+
+        # offload kv cache
+        self.tp_worker.worker.model_runner.token_to_kv_pool._clear_buffers()
+
+        self.flush_cache()
+        self.sync_in_tp("offload_model_weights")
+        # self.cuda_info("After offload Model weights and kv cache")
+        return OffloadReqOutput()
+
+    def sync_hf_weight(self, recv_req: SyncHFWeightInput):
+        use_cudagraph = not self.cfg.rollout.enforce_eager
+        colocate = self.placement_mode == PlacementMode.COLLOCATED
+
+        assert use_cudagraph, "use_cudagraph must be True now."
+
+        state_dict = self.recv(
+            src_group_name=self._actor_group_name,
+            src_rank=self.actor_weight_rank,
+        )
+
+        model = self.tp_worker.worker.model_runner.model
+
+        if colocate:
+            self.resume_memory_occupation(ResumeMemoryOccupationReqInput())
+            for name, handle in state_dict.items():
+                #func, args = handle
+                #list_args = list(args)
+                # NOTE: the key is to change device id to the current device id
+                # in case two processes have different CUDA_VISIBLE_DEVICES
+                #list_args[6] = torch.npu.current_device()
+                #new_weight = func(*list_args)
+                
+                #model.load_weights([(name, new_weight)])
+                #self.tp_worker.worker.model_runner.update_weights_from_tensor(
+                #    [(name, new_weight)], load_format="direct"
+                #)
+                #del new_weight
+                func, args = handle
+                import inspect
+                sig = inspect.signature(func)
+                param_names = list(sig.parameters.keys())
+
+                # 将 args 转为 kwargs
+                kwargs = {}
+                args = list(args)
+                for i, param_name in enumerate(param_names):
+                    if i < len(args):
+                        kwargs[param_name] = args[i]
+                    else:
+                        break
+
+                # 修改设备参数（假设参数名是 'map_location' 或 'device'）
+                if 'map_location' in kwargs:
+                    kwargs['map_location'] = f"npu:{torch.npu.current_device()}"
+                elif 'device' in kwargs:
+                    kwargs['device'] = torch.npu.current_device()
+
+                new_weight = func(**kwargs)
+                model.load_weights([(name, new_weight)])
+                #safe_load_weights(model, [(name, new_weight)])
+                del new_weight
+                #fixed_weights = []
+                #for name, weight in [(name, new_weight)]:
+                #    if name.startswith("visual.") and not name.startswith("model.visual."):
+                #        fixed_weights.append(("model." + name, weight))
+                #    else:
+                #        fixed_weights.append((name, weight))
+                #model.load_weights(fixed_weights)
+                #del new_weight
+        else:
+            # disaggregate mode, recv tensor directly
+            for name, tensor in state_dict.items():
+                model.load_weights([(name, tensor)])
+        self.flush_cache()
+        self.sync_in_tp("sync_hf_weight")
+        return SyncHFWeightOutput()
+
+    def sync_weight(self, recv_req: SyncWeightInput):
+        use_cudagraph = not self.cfg.rollout.enforce_eager
+        colocate = self.placement_mode == PlacementMode.COLLOCATED
+        if not colocate:
+            assert use_cudagraph, "If not colocate, use_cudagraph must be True now."
+
+        state_dict = self.recv(
+            src_group_name=self._actor_group_name,
+            src_rank=self.actor_weight_rank,
+        )
+        model = self.tp_worker.worker.model_runner.model
+
+        if use_cudagraph and colocate:
+            self.resume_memory_occupation(ResumeMemoryOccupationReqInput())
+
+        if colocate:
+            if use_cudagraph:
+                for name, handle in state_dict.items():
+                    func, args = handle
+                    list_args = list(args)
+                    # NOTE: the key is to change device id to the current device id
+                    # in case two processes have different CUDA_VISIBLE_DEVICES
+                    list_args[6] = torch.npu.current_device()
+                    new_weight = func(*list_args)
+
+                    self.tp_worker.worker.model_runner.update_weights_from_tensor(
+                        [(name, new_weight)], load_format="direct"
+                    )
+                    del new_weight
+
+            else:
+                named_params = dict(model.named_parameters())
+                for name, handle in state_dict.items():
+                    rebind_param_attr(model, name, self.binded_attr, materialize=False)
+                    func, args = handle
+                    list_args = list(args)
+                    list_args[6] = torch.npu.current_device()
+                    new_weight = func(*list_args)
+                    vllm_weight = named_params[name]
+                    assert vllm_weight.shape == new_weight.shape, (
+                        f"{name}: {vllm_weight.shape=}, {new_weight.shape=}"
+                    )
+                    assert vllm_weight.dtype == new_weight.dtype, (
+                        f"{name}: {vllm_weight.dtype=}, {new_weight.dtype=}"
+                    )
+
+                    swap_tensor_pointer(vllm_weight, new_weight)
+                    del new_weight
+
+                for name, buffer in self.named_buffers.items():
+                    vllm_buffer = get_module_from_name(model, name)
+                    assert vllm_buffer.shape == buffer.shape
+                    assert vllm_buffer.dtype == buffer.dtype
+                    swap_tensor_pointer(vllm_buffer, buffer)
+
+                self.named_buffers = {}
+
+                self.tp_worker.worker.model_runner.token_to_kv_pool._create_buffers()
+        else:
+            # disaggregate mode, recv tensor directly
+            named_tensors = [(n, p) for n, p in state_dict.items()]
+            self.tp_worker.worker.model_runner.update_weights_from_tensor(
+                named_tensors, load_format="direct"
+            )
+        self.sync_in_tp("sync_weight")
+
+        return SyncWeightOutput()
+
+    def run_task_method(self, obj: TaskMethodInput):
+        """
+        Run a CommTask method with the given name and arguments.
+        NOTE: will call wait() if async_op is True.
+        """
+        result = getattr(self, obj.method_name)(*obj.args, **obj.kwargs)
+        if "async_op" in obj.kwargs and obj.kwargs["async_op"]:
+            result = result.wait()
+        return TaskMethodOutput(method_name=obj.method_name, result=result)
+
+
+def run_scheduler_process(
+    parent_address: WorkerAddress,
+    placement: ModelParallelComponentPlacement,
+    config: DictConfig,
+    world_size: int,
+    rank: int,
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    gpu_id: int,
+    tp_rank: int,
+    moe_ep_rank: int,
+    pp_rank: int,
+    dp_rank: Optional[int],
+    pipe_writer,
+    balance_meta: Optional[DPBalanceMeta] = None,
+):
+    # Generate the prefix
+    prefix = ""
+    if dp_rank is not None:
+        prefix += f" DP{dp_rank}"
+    if server_args.tp_size > 1:
+        prefix += f" TP{tp_rank}"
+    if server_args.ep_size > 1:
+        prefix += f" EP{moe_ep_rank}"
+    if server_args.pp_size > 1:
+        prefix += f" PP{pp_rank}"
+
+    # Config the process
+    setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
+    faulthandler.enable()
+    kill_itself_when_parent_died()
+    parent_process = psutil.Process().parent()
+
+    # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
+    if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
+        dp_rank = int(os.environ["SGLANG_DP_RANK"])
+
+    # Configure the logger
+    configure_logger(server_args, prefix=prefix)
+    suppress_other_loggers()
+
+    # Set cpu affinity to this gpu process
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
+
+    # Create a scheduler and run the event loop
+    try:
+        scheduler = Scheduler(
+            parent_address,
+            placement,
+            config,
+            world_size,
+            rank,
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+            dp_balance_meta=balance_meta,
+        )
+        pipe_writer.send(
+            {
+                "status": "ready",
+                "max_total_num_tokens": scheduler.max_total_num_tokens,
+                "max_req_input_len": scheduler.max_req_input_len,
+            }
+        )
+
+        disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
+        if disaggregation_mode == DisaggregationMode.NULL:
+            if server_args.pp_size > 1:
+                scheduler.event_loop_pp()
+            elif scheduler.enable_overlap:
+                scheduler.event_loop_overlap()
+            else:
+                scheduler.event_loop_normal()
+        elif disaggregation_mode == DisaggregationMode.PREFILL:
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_prefill()
+            else:
+                if server_args.pp_size > 1:
+                    scheduler.event_loop_pp_disagg_prefill()
+                else:
+                    scheduler.event_loop_normal_disagg_prefill()
+
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_decode()
+            else:
+                scheduler.event_loop_normal_disagg_decode()
+
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py
new file mode 100644
index 000000000..07b77b200
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py
@@ -0,0 +1,129 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import fastapi
+from sglang.srt.managers.io_struct import AbortReq
+from sglang.srt.managers.tokenizer_manager import TokenizerManager as _TokenizerManager
+#from sglang.srt.managers.tokenizer_manager import _Communicator
+from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator
+from sglang.srt.server_args import PortArgs, ServerArgs
+from .io_struct import (
+    OffloadReqInput,
+    OffloadReqOutput,
+    SyncHFWeightInput,
+    SyncHFWeightOutput,
+    SyncWeightInput,
+    SyncWeightOutput,
+    TaskMethodInput,
+    TaskMethodOutput,
+)
+
+
+# Add two methods and their communicators, input/output structs.
+class TokenizerManager(_TokenizerManager):
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        super().__init__(
+            server_args=server_args,
+            port_args=port_args,
+        )
+
+        self.run_task_method_communicator = _Communicator(
+            self.send_to_scheduler,
+            fan_out=server_args.dp_size,
+        )
+        self.offload_model_weights_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.sync_weight_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.sync_hf_weight_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+
+        self._result_dispatcher._mapping.extend(
+            [
+                (
+                    TaskMethodOutput,
+                    self.run_task_method_communicator.handle_recv,
+                ),
+                (
+                    OffloadReqOutput,
+                    self.offload_model_weights_communicator.handle_recv,
+                ),
+                (
+                    SyncWeightOutput,
+                    self.sync_weight_communicator.handle_recv,
+                ),
+                (
+                    SyncHFWeightOutput,
+                    self.sync_hf_weight_communicator.handle_recv,
+                ),
+            ]
+        )
+
+    async def run_task_method(
+        self,
+        obj: TaskMethodInput = None,
+        request: Optional[fastapi.Request] = None,
+    ):
+        """
+        Run a task method with the given name and arguments.
+        """
+        self.auto_create_handle_loop()
+        if isinstance(obj, str):
+            obj = TaskMethodInput(method_name=obj)
+        res: List[TaskMethodOutput] = await self.run_task_method_communicator(obj)
+        return res[0].result
+
+    async def offload_model_weights(
+        self,
+        obj: OffloadReqInput = None,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        if obj is None:
+            obj = OffloadReqInput()
+        await self.offload_model_weights_communicator(obj)
+
+    async def sync_hf_weight(
+        self,
+        obj: SyncHFWeightInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.sync_hf_weight_communicator(obj)
+
+    async def sync_weight(
+        self,
+        obj: SyncWeightInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.sync_weight_communicator(obj)
+
+    def abort_request(self, rid: str):
+        if rid != "" and rid not in self.rid_to_state:
+            return
+        req = AbortReq(rid)
+        self.send_to_scheduler.send_pyobj(req)
+
+    async def pause_generation(self):
+        self.abort_request("")
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py
deleted file mode 100644
index 4bf155a55..000000000
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2025 The RLinf Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import partial
-from typing import List, Optional, Union
-
-from omegaconf import DictConfig
-from vllm.config import VllmConfig
-from vllm.inputs.data import TextPrompt, TokensPrompt
-from vllm.outputs import RequestOutput
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.utils import Counter
-from vllm.v1.engine.llm_engine import LLMEngine as _LLMEngine
-
-from rlinf.scheduler.manager.worker_manager import WorkerAddress
-from rlinf.utils.placement import ModelParallelComponentPlacement
-
-
-class VLLMEngine:
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        log_stats: bool,
-        dp_rank: int,
-        rlinf_config: DictConfig,
-        parent_address: WorkerAddress,
-        placement: ModelParallelComponentPlacement,
-        multiprocess_model: bool = False,
-    ):
-        # vllm_worker_cls = partial(VLLMWorker, rlinf_config=rlinf_config)
-        vllm_worker_cls = "rlinf.hybrid_engines.vllm.vllm_0_8_5.worker.VLLMWorker"
-        vllm_config.parallel_config.worker_cls = vllm_worker_cls
-
-        from rlinf.hybrid_engines.vllm.vllm_0_8_5.executor import VLLMExecutor
-
-        executor_factory = partial(
-            VLLMExecutor,
-            rlinf_config=rlinf_config,
-            parent_address=parent_address,
-            placement=placement,
-            dp_rank=dp_rank,
-        )
-
-        self._engine = _LLMEngine(
-            vllm_config=vllm_config,
-            executor_class=executor_factory,
-            log_stats=log_stats,
-            multiprocess_mode=multiprocess_model,
-        )
-        self.request_counter = Counter()
-
-    def generate(
-        self,
-        input_ids: Union[List[List[int]], List[int]],
-        sampling_params: Union[SamplingParams, PoolingParams],
-        prompt_texts: Optional[Union[List[str], str]] = None,
-        return_logprobs: bool = False,
-    ) -> List[RequestOutput]:
-        """
-        Use the VLLM engine to generate text based on input token IDs or prompt text.
-
-        Args:
-            input_ids: A list of lists of input token IDs, or a single list of input
-                token IDs.
-            sampling_params: Sampling parameters for generation.
-            prompt_text: Optional; A list of prompt strings or a single prompt string,
-                if provided, it will be used instead of input_ids.
-            return_logprobs: Whether to return log probabilities of the generated tokens.
-
-        Returns:
-            A list of RequestOutput objects containing the results of the generation.
-        """
-        sampling_params.logprobs = 0 if return_logprobs else None
-        self._add_requests(
-            input_ids=input_ids,
-            prompt_texts=prompt_texts,
-            sampling_params=sampling_params,
-        )
-        results: List[RequestOutput] = self._run_engine()
-        return results
-
-    def _add_requests(
-        self,
-        input_ids: Union[List[List[int]], List[int]],
-        sampling_params: Union[SamplingParams, PoolingParams],
-        prompt_texts: Optional[Union[List[str], str]] = None,
-    ) -> None:
-        """
-        Add generation requests to the engine.
-
-        Args:
-            input_ids: A list of lists of input token IDs, or a single list of input token IDs.
-            prompt_texts: Optional; A list of prompt strings or a single prompt string, if provided,
-                it will be used instead of input_ids.
-            sampling_params: Optional; Sampling parameters for generation.
-        """
-        if prompt_texts is not None:
-            # if not None, we use prompt_text rather than input_ids
-            if isinstance(prompt_texts, str):
-                prompt_texts = [prompt_texts]
-            assert isinstance(prompt_texts, list), (
-                f"Expected list for prompt_texts, got {type(prompt_texts)}"
-            )
-            for prompt_text in prompt_texts:
-                request_id = str(next(self.request_counter))
-                text_prompt = TextPrompt(prompt=prompt_text)
-                self._engine.add_request(
-                    request_id=request_id,
-                    prompt=text_prompt,
-                    params=sampling_params,
-                )
-            return
-
-        assert isinstance(input_ids, list), (
-            f"Expected list for input_ids, got {type(input_ids)}"
-        )
-        if not isinstance(input_ids[0], list):
-            input_ids = [input_ids]
-
-        for input_id in input_ids:
-            request_id = str(next(self.request_counter))
-            tokens_prompt = TokensPrompt(prompt_token_ids=input_id)
-            self._engine.add_request(
-                request_id=request_id,
-                prompt=tokens_prompt,
-                params=sampling_params,
-            )
-
-    def _run_engine(self) -> List[RequestOutput]:
-        """
-        Run the engine until all requests are finished.
-
-        Returns:
-            A list of RequestOutput objects containing the results of the generation.
-        """
-        outputs: List[RequestOutput] = []
-
-        while self._engine.has_unfinished_requests():
-            step_outputs = self._engine.step()
-            for output in step_outputs:
-                if output.finished:
-                    outputs.append(output)
-        return sorted(outputs, key=lambda x: int(x.request_id))
-
-    def offload_model_weights(self) -> None:
-        """
-        Offload most graphic memory vllm used, including model's weights, buffers and kv cache.
-        """
-        self._engine.collective_rpc("offload_model_weights")
-
-    def sync_hf_weight(self) -> None:
-        """
-        Sync model weights from actor to the vllm workers.
-        """
-        self._engine.collective_rpc("sync_hf_weight")
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
index fe82c7d9d..519895e49 100644
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
+++ b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
@@ -48,6 +48,9 @@ def __init__(
         )
         # rlinf specific
         self.rlinf_config = rlinf_config
+        self.using_sharded_weight = (
+            False if self.rlinf_config.actor.training_backend == "fsdp" else True
+        )
         self._rlinf_worker = _RLinfWorker(
             parent_address=parent_address,
             world_size=vllm_config.parallel_config.world_size,
@@ -82,7 +85,10 @@ def sync_hf_weight(self) -> None:
         state_dict = self._rlinf_worker.recv(
             src_group_name=self._actor_group_name, src_rank=self.actor_weight_rank
         )
-        super().wake_up()
+        if self.placement_mode == PlacementMode.COLLOCATED:
+            # in disaggregated mode, rollout backend will never offload weights
+            # so we don't need to wake up when placement is disaggregated
+            super().wake_up()
 
         model = self.model_runner.model
         if colocate:
@@ -100,7 +106,7 @@ def sync_hf_weight(self) -> None:
     def use_sharded_weights(self) -> None:
         model = self.model_runner.model
         for _, param in model.named_parameters():
-            setattr(param, "is_sharded_weight", True)
+            setattr(param, "is_sharded_weight", self.using_sharded_weight)
 
     def get_dp_rank(self) -> int:
         return self._rlinf_worker.get_parent_rank()
diff --git a/rlinf/models/__init__.py b/rlinf/models/__init__.py
index 08207900a..617ac7467 100644
--- a/rlinf/models/__init__.py
+++ b/rlinf/models/__init__.py
@@ -17,7 +17,6 @@
 
 import torch
 from omegaconf import DictConfig
-from peft import LoraConfig, PeftModel, get_peft_model
 from transformers import (
     AutoConfig,
     AutoImageProcessor,
@@ -172,6 +171,8 @@ def get_model(model_path, cfg: DictConfig, override_config_kwargs=None):
         model = model.cuda()
 
     if cfg.is_lora:
+        from peft import LoraConfig, PeftModel, get_peft_model
+
         if not hasattr(cfg, "lora_path") or cfg.lora_path is None:
             lora_config = LoraConfig(
                 r=cfg.lora_rank,
diff --git a/rlinf/models/embodiment/model_utils.py b/rlinf/models/embodiment/model_utils.py
index 04425cfdc..8e7aebeb1 100644
--- a/rlinf/models/embodiment/model_utils.py
+++ b/rlinf/models/embodiment/model_utils.py
@@ -15,9 +15,10 @@
 from typing import Any, Optional
 
 import torch
-import torch.nn.functional as F
 from transformers.generation import TopKLogitsWarper
 
+from rlinf.utils.utils import compute_entropy_from_logits, compute_logprobs_from_logits
+
 
 def default_logits_processor(logits, action_tokens, vocab_size, n_action_bins):
     logits = logits.permute(0, 2, 1)  # [B, vocab-size, action-dim]
@@ -34,28 +35,6 @@ def default_logits_processor(logits, action_tokens, vocab_size, n_action_bins):
     return ret
 
 
-def compute_logprobs_from_logits(logits, target):
-    logprobs = -F.cross_entropy(
-        logits, target=target, reduction="none"
-    )  # [B, action-dim]
-    return logprobs
-
-
-def compute_entropy_from_logits(logits, epsilon=1e-10):
-    """
-    Compute entropy by logits.
-
-    Args:
-        logits: [B, vocab-size, seq-len]
-    Returns:
-        entropy: [B, seq-len]
-    """
-    all_probs = F.softmax(logits, dim=1)  # [B, vocab-size, seq-len]
-    all_log_probs = torch.log(all_probs + epsilon)
-    entropy = -torch.sum(all_probs * all_log_probs, dim=1)  # [B, seq-len]
-    return entropy
-
-
 def custom_forward(
     model,
     input_ids,
diff --git a/rlinf/runners/coding_online_rl_runner.py b/rlinf/runners/coding_online_rl_runner.py
new file mode 100644
index 000000000..377667523
--- /dev/null
+++ b/rlinf/runners/coding_online_rl_runner.py
@@ -0,0 +1,315 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from typing import Optional
+
+import pandas as pd
+from omegaconf.dictconfig import DictConfig
+from tqdm import tqdm
+
+from rlinf.scheduler import Channel
+from rlinf.scheduler import WorkerGroupFuncResult as Handle
+from rlinf.utils.distributed import ScopedTimer
+from rlinf.utils.metric_logger import MetricLogger
+from rlinf.utils.placement import ModelParallelComponentPlacement
+from rlinf.utils.runner_utils import check_progress
+from rlinf.utils.timers import Timer
+from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.rollout.server.online_router_worker import OnlineRouterWorker
+from rlinf.workers.rollout.server.server_rollout_worker import ServerRolloutWorker
+from rlinf.workers.rollout.sglang.sglang_worker import SGLangWorker
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+class CodingOnlineRLRunner:
+    """Runner for online coding model training."""
+
+    def __init__(
+        self,
+        cfg: DictConfig,
+        placement: ModelParallelComponentPlacement,
+        rollout: SGLangWorker,
+        inference: Optional[MegatronInference],
+        actor: MegatronActor,
+        online_router: OnlineRouterWorker,
+        server_rollout: ServerRolloutWorker,
+    ):
+        """"""
+        self.cfg = cfg
+        self.component_placement = placement
+        self.is_pipeline = self.component_placement.is_disaggregated
+        self.has_dedicated_inference = inference is not None
+
+        # Workers
+        self.rollout = rollout
+        self.actor = actor
+        self.online_router = online_router
+        self.server_rollout = server_rollout
+        # Collocated mode uses actor as inference
+        self.inference = inference if self.has_dedicated_inference else self.actor
+
+        # Data channels
+        self.dataloader_channel = Channel.create("DataLoader")
+        # Create a local channel (i.e., a channel that is different in every process)
+        # if inference is not a dedicated worker
+        self.inference_channel = Channel.create(
+            "Inference", local=not self.has_dedicated_inference
+        )
+        self.actor_channel = Channel.create("Actor", local=True)
+
+        # Configurations
+        self.compute_ref_logprobs = self.cfg.algorithm.kl_beta > 0
+        self.recompute_logprobs = self.cfg.algorithm.recompute_logprobs
+        assert self.recompute_logprobs, "online rl must recompute logprobs"
+        self.consumed_samples = 0
+        self.global_steps = 0
+
+        # Build dataloader and compute `max_steps`
+        self.max_steps = self.cfg.runner.get("max_steps", -self.global_steps)
+
+        # Wandb table
+        self.train_df = pd.DataFrame(columns=["step", "prompt", "response", "reward"])
+        self.val_df = pd.DataFrame(columns=["step", "prompt", "response", "reward"])
+
+        # Timers
+        self.timer = ScopedTimer(reduction="max", sync_cuda=False)
+        self.run_timer = Timer(None)  # Timer that checks if we should stop training
+
+        self.metric_logger = MetricLogger(cfg)
+
+    def init_workers(self):
+        # Must be done before actor init
+        if self.cfg.runner.resume_dir is None:
+            logging.info("Training from scratch")
+            if (
+                self.cfg.actor.training_backend == "megatron"
+                and self.cfg.actor.megatron.use_hf_ckpt
+            ):
+                from toolkits.ckpt_convertor.convert_hf_to_mg import convert_hf_to_mg
+
+                convert_hf_to_mg(
+                    self.cfg.actor.megatron.ckpt_convertor.hf_model_path,
+                    self.cfg.actor.megatron.ckpt_convertor,
+                )
+
+        # Init workers
+        self.rollout.init_worker().wait()
+        self.actor.init_worker().wait()
+        self.online_router.init_worker(self.rollout).wait()
+        self.server_rollout.init_worker().wait()
+        if self.has_dedicated_inference:
+            self.inference.init_worker().wait()
+
+        if self.cfg.runner.resume_dir is None:
+            return
+
+        # Checkpoint loading
+        logging.info(f"Load from checkpoint folder: {self.cfg.runner.resume_dir}")
+        # set global step
+        self.global_steps = int(self.cfg.runner.resume_dir.split("global_step_")[-1])
+        logging.info(f"Setting global step to {self.global_steps}")
+        print(f"Setting global step to {self.global_steps}")
+
+        actor_checkpoint_path = os.path.join(self.cfg.runner.resume_dir, "actor")
+        self.actor.load_checkpoint(actor_checkpoint_path).wait()
+
+    def _compute_flops_metrics(self, time_metrics, act_rollout_metrics) -> dict:
+        rollout_time = time_metrics.get("rollout")
+        inference_time = time_metrics.get("inference", -1)
+        training_time = time_metrics.get("training")
+
+        num_gpus_actor = self.component_placement.actor_world_size
+        num_gpus_rollout = self.component_placement.rollout_world_size
+
+        rollout_tflops = act_rollout_metrics["rollout_tflops"]
+        inference_tflops = act_rollout_metrics["inference_tflops"]
+        training_tflops = act_rollout_metrics["training_tflops"]
+
+        flops_metrics = {
+            "rollout_tflops_per_gpu": 0.0,
+            "inference_tflops_per_gpu": 0.0,
+            "training_tflops_per_gpu": 0.0,
+        }
+        if rollout_time > 0 and rollout_tflops > 0:
+            flops_metrics["rollout_tflops_per_gpu"] = (
+                rollout_tflops / rollout_time / num_gpus_rollout
+            )
+
+        if inference_time > 0 and inference_tflops > 0:
+            num_gpus_inference = self.component_placement.inference_world_size
+            if num_gpus_inference == 0:
+                num_gpus_inference = self.component_placement.actor_world_size
+            flops_metrics["inference_tflops_per_gpu"] = (
+                inference_tflops / inference_time / num_gpus_inference
+            )
+
+        if training_time > 0 and training_tflops > 0:
+            flops_metrics["training_tflops_per_gpu"] = (
+                training_tflops / training_time / num_gpus_actor
+            )
+
+        return flops_metrics
+
+    def _save_checkpoint(self):
+        base_output_dir = os.path.join(
+            self.cfg.runner.output_dir,
+            self.cfg.runner.experiment_name,
+            f"checkpoints/global_step_{self.global_steps}",
+        )
+        actor_save_path = os.path.join(base_output_dir, "actor")
+
+        # actor
+        self.actor.save_checkpoint(actor_save_path, self.global_steps).wait()
+
+    def _sync_weights(self):
+        self.online_router.sync_model_start()
+        self.actor.sync_model_to_rollout()
+        self.rollout.sync_model_from_actor().wait()
+        self.actor.del_reshard_state_dict().wait()
+
+        if self.has_dedicated_inference:
+            self.actor.sync_model_to_inference()
+            self.inference.sync_model_from_actor().wait()
+        self.online_router.sync_model_end()
+
+    def run(self):
+        global_pbar = tqdm(
+            initial=0,
+            total=self.cfg.runner.max_epochs,
+            desc="Global Step",
+            ncols=620,
+        )
+
+        self.online_router.server_start()
+        self.server_rollout.server_start()
+        self.run_timer.start_time()
+        for _ in range(self.cfg.runner.max_epochs):
+            with self.timer("step"):
+                with self.timer("sync_weights"):
+                    self._sync_weights()
+
+                rollout_handle: Handle = self.server_rollout.rollout(
+                    output_channel=self.dataloader_channel,
+                )
+
+                if self.recompute_logprobs:
+                    # Inference prev/ref logprobs
+                    infer_handle: Handle = self.inference.run_inference(
+                        input_channel=self.dataloader_channel,
+                        output_channel=self.inference_channel,
+                        rollout_channel=None,
+                        compute_ref_logprobs=self.compute_ref_logprobs,
+                    )
+                    inference_channel = self.inference_channel
+                else:
+                    infer_handle = None
+                    inference_channel = self.dataloader_channel
+
+                # Advantages and returns
+                adv_handle: Handle = self.actor.compute_advantages_and_returns(
+                    input_channel=inference_channel,
+                    output_channel=self.actor_channel,
+                )
+
+                # Actor training
+                actor_input_channel = self.actor_channel
+                actor_handle: Handle = self.actor.run_training(
+                    input_channel=actor_input_channel,
+                )
+
+                metrics = actor_handle.wait()
+                actor_rollout_metrics = metrics[0][0]
+                actor_training_metrics = metrics[0][1]
+                self.global_steps += 1
+
+                run_time_exceeded = self.run_timer.is_finished()
+                _, save_model, is_train_end = check_progress(
+                    self.global_steps,
+                    self.max_steps,
+                    self.cfg.runner.val_check_interval,
+                    self.cfg.runner.save_interval,
+                    1.0,
+                    run_time_exceeded=run_time_exceeded,
+                )
+
+                if save_model:
+                    self._save_checkpoint()
+
+                if is_train_end:
+                    logging.info(
+                        f"Step limit given by max_steps={self.max_steps} reached. Stopping run"
+                    )
+                    return
+
+                if run_time_exceeded:
+                    logging.info(
+                        f"Time limit given by run_timer={self.run_timer} reached. Stopping run"
+                    )
+                    return
+
+                # To ensure the router server is paused (old requests are finished and new requests are paused).
+                # So it's safe to do weight sync on sglang.
+                rollout_handle.wait()
+
+            time_metrics = self.timer.consume_durations()
+            time_metrics["training"] = actor_handle.consume_duration()
+            time_metrics["advantage"] = adv_handle.consume_duration()
+            if infer_handle is not None:
+                # Inference time should be the min time across ranks, because different DP receive the rollout results differently
+                # But at the begin of the pp schedule, there is a timer barrier
+                # This makes all DP end at the same time, while they start at differnt times, and thus only the min time is correct
+                time_metrics["inference"] = infer_handle.consume_duration(
+                    reduction_type="min"
+                )
+
+            logging_steps = (self.global_steps - 1) * self.cfg.algorithm.n_minibatches
+            # add prefix to the metrics
+            log_time_metrics = {f"time/{k}": v for k, v in time_metrics.items()}
+            rollout_metrics = {
+                f"rollout/{k}": v for k, v in actor_rollout_metrics.items()
+            }
+
+            self.metric_logger.log(log_time_metrics, logging_steps)
+            self.metric_logger.log(rollout_metrics, logging_steps)
+            for i in range(self.cfg.algorithm.n_minibatches):
+                training_metrics = {
+                    f"train/{k}": v for k, v in actor_training_metrics[i].items()
+                }
+                self.metric_logger.log(training_metrics, logging_steps + i)
+
+            logging_metrics = time_metrics
+
+            if self.cfg.actor.get("calculate_flops", False):
+                flops_metrics = self._compute_flops_metrics(
+                    time_metrics, actor_rollout_metrics
+                )
+                flops_metrics = {f"flops/{k}": v for k, v in flops_metrics.items()}
+                self.metric_logger.log(flops_metrics, logging_steps)
+                logging_metrics.update(flops_metrics)
+
+            logging_metrics.update(actor_rollout_metrics)
+            logging_metrics.update(actor_training_metrics[-1])
+
+            global_pbar.set_postfix(logging_metrics)
+            global_pbar.update(1)
+
+        self.server_rollout.shutdown()
+        self.online_router.server_stop()
+        self.server_rollout.server_stop()
+        # No need to wait for rollout_handle since rollout service runs continuously
+        self.metric_logger.finish()
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/reasoning_runner.py
similarity index 89%
rename from rlinf/runners/math_runner.py
rename to rlinf/runners/reasoning_runner.py
index 9dadbb78b..b53010e18 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -25,7 +25,7 @@
 from tqdm import tqdm
 
 from rlinf.data.io_struct import RolloutRequest
-from rlinf.scheduler import Channel, Worker
+from rlinf.scheduler import Channel
 from rlinf.scheduler import WorkerGroupFuncResult as Handle
 from rlinf.utils.data_iter_utils import split_list
 from rlinf.utils.distributed import ScopedTimer
@@ -35,6 +35,7 @@
 from rlinf.utils.timers import Timer
 from rlinf.workers.actor.megatron_actor_worker import MegatronActor
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.reward.reward_worker import RewardWorker
 
 if typing.TYPE_CHECKING:
     from rlinf.workers.rollout.sglang.sglang_worker import SGLangWorker
@@ -43,8 +44,8 @@
 logging.getLogger().setLevel(logging.INFO)
 
 
-class MathRunner:
-    """Runner for math model training."""
+class ReasoningRunner:
+    """Runner for reasoning task RL training."""
 
     def __init__(
         self,
@@ -55,33 +56,28 @@ def __init__(
         rollout: Union["SGLangWorker", "VLLMWorker"],
         inference: Optional[MegatronInference],
         actor: MegatronActor,
-        reward: Optional[Worker] = None,
+        reward: RewardWorker,
     ):
         """"""
         self.cfg = cfg
         self.component_placement = placement
         self.is_pipeline = self.component_placement.is_disaggregated
         self.has_dedicated_inference = inference is not None
-        self.has_dedicated_reward = reward is not None
 
         # Workers
         self.rollout = rollout
         self.actor = actor
         # Collocated mode uses actor as inference
         self.inference = inference if self.has_dedicated_inference else self.actor
-        self.reward = reward if self.has_dedicated_reward else self.actor
+        self.reward = reward
 
         # Data channels
         self.dataloader_channel = Channel.create("DataLoader")
         self.rollout_channel = Channel.create("Rollout")
         # Create a local channel (i.e., a channel that is different in every process)
         # if inference is not a dedicated worker
-        self.inference_channel = Channel.create(
-            "Inference", local=not self.has_dedicated_inference
-        )
-        self.reward_channel = Channel.create(
-            "Reward", local=not self.has_dedicated_reward
-        )
+        self.inference_channel = Channel.create("Inference")
+        self.reward_channel = Channel.create("Reward")
         self.actor_channel = Channel.create("Actor", local=True)
 
         # Configurations
@@ -179,8 +175,7 @@ def init_workers(self):
         self.actor.init_worker().wait()
         if self.has_dedicated_inference:
             self.inference.init_worker().wait()
-        if self.has_dedicated_reward:
-            self.reward.init_worker().wait()
+        self.reward.init_worker().wait()
 
         if self.cfg.runner.resume_dir is None:
             return
@@ -274,18 +269,26 @@ def epoch(self):
     def _put_batch(self, batch: Dict[str, torch.Tensor]):
         prompt_ids = batch["prompt"].tolist()
         lengths = batch["length"].tolist()
-        answers = batch["answer"].tolist()
-        prompts = [ids[-pmp_len:] for ids, pmp_len in zip(prompt_ids, lengths)]
+        answers = batch["answer"]
+        image_data = batch["image_data"]
+        multi_modal_inputs = batch["multi_modal_inputs"]
+        prompt_ids = [ids[-pmp_len:] for ids, pmp_len in zip(prompt_ids, lengths)]
         rollout_dp_size = self.component_placement.rollout_dp_size
 
-        for input_ids, answers in zip(
-            split_list(prompts, rollout_dp_size, enforce_divisible_batch=False),
+        for input_ids, answers, image_data, multi_modal_inputs in zip(
+            split_list(prompt_ids, rollout_dp_size, enforce_divisible_batch=False),
             split_list(answers, rollout_dp_size, enforce_divisible_batch=False),
+            split_list(image_data, rollout_dp_size, enforce_divisible_batch=False),
+            split_list(
+                multi_modal_inputs, rollout_dp_size, enforce_divisible_batch=False
+            ),
         ):
             request = RolloutRequest(
                 n=self.cfg.algorithm.group_size,
                 input_ids=input_ids,
                 answers=answers,
+                image_data=image_data,
+                multi_modal_inputs=multi_modal_inputs,
             )
             self.dataloader_channel.put(request, async_op=True)
 
@@ -327,41 +330,39 @@ def run(self):
                         output_channel=self.rollout_channel,
                     )
 
+                    # Rewards
+                    reward_handle: Handle = self.reward.compute_rewards(
+                        input_channel=self.rollout_channel,
+                        output_channel=self.reward_channel,
+                    )
+
                     if self.recompute_logprobs:
                         # Inference prev/ref logprobs
                         infer_handle: Handle = self.inference.run_inference(
-                            input_channel=self.rollout_channel,
+                            input_channel=self.reward_channel,
                             output_channel=self.inference_channel,
+                            rollout_channel=self.rollout_channel,
                             compute_ref_logprobs=self.compute_ref_logprobs,
                         )
                         inference_channel = self.inference_channel
                     else:
                         infer_handle = None
-                        inference_channel = self.rollout_channel
-
-                    # Rewards
-                    reward_handle: Handle = self.reward.compute_rewards(
-                        input_channel=inference_channel,
-                        output_channel=self.reward_channel,
-                    )
+                        inference_channel = self.reward_channel
 
                     # Advantages and returns
                     adv_handle: Handle = self.actor.compute_advantages_and_returns(
-                        input_channel=self.reward_channel,
+                        input_channel=inference_channel,
                         output_channel=self.actor_channel,
                     )
 
                     # Actor training
-                    actor_input_channel = self.actor_channel
-                    if self.is_pipeline:
-                        # In pipeline mode, the rollout already contains the advantages and returns
-                        # So the above two steps are in fact no-ops, and we should directly use the inference channel as the input
-                        actor_input_channel = inference_channel
                     actor_handle: Handle = self.actor.run_training(
-                        input_channel=actor_input_channel,
+                        input_channel=self.actor_channel,
                     )
 
                     metrics = actor_handle.wait()
+                    actor_rollout_metrics = metrics[0][0]
+                    actor_training_metrics = metrics[0][1]
                     self.global_steps += 1
 
                     run_time_exceeded = self.run_timer.is_finished()
@@ -407,28 +408,30 @@ def run(self):
                 ) * self.cfg.algorithm.n_minibatches
                 # add prefix to the metrics
                 log_time_metrics = {f"time/{k}": v for k, v in time_metrics.items()}
-                rollout_metrics = {f"rollout/{k}": v for k, v in metrics[0][0].items()}
+                rollout_metrics = {
+                    f"rollout/{k}": v for k, v in actor_rollout_metrics.items()
+                }
 
                 self.metric_logger.log(log_time_metrics, logging_steps)
                 self.metric_logger.log(rollout_metrics, logging_steps)
                 for i in range(self.cfg.algorithm.n_minibatches):
                     training_metrics = {
-                        f"train/{k}": v for k, v in metrics[0][1][i].items()
+                        f"train/{k}": v for k, v in actor_training_metrics[i].items()
                     }
                     self.metric_logger.log(training_metrics, logging_steps + i)
 
-                logging_metrics = time_metrics
+                logging_metrics = {f"{k}_time": v for k, v in time_metrics.items()}
 
                 if self.cfg.actor.get("calculate_flops", False):
                     flops_metrics = self._compute_flops_metrics(
-                        time_metrics, metrics[0][0]
+                        time_metrics, actor_rollout_metrics
                     )
                     flops_metrics = {f"flops/{k}": v for k, v in flops_metrics.items()}
                     self.metric_logger.log(flops_metrics, logging_steps)
                     logging_metrics.update(flops_metrics)
 
-                logging_metrics.update(metrics[0][0])
-                logging_metrics.update(metrics[0][1][-1])
+                logging_metrics.update(actor_rollout_metrics)
+                logging_metrics.update(actor_training_metrics[-1])
 
                 global_pbar.set_postfix(logging_metrics)
                 global_pbar.update(1)
diff --git a/rlinf/scheduler/cluster.py b/rlinf/scheduler/cluster.py
index 506d09455..8882d082a 100644
--- a/rlinf/scheduler/cluster.py
+++ b/rlinf/scheduler/cluster.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 import signal
 import sys
 import time
+import warnings
 from dataclasses import dataclass
 from importlib.metadata import version
 from typing import TYPE_CHECKING, Dict, List, Optional, Type
@@ -106,6 +108,22 @@ def __init__(self, num_nodes: Optional[int] = None):
         self._has_initialized = True
 
     def _init_and_launch_managers(self, num_nodes: int):
+        assert num_nodes > 0, "num_nodes must be greater than 0."
+
+        # Add logger
+        self._logger = logging.getLogger(Cluster.SYS_NAME)
+        self._logger.setLevel(Cluster.LOGGING_LEVEL)
+        self._logger.propagate = False
+        for handler in self._logger.handlers:
+            self._logger.removeHandler(handler)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            fmt="[%(levelname)s %(asctime)s %(name)s] %(message)s",
+            datefmt="%H:%M:%S",
+        )
+        handler.setFormatter(formatter)
+        self._logger.addHandler(handler)
+
         self._num_nodes = num_nodes
         self._set_default_env_vars()
 
@@ -143,9 +161,8 @@ def _init_and_launch_managers(self, num_nodes: int):
 
         # Wait for the cluster to be ready
         while len(ray.nodes()) < self._num_nodes:
-            print(
-                f"Waiting for {self._num_nodes} nodes to be ready, currently {len(ray.nodes())} nodes available.",
-                flush=True,
+            self._logger.warning(
+                f"Waiting for {self._num_nodes} nodes to be ready, currently {len(ray.nodes())} nodes available."
             )
             time.sleep(1)
 
@@ -177,6 +194,17 @@ def _init_and_launch_managers(self, num_nodes: int):
             node for nodes in nodes_group_by_accel_type.values() for node in nodes
         ]
 
+        # Handle num_nodes configuration mismatch with actual node number
+        if len(self._nodes) > self._num_nodes:
+            warnings.warn(
+                f"The cluster is initialized with {self._num_nodes} nodes, but detected {len(self._nodes)} nodes have joined the ray cluster. So only the first {self._num_nodes} nodes are used."
+            )
+            self._nodes = self._nodes[: self._num_nodes]
+
+        self._logger.info(
+            f"{Cluster.SYS_NAME} is running on a cluster with {len(self._nodes)} node{'s' if len(self._nodes) > 1 else ''} and {self.num_accelerators_in_cluster} accelerator{'s' if self.num_accelerators_in_cluster > 1 else ''}. The nodes' details are: {self._nodes}"
+        )
+
         # Launch managers
         from .manager import (
             CollectiveManager,
diff --git a/rlinf/scheduler/worker/worker.py b/rlinf/scheduler/worker/worker.py
index 7e2c73191..98d485635 100644
--- a/rlinf/scheduler/worker/worker.py
+++ b/rlinf/scheduler/worker/worker.py
@@ -313,7 +313,9 @@ class Worker(metaclass=WorkerMeta):
 
     PID = None
     current_worker = None
+    logging.basicConfig()
     logger = logging.getLogger(Cluster.SYS_NAME)
+    logger.setLevel(logging.INFO)
     torch_platform = torch.cuda
     torch_device_type = "cuda"
 
diff --git a/rlinf/scheduler/worker/worker_group.py b/rlinf/scheduler/worker/worker_group.py
index 8dd483870..02e0cb4bb 100644
--- a/rlinf/scheduler/worker/worker_group.py
+++ b/rlinf/scheduler/worker/worker_group.py
@@ -154,6 +154,9 @@ def _create_workers(self):
         placements = self._placement_strategy.get_placement(
             self._cluster, self._isolate_gpu
         )
+        master_addr = next(
+            self._cluster.get_node_ip(p.node_id) for p in placements if p.rank == 0
+        )
         self._world_size = len(placements)
         for placement in placements:
             worker_name = WorkerAddress.from_parent_name_rank(
@@ -165,6 +168,7 @@ def _create_workers(self):
             env_vars = {
                 "GROUP_NAME": self._worker_group_name,
                 "WORKER_NAME": worker_name,
+                "MASTER_ADDR": master_addr,
                 "WORLD_SIZE": str(self._world_size),
                 "RANK": str(placement.rank),
                 "NODE_RANK": str(placement.node_rank),
diff --git a/rlinf/utils/convertor/__init__.py b/rlinf/utils/convertor/__init__.py
new file mode 100644
index 000000000..5b365ea1e
--- /dev/null
+++ b/rlinf/utils/convertor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rlinf/utils/convertor/utils.py b/rlinf/utils/convertor/utils.py
new file mode 100644
index 000000000..761218650
--- /dev/null
+++ b/rlinf/utils/convertor/utils.py
@@ -0,0 +1,467 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+
+
+class TransformType(Enum):
+    SPLIT_QKV = "split_qkv"
+    SPLIT_QKV_BIAS = "split_qkv_bias"
+    SPLIT_FC1 = "split_fc1"
+    SPLIT_NONE = "split_none"
+
+
+class TransformFunc:
+    @staticmethod
+    def _split_gqa_tensor(
+        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str], config
+    ) -> None:
+        hidden_size = config.model_config.hidden_size
+        num_attention_heads = config.model_config.num_attention_heads
+        num_query_groups = config.model_config.num_query_groups or num_attention_heads
+        head_dim = hidden_size // num_attention_heads
+
+        target_tp = config.reshard_tp_size
+        assert num_query_groups % target_tp == 0, (
+            "num_query_groups must be divisible by reshard_tp_size"
+        )
+        local_num_query_groups = num_query_groups // target_tp
+
+        # heads per query group
+        assert num_attention_heads % num_query_groups == 0, (
+            "num_attention_heads must be divisible by num_query_groups"
+        )
+        q_heads_per_group = num_attention_heads // num_query_groups
+
+        num_channel_qkv = q_heads_per_group + 2
+
+        if tensor.ndim == 2:
+            # Weight: [out_features, in_features]
+            out_features, in_features = tensor.shape
+            expected_out = local_num_query_groups * num_channel_qkv * head_dim
+            assert out_features == expected_out, (
+                f"Unexpected fused QKV weight shape {tensor.shape}, expect "
+                f"[{expected_out}, {in_features}] (local groups={local_num_query_groups})"
+            )
+
+            qkv = tensor.view(
+                local_num_query_groups, num_channel_qkv, head_dim, in_features
+            )
+            q, k, v = torch.split(
+                qkv, [q_heads_per_group, 1, 1], dim=1
+            )  # shapes: [G, qh, D, In], [G,1,D,In], [G,1,D,In]
+            q_full = q.reshape(-1, in_features).contiguous()
+            k_full = k.reshape(-1, in_features).contiguous()
+            v_full = v.reshape(-1, in_features).contiguous()
+        else:
+            # Bias: [out_features]
+            out_features = tensor.shape[0]
+            expected_out = local_num_query_groups * num_channel_qkv * head_dim
+            assert out_features == expected_out, (
+                f"Unexpected fused QKV bias shape {tensor.shape}, expect "
+                f"[{expected_out}] (local groups={local_num_query_groups})"
+            )
+
+            qkv = tensor.view(local_num_query_groups, num_channel_qkv, head_dim)
+            q, k, v = torch.split(qkv, [q_heads_per_group, 1, 1], dim=1)
+            q_full = q.reshape(-1).contiguous()
+            k_full = k.reshape(-1).contiguous()
+            v_full = v.reshape(-1).contiguous()
+
+        # Save to target names
+        new_statedict[weight_names[0]] = q_full.clone()
+        new_statedict[weight_names[1]] = k_full.clone()
+        new_statedict[weight_names[2]] = v_full.clone()
+
+    @staticmethod
+    def split_fc1(
+        linear_fc1: torch.Tensor, new_statedict: dict, weight_names: List[str], config
+    ) -> None:
+        assert weight_names is not None and len(weight_names) == 2, (
+            f"split_fc1 transform expects two weight names, got {weight_names}"
+        )
+
+        tp_size = config.model_config.tensor_model_parallel_size
+        target_tp = config.reshard_tp_size
+        split_size = linear_fc1.shape[0] // (tp_size // target_tp)
+        linear_fc1_slice = torch.split(linear_fc1, split_size, dim=0)
+
+        gate_proj_shards = []
+        up_proj_shards = []
+        for weight in linear_fc1_slice:
+            assert weight.shape[0] % 2 == 0, (
+                f"linear_fc1 weight shape {weight.shape} is not even along dim 0"
+            )
+            weight_chunk = torch.chunk(weight, 2, dim=0)
+            gate_proj_shards.append(weight_chunk[0])
+            up_proj_shards.append(weight_chunk[1])
+        gate_proj = torch.cat(gate_proj_shards, dim=0)
+        up_proj = torch.cat(up_proj_shards, dim=0)
+
+        new_statedict[weight_names[0]] = gate_proj.clone()
+        new_statedict[weight_names[1]] = up_proj.clone()
+
+    @staticmethod
+    def split_none(
+        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str]
+    ) -> None:
+        assert weight_names is not None and len(weight_names) == 1, (
+            f"split_none transform expects one weight name, got {weight_names}"
+        )
+        new_statedict[weight_names[0]] = tensor.clone()
+
+
+@dataclass
+class ConvertorRule:
+    pattern: re.Pattern
+    transform: TransformType
+    targets: List[str]
+    post: Optional[Callable] = None
+
+
+class BaseConvertor:
+    def __init__(self, config, strict: bool = False):
+        self.cfg = config
+        self.strict = strict
+        self.rules = self.build_rules()
+
+    def map_name(self, name: str) -> Optional[Tuple[TransformType, List[str]]]:
+        def _get_targets_from_match(templates: list[str], m: re.Match) -> list[str]:
+            gd = m.groupdict()
+            out = []
+            for t in templates:
+                if "{" in t and "}" in t:
+                    out.append(t.format(**gd))
+                else:
+                    out.append(m.expand(t))
+            return out
+
+        for r in self.rules:
+            m = r.pattern.fullmatch(name)
+            if not m:
+                continue
+            targets = r.targets
+            if r.post:
+                targets = r.post(targets, m)
+            full_names = _get_targets_from_match(targets, m)
+            return r.transform, full_names
+        return None
+
+    def convert(self, state_dict: Dict) -> Dict:
+        converted = {}
+        for k, v in state_dict.items():
+            mapped = self.map_name(k)
+            if mapped is None:
+                if self.strict:
+                    raise KeyError(f"Unmapped key {k}")
+                continue
+            transform, targets = mapped
+            if transform in (TransformType.SPLIT_QKV, TransformType.SPLIT_QKV_BIAS):
+                TransformFunc._split_gqa_tensor(v, converted, targets, self.cfg)
+            elif transform == TransformType.SPLIT_FC1:
+                TransformFunc.split_fc1(v, converted, targets, self.cfg)
+            elif transform == TransformType.SPLIT_NONE:
+                TransformFunc.split_none(v, converted, targets)
+            else:
+                raise ValueError(f"Unknown transform type {transform}")
+        return converted
+
+    def build_rules(self) -> List[ConvertorRule]:
+        """
+        Should be implemented in subclass to build the conversion rules.
+        """
+        raise NotImplementedError
+
+
+class Qwen2_5Convertor(BaseConvertor):
+    def build_rules(self) -> List[ConvertorRule]:
+        LID = r"(?P<i>\d+)"
+        WB = r"(?P<wb>weight|bias)"
+
+        return [
+            # embeddings
+            ConvertorRule(
+                re.compile(r"embedding\.word_embeddings\.weight$"),
+                TransformType.SPLIT_NONE,
+                [r"model.embed_tokens.weight"],
+            ),
+            # final_layernorm
+            ConvertorRule(
+                re.compile(r"decoder\.final_layernorm\.weight$"),
+                TransformType.SPLIT_NONE,
+                [r"model.norm.weight"],
+            ),
+            # lm_head
+            ConvertorRule(
+                re.compile(r"output_layer\.weight$"),
+                TransformType.SPLIT_NONE,
+                [r"lm_head.weight"],
+            ),
+            # attn qkv norm
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.self_attention\.linear_qkv\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.input_layernorm.weight"],
+            ),
+            # attn qkv weights/bias
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.self_attention\.linear_qkv\.{WB}$"
+                ),
+                TransformType.SPLIT_QKV,
+                [
+                    r"model.layers.\g<i>.self_attn.q_proj.\g<wb>",
+                    r"model.layers.\g<i>.self_attn.k_proj.\g<wb>",
+                    r"model.layers.\g<i>.self_attn.v_proj.\g<wb>",
+                ],
+            ),
+            # attn o proj
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.self_attention\.linear_proj\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.self_attn.o_proj.\g<wb>"],
+            ),
+            # mlp fc1
+            ConvertorRule(
+                re.compile(rf"decoder\.layers\.{LID}\.mlp\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_FC1,
+                [
+                    r"model.layers.\g<i>.mlp.gate_proj.\g<wb>",
+                    r"model.layers.\g<i>.mlp.up_proj.\g<wb>",
+                ],
+            ),
+            # mlp fc2
+            ConvertorRule(
+                re.compile(rf"decoder\.layers\.{LID}\.mlp\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.mlp.down_proj.\g<wb>"],
+            ),
+            # mlp norms
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.mlp\.linear_fc1\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.post_attention_layernorm.weight"],
+            ),
+        ]
+
+
+class Qwen2_5VLConvertor(BaseConvertor):
+    def _build_vision_rules(self) -> List[ConvertorRule]:
+        B = r"(?P<i>\d+)"
+        WB = r"(?P<wb>weight|bias)"
+        HF_V_PREFIX = "model.visual"
+        HF_V_DECODER_PREFIX = f"{HF_V_PREFIX}.blocks"
+        MG_V_PREFIX = "vision_model"
+        MG_V_DECODER_PREFIX = rf"{MG_V_PREFIX}\.decoder\.layers"
+
+        vision_rules = [
+            # vision patch embed
+            ConvertorRule(
+                re.compile(rf"^{MG_V_PREFIX}\.patch_embed\.proj\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_PREFIX}.patch_embed.proj.weight"],
+            ),
+            # final layer norm
+            ConvertorRule(
+                re.compile(rf"^{MG_V_PREFIX}\.decoder\.final_layernorm\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_PREFIX}.merger.ln_q.weight"],
+            ),
+            # attn norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.self_attention\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.norm1.weight"],
+            ),
+            # attn qkv
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.self_attention\.linear_qkv\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.attn.qkv.\g<wb>"],
+            ),
+            # attn proj
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.self_attention\.linear_proj\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.attn.proj.\g<wb>"],
+            ),
+            # mlp fc1
+            ConvertorRule(
+                re.compile(rf"^{MG_V_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_FC1,
+                [
+                    f"{HF_V_DECODER_PREFIX}" + r".\g<i>.mlp.gate_proj.\g<wb>",
+                    f"{HF_V_DECODER_PREFIX}" + r".\g<i>.mlp.up_proj.\g<wb>",
+                ],
+            ),
+            # mlp fc2
+            ConvertorRule(
+                re.compile(rf"^{MG_V_DECODER_PREFIX}\.{B}\.mlp\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.mlp.down_proj.\g<wb>"],
+            ),
+            # mlp norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.norm2.weight"],
+            ),
+        ]
+        return vision_rules
+
+    def _build_llm_rules(self) -> List[ConvertorRule]:
+        B = r"(?P<i>\d+)"
+        WB = r"(?P<wb>weight|bias)"
+        HF_LLM_PREFIX = "model.language_model"
+        MG_LLM_PREFIX = "language_model"
+        MG_LLM_DECODER_PREFIX = rf"{MG_LLM_PREFIX}\.decoder\.layers"
+
+        llm_rules = [
+            # embeddings
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_PREFIX}\.embed_tokens\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}.embedding.weight"],
+            ),
+            # final_layernorm
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_PREFIX}\.final_layernorm\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}.norm.weight"],
+            ),
+            # attn norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.self_attention\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.input_layernorm.weight"],
+            ),
+            # attn qkv
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.self_attention\.linear_qkv\.{WB}$"
+                ),
+                TransformType.SPLIT_QKV,
+                [
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.self_attn.q_proj.\g<wb>",
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.self_attn.k_proj.\g<wb>",
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.self_attn.v_proj.\g<wb>",
+                ],
+            ),
+            # attn proj
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.self_attention\.linear_proj\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.self_attn.o_proj.\g<wb>"],
+            ),
+            # mlp fc1
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_FC1,
+                [
+                    f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.mlp.gate_proj.\g<wb>",
+                    f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.mlp.up_proj.\g<wb>",
+                ],
+            ),
+            # mlp fc2
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.mlp\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.mlp.down_proj.\g<wb>"],
+            ),
+            # mlp norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.post_attention_layernorm.weight"
+                ],
+            ),
+        ]
+        return llm_rules
+
+    def _build_projector_rules(self) -> List[ConvertorRule]:
+        HF_PROJECTOR_PREFIX = "model.visual.merger"
+        MG_PROJECTOR_PREFIX = "vision_model.protection.encoder"
+        WB = r"(?P<wb>weight|bias)"
+
+        projector_rules = [
+            # projector fc1
+            ConvertorRule(
+                re.compile(rf"^{MG_PROJECTOR_PREFIX}\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_PROJECTOR_PREFIX}" + r".mlp.0.\g<wb>"],
+            ),
+            # projector fc2
+            ConvertorRule(
+                re.compile(rf"^{MG_PROJECTOR_PREFIX}\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_PROJECTOR_PREFIX}" + r".mlp.2.\g<wb>"],
+            ),
+        ]
+        return projector_rules
+
+    def build_rules(self) -> List[ConvertorRule]:
+        rules = []
+        rules.extend(self._build_vision_rules())
+        rules.extend(self._build_llm_rules())
+        rules.extend(self._build_projector_rules())
+        return rules
+
+
+_MG2HF_CONVERTOR_REGISTRY = {}
+
+
+def register_mg2hf_convertor(model_arch: str, convertor_cls: Callable) -> None:
+    if model_arch in _MG2HF_CONVERTOR_REGISTRY:
+        raise ValueError(f"Convertor for {model_arch} already registered")
+    _MG2HF_CONVERTOR_REGISTRY[model_arch] = convertor_cls
+
+
+register_mg2hf_convertor("qwen2.5", Qwen2_5Convertor)
+register_mg2hf_convertor("qwen2.5_vl", Qwen2_5VLConvertor)
+
+
+def get_mg2hf_convertor(model_arch: str, config, strict: bool = False) -> BaseConvertor:
+    if model_arch not in _MG2HF_CONVERTOR_REGISTRY:
+        raise ValueError(f"No convertor registered for {model_arch}")
+    convertor_cls = _MG2HF_CONVERTOR_REGISTRY[model_arch]
+    return convertor_cls(config=config, strict=strict)
diff --git a/rlinf/utils/data_iter_utils.py b/rlinf/utils/data_iter_utils.py
index 27bbd7215..31e440201 100644
--- a/rlinf/utils/data_iter_utils.py
+++ b/rlinf/utils/data_iter_utils.py
@@ -60,13 +60,17 @@ def concat_dict_list(list_of_dicts: List[Dict[str, Any]]) -> Dict[str, Any]:
     return result
 
 
-def split_list(inputs, num_chunks, enforce_divisible_batch: Optional[bool] = True):
+def split_list(
+    inputs: List, num_chunks: int, enforce_divisible_batch: Optional[bool] = True
+):
     """
     Split a list into equal sized chunks
     """
     if enforce_divisible_batch:
         chunk_size = len(inputs) // num_chunks
-        assert len(inputs) % chunk_size == 0, "Issue with batch size configuration!"
+        assert len(inputs) % chunk_size == 0, (
+            f"Issue with batch size configuration! inputs len:{len(inputs)} num_chunks:{num_chunks}"
+        )
         return [inputs[i : i + chunk_size] for i in range(0, len(inputs), chunk_size)]
     else:
         k, m = divmod(len(inputs), num_chunks)
diff --git a/rlinf/utils/distributed.py b/rlinf/utils/distributed.py
index e9da8d6da..1f5ddc9c6 100644
--- a/rlinf/utils/distributed.py
+++ b/rlinf/utils/distributed.py
@@ -29,11 +29,16 @@
 
 from rlinf.utils.timers import NamedTimer
 
-
+import torch_npu
 def compute_rollout_metrics(
-    rollout_batch, max_prompt_len, response_len, use_critic=False
+    rollout_batch,
+    max_prompt_len,
+    response_len,
+    dp_world_size,
+    dp_group=None,
+    use_critic=False,
 ):
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"npu:{torch.npu.current_device()}")
     advantages = rollout_batch["advantages"].to(device=device)
     mask = rollout_batch["attention_mask"][:, -response_len:].to(device=device)
     prompt_lengths = rollout_batch["prompt_lengths"].clone().to(device=device)
@@ -41,8 +46,6 @@ def compute_rollout_metrics(
     reward_scores = rollout_batch["rewards"].clone().to(device=device)
     is_end = rollout_batch["is_end"].clone().float().to(device=device)
 
-    dp_world_size = parallel_state.get_data_parallel_world_size()
-
     prompt_lengths_list = [
         torch.empty_like(prompt_lengths) for _ in range(dp_world_size)
     ]
@@ -52,12 +55,12 @@ def compute_rollout_metrics(
     torch.distributed.all_gather(
         prompt_lengths_list,
         prompt_lengths,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_gather(
         decode_lengths_list,
         response_lengths,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
 
     total_prompt_lengths = torch.cat(prompt_lengths_list, dim=0)
@@ -66,22 +69,22 @@ def compute_rollout_metrics(
     torch.distributed.all_reduce(
         prompt_lengths,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         response_lengths,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         reward_scores,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         is_end,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
 
     valid_adv = torch.masked_select(advantages, mask)
@@ -90,24 +93,24 @@ def compute_rollout_metrics(
     torch.distributed.all_reduce(
         n_valid_token,
         op=torch.distributed.ReduceOp.SUM,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         adv_sum,
         op=torch.distributed.ReduceOp.SUM,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     adv_mean = adv_sum / n_valid_token
 
     adv_max = torch.max(valid_adv).detach().item()
     adv_min = torch.min(valid_adv).detach().item()
     reduce_tensor = torch.as_tensor(
-        [-adv_min, adv_max], device=torch.cuda.current_device(), dtype=torch.float32
+        [-adv_min, adv_max], device=torch.npu.current_device(), dtype=torch.float32
     )
     torch.distributed.all_reduce(
         reduce_tensor,
         torch.distributed.ReduceOp.MAX,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     adv_min, adv_max = reduce_tensor.tolist()
 
@@ -172,7 +175,7 @@ def from_rollout_batches(
         dp_group: Optional[ProcessGroup],
         partitioning_tool: Callable,
     ) -> Self:
-        current_device = torch.cuda.current_device()
+        current_device = torch.npu.current_device()
 
         attn_mask = rollout_batches.get("attention_mask")
         current_num_samples = attn_mask.size(0)
@@ -403,12 +406,12 @@ def rebalance_nd_tensor(tensor, group):
     NOTE: assumes all other (i.e., non-zero) dimensions are equal.
     """
     num_samples = torch.as_tensor(
-        tensor.size(0), dtype=torch.int64, device=torch.cuda.current_device()
+        tensor.size(0), dtype=torch.int64, device=torch.npu.current_device()
     )
     batch_num_per_rank = torch.zeros(
         torch.distributed.get_world_size(group),
         dtype=torch.int64,
-        device=torch.cuda.current_device(),
+        device=torch.npu.current_device(),
     )
     torch.distributed.all_gather_into_tensor(
         batch_num_per_rank, num_samples, group=group
@@ -419,7 +422,7 @@ def rebalance_nd_tensor(tensor, group):
 
     indices = batch_num_per_rank.cumsum(dim=0)
     output_tensor = torch.zeros(
-        B, *other_dims, dtype=tensor.dtype, device=torch.cuda.current_device()
+        B, *other_dims, dtype=tensor.dtype, device=torch.npu.current_device()
     )
 
     # tensor_split is a view we can copy into
@@ -451,7 +454,7 @@ def broadcast_tensor(
     """
 
     if torch.distributed.get_rank() == src:
-        tensor = tensor.cuda()
+        tensor = tensor.npu()
         if dtype:
             tensor = tensor.to(dtype)
 
@@ -464,7 +467,7 @@ def broadcast_tensor(
         torch.distributed.broadcast_object_list(metadata, src, group)
 
         dtype, input_shape = metadata
-        tensor = torch.empty(input_shape, dtype=dtype, device="cuda")
+        tensor = torch.empty(input_shape, dtype=dtype, device="npu")
         torch.distributed.broadcast(tensor, src, group)
     return tensor
 
@@ -516,7 +519,7 @@ def broadcast_tensor_within_dp(tensor: torch.Tensor, dtype: torch.dtype):
 def gather_tensor(tensor, dst, group, dtype=None):
     """Gather any tensor to the dst rank from every other rank in the given group.
     All the ranks that send or receive data must call this function."""
-    tensor = tensor.to(device=torch.cuda.current_device(), dtype=dtype)
+    tensor = tensor.to(device=torch.npu.current_device(), dtype=dtype)
     if torch.distributed.get_rank() == dst:
         gather_list = [
             torch.empty_like(tensor)
@@ -546,8 +549,8 @@ def normalize_tensor(tensor, mask, group=None):
     """normalizes a tensor using global mean and std"""
     dtype = torch.float64
     tensor = tensor.to(dtype)
-    tensor = tensor.to(device=torch.cuda.current_device())
-    mask = mask.to(device=torch.cuda.current_device())
+    tensor = tensor.to(device=torch.npu.current_device())
+    mask = mask.to(device=torch.npu.current_device())
 
     tensor_global_mean, tensor_global_var = masked_global_mean_var(
         tensor, mask, group=group
@@ -586,7 +589,7 @@ def masked_normalization(
             Normalized x, with the same shape as x.
     """
     dtype = torch.float64 if high_precision else torch.float32
-    x = x.to(dtype=dtype).cuda()
+    x = x.to(dtype=dtype).npu()
     if not inplace:
         x = x.clone()
     if dim is None:
@@ -596,7 +599,7 @@ def masked_normalization(
             np.prod([x.shape[d] for d in dim]), dtype=dtype, device=x.device
         )
     else:
-        mask = mask.to(dtype=dtype).cuda()
+        mask = mask.to(dtype=dtype).npu()
         assert len(mask.shape) == len(x.shape), (mask.shape, x.shape, dim)
         for i in range(len(x.shape)):
             if i in dim:
@@ -640,8 +643,8 @@ def masked_global_mean_var(values, mask, group=None):
     mask and values must have same shape, with mask being {0,1} with 1 being the values we want to keep
     """
     assert values.shape == mask.shape, (values.shape, mask.shape)
-    values = values.to(device=torch.cuda.current_device())
-    mask = mask.to(device=torch.cuda.current_device())
+    values = values.to(device=torch.npu.current_device())
+    mask = mask.to(device=torch.npu.current_device())
 
     values = values * mask
 
@@ -649,7 +652,7 @@ def masked_global_mean_var(values, mask, group=None):
     sum_and_count = torch.tensor(
         [values.sum(), mask.sum()],
         dtype=torch.float64,
-        device=torch.cuda.current_device(),
+        device=torch.npu.current_device(),
     )
     torch.distributed.all_reduce(sum_and_count, group=group)
     global_sum, global_count = sum_and_count
@@ -657,7 +660,7 @@ def masked_global_mean_var(values, mask, group=None):
     variance_summed = (
         (((values - global_mean) ** 2) * mask)
         .sum()
-        .to(device=torch.cuda.current_device(), dtype=torch.float64)
+        .to(device=torch.npu.current_device(), dtype=torch.float64)
     )
 
     torch.distributed.all_reduce(variance_summed, group=group)
@@ -666,12 +669,12 @@ def masked_global_mean_var(values, mask, group=None):
 
 
 def report_device_info(info_str):
-    free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+    free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
     free_gpu_memory /= 2**30
     total_gpu_memory /= 2**30
 
-    memory_allocated = torch.cuda.memory_allocated() / 2**30
-    memory_reserved = torch.cuda.memory_reserved() / 2**30
+    memory_allocated = torch.npu.memory_allocated() / 2**30
+    memory_reserved = torch.npu.memory_reserved() / 2**30
 
     print(
         f"[Rank {torch.distributed.get_rank()}] {info_str}, {free_gpu_memory=:.2f} GiB, {total_gpu_memory=:.2f} GiB, {memory_allocated=:.2f} GiB, {memory_reserved=:.2f} GiB"
@@ -722,7 +725,7 @@ def all_reduce_dict(
 ):
     keys = sorted(dictionary)
     tensor = torch.as_tensor(
-        [dictionary[k] for k in keys], dtype=dtype, device=torch.cuda.current_device()
+        [dictionary[k] for k in keys], dtype=dtype, device=torch.npu.current_device()
     )
     torch.distributed.all_reduce(tensor, op=op, group=group)
     return dict(zip(keys, tensor.tolist()))
diff --git a/rlinf/utils/logging.py b/rlinf/utils/logging.py
new file mode 100644
index 000000000..e80b6cf4f
--- /dev/null
+++ b/rlinf/utils/logging.py
@@ -0,0 +1,20 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_logger():
+    """Get the logger instance of the current worker."""
+    from rlinf.scheduler.worker import Worker
+
+    return Worker.logger
diff --git a/rlinf/utils/placement.py b/rlinf/utils/placement.py
index 7b707894e..6ecea767f 100644
--- a/rlinf/utils/placement.py
+++ b/rlinf/utils/placement.py
@@ -202,6 +202,7 @@ def __init__(self, config: DictConfig, cluster: Cluster):
         self._actor_gpus = self._component_gpu_map.get("actor", None)
         self._inference_gpus = self._component_gpu_map.get("inference", None)
         self._rollout_gpus = self._component_gpu_map.get("rollout", None)
+        self._reward_gpus = self._component_gpu_map.get("reward", None)
         assert self._actor_gpus is not None, (
             "Actor GPUs must be specified in the component_placement config."
         )
@@ -224,11 +225,9 @@ def __init__(self, config: DictConfig, cluster: Cluster):
             len(self._inference_gpus) if self._inference_gpus else 0
         )
         self._rollout_num_gpus = len(self._rollout_gpus)
+        self._reward_num_gpus = len(self._reward_gpus) if self._reward_gpus else 0
 
         if self._is_collocated():
-            assert self.actor_tp_size >= self.rollout_tp_size, (
-                f"Actor TP size {self.actor_tp_size} must be greater or equal to Rollout TP size {self.rollout_tp_size}."
-            )
             assert self._inference_gpus is None, (
                 "Inference GPUs must not be specified in collocated mode."
             )
@@ -280,21 +279,24 @@ def _generate_placements(self):
                 self._actor_gpus[0], self._actor_gpus[-1]
             )
 
-            actor_tp_size = self._config.actor.model.tensor_model_parallel_size
-            rollout_tp_size = self._config.rollout.tensor_parallel_size
-            assert actor_tp_size >= rollout_tp_size, (
-                f"Actor TP size ({actor_tp_size}) must be greater or equal to Rollout TP size ({rollout_tp_size})"
-            )
-            assert actor_tp_size % rollout_tp_size == 0, (
-                f"Actor TP size ({actor_tp_size}) must be divisible by Rollout TP size ({rollout_tp_size})"
+            if self.actor_tp_size > self.rollout_tp_size:
+                assert self.actor_tp_size % self.rollout_tp_size == 0, (
+                    f"Actor TP size ({self.actor_tp_size}) must be divisible by Rollout TP size ({self.rollout_tp_size})"
+                )
+            stride = (
+                self.actor_tp_size // self.rollout_tp_size
+                if self.actor_tp_size > self.rollout_tp_size
+                else 1
             )
-            stride = actor_tp_size // rollout_tp_size
             self._placements["rollout"] = PackedPlacementStrategy(
                 self._rollout_gpus[0],
                 self._rollout_gpus[-1],
-                num_accelerators_per_process=rollout_tp_size,
+                num_accelerators_per_process=self.rollout_tp_size,
                 stride=stride,
             )
+            self._placements["reward"] = PackedPlacementStrategy(
+                self._reward_gpus[0], self._reward_gpus[-1]
+            )
         elif self._placement_mode == PlacementMode.DISAGGREGATED:
             # Generate continuous placement strategies for components in a cluster.
             num_gpus_per_rollout_dp = len(self._rollout_gpus) // self.rollout_dp_size
@@ -310,6 +312,9 @@ def _generate_placements(self):
             self._placements["actor"] = PackedPlacementStrategy(
                 self._actor_gpus[0], self._actor_gpus[-1]
             )
+            self._placements["reward"] = PackedPlacementStrategy(
+                self._reward_gpus[0], self._reward_gpus[-1]
+            )
 
     @property
     def is_disaggregated(self):
@@ -325,18 +330,18 @@ def has_dedicated_inference(self):
     @property
     def actor_dp_size(self) -> int:
         return self._actor_num_gpus // (
-            self._config.actor.model.tensor_model_parallel_size
-            * self._config.actor.model.context_parallel_size
-            * self._config.actor.model.pipeline_model_parallel_size
+            self._config.actor.model.get("tensor_model_parallel_size", 1)
+            * self._config.actor.model.get("context_parallel_size", 1)
+            * self._config.actor.model.get("pipeline_model_parallel_size", 1)
         )
 
     @property
     def actor_tp_size(self) -> int:
-        return self._config.actor.model.tensor_model_parallel_size
+        return self._config.actor.model.get("tensor_model_parallel_size", 1)
 
     @property
     def actor_pp_size(self) -> int:
-        return self._config.actor.model.pipeline_model_parallel_size
+        return self._config.actor.model.get("pipeline_model_parallel_size", 1)
 
     @property
     def actor_world_size(self) -> int:
@@ -349,7 +354,7 @@ def inference_tp_size(self) -> int:
             and hasattr(self._config.inference, "model")
             and hasattr(self._config.inference.model, "tensor_model_parallel_size")
         ):
-            return self._config.inference.model.tensor_model_parallel_size
+            return self._config.inference.model.get("tensor_model_parallel_size", 1)
         else:
             return self.actor_tp_size
 
@@ -360,7 +365,7 @@ def inference_pp_size(self) -> int:
             and hasattr(self._config.inference, "model")
             and hasattr(self._config.inference.model, "pipeline_model_parallel_size")
         ):
-            return self._config.inference.model.pipeline_model_parallel_size
+            return self._config.inference.model.get("pipeline_model_parallel_size", 1)
         else:
             return self.actor_pp_size
 
@@ -377,14 +382,18 @@ def inference_world_size(self) -> int:
     @property
     def rollout_dp_size(self) -> int:
         return self._rollout_num_gpus // (
-            self._config.rollout.tensor_parallel_size
-            * self._config.rollout.pipeline_parallel_size
+            self._config.rollout.get("tensor_parallel_size", 1)
+            * self._config.rollout.get("pipeline_parallel_size", 1)
         )
 
     @property
     def rollout_tp_size(self) -> int:
-        return self._config.rollout.tensor_parallel_size
+        return self._config.rollout.get("tensor_parallel_size", 1)
 
     @property
     def rollout_world_size(self) -> int:
         return self._rollout_num_gpus
+
+    @property
+    def reward_world_size(self) -> int:
+        return self._reward_num_gpus
diff --git a/rlinf/utils/resharding/mcore_weight_reshard.py b/rlinf/utils/resharding/mcore_weight_reshard.py
index 9ec44eba0..90d8277fa 100644
--- a/rlinf/utils/resharding/mcore_weight_reshard.py
+++ b/rlinf/utils/resharding/mcore_weight_reshard.py
@@ -183,6 +183,6 @@ def get_layer_num(param_name):
             )
 
         if self.config.convert_fn is not None:
-            model_state_dict = self.config.convert_fn(model_state_dict, self.config)
+            model_state_dict = self.config.convert_fn(model_state_dict)
 
         return model_state_dict
diff --git a/rlinf/utils/resharding/reshard_config.py b/rlinf/utils/resharding/reshard_config.py
index 2b493a839..79e089bcd 100644
--- a/rlinf/utils/resharding/reshard_config.py
+++ b/rlinf/utils/resharding/reshard_config.py
@@ -17,7 +17,9 @@
 
 from megatron.core.transformer import TransformerConfig
 
-from .utils import get_convert_fn, get_pp_reshard_fn, get_tp_reshard_fn
+from rlinf.utils.convertor.utils import get_mg2hf_convertor
+
+from .utils import get_pp_reshard_fn, get_tp_reshard_fn
 
 
 @dataclass
@@ -37,7 +39,7 @@ class ReshardConfig:
     """Resharding pp size."""
 
     convert_fn: Callable = None
-    """Convert function to use for converting the model parameters' weight and name from training engine to rollout engine."""
+    """Function to convert the model weights from megatron format to HuggingFace format."""
 
     tp_reshard_fn: Callable = None
     """Resharding function to use for resharding the model parallelism from tensor_model_parallel_size to reshard_tp_size."""
@@ -59,7 +61,8 @@ def __post_init__(self):
             )
 
         if self.convert_fn is None and self.reshard_weights_format != "mcore":
-            self.convert_fn = get_convert_fn(self.model_arch)
+            self._convertor = get_mg2hf_convertor(self.model_arch, self, strict=True)
+            self.convert_fn = self._convertor.convert
 
         if self.tp_reshard_fn is None:
             self.tp_reshard_fn = get_tp_reshard_fn(self.model_arch)
diff --git a/rlinf/utils/resharding/utils.py b/rlinf/utils/resharding/utils.py
index 1fae2b05a..d7a4af231 100644
--- a/rlinf/utils/resharding/utils.py
+++ b/rlinf/utils/resharding/utils.py
@@ -12,21 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-from enum import Enum
-from typing import List, Tuple
 
 import torch
-from megatron.core import parallel_state
-
-
-def get_convert_fn(model_arch: str):
-    if model_arch == "qwen2.5":
-        return TransformFunc.convert_mega_qwen2_5_to_hf
-    else:
-        raise NotImplementedError(
-            f"get_convert_fn for model_arch {model_arch} is not implemented"
-        )
 
 
 def get_tp_reshard_fn(model_arch: str):
@@ -47,212 +34,6 @@ def get_pp_reshard_fn(model_arch: str):
         )
 
 
-###########################
-# convert fn implementation
-###########################
-
-
-class TransformType(Enum):
-    SPLIT_QKV = "split_qkv"
-    SPLIT_QKV_BIAS = "split_qkv_bias"
-    SPLIT_FC1 = "split_fc1"
-    SPLIT_NONE = "split_none"
-
-
-class TransformFunc:
-    @staticmethod
-    def _split_gqa_tensor(
-        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str], config
-    ) -> None:
-        hidden_size = config.model_config.hidden_size
-        num_attention_heads = config.model_config.num_attention_heads
-        num_query_groups = config.model_config.num_query_groups or num_attention_heads
-        head_dim = hidden_size // num_attention_heads
-
-        target_tp = config.reshard_tp_size
-        assert num_query_groups % target_tp == 0, (
-            "num_query_groups must be divisible by reshard_tp_size"
-        )
-        local_num_query_groups = num_query_groups // target_tp
-
-        # heads per query group
-        assert num_attention_heads % num_query_groups == 0, (
-            "num_attention_heads must be divisible by num_query_groups"
-        )
-        q_heads_per_group = num_attention_heads // num_query_groups
-
-        num_channel_qkv = q_heads_per_group + 2
-
-        if tensor.ndim == 2:
-            # Weight: [out_features, in_features]
-            out_features, in_features = tensor.shape
-            expected_out = local_num_query_groups * num_channel_qkv * head_dim
-            assert out_features == expected_out, (
-                f"Unexpected fused QKV weight shape {tensor.shape}, expect "
-                f"[{expected_out}, {in_features}] (local groups={local_num_query_groups})"
-            )
-
-            qkv = tensor.view(
-                local_num_query_groups, num_channel_qkv, head_dim, in_features
-            )
-            q, k, v = torch.split(
-                qkv, [q_heads_per_group, 1, 1], dim=1
-            )  # shapes: [G, qh, D, In], [G,1,D,In], [G,1,D,In]
-            q_full = q.reshape(-1, in_features).contiguous()
-            k_full = k.reshape(-1, in_features).contiguous()
-            v_full = v.reshape(-1, in_features).contiguous()
-        else:
-            # Bias: [out_features]
-            out_features = tensor.shape[0]
-            expected_out = local_num_query_groups * num_channel_qkv * head_dim
-            assert out_features == expected_out, (
-                f"Unexpected fused QKV bias shape {tensor.shape}, expect "
-                f"[{expected_out}] (local groups={local_num_query_groups})"
-            )
-
-            qkv = tensor.view(local_num_query_groups, num_channel_qkv, head_dim)
-            q, k, v = torch.split(qkv, [q_heads_per_group, 1, 1], dim=1)
-            q_full = q.reshape(-1).contiguous()
-            k_full = k.reshape(-1).contiguous()
-            v_full = v.reshape(-1).contiguous()
-
-        # Save to target names
-        new_statedict[weight_names[0]] = q_full.clone()
-        new_statedict[weight_names[1]] = k_full.clone()
-        new_statedict[weight_names[2]] = v_full.clone()
-
-    @staticmethod
-    def split_fc1(
-        linear_fc1: torch.Tensor, new_statedict: dict, weight_names: List[str], config
-    ) -> None:
-        assert weight_names is not None and len(weight_names) == 2, (
-            f"split_fc1 transform expects two weight names, got {weight_names}"
-        )
-
-        tp_size = config.model_config.tensor_model_parallel_size
-        target_tp = config.reshard_tp_size
-        split_size = linear_fc1.shape[0] // (tp_size // target_tp)
-        linear_fc1_slice = torch.split(linear_fc1, split_size, dim=0)
-
-        gate_proj_shards = []
-        up_proj_shards = []
-        for weight in linear_fc1_slice:
-            assert weight.shape[0] % 2 == 0, (
-                f"linear_fc1 weight shape {weight.shape} is not even along dim 0"
-            )
-            weight_chunk = torch.chunk(weight, 2, dim=0)
-            gate_proj_shards.append(weight_chunk[0])
-            up_proj_shards.append(weight_chunk[1])
-        gate_proj = torch.cat(gate_proj_shards, dim=0)
-        up_proj = torch.cat(up_proj_shards, dim=0)
-
-        new_statedict[weight_names[0]] = gate_proj.clone()
-        new_statedict[weight_names[1]] = up_proj.clone()
-
-    @staticmethod
-    def split_none(
-        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str]
-    ) -> None:
-        assert weight_names is not None and len(weight_names) == 1, (
-            f"split_none transform expects one weight name, got {weight_names}"
-        )
-        new_statedict[weight_names[0]] = tensor.clone()
-
-    @staticmethod
-    def mega_name_qwen2_5_to_hf(name: str) -> Tuple[TransformType, List[str]]:
-        """
-        Convert qwen2_5 model weight megatron name to hf name and do shape transform if needed.
-
-        Args:
-            name (str): megatron model weight name
-
-        Returns:
-            (TransformType, List[str]): transform type and the corresponding hf model weight name
-        """
-        if "embedding.word_embeddings.weight" in name:
-            return (TransformType.SPLIT_NONE, ["model.embed_tokens.weight"])
-        if "decoder.final_layernorm.weight" in name:
-            return (TransformType.SPLIT_NONE, ["model.norm.weight"])
-        if "output_layer.weight" in name:
-            return (TransformType.SPLIT_NONE, ["lm_head.weight"])
-        layer_id, suffix = TransformFunc.extract_layer_info(name)
-        assert layer_id is not None, f"Cannot extract layer info from {name}"
-        result_pattern = "model.layers.{}.{}"
-        nmap = {
-            "self_attention.linear_proj.weight": (
-                TransformType.SPLIT_NONE,
-                ["self_attn.o_proj.weight"],
-            ),
-            "self_attention.linear_qkv.layer_norm_weight": (
-                TransformType.SPLIT_NONE,
-                ["input_layernorm.weight"],
-            ),
-            "self_attention.linear_qkv.weight": (
-                TransformType.SPLIT_QKV,
-                [
-                    "self_attn.q_proj.weight",
-                    "self_attn.k_proj.weight",
-                    "self_attn.v_proj.weight",
-                ],
-            ),
-            "self_attention.linear_qkv.bias": (
-                TransformType.SPLIT_QKV_BIAS,
-                [
-                    "self_attn.q_proj.bias",
-                    "self_attn.k_proj.bias",
-                    "self_attn.v_proj.bias",
-                ],
-            ),
-            "mlp.linear_fc1.layer_norm_weight": (
-                TransformType.SPLIT_NONE,
-                ["post_attention_layernorm.weight"],
-            ),
-            "mlp.linear_fc1.weight": (
-                TransformType.SPLIT_FC1,
-                ["mlp.gate_proj.weight", "mlp.up_proj.weight"],
-            ),
-            "mlp.linear_fc2.weight": (
-                TransformType.SPLIT_NONE,
-                ["mlp.down_proj.weight"],
-            ),
-        }
-
-        assert suffix in nmap, f"Cannot find mapping for {suffix}"
-
-        transform_type, suffixes = nmap[suffix]
-        return (
-            transform_type,
-            [result_pattern.format(layer_id, suffix) for suffix in suffixes],
-        )
-
-    @staticmethod
-    def convert_mega_qwen2_5_to_hf(model_state_dict: dict, config) -> dict:
-        new_statedict = {}
-        for name, param in model_state_dict.items():
-            transform_type, hf_names = TransformFunc.mega_name_qwen2_5_to_hf(name)
-            if transform_type == TransformType.SPLIT_QKV:
-                TransformFunc._split_gqa_tensor(param, new_statedict, hf_names, config)
-            elif transform_type == TransformType.SPLIT_QKV_BIAS:
-                TransformFunc._split_gqa_tensor(param, new_statedict, hf_names, config)
-            elif transform_type == TransformType.SPLIT_FC1:
-                TransformFunc.split_fc1(param, new_statedict, hf_names, config)
-            elif transform_type == TransformType.SPLIT_NONE:
-                TransformFunc.split_none(param, new_statedict, hf_names)
-            else:
-                raise NotImplementedError(
-                    f"Transform type {transform_type} not implemented"
-                )
-        return new_statedict
-
-    @staticmethod
-    def extract_layer_info(s):
-        pattern = r"layers\.(\d+)\.(.+)"
-        match = re.search(pattern, s)
-        if match:
-            return match.group(1), match.group(2)
-        return None, None
-
-
 ##############################
 # tp reshard fn implementation
 ##############################
@@ -314,6 +95,8 @@ def _gather_pp_group_tensor_and_reshard(
 
 
 def pp_reshard_fn_qwen2_5(model_state_dict, pp_group, dtype):
+    from megatron.core import parallel_state
+
     pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
     pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
 
diff --git a/rlinf/utils/utils.py b/rlinf/utils/utils.py
index a2fc29cdd..b840538cc 100644
--- a/rlinf/utils/utils.py
+++ b/rlinf/utils/utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import atexit
 import gc
 import os
 import sys
@@ -19,13 +20,14 @@
 from functools import partial, wraps
 
 import torch
-
+import torch.nn.functional as F
+import torch_npu
 
 def clear_memory(sync=True):
     if sync:
-        torch.cuda.synchronize()
+        torch.npu.synchronize()
     gc.collect()
-    torch.cuda.empty_cache()
+    torch.npu.empty_cache()
 
 
 def apply_func_to_dict(func, dictionary):
@@ -52,7 +54,7 @@ def retrieve_model_state_dict_in_cpu(model):
 
         cpu_dict[name] = item
 
-    torch.cuda.synchronize()
+    torch.npu.synchronize()
     return cpu_dict
 
 
@@ -123,6 +125,65 @@ def seq_mean_token_mean(values, mask):
     return loss
 
 
+def logprobs_from_logits_flash_attn(logits, labels, inplace_backward=True):
+    #from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+
+    #output = cross_entropy_loss(logits, labels, inplace_backward=inplace_backward)
+    #assert isinstance(output, tuple), (
+    #    "please make sure flash-attn>=2.4.3 where cross_entropy_loss returns Tuple[losses, z_losses]."
+    #)
+    #return -output[0]
+    import torch.nn.functional as F
+
+    # 数值稳定的 log_softmax
+    log_probs = F.log_softmax(logits, dim=-1)
+
+    # 提取标签对应的 logprob
+    labels = labels.unsqueeze(-1)
+    return torch.gather(log_probs, -1, labels).squeeze(-1)
+
+
+def compute_logprobs_from_logits(logits, target, task_type="embodied"):
+    if task_type == "embodied":
+        logprobs = -F.cross_entropy(
+            logits, target=target, reduction="none"
+        )  # [B, action-dim]
+        return logprobs
+    batch_dim = logits.shape[:-1]
+    last_dim = logits.shape[-1]
+    logits = logits.reshape(-1, last_dim)
+    labels = target.reshape(-1)
+    logprobs = logprobs_from_logits_flash_attn(
+        logits, labels=labels, inplace_backward=False
+    )
+    logprobs = logprobs.view(*batch_dim)
+    return logprobs
+
+
+def entropy_from_logits(logits: torch.Tensor):
+    """Calculate entropy from logits."""
+    pd = torch.nn.functional.softmax(logits, dim=-1)
+    entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1)
+    return entropy
+
+
+def compute_entropy_from_logits(logits, epsilon=1e-10, task_type="embodied"):
+    """
+    Compute entropy by logits.
+
+    Args:
+        logits: [B, vocab-size, seq-len]
+    Returns:
+        entropy: [B, seq-len]
+    """
+    if task_type == "embodied":
+        all_probs = F.softmax(logits, dim=1)  # [B, vocab-size, seq-len]
+        all_log_probs = torch.log(all_probs + epsilon)
+        entropy = -torch.sum(all_probs * all_log_probs, dim=1)  # [B, seq-len]
+        return entropy
+    return entropy_from_logits(logits=logits)
+
+
 class DualOutput:
     def __init__(self, file, terminal):
         self.file = file
@@ -166,32 +227,37 @@ def wrapper(cfg, *args, **kwargs):
         )
         os.makedirs(os.path.dirname(log_path), exist_ok=True)
 
-        with open(log_path, "w", encoding="utf-8", buffering=1) as f:
-            dual_out = DualOutput(f, sys.stdout)
-            dual_err = DualOutput(f, sys.stderr)
-
-            old_stdout = sys.stdout
-            old_stderr = sys.stderr
-            try:
-                sys.stdout = dual_out
-                sys.stderr = dual_err
-                return func(cfg, *args, **kwargs)
-
-            except Exception as e:
-                import traceback
-
-                error_msg = f"\nException occurred: {e}\n{traceback.format_exc()}\n"
-                dual_err.write(error_msg)
-                dual_err.flush()
-                f.flush()
-                raise
-
-            finally:
-                sys.stdout = old_stdout
-                sys.stderr = old_stderr
-
-                dual_out.flush()
-                dual_err.flush()
-                f.flush()
+        f = open(log_path, "w", encoding="utf-8", buffering=1)
+
+        def close():
+            dual_out.flush()
+            dual_err.flush()
+            f.flush()
+            f.close()
+
+        atexit.register(close)
+
+        dual_out = DualOutput(f, sys.stdout)
+        dual_err = DualOutput(f, sys.stderr)
+
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        try:
+            sys.stdout = dual_out
+            sys.stderr = dual_err
+            return func(cfg, *args, **kwargs)
+
+        except Exception as e:
+            import traceback
+
+            error_msg = f"\nException occurred: {e}\n{traceback.format_exc()}\n"
+            dual_err.write(error_msg)
+            dual_err.flush()
+            f.flush()
+            raise
+
+        finally:
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
 
     return wrapper
diff --git a/rlinf/workers/actor/__init__.py b/rlinf/workers/actor/__init__.py
index 5b365ea1e..2d315469e 100644
--- a/rlinf/workers/actor/__init__.py
+++ b/rlinf/workers/actor/__init__.py
@@ -11,3 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from omegaconf import DictConfig
+
+from rlinf.scheduler.worker.worker import Worker
+
+
+def get_actor_worker(cfg: DictConfig) -> Worker:
+    if cfg.actor.training_backend == "fsdp":
+        from .fsdp_actor_worker import FSDPActor
+
+        return FSDPActor
+    elif cfg.actor.training_backend == "megatron":
+        from .megatron_actor_worker import MegatronActor
+
+        return MegatronActor
+    else:
+        raise ValueError(f"Unsupported training backend: {cfg.actor.training_backend}")
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 51c0a9533..1b3b015bb 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -14,31 +14,518 @@
 
 import gc
 import os
+from contextlib import nullcontext
+from typing import Dict, Tuple
 
 import numpy as np
 import torch
 from omegaconf import DictConfig
 from torch.distributed.device_mesh import init_device_mesh
+from torch.multiprocessing.reductions import reduce_tensor
 from tqdm import tqdm
 
 import rlinf.algorithms  # noqa: F401
 from rlinf.algorithms.registry import actor_loss, calculate_adv_and_returns
-from rlinf.algorithms.utils import preprocess_advantages_inputs, preprocess_loss_inputs
+from rlinf.algorithms.utils import (
+    kl_penalty,
+    preprocess_advantages_inputs,
+    preprocess_loss_inputs,
+)
+from rlinf.data.io_struct import RolloutResult
 from rlinf.hybrid_engines.fsdp.fsdp_model_manager import (
     FSDPModelManager,
 )
 from rlinf.models import get_model
 from rlinf.models.embodiment.model_utils import custom_forward
-from rlinf.scheduler import Cluster, Worker
+from rlinf.scheduler import Channel, Cluster, Worker
 from rlinf.utils.data_iter_utils import get_iterator_k_split
 from rlinf.utils.distributed import all_reduce_dict
+from rlinf.utils.distributed import (
+    compute_rollout_metrics as compute_math_rollout_metrics,
+)
 from rlinf.utils.metric_utils import (
     append_to_dict,
     compute_loss_mask,
     compute_rollout_metrics,
     compute_split_num,
 )
-from rlinf.utils.placement import HybridComponentPlacement
+from rlinf.utils.placement import (
+    HybridComponentPlacement,
+    ModelParallelComponentPlacement,
+)
+from rlinf.utils.utils import (
+    compute_logprobs_from_logits,
+    cpu_weight_swap,
+    masked_mean,
+    retrieve_model_state_dict_in_cpu,
+    seq_mean_token_mean,
+    seq_mean_token_sum,
+)
+from rlinf.workers.rollout.utils import RankMapper
+import torch_npu
+
+class FSDPActor(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
+        Worker.__init__(self)
+        super().__init__(cfg.actor)
+
+        self.cfg = cfg
+
+        self.response_len = (
+            cfg.actor.model.encoder_seq_length - cfg.data.max_prompt_length
+        )
+        self.calculate_entropy = self.cfg.algorithm.calculate_entropy
+        self.calculate_entropy_loss = (
+            self.cfg.algorithm.entropy_bonus > 0 and self.calculate_entropy
+        )
+        self.kl_beta = self.cfg.algorithm.kl_beta
+        self.kl_penalty_type = self.cfg.algorithm.kl_penalty_type
+
+        self.total_batch_size_per_dp = (
+            self.cfg.data.rollout_batch_size
+            * self.cfg.algorithm.group_size
+            // self._world_size
+        )
+
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
+        world_size = self._world_size
+        self.device_mesh = init_device_mesh(
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+        )
+
+        self._rollout_group_name = cfg.rollout.group_name
+        self._component_placement = placement
+        self.is_data_io_rank = True
+        self.is_pipeline = self._component_placement.is_disaggregated
+        self.ref_policy_state_dict = None
+
+        if self.cfg.algorithm.loss_agg_func == "token-mean":
+            self.loss_agg_func = masked_mean
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-sum":
+            self.loss_agg_func = seq_mean_token_sum
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-mean":
+            self.loss_agg_func = seq_mean_token_mean
+        else:
+            raise NotImplementedError(
+                f"algorithm.loss_agg_func={self.cfg.algorithm.loss_agg_func} is not supported!"
+            )
+
+    def init_worker(self) -> None:
+        self.setup_model_and_optimizer()
+        if self.cfg.algorithm.kl_beta > 0 and self.cfg.actor.get(
+            "combine_reference_model", True
+        ):
+            self.ref_policy_state_dict = retrieve_model_state_dict_in_cpu(self.model)
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+        self._setup_rollout_weight_dst_ranks()
+
+    def _setup_rollout_weight_dst_ranks(self) -> None:
+        """Setup destination ranks for token and weight communication."""
+        rank_map = RankMapper.get_actor_rank_to_rollout_rank_map(
+            self._component_placement
+        )
+        self._weight_dst_rank_in_rollout = rank_map[self._rank]
+        self.log_info(
+            f"Actor rank {self._rank} will send weights to {self._weight_dst_rank_in_rollout}"
+        )
+
+    def del_reshard_state_dict(self) -> None:
+        if hasattr(self, "rollout_state_dict"):
+            del self.rollout_state_dict
+
+    def sync_model_to_rollout(self) -> None:
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_optimizer()
+
+        if next(self.model.parameters()).is_cpu:
+            self.load_fsdp_param_and_grad(self.device)
+        self.rollout_state_dict = self.get_model_state_dict()
+
+        has_visual = any("visual." in k for k in self.rollout_state_dict.keys())
+
+        state_dict = {}
+
+        if self._weight_dst_rank_in_rollout is not None:
+            for k, v in self.rollout_state_dict.items():
+                name = k
+                if has_visual:
+                    if name.startswith("model.language_model."):
+                        name = "model." + name[21:]
+                    # NOTE:
+                    # if transformers version is 4.56.1 or older(not tested),
+                    # the following line should be uncommented
+
+                    # elif name.startswith("model."):
+                    #     name = name[6:]
+                state_dict[name] = reduce_tensor(v)
+
+            self.send(
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+
+    def compute_logprobs(self) -> None:
+        self.model.eval()
+        self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
+
+    def get_batch(
+        self, channel: Channel
+    ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
+        result: RolloutResult = channel.get()
+
+        batch = result.to_actor_batch(
+            self.cfg.data.max_prompt_length,
+            self.cfg.actor.model.encoder_seq_length,
+            self.tokenizer.eos_token_id,
+        )
+        return batch, result
+
+    def put_result(self, result: RolloutResult, channel: Channel) -> None:
+        if channel.is_local:
+            # Local channel, every process will put its own data locally
+            # No need to broadcast
+            channel.put(result)
+        else:
+            if self.is_data_io_rank:
+                channel.put(result)
+
+    def _load_weight_and_optimizer(self, channel: Channel) -> None:
+        # Acquire the GPUs to ensure that no one is using them before loading models
+        # Otherwise, it may lead to OOM
+        with channel.device_lock:
+            if self.cfg.actor.get("enable_offload", False):
+                self.load_fsdp_param_and_grad(self.device)
+                self.load_fsdp_optimizer(self.device)
+
+    @torch.no_grad()
+    def inference_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        self.model.eval()
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        position_ids = batch["position_ids"]
+
+        multi_modal_inputs = {}
+        if "multi_modal_inputs" in batch.keys():
+            for key in batch["multi_modal_inputs"][0].keys():
+                multi_modal_inputs[key] = torch.cat(
+                    [inputs[key] for inputs in batch["multi_modal_inputs"]],
+                    dim=0,
+                ).npu()
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=False,
+            **multi_modal_inputs,
+        )
+
+        logits = outputs.logits
+        logits = logits[:, -self.response_len - 1 : -1, :]
+        logits = logits / self.cfg.algorithm.sampling_params.temperature
+
+        responses = input_ids[:, -self.response_len :]
+        logprobs = compute_logprobs_from_logits(
+            logits, responses, task_type=self.cfg.runner.task_type
+        )
+        return logprobs
+
+    def run_inference(
+        self,
+        input_channel: Channel,
+        output_channel: Channel,
+        rollout_channel: Channel,
+        compute_ref_logprobs: bool,
+    ) -> None:
+        """
+        Compute prev/ref logprobs using the actor Model's forward.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+            rollout_channel: get the rollout channel's device lock in case of collision.
+            compute_ref_logprobs: Whether to compute reference logprobs.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+            self._load_weight_and_optimizer(
+                    input_channel if self.is_pipeline else rollout_channel
+            )
+            num_splits = (
+                rollout_result.num_sequence
+                // self.cfg.algorithm.logprob_forward_micro_batch_size
+            )
+            micro_batches_iter = get_iterator_k_split(
+                batch,
+                num_splits=num_splits,
+            )
+            micro_batches = list(micro_batches_iter)
+
+            prev_logprobs = []
+            with self.worker_timer():
+                for micro_batch in micro_batches:
+                    prev_logprobs.append(self.inference_step(micro_batch).cpu())
+                rollout_result.prev_logprobs = torch.cat(prev_logprobs)
+            if compute_ref_logprobs:
+                assert self.ref_policy_state_dict is not None, (
+                    "Reference policy state dict is None but compute_ref_logprobs is True"
+                )
+                ref_logprobs = []
+                with cpu_weight_swap(self.model, self.ref_policy_state_dict):
+                    for micro_batch in micro_batches:
+                        ref_logprobs.append(self.inference_step(micro_batch).cpu())
+                    rollout_result.ref_logprobs = torch.cat(ref_logprobs)
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+    def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
+        # Get all batches for this DP
+        batches = []
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            batches.append(batch)
+            recv_batch_size += rollout_result.num_sequence
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+        batch = RolloutResult.merge_batches(batches)
+        # Must be called after batch is retrieved, which is when rollout has stopped
+        # Otherwise, loading model might cause OOM
+        self._load_weight_and_optimizer(input_channel)
+
+        global_batches = get_iterator_k_split(
+            batch,
+            num_splits=self.cfg.algorithm.n_minibatches,
+            shuffle=self.cfg.algorithm.get("shuffle_rollout", True),
+            shuffle_seed=self.cfg.actor.seed,
+        )
+
+        self.model.train()
+        assert (
+            self.cfg.actor.global_batch_size
+            % (self.cfg.actor.micro_batch_size * self._world_size)
+            == 0
+        )
+
+        training_metrics_list = []
+        # Global batch iterations
+        with self.worker_timer():
+            for global_batch in global_batches:
+                train_global_batch_size = global_batch["input_ids"].shape[0]
+
+                assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
+                    f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size=}"
+                )
+
+                self.gradient_accumulation = (
+                    train_global_batch_size // self.cfg.actor.micro_batch_size
+                )
+                # split batch into micro_batches
+                train_micro_batches = get_iterator_k_split(
+                    global_batch,
+                    train_global_batch_size // self.cfg.actor.micro_batch_size,
+                )
+
+                self.optimizer.zero_grad()
+                metrics = {}
+                for idx, m_batch in enumerate(train_micro_batches):
+                    backward_ctx = (
+                        self.model.no_sync()
+                        if idx < self.gradient_accumulation - 1
+                        else nullcontext()
+                    )
+                    for k, v in m_batch.items():
+                        m_batch[k] = v.npu() if isinstance(v, torch.Tensor) else v
+
+                    multi_modal_inputs = {}
+                    if "multi_modal_inputs" in m_batch.keys():
+                        for key in m_batch["multi_modal_inputs"][0].keys():
+                            multi_modal_inputs[key] = torch.cat(
+                                [
+                                    inputs[key]
+                                    for inputs in m_batch["multi_modal_inputs"]
+                                ],
+                                dim=0,
+                            ).npu()
+
+                    input_ids = m_batch["input_ids"]
+                    attention_mask = m_batch["attention_mask"]
+                    position_ids = m_batch["position_ids"]
+                    prev_logprobs = m_batch["prev_logprobs"]
+                    advantages = m_batch["advantages"]
+                    ref_logprobs = None
+                    if "ref_logprobs" in m_batch:
+                        ref_logprobs = m_batch["ref_logprobs"]
+
+                    loss_mask = m_batch["attention_mask"][:, -self.response_len :]
+                    output = self.model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                        **multi_modal_inputs,
+                        use_cache=False,
+                    )
+
+                    logits = output.logits
+
+                    logits.div_(self.cfg.algorithm.sampling_params.temperature)
+
+                    responses = input_ids[:, -self.response_len :]
+                    logits = logits[
+                        :, -self.response_len - 1 : -1, :
+                    ]  # (bsz, response_length, vocab_size)
+                    logprobs = compute_logprobs_from_logits(
+                        logits, responses, task_type=self.cfg.runner.task_type
+                    )
+
+                    clip_ratio = self.cfg.algorithm.ratio_clip_eps
+                    clip_ratio_low = (
+                        self.cfg.algorithm.clip_ratio_low
+                        if self.cfg.algorithm.clip_ratio_low is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_high = (
+                        self.cfg.algorithm.clip_ratio_high
+                        if self.cfg.algorithm.clip_ratio_high is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_c = self.cfg.algorithm.get("clip_ratio_c", 3.0)
+
+                    loss, mbs_metrics_data = actor_loss(
+                        loss_type=self.cfg.algorithm.loss_type,
+                        loss_agg_func=self.loss_agg_func,
+                        logprobs=logprobs,
+                        old_logprobs=prev_logprobs,
+                        advantages=advantages,
+                        clip_ratio_low=clip_ratio_low,
+                        clip_ratio_high=clip_ratio_high,
+                        clip_ratio_c=clip_ratio_c,
+                        loss_mask=loss_mask,
+                    )
+
+                    entropy_loss = torch.tensor(0.0, device=torch.npu.current_device())
+                    if self.calculate_entropy:
+                        entropy = output["entropy"][
+                            :, -self.response_len - 1 : -1
+                        ].contiguous()
+                        entropy_loss = self.loss_agg_func(entropy, mask=loss_mask)
+                        if self.calculate_entropy_loss:
+                            loss = (
+                                loss - self.cfg.algorithm.entropy_bonus * entropy_loss
+                            )
+
+                    kl_loss = torch.tensor(0.0, device=torch.npu.current_device())
+                    if self.kl_beta > 0 and ref_logprobs is not None:
+                        kld = kl_penalty(ref_logprobs, logprobs, self.kl_penalty_type)
+                        kl_loss = self.loss_agg_func(kld, loss_mask)
+                        loss = loss + kl_loss * self.kl_beta
+
+                    # add to log
+                    # scale loss for gradient accumulation and backprop
+                    loss = loss / self.gradient_accumulation
+                    with backward_ctx:
+                        loss.backward()
+
+                    mbs_metrics_data.update(
+                        {
+                            "final_loss": loss.detach(),
+                            "entropy_loss": entropy_loss.detach(),
+                            "kl_loss": kl_loss.detach(),
+                        }
+                    )
+
+                    append_to_dict(metrics, mbs_metrics_data)
+                # apply gradient clipping and optimizer step at the end of a global batch
+                grad_norm = self.model.clip_grad_norm_(
+                    max_norm=self.cfg.actor.optim.clip_grad
+                )
+                if not torch.isfinite(grad_norm).all():
+                    self.log_warning(
+                        "grad norm is not finite, skip this optimizer step."
+                    )
+                else:
+                    self.optimizer.step()
+                self.optimizer.zero_grad()
+
+                # aggregate metrics across micro-batches
+                mean_metric_dict = {
+                    key: torch.mean(torch.stack(value))
+                    for key, value in metrics.items()
+                }
+                mean_metric_dict = all_reduce_dict(
+                    mean_metric_dict, op=torch.distributed.ReduceOp.AVG
+                )
+                # add optimizer stats
+                if torch.is_tensor(grad_norm):
+                    mean_metric_dict["actor/grad_norm"] = float(
+                        grad_norm.detach().item()
+                    )
+                else:
+                    mean_metric_dict["actor/grad_norm"] = float(grad_norm)
+                lr = self.optimizer.param_groups[0]["lr"]
+                mean_metric_dict["actor/lr"] = torch.as_tensor(lr).float().cpu()
+                training_metrics_list.append(mean_metric_dict)
+
+        # Rollout metrics
+        rollout_metrics, _, _ = compute_math_rollout_metrics(
+            batch, self.cfg.data.max_prompt_length, self.response_len, self._world_size
+        )
+
+        return rollout_metrics, training_metrics_list
+
+    def save_checkpoint(self, save_base_path: str, step: int) -> None:
+        torch.distributed.barrier()
+        model_state = self.get_model_state_dict()
+        optim_state = self.get_optimizer_state_dict()
+        if self._rank == 0:
+            os.makedirs(save_base_path, exist_ok=True)
+            torch.save(model_state, os.path.join(save_base_path, "model.pt"))
+            torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
+        torch.distributed.barrier()
+
+    # Advantages and returns
+    def compute_advantages_and_returns(
+        self, input_channel: Channel, output_channel: Channel
+    ) -> None:
+        """Compute the advantages and returns.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+
+            with self.worker_timer():
+                if rollout_result.advantages is None:
+                    mask = batch["attention_mask"][:, -self.response_len :]
+                    advantages, returns = calculate_adv_and_returns(
+                        adv_type=self.cfg.algorithm.adv_type,
+                        reward_scores=batch["rewards"].npu(),
+                        mask=mask.npu(),
+                        num_responses=self.cfg.algorithm.group_size,
+                    )
+                    rollout_result.advantages = advantages.cpu()
+
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
 
 
 class EmbodiedFSDPActor(FSDPModelManager, Worker):
@@ -47,11 +534,11 @@ def __init__(self, cfg: DictConfig):
         super().__init__(cfg.actor)
 
         self.cfg = cfg
-        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
-        self.device = torch.cuda.current_device()
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
         world_size = self._world_size
         self.device_mesh = init_device_mesh(
-            "cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
         )
 
         self._env_group_name = cfg.env.group_name
@@ -79,9 +566,9 @@ def init_worker(self):
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
-            torch.cuda.synchronize()
+            torch.npu.synchronize()
             gc.collect()
-            torch.cuda.empty_cache()
+            torch.npu.empty_cache()
 
     def model_provider_func(self):
         model = get_model(self.cfg.actor.checkpoint_load_path, self.cfg.actor.model)
@@ -102,10 +589,10 @@ def sync_model_to_rollout(self):
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
-            torch.cuda.synchronize()
+            torch.npu.synchronize()
             del state_dict
             gc.collect()
-            torch.cuda.empty_cache()
+            torch.npu.empty_cache()
 
     async def recv_rollout_batch(self):
         send_num = self._component_placement.get_world_size("rollout") * self.stage_num
@@ -159,6 +646,10 @@ def _process_received_rollout_batch(self, rollout_batch):
             ]  # [n_chunk_step, rollout_epoch x bsz, num_action_chunks]
             loss_mask, loss_mask_sum = compute_loss_mask(dones)
 
+            if self.cfg.algorithm.reward_type == "chunk_level":
+                loss_mask = loss_mask.any(dim=-1, keepdim=True)
+                loss_mask_sum = loss_mask_sum[..., -1:]
+
             rollout_batch["loss_mask"] = loss_mask
             rollout_batch["loss_mask_sum"] = loss_mask_sum
 
@@ -385,7 +876,7 @@ def run_training(self):
                 metrics_data["loss"] = loss.detach().item()
                 append_to_dict(metrics, metrics_data)
 
-            torch.cuda.empty_cache()
+            torch.npu.empty_cache()
 
             grad_norm = self.model.clip_grad_norm_(
                 max_norm=self.cfg.actor.optim.clip_grad
@@ -407,9 +898,9 @@ def run_training(self):
         )
 
         self.optimizer.zero_grad()
-        torch.cuda.synchronize()
+        torch.npu.synchronize()
         torch.distributed.barrier()
-        torch.cuda.empty_cache()
+        torch.npu.empty_cache()
 
         return mean_metric_dict
 
diff --git a/rlinf/workers/actor/fsdp_actor_worker_bak.py b/rlinf/workers/actor/fsdp_actor_worker_bak.py
new file mode 100644
index 000000000..84d8e09c5
--- /dev/null
+++ b/rlinf/workers/actor/fsdp_actor_worker_bak.py
@@ -0,0 +1,903 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from contextlib import nullcontext
+from typing import Dict, Tuple
+
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from torch.distributed.device_mesh import init_device_mesh
+from torch.multiprocessing.reductions import reduce_tensor
+from tqdm import tqdm
+
+import rlinf.algorithms  # noqa: F401
+from rlinf.algorithms.registry import actor_loss, calculate_adv_and_returns
+from rlinf.algorithms.utils import (
+    kl_penalty,
+    preprocess_advantages_inputs,
+    preprocess_loss_inputs,
+)
+from rlinf.data.io_struct import RolloutResult
+from rlinf.hybrid_engines.fsdp.fsdp_model_manager import (
+    FSDPModelManager,
+)
+from rlinf.models import get_model
+from rlinf.models.embodiment.model_utils import custom_forward
+from rlinf.scheduler import Channel, Cluster, Worker
+from rlinf.utils.data_iter_utils import get_iterator_k_split
+from rlinf.utils.distributed import all_reduce_dict
+from rlinf.utils.distributed import (
+    compute_rollout_metrics as compute_math_rollout_metrics,
+)
+from rlinf.utils.metric_utils import (
+    append_to_dict,
+    compute_loss_mask,
+    compute_rollout_metrics,
+    compute_split_num,
+)
+from rlinf.utils.placement import (
+    HybridComponentPlacement,
+    ModelParallelComponentPlacement,
+)
+from rlinf.utils.utils import (
+    compute_logprobs_from_logits,
+    cpu_weight_swap,
+    masked_mean,
+    retrieve_model_state_dict_in_cpu,
+    seq_mean_token_mean,
+    seq_mean_token_sum,
+)
+from rlinf.workers.rollout.utils import RankMapper
+import torch_npu
+
+class FSDPActor(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
+        Worker.__init__(self)
+        super().__init__(cfg.actor)
+
+        self.cfg = cfg
+
+        self.response_len = (
+            cfg.actor.model.encoder_seq_length - cfg.data.max_prompt_length
+        )
+        self.calculate_entropy = self.cfg.algorithm.calculate_entropy
+        self.calculate_entropy_loss = (
+            self.cfg.algorithm.entropy_bonus > 0 and self.calculate_entropy
+        )
+        self.kl_beta = self.cfg.algorithm.kl_beta
+        self.kl_penalty_type = self.cfg.algorithm.kl_penalty_type
+
+        self.total_batch_size_per_dp = (
+            self.cfg.data.rollout_batch_size
+            * self.cfg.algorithm.group_size
+            // self._world_size
+        )
+
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
+        world_size = self._world_size
+        self.device_mesh = init_device_mesh(
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+        )
+
+        self._rollout_group_name = cfg.rollout.group_name
+        self._component_placement = placement
+        self.is_data_io_rank = True
+        self.is_pipeline = self._component_placement.is_disaggregated
+        self.ref_policy_state_dict = None
+
+        if self.cfg.algorithm.loss_agg_func == "token-mean":
+            self.loss_agg_func = masked_mean
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-sum":
+            self.loss_agg_func = seq_mean_token_sum
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-mean":
+            self.loss_agg_func = seq_mean_token_mean
+        else:
+            raise NotImplementedError(
+                f"algorithm.loss_agg_func={self.cfg.algorithm.loss_agg_func} is not supported!"
+            )
+
+    def init_worker(self) -> None:
+        self.setup_model_and_optimizer()
+        if self.cfg.algorithm.kl_beta > 0 and self.cfg.actor.get(
+            "combine_reference_model", True
+        ):
+            self.ref_policy_state_dict = retrieve_model_state_dict_in_cpu(self.model)
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+        self._setup_rollout_weight_dst_ranks()
+
+    def _setup_rollout_weight_dst_ranks(self) -> None:
+        """Setup destination ranks for token and weight communication."""
+        rank_map = RankMapper.get_actor_rank_to_rollout_rank_map(
+            self._component_placement
+        )
+        self._weight_dst_rank_in_rollout = rank_map[self._rank]
+        self.log_info(
+            f"Actor rank {self._rank} will send weights to {self._weight_dst_rank_in_rollout}"
+        )
+
+    def del_reshard_state_dict(self) -> None:
+        if hasattr(self, "rollout_state_dict"):
+            del self.rollout_state_dict
+
+    def sync_model_to_rollout(self) -> None:
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_optimizer()
+
+        if next(self.model.parameters()).is_cpu:
+            self.load_fsdp_param_and_grad(self.device)
+        self.rollout_state_dict = self.get_model_state_dict()
+
+        has_visual = any("visual." in k for k in self.rollout_state_dict.keys())
+
+        state_dict = {}
+
+        if self._weight_dst_rank_in_rollout is not None:
+            for k, v in self.rollout_state_dict.items():
+                name = k
+                if has_visual:
+                    if name.startswith("model.language_model."):
+                        name = "model." + name[21:]
+                    # NOTE:
+                    # if transformers version is 4.56.1 or older(not tested),
+                    # the following line should be uncommented
+
+                    # elif name.startswith("model."):
+                    #     name = name[6:]
+                state_dict[name] = reduce_tensor(v)
+
+            self.send(
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+
+    def compute_logprobs(self) -> None:
+        self.model.eval()
+        self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
+
+    def get_batch(
+        self, channel: Channel
+    ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
+        result: RolloutResult = channel.get()
+
+        batch = result.to_actor_batch(
+            self.cfg.data.max_prompt_length,
+            self.cfg.actor.model.encoder_seq_length,
+            self.tokenizer.eos_token_id,
+        )
+        return batch, result
+
+    def put_result(self, result: RolloutResult, channel: Channel) -> None:
+        if channel.is_local:
+            # Local channel, every process will put its own data locally
+            # No need to broadcast
+            channel.put(result)
+        else:
+            if self.is_data_io_rank:
+                channel.put(result)
+
+    def _load_weight_and_optimizer(self, channel: Channel) -> None:
+        # Acquire the GPUs to ensure that no one is using them before loading models
+        # Otherwise, it may lead to OOM
+        with channel.device_lock:
+            if self.cfg.actor.get("enable_offload", False):
+                self.load_fsdp_param_and_grad(self.device)
+                self.load_fsdp_optimizer(self.device)
+
+    @torch.no_grad()
+    def inference_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        self.model.eval()
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        position_ids = batch["position_ids"]
+
+        multi_modal_inputs = {}
+        if "multi_modal_inputs" in batch.keys():
+            for key in batch["multi_modal_inputs"][0].keys():
+                multi_modal_inputs[key] = torch.cat(
+                    [inputs[key] for inputs in batch["multi_modal_inputs"]],
+                    dim=0,
+                ).npu()
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=False,
+            **multi_modal_inputs,
+        )
+
+        logits = outputs.logits
+        logits = logits[:, -self.response_len - 1 : -1, :]
+        logits = logits / self.cfg.algorithm.sampling_params.temperature
+
+        responses = input_ids[:, -self.response_len :]
+        logprobs = compute_logprobs_from_logits(
+            logits, responses, task_type=self.cfg.runner.task_type
+        )
+        return logprobs
+
+    def run_inference(
+        self,
+        input_channel: Channel,
+        output_channel: Channel,
+        rollout_channel: Channel,
+        compute_ref_logprobs: bool,
+    ) -> None:
+        """
+        Compute prev/ref logprobs using the actor Model's forward.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+            rollout_channel: get the rollout channel's device lock in case of collision.
+            compute_ref_logprobs: Whether to compute reference logprobs.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+            self._load_weight_and_optimizer(
+                input_channel if self.is_pipeline else rollout_channel
+            )
+
+            with self.worker_timer():
+                prev_logprobs = self.inference_step(batch)
+                rollout_result.prev_logprobs = prev_logprobs.cpu()
+
+            if compute_ref_logprobs:
+                assert self.ref_policy_state_dict is not None, (
+                    "Reference policy state dict is None but compute_ref_logprobs is True"
+                )
+                with cpu_weight_swap(self.model, self.ref_policy_state_dict):
+                    ref_logprobs = self.inference_step(batch)
+                    rollout_result.ref_logprobs = ref_logprobs.cpu()
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+    def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
+        # Get all batches for this DP
+        batches = []
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            batches.append(batch)
+            recv_batch_size += rollout_result.num_sequence
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+        batch = RolloutResult.merge_batches(batches)
+        # Must be called after batch is retrieved, which is when rollout has stopped
+        # Otherwise, loading model might cause OOM
+        self._load_weight_and_optimizer(input_channel)
+
+        global_batches = get_iterator_k_split(
+            batch,
+            num_splits=self.cfg.algorithm.n_minibatches,
+            shuffle=self.cfg.algorithm.get("shuffle_rollout", True),
+            shuffle_seed=self.cfg.actor.seed,
+        )
+
+        self.model.train()
+        assert (
+            self.cfg.actor.global_batch_size
+            % (self.cfg.actor.micro_batch_size * self._world_size)
+            == 0
+        )
+
+        training_metrics_list = []
+        # Global batch iterations
+        with self.worker_timer():
+            for global_batch in global_batches:
+                train_global_batch_size = global_batch["input_ids"].shape[0]
+
+                assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
+                    f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size=}"
+                )
+
+                self.gradient_accumulation = (
+                    train_global_batch_size // self.cfg.actor.micro_batch_size
+                )
+                # split batch into micro_batches
+                train_micro_batches = get_iterator_k_split(
+                    global_batch,
+                    train_global_batch_size // self.cfg.actor.micro_batch_size,
+                )
+
+                self.optimizer.zero_grad()
+                metrics = {}
+                for idx, m_batch in enumerate(train_micro_batches):
+                    backward_ctx = (
+                        self.model.no_sync()
+                        if idx < self.gradient_accumulation - 1
+                        else nullcontext()
+                    )
+                    for k, v in m_batch.items():
+                        m_batch[k] = v.npu() if isinstance(v, torch.Tensor) else v
+
+                    multi_modal_inputs = {}
+                    if "multi_modal_inputs" in m_batch.keys():
+                        for key in m_batch["multi_modal_inputs"][0].keys():
+                            multi_modal_inputs[key] = torch.cat(
+                                [
+                                    inputs[key]
+                                    for inputs in m_batch["multi_modal_inputs"]
+                                ],
+                                dim=0,
+                            ).npu()
+
+                    input_ids = m_batch["input_ids"]
+                    attention_mask = m_batch["attention_mask"]
+                    position_ids = m_batch["position_ids"]
+                    prev_logprobs = m_batch["prev_logprobs"]
+                    advantages = m_batch["advantages"]
+                    ref_logprobs = None
+                    if "ref_logprobs" in m_batch:
+                        ref_logprobs = m_batch["ref_logprobs"]
+
+                    loss_mask = m_batch["attention_mask"][:, -self.response_len :]
+                    output = self.model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                        **multi_modal_inputs,
+                        use_cache=False,
+                    )
+
+                    logits = output.logits
+
+                    logits.div_(self.cfg.algorithm.sampling_params.temperature)
+
+                    responses = input_ids[:, -self.response_len :]
+                    logits = logits[
+                        :, -self.response_len - 1 : -1, :
+                    ]  # (bsz, response_length, vocab_size)
+                    logprobs = compute_logprobs_from_logits(
+                        logits, responses, task_type=self.cfg.runner.task_type
+                    )
+
+                    clip_ratio = self.cfg.algorithm.ratio_clip_eps
+                    clip_ratio_low = (
+                        self.cfg.algorithm.clip_ratio_low
+                        if self.cfg.algorithm.clip_ratio_low is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_high = (
+                        self.cfg.algorithm.clip_ratio_high
+                        if self.cfg.algorithm.clip_ratio_high is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_c = self.cfg.algorithm.get("clip_ratio_c", 3.0)
+
+                    loss, mbs_metrics_data = actor_loss(
+                        loss_type=self.cfg.algorithm.loss_type,
+                        loss_agg_func=self.loss_agg_func,
+                        logprobs=logprobs,
+                        old_logprobs=prev_logprobs,
+                        advantages=advantages,
+                        clip_ratio_low=clip_ratio_low,
+                        clip_ratio_high=clip_ratio_high,
+                        clip_ratio_c=clip_ratio_c,
+                        loss_mask=loss_mask,
+                    )
+
+                    entropy_loss = torch.tensor(0.0, device=torch.npu.current_device())
+                    if self.calculate_entropy:
+                        entropy = output["entropy"][
+                            :, -self.response_len - 1 : -1
+                        ].contiguous()
+                        entropy_loss = self.loss_agg_func(entropy, mask=loss_mask)
+                        if self.calculate_entropy_loss:
+                            loss = (
+                                loss - self.cfg.algorithm.entropy_bonus * entropy_loss
+                            )
+
+                    kl_loss = torch.tensor(0.0, device=torch.npu.current_device())
+                    if self.kl_beta > 0 and ref_logprobs is not None:
+                        kld = kl_penalty(ref_logprobs, logprobs, self.kl_penalty_type)
+                        kl_loss = self.loss_agg_func(kld, loss_mask)
+                        loss = loss + kl_loss * self.kl_beta
+
+                    # add to log
+                    # scale loss for gradient accumulation and backprop
+                    loss = loss / self.gradient_accumulation
+                    with backward_ctx:
+                        loss.backward()
+
+                    mbs_metrics_data.update(
+                        {
+                            "final_loss": loss.detach(),
+                            "entropy_loss": entropy_loss.detach(),
+                            "kl_loss": kl_loss.detach(),
+                        }
+                    )
+
+                    append_to_dict(metrics, mbs_metrics_data)
+                # apply gradient clipping and optimizer step at the end of a global batch
+                grad_norm = self.model.clip_grad_norm_(
+                    max_norm=self.cfg.actor.optim.clip_grad
+                )
+                if not torch.isfinite(grad_norm).all():
+                    self.log_warning(
+                        "grad norm is not finite, skip this optimizer step."
+                    )
+                else:
+                    self.optimizer.step()
+                self.optimizer.zero_grad()
+
+                # aggregate metrics across micro-batches
+                mean_metric_dict = {
+                    key: torch.mean(torch.stack(value))
+                    for key, value in metrics.items()
+                }
+                mean_metric_dict = all_reduce_dict(
+                    mean_metric_dict, op=torch.distributed.ReduceOp.AVG
+                )
+                # add optimizer stats
+                if torch.is_tensor(grad_norm):
+                    mean_metric_dict["actor/grad_norm"] = float(
+                        grad_norm.detach().item()
+                    )
+                else:
+                    mean_metric_dict["actor/grad_norm"] = float(grad_norm)
+                lr = self.optimizer.param_groups[0]["lr"]
+                mean_metric_dict["actor/lr"] = torch.as_tensor(lr).float().cpu()
+                training_metrics_list.append(mean_metric_dict)
+
+        # Rollout metrics
+        rollout_metrics, _, _ = compute_math_rollout_metrics(
+            batch, self.cfg.data.max_prompt_length, self.response_len, self._world_size
+        )
+
+        return rollout_metrics, training_metrics_list
+
+    def save_checkpoint(self, save_base_path: str, step: int) -> None:
+        torch.distributed.barrier()
+        model_state = self.get_model_state_dict()
+        optim_state = self.get_optimizer_state_dict()
+        if self._rank == 0:
+            os.makedirs(save_base_path, exist_ok=True)
+            torch.save(model_state, os.path.join(save_base_path, "model.pt"))
+            torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
+        torch.distributed.barrier()
+
+    # Advantages and returns
+    def compute_advantages_and_returns(
+        self, input_channel: Channel, output_channel: Channel
+    ) -> None:
+        """Compute the advantages and returns.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+
+            with self.worker_timer():
+                if rollout_result.advantages is None:
+                    mask = batch["attention_mask"][:, -self.response_len :]
+                    advantages, returns = calculate_adv_and_returns(
+                        adv_type=self.cfg.algorithm.adv_type,
+                        reward_scores=batch["rewards"].npu(),
+                        mask=mask.npu(),
+                        num_responses=self.cfg.algorithm.group_size,
+                    )
+                    rollout_result.advantages = advantages.cpu()
+
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+
+class EmbodiedFSDPActor(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig):
+        Worker.__init__(self)
+        super().__init__(cfg.actor)
+
+        self.cfg = cfg
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
+        world_size = self._world_size
+        self.device_mesh = init_device_mesh(
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+        )
+
+        self._env_group_name = cfg.env.group_name
+        self._rollout_group_name = cfg.rollout.group_name
+        self._component_placement = HybridComponentPlacement(cfg, Cluster())
+        self._weight_dst_rank_in_rollout = self._rank
+        if self._weight_dst_rank_in_rollout >= self._component_placement.get_world_size(
+            "rollout"
+        ):
+            self._weight_dst_rank_in_rollout = None
+
+        self._obs_queue_name = cfg.env.channel.queue_name
+        self._action_queue_name = cfg.rollout.channel.queue_name
+        self._replay_buffer_name = cfg.actor.channel.queue_name
+        # stage_num: default to 2, use for pipeline rollout process
+        self.stage_num = cfg.rollout.pipeline_stage_num
+
+        self.channel = self.connect_channel(cfg.actor.channel.name)
+        self.channel.create_queue(
+            cfg.actor.channel.queue_name, maxsize=cfg.actor.channel.queue_size
+        )
+
+    def init_worker(self):
+        self.setup_model_and_optimizer()
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+            torch.npu.synchronize()
+            gc.collect()
+            torch.npu.empty_cache()
+
+    def model_provider_func(self):
+        model = get_model(self.cfg.actor.checkpoint_load_path, self.cfg.actor.model)
+        if model is not None:
+            return model
+        return super().model_provider_func()
+
+    def sync_model_to_rollout(self):
+        if next(self.model.parameters()).is_cpu:
+            self.load_fsdp_param_and_grad(self.device)
+            self.load_fsdp_optimizer(self.device)
+
+        state_dict = self.get_model_state_dict()
+        if self._weight_dst_rank_in_rollout is not None:
+            self.send(
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+            torch.npu.synchronize()
+            del state_dict
+            gc.collect()
+            torch.npu.empty_cache()
+
+    async def recv_rollout_batch(self):
+        send_num = self._component_placement.get_world_size("rollout") * self.stage_num
+        recv_num = self._component_placement.get_world_size("actor")
+        split_num = compute_split_num(send_num, recv_num)
+
+        self.rollout_batch = {}
+        recv_list = []
+        for i in range(split_num):
+            recv_list.append(
+                await self.channel.get(
+                    queue_name=self._replay_buffer_name, async_op=True
+                ).async_wait()
+            )
+
+        # shape [num_chunk, bsz, chunk_size], cat dim 1
+        for key in recv_list[0].keys():
+            if "env_info/" not in key:
+                self.rollout_batch[key] = torch.cat(
+                    [recv_list[i][key] for i in range(split_num)], dim=1
+                )
+            else:
+                self.rollout_batch[key] = torch.cat(
+                    [recv_list[i][key] for i in range(split_num)], dim=0
+                )
+
+        self.rollout_batch = self._process_received_rollout_batch(self.rollout_batch)
+
+    def _process_received_rollout_batch(self, rollout_batch):
+        """
+        original shape: [rollout_epoch x n_chunk_steps, bsz, num_action_chunks, ...]
+        target shape: [n_chunk_steps, rollout_epoch x bsz, num_action_chunks, ...]
+        """
+        rollout_epoch = self.cfg.algorithm.rollout_epoch
+        for key, value in rollout_batch.items():
+            new_value = value.reshape(
+                rollout_epoch, -1, *value.shape[1:]
+            )  # [rollout_epoch, n_chunk_step, bsz, ...]
+            new_value = new_value.transpose(
+                0, 1
+            )  # [n_chunk_step, rollout_epoch, bsz, ...]
+            new_value = new_value.reshape(new_value.shape[0], -1, *new_value.shape[3:])
+            rollout_batch[key] = new_value
+
+        if (
+            not self.cfg.env.train.auto_reset
+            and not self.cfg.env.train.ignore_terminations
+        ):
+            dones = rollout_batch[
+                "dones"
+            ]  # [n_chunk_step, rollout_epoch x bsz, num_action_chunks]
+            loss_mask, loss_mask_sum = compute_loss_mask(dones)
+
+            if self.cfg.algorithm.reward_type == "chunk_level":
+                loss_mask = loss_mask.any(dim=-1, keepdim=True)
+                loss_mask_sum = loss_mask_sum[..., -1:]
+
+            rollout_batch["loss_mask"] = loss_mask
+            rollout_batch["loss_mask_sum"] = loss_mask_sum
+
+        # filter data by rewards
+        if self.cfg.algorithm.get("filter_rewards", False):
+            rewards = rollout_batch[
+                "rewards"
+            ]  # [n_chunk_step, batch, num_action_chunks]
+            if self.rollout_batch.get("loss_mask", None) is not None:
+                rewards = rewards * rollout_batch["loss_mask"]
+            n_chunk_step, batch_size, num_action_chunks = rewards.shape
+
+            group_size = self.cfg.algorithm.group_size
+            assert batch_size % group_size == 0, (
+                f"batch {batch_size} not divisible by group_size {group_size}"
+            )
+            n_prompts = batch_size // group_size
+
+            # calculate rewards by prompt
+            rewards = rewards.transpose(
+                0, 1
+            )  # [batch, n_chunk_step, num_action_chunks]
+            rewards = rewards.reshape(rewards.shape[0], -1)  # [batch, n_step]
+            reward_matrix = rewards.reshape(
+                n_prompts, group_size, rewards.shape[-1]
+            )  # [n_prompts, group_size, n_step]
+            reward_matrix = reward_matrix.sum(dim=-1)  # [n_prompts, group_size]
+            mean_reward_in_group = reward_matrix.mean(dim=1)  # [n_prompts]
+
+            # mask
+            reward_filter_mask = (
+                mean_reward_in_group >= self.cfg.algorithm.rewards_lower_bound
+            ) & (
+                mean_reward_in_group <= self.cfg.algorithm.rewards_upper_bound
+            )  # [n_prompts]
+
+            # extend mask dimension
+            reward_filter_mask = reward_filter_mask.repeat_interleave(
+                group_size
+            )  # [batch]
+            reward_filter_mask = (
+                reward_filter_mask.unsqueeze(0).expand(n_chunk_step, -1).unsqueeze(-1)
+            )  # [n_chunk_step, batch, 1]
+
+            # update loss_mask
+            if self.rollout_batch.get("loss_mask", None) is not None:
+                rollout_batch["loss_mask"] = (
+                    reward_filter_mask & self.rollout_batch["loss_mask"]
+                )
+            else:
+                rollout_batch["loss_mask"] = reward_filter_mask
+
+        return rollout_batch
+
+    def compute_logprobs(self):
+        self.model.eval()
+        self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
+
+    def compute_advantages_and_returns(self):
+        stage_num = self.cfg.rollout.pipeline_stage_num
+        env_world_size = self._component_placement.get_world_size("env")
+        actor_world_size = self._component_placement.get_world_size("actor")
+        num_group_envs_for_train = (
+            self.cfg.algorithm.num_group_envs
+            * stage_num
+            * env_world_size
+            // actor_world_size
+        )
+
+        kwargs = {
+            "adv_type": self.cfg.algorithm.adv_type,
+            "rewards": self.rollout_batch["rewards"],
+            "dones": self.rollout_batch["dones"],
+            "normalize_advantages": self.cfg.algorithm.get(
+                "normalize_advantages", True
+            ),
+            "values": self.rollout_batch.get("prev_values", None),
+            "gamma": self.cfg.algorithm.get("gamma", 1),
+            "gae_lambda": self.cfg.algorithm.get("gae_lambda", 1),
+            "num_group_envs": num_group_envs_for_train,
+            "group_size": self.cfg.algorithm.get("group_size", 8),
+            "reward_type": self.cfg.algorithm.reward_type,
+            "loss_mask": self.rollout_batch.get("loss_mask", None),
+            "rollout_epoch": self.cfg.algorithm.get("rollout_epoch", 1),
+        }
+        kwargs = preprocess_advantages_inputs(**kwargs)
+        advantages, returns = calculate_adv_and_returns(**kwargs)
+
+        self.rollout_batch.update({"advantages": advantages, "returns": returns})
+        rollout_metrics = compute_rollout_metrics(self.rollout_batch)
+        return rollout_metrics
+
+    def run_training(self):
+        if self.cfg.actor.get("enable_offload", False):
+            self.load_fsdp_param_and_grad(self.device)
+            self.load_fsdp_optimizer(self.device)
+
+        self.model.train()
+        self.optimizer.zero_grad()
+        rollout_size = (
+            self.rollout_batch["input_ids"].shape[0]
+            * self.rollout_batch["input_ids"].shape[1]
+        )
+        shuffle_id = torch.randperm(rollout_size)
+
+        for key, value in self.rollout_batch.items():
+            self.log_on_first_rank(f"run training, {key}: {value.shape}")
+
+        with torch.no_grad():
+            for key, value in self.rollout_batch.items():
+                if key in ["dones", "prev_values"]:
+                    value = value[:-1]
+                if "env_info" in key:
+                    continue
+                value = value.reshape(rollout_size, *value.shape[2:])
+                self.rollout_batch[key] = value[shuffle_id]
+
+        assert (
+            self.cfg.actor.global_batch_size
+            % (self.cfg.actor.micro_batch_size * self._world_size)
+            == 0
+        )
+
+        self.gradient_accumulation = (
+            self.cfg.actor.global_batch_size
+            // self.cfg.actor.micro_batch_size
+            // self._world_size
+        )
+
+        # Split to make minibatch iterator for updating the actor
+        # See PPO paper for details. https://arxiv.org/abs/1707.06347
+        rollout_size = self.rollout_batch["input_ids"].size(0)
+        batch_size_per_rank = self.cfg.actor.global_batch_size // self._world_size
+        assert rollout_size % batch_size_per_rank == 0, (
+            f"{rollout_size} is not divisible by {batch_size_per_rank}"
+        )
+        rollout_dataloader_iter = get_iterator_k_split(
+            self.rollout_batch,
+            rollout_size // batch_size_per_rank,
+        )
+
+        metrics = {}
+        for _, train_global_batch in tqdm(
+            enumerate(rollout_dataloader_iter), desc="get loss and metrics"
+        ):
+            # split batch into micro_batches
+            train_global_batch_size = train_global_batch["input_ids"].shape[0]
+            assert (
+                train_global_batch_size
+                == self.cfg.actor.global_batch_size
+                // torch.distributed.get_world_size()
+            )
+            assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
+                f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size}"
+            )
+            train_micro_batch = get_iterator_k_split(
+                train_global_batch,
+                train_global_batch_size // self.cfg.actor.micro_batch_size,
+            )
+
+            self.optimizer.zero_grad()
+            for data_idx, data in enumerate(train_micro_batch):
+                for k, v in data.items():
+                    data[k] = v.to(f"cuda:{int(os.environ['LOCAL_RANK'])}")
+
+                data = self.model.preprocess_for_train(data)
+                input_ids = data["input_ids"]
+                action_tokens = data["action_tokens"]
+                attention_mask = data["attention_mask"]
+                pixel_values = data["pixel_values"]
+
+                action_token_len = self.model.action_dim * self.model.num_action_chunks
+
+                logits_processor_args = {
+                    "action_tokens": action_tokens,
+                    "vocab_size": self.model.vocab_size,
+                    "n_action_bins": self.model.config.n_action_bins,
+                }
+
+                output_dict = custom_forward(
+                    self.model,
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    pixel_values=pixel_values,
+                    action_token_len=action_token_len,
+                    value_model=True
+                    if self.cfg.algorithm.adv_type == "embodied_gae"
+                    else False,
+                    value_head_mode=self.cfg.actor.model.get("vh_mode", None),
+                    temperature=self.cfg.algorithm.sampling_params.temperature_train,
+                    top_k=self.cfg.algorithm.sampling_params.top_k,
+                    logits_processor_args=logits_processor_args,
+                )
+
+                kwargs = {
+                    "loss_type": self.cfg.algorithm.loss_type,
+                    "logprob_type": self.cfg.algorithm.logprob_type,
+                    "entropy_type": self.cfg.algorithm.entropy_type,
+                    "single_action_dim": self.model.action_dim,
+                    "logprobs": output_dict["logprobs"],
+                    "entropy": output_dict["entropy"],
+                    "values": output_dict.get("values", None),
+                    "old_logprobs": data["prev_logprobs"],
+                    "advantages": data["advantages"],
+                    "returns": data["returns"],
+                    "prev_values": data["prev_values"],
+                    "clip_ratio_high": self.cfg.algorithm.clip_ratio_high,
+                    "clip_ratio_low": self.cfg.algorithm.clip_ratio_low,
+                    "value_clip": self.cfg.algorithm.get("value_clip", None),
+                    "huber_delta": self.cfg.algorithm.get("huber_delta", None),
+                    "entropy_bonus": self.cfg.algorithm.entropy_bonus,
+                    "loss_mask": data.get("loss_mask", None),
+                    "loss_mask_sum": data.get("loss_mask_sum", None),
+                    "max_episode_steps": self.cfg.env.train.max_episode_steps,
+                }
+
+                kwargs = preprocess_loss_inputs(**kwargs)
+
+                loss, metrics_data = actor_loss(**kwargs)
+
+                loss /= self.gradient_accumulation
+                loss.backward()
+
+                metrics_data["loss"] = loss.detach().item()
+                append_to_dict(metrics, metrics_data)
+
+            torch.npu.empty_cache()
+
+            grad_norm = self.model.clip_grad_norm_(
+                max_norm=self.cfg.actor.optim.clip_grad
+            )
+            self.optimizer.step()
+
+            self.optimizer.zero_grad()
+            data = {
+                "actor/grad_norm": grad_norm.detach().item(),
+                "actor/lr": self.optimizer.param_groups[0]["lr"],
+            }
+            if self.cfg.algorithm.adv_type == "embodied_gae":
+                data["critic/lr"] = self.optimizer.param_groups[1]["lr"]
+            append_to_dict(metrics, data)
+
+        mean_metric_dict = {key: np.mean(value) for key, value in metrics.items()}
+        mean_metric_dict = all_reduce_dict(
+            mean_metric_dict, op=torch.distributed.ReduceOp.AVG
+        )
+
+        self.optimizer.zero_grad()
+        torch.npu.synchronize()
+        torch.distributed.barrier()
+        torch.npu.empty_cache()
+
+        return mean_metric_dict
+
+    def save_checkpoint(self, save_base_path, step):
+        torch.distributed.barrier()
+        model_state = self.get_model_state_dict()
+        optim_state = self.get_optimizer_state_dict()
+        if self._rank == 0:
+            os.makedirs(save_base_path, exist_ok=True)
+            torch.save(model_state, os.path.join(save_base_path, "model.pt"))
+            torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
+        torch.distributed.barrier()
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index ae9beada5..40289ddf0 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -14,7 +14,7 @@
 
 import copy
 from functools import partial
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -52,7 +52,6 @@
 )
 from rlinf.utils.distributed import (
     RolloutDataBalance,
-    broadcast_tensor_within_mp,
     broadcast_tensor_within_pp,
     compute_rollout_metrics,
     masked_normalization,
@@ -79,7 +78,6 @@
     seq_mean_token_sum,
 )
 from rlinf.workers.rollout.utils import RankMapper
-from toolkits.math_verifier.verify import math_verify_call
 
 
 class MegatronActor(MegatronModelManager, Worker):
@@ -102,6 +100,10 @@ def __init__(
         self.cfg = cfg
         self.component_placement = placement
 
+        # check placement validity when actor backend is megatron
+        assert placement.rollout_tp_size <= placement.actor_tp_size, (
+            f" rollout tensor parallel size {placement.rollout_tp_size} must be less than or equal to actor tensor parallel size {placement.actor_tp_size}."
+        )
         # Data configurations
         self.response_len = (
             role_cfg.model.encoder_seq_length - cfg.data.max_prompt_length
@@ -114,10 +116,21 @@ def __init__(
         self.calculate_entropy_loss = (
             self.cfg.algorithm.entropy_bonus > 0 and self.calculate_entropy
         )
-        self.ratio_eps = self.cfg.algorithm.ratio_clip_eps
+        clip_ratio = self.cfg.algorithm.ratio_clip_eps
+        self.clip_ratio_low = (
+            self.cfg.algorithm.get("clip_ratio_low")
+            if self.cfg.algorithm.get("clip_ratio_low") is not None
+            else clip_ratio
+        )
+        self.clip_ratio_high = (
+            self.cfg.algorithm.get("clip_ratio_high")
+            if self.cfg.algorithm.get("clip_ratio_high") is not None
+            else clip_ratio
+        )
         self.logprob_forward_micro_batch_size = (
             self.cfg.algorithm.logprob_forward_micro_batch_size
         )
+
         self.kl_beta = self.cfg.algorithm.kl_beta
         self.kl_penalty_type = self.cfg.algorithm.kl_penalty_type
         self.clip_ratio_c = self.cfg.algorithm.clip_ratio_c
@@ -143,11 +156,6 @@ def __init__(
         self.ref_policy_state_dict = None
         self.is_pipeline = self.component_placement.is_disaggregated
 
-        # Reward configurations
-        if not self.cfg.reward.use_reward_model:
-            assert self.cfg.reward.reward_type == "math", "only support math"
-            self.reward_fn = math_verify_call
-
         # Rollout configurations
         self.rollout_group_name = self.cfg.rollout.group_name
 
@@ -381,7 +389,8 @@ def loss_func(output):
                     logprobs=curr_logprobs,
                     old_logprobs=prev_logprobs,
                     advantages=advantages,
-                    eps_clip=self.ratio_eps,
+                    clip_ratio_low=self.clip_ratio_low,
+                    clip_ratio_high=self.clip_ratio_high,
                     loss_mask=mask,
                 )
 
@@ -394,7 +403,7 @@ def loss_func(output):
 
                 kl_loss = torch.tensor(0.0, device=torch.cuda.current_device())
                 if self.kl_beta > 0 and ref_logprobs is not None:
-                    kld = kl_penalty(ref_logprobs, curr_logprobs, self.kl_penalty_type)
+                    kld = kl_penalty(curr_logprobs, ref_logprobs, self.kl_penalty_type)
                     kl_loss = self.loss_agg_func(kld, mask)
                     loss = loss + kl_loss * self.kl_beta
 
@@ -829,23 +838,29 @@ def run_inference(
         self,
         input_channel: Channel,
         output_channel: Channel,
+        rollout_channel: Optional[Channel],
         compute_ref_logprobs: bool,
     ):
-        """Compute prev/ref logprobs using the actor Model's forward.
+        """
+        Compute prev/ref logprobs using the actor Model's forward.
 
         Args:
             input_channel: The input channel to read from.
             output_channel: The output channel to send results to.
+            rollout_channel: get the rollout channel's device lock in case of collision.
             compute_ref_logprobs: Whether to compute reference logprobs.
         """
         recv_batch_size = 0
         while recv_batch_size < self.total_batch_size_per_dp:
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
-
             # Must be called after batch is retrieved, suggesting that rollout has stopped
             # Otherwise, loading model might cause OOM in the collocated mode
-            self._load_weight_and_optimizer(input_channel)
+            self._load_weight_and_optimizer(
+                input_channel
+                if self.is_pipeline or rollout_channel is None
+                else rollout_channel
+            )
 
             # Prev logprobs
             with self.worker_timer():
@@ -858,86 +873,12 @@ def run_inference(
                 with cpu_weight_swap(self.model[0], self.ref_policy_state_dict):
                     ref_logprobs = self.inference_step(batch)
                     rollout_result.ref_logprobs = ref_logprobs.cpu()
-
             self.put_result(rollout_result, output_channel)
 
         assert recv_batch_size == self.total_batch_size_per_dp, (
             f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
         )
 
-    # Rewards
-    def compute_rewards(self, input_channel: Channel, output_channel: Channel):
-        """Compute rewards.
-
-        Args:
-            input_channel: The input channel to read from.
-            output_channel: The output channel to send results to.
-        """
-        if self.is_pipeline:
-            # In pipeline mode, rewards are computed in the rollout
-            with self.worker_timer():
-                return
-        recv_batch_size = 0
-        while recv_batch_size < self.total_batch_size_per_dp:
-            batch, rollout_result = self.get_batch(input_channel)
-            recv_batch_size += rollout_result.num_sequence
-
-            # Compute rule-based reward
-            with self.worker_timer():
-                if rollout_result.rewards is None:
-                    rollout_result.rewards = self._compute_batch_rewards(
-                        batch, rollout_result.answers
-                    ).cpu()
-
-            self.put_result(rollout_result, output_channel)
-
-        assert recv_batch_size == self.total_batch_size_per_dp, (
-            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
-        )
-
-    def _compute_batch_rewards(
-        self, batch: Dict[str, torch.Tensor], answers: List[str]
-    ):
-        """Reward computation using non-model based reward."""
-        all_reward_scores = []
-        texts = []
-        for response, response_len in zip(
-            batch["input_ids"],
-            batch["response_lengths"],
-        ):
-            response = response[
-                self.cfg.data.max_prompt_length : self.cfg.data.max_prompt_length
-                + response_len
-            ]
-            texts.append(
-                self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
-            )
-
-        if torch.distributed.get_rank() == parallel_state.get_model_parallel_src_rank():
-            rewards = self.reward_fn(texts, answers)
-            reward_scores = [
-                self.cfg.reward.reward_scale
-                if reward == 1
-                else -self.cfg.reward.reward_scale
-                for reward in rewards
-            ]
-            all_reward_scores.extend(reward_scores)
-
-        if len(all_reward_scores) > 0:
-            new_all_rewards = []
-
-            for response in all_reward_scores:
-                if response is None:
-                    response = 0.0
-                new_all_rewards.append(response)
-
-            all_reward_scores = torch.as_tensor(
-                new_all_rewards,
-                dtype=torch.float,
-                device=torch.cuda.current_device(),
-            ).view(-1, 1)
-        return broadcast_tensor_within_mp(all_reward_scores).flatten().to("cpu")
-
     # Advantages and returns
     def compute_advantages_and_returns(
         self, input_channel: Channel, output_channel: Channel
@@ -948,16 +889,11 @@ def compute_advantages_and_returns(
             input_channel: The input channel to read from.
             output_channel: The output channel to send results to.
         """
-        if self.is_pipeline:
-            # In pipeline mode, advantages are computed in the rollout
-            with self.worker_timer():
-                return
         clear_memory()
         recv_batch_size = 0
         while recv_batch_size < self.total_batch_size_per_dp:
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
-
             with self.worker_timer():
                 if rollout_result.advantages is None:
                     mask = batch["attention_mask"][:, -self.response_len :]
@@ -1027,7 +963,11 @@ def sync_model_to_rollout(self):
     def _compute_rollout_metrics(self, batch):
         rollout_metrics, total_prompt_lengths, total_decode_lengths = (
             compute_rollout_metrics(
-                batch, self.cfg.data.max_prompt_length, self.response_len
+                batch,
+                self.cfg.data.max_prompt_length,
+                self.response_len,
+                self._world_size,
+                dp_group=parallel_state.get_data_parallel_group(),
             )
         )
 
diff --git a/rlinf/workers/env/env_worker.py b/rlinf/workers/env/env_worker.py
index 13d24d1e1..06375ed98 100644
--- a/rlinf/workers/env/env_worker.py
+++ b/rlinf/workers/env/env_worker.py
@@ -96,23 +96,25 @@ def init_worker(self):
             from rlinf.envs.maniskill.maniskill_env import ManiskillEnv
 
             if not only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.simulator_list.append(
                         EnvManager(
                             self.cfg.env.train,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size,
                             env_cls=ManiskillEnv,
                             enable_offload=enable_offload,
                         )
                     )
             if self.cfg.runner.val_check_interval > 0 or only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.eval_simulator_list.append(
                         EnvManager(
                             self.cfg.env.eval,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size,
                             env_cls=ManiskillEnv,
                             enable_offload=enable_offload,
                         )
@@ -121,23 +123,25 @@ def init_worker(self):
             from rlinf.envs.libero.libero_env import LiberoEnv
 
             if not only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.simulator_list.append(
                         EnvManager(
                             self.cfg.env.train,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size * self.stage_num,
                             env_cls=LiberoEnv,
                             enable_offload=enable_offload,
                         )
                     )
             if self.cfg.runner.val_check_interval > 0 or only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.eval_simulator_list.append(
                         EnvManager(
                             self.cfg.env.eval,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size * self.stage_num,
                             env_cls=LiberoEnv,
                             enable_offload=enable_offload,
                         )
@@ -146,24 +150,26 @@ def init_worker(self):
             from rlinf.envs.robotwin.RoboTwin_env import RoboTwin
 
             if not only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.simulator_list.append(
                         EnvManager(
                             self.cfg.env.train,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size * self.stage_num,
                             env_cls=RoboTwin,
                             enable_offload=enable_offload,
                         )
-                        # RoboTwin(self.cfg.env.train, rank=self._rank, world_size=self._world_size)
+                        # RoboTwin(self.cfg.env.train, rank=self._rank, total_num_processes=self._world_size)
                     )
             if self.cfg.runner.val_check_interval > 0 or only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.eval_simulator_list.append(
                         EnvManager(
                             self.cfg.env.eval,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size,
                             env_cls=RoboTwin,
                             enable_offload=enable_offload,
                         )
@@ -283,13 +289,13 @@ def finish_rollout(self, mode="train"):
         if mode == "train":
             if self.cfg.env.train.video_cfg.save_video:
                 for i in range(self.stage_num):
-                    self.simulator_list[i].flush_video(video_sub_dir=f"stage_{i}")
+                    self.simulator_list[i].flush_video()
             for i in range(self.stage_num):
                 self.simulator_list[i].update_reset_state_ids()
         elif mode == "eval":
             if self.cfg.env.eval.video_cfg.save_video:
                 for i in range(self.stage_num):
-                    self.eval_simulator_list[i].flush_video(video_sub_dir=f"stage_{i}")
+                    self.eval_simulator_list[i].flush_video()
 
     def split_env_batch(self, env_batch, gather_id, mode):
         env_batch_i = {}
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
new file mode 100644
index 000000000..88be65ddc
--- /dev/null
+++ b/rlinf/workers/reward/reward_worker.py
@@ -0,0 +1,104 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+import torch
+from omegaconf import DictConfig
+
+from rlinf.algorithms.rewards import get_reward_class
+from rlinf.data.io_struct import RolloutResult
+from rlinf.data.tokenizers import hf_tokenizer
+from rlinf.scheduler import Channel, Worker
+from rlinf.utils.placement import ModelParallelComponentPlacement
+
+
+class RewardWorker(Worker):
+    def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
+        Worker.__init__(self)
+        self.cfg = cfg
+        self.component_placement = placement
+        self.tokenizer = hf_tokenizer(cfg.reward.tokenizer.tokenizer_model)
+        self.total_batch_size_per_dp = (
+            self.cfg.data.rollout_batch_size
+            * self.cfg.algorithm.get("group_size", 1)
+            // self._world_size
+        )
+
+    def init_worker(self):
+        if self.cfg.reward.use_reward_model:
+            raise NotImplementedError("Reward model is not implemented yet.")
+        else:
+            self.reward = get_reward_class(self.cfg.reward.reward_type)(self.cfg.reward)
+
+    def get_batch(
+        self, channel: Channel
+    ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
+        result: RolloutResult = channel.get()
+        batch = result.to_actor_batch(
+            self.cfg.data.max_prompt_length,
+            self.cfg.actor.model.encoder_seq_length,
+            self.tokenizer.eos_token_id,
+        )
+        return batch, result
+
+    def compute_rewards(self, input_channel: Channel, output_channel: Channel):
+        """Compute rewards.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            rollout_result: RolloutResult = input_channel.get()
+            recv_batch_size += rollout_result.num_sequence
+            with self.worker_timer():
+                if rollout_result.rewards is None:
+                    if self.cfg.reward.use_reward_model:
+                        with input_channel.device_lock:
+                            batch = rollout_result.to_actor_batch(
+                                self.cfg.data.max_prompt_length,
+                                self.cfg.actor.model.encoder_seq_length,
+                                self.tokenizer.eos_token_id,
+                            )
+                            rollout_result.rewards = (
+                                self.compute_batch_rewards_with_model(batch)
+                            )
+                    else:
+                        rollout_result.rewards = self._compute_rule_based_rewards(
+                            rollout_result
+                        )
+
+            output_channel.put(rollout_result)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+    def _compute_rule_based_rewards(self, rollout_result: RolloutResult):
+        # Decode only the generated tokens; response_ids are already the post-prompt tokens
+        texts = self.tokenizer.batch_decode(
+            rollout_result.response_ids, skip_special_tokens=True
+        )
+
+        scores = self.reward.get_reward(texts, rollout_result.answers)
+        return (
+            torch.as_tensor(scores, dtype=torch.float, device=torch.device("cpu"))
+            .view(-1, 1)
+            .flatten()
+        )
+
+    def compute_batch_rewards_with_model(self, batch: Dict[str, torch.Tensor]):
+        raise NotImplementedError("Reward model is not implemented yet.")
diff --git a/rlinf/workers/rollout/hf/huggingface_worker.py b/rlinf/workers/rollout/hf/huggingface_worker.py
index 911cbe54e..c45be2991 100644
--- a/rlinf/workers/rollout/hf/huggingface_worker.py
+++ b/rlinf/workers/rollout/hf/huggingface_worker.py
@@ -302,11 +302,13 @@ async def evaluate(self):
                     for key, value in env_info_list.items():
                         eval_info[f"env_info/{key}"].append(value)
 
-        env_batch = await self.recv_env_batch()
-        if "meta" in env_batch:
-            env_info_list = env_batch["meta"]
-            for key, value in env_info_list.items():
-                eval_info[f"env_info/{key}"].append(value)
+        for i in range(self.stage_num):
+            env_batch = await self.recv_env_batch()
+            if "meta" in env_batch:
+                env_info_list = env_batch["meta"]
+                for key, value in env_info_list.items():
+                    eval_info[f"env_info/{key}"].append(value)
+
         eval_metrics = create_rollout_batch(eval_info)
         if self.cfg.rollout.get("enable_offload", False):
             self.offload_model()
diff --git a/rlinf/workers/rollout/server/online_router_worker.py b/rlinf/workers/rollout/server/online_router_worker.py
new file mode 100644
index 000000000..ea2959dc6
--- /dev/null
+++ b/rlinf/workers/rollout/server/online_router_worker.py
@@ -0,0 +1,250 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import json
+import random
+import time
+import uuid
+from typing import Any, Dict, List, Optional
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from omegaconf.dictconfig import DictConfig
+from pydantic import BaseModel
+
+from rlinf.scheduler import Worker
+from rlinf.utils.placement import ComponentPlacement
+from rlinf.workers.rollout.sglang.sglang_worker import AsyncSGLangWorker
+
+
+class CompleteRequest(BaseModel):
+    """Complete request model."""
+
+    prompt: str
+    model: Optional[str] = None
+    max_tokens: Optional[int] = 1024
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.9
+    top_k: Optional[int] = 50
+    repetition_penalty: Optional[float] = 1.0
+    stop: Optional[List[str]] = None
+    stream: Optional[bool] = False
+
+
+class CompleteResponse(BaseModel):
+    """Complete response model."""
+
+    id: str
+    choices: List[Dict[str, Any]]
+    model: str
+    created: int
+    object: str = "text_completion"
+
+
+class OnlineRouterWorker(Worker):
+    """Online router worker with FastAPI server for handling complete and completionTrack requests."""
+
+    def __init__(self, cfg: DictConfig, placement: ComponentPlacement):
+        Worker.__init__(self)
+
+        self._cfg = cfg
+
+        # Configuration
+        self._server_host = cfg.server.online_router.get("host", "0.0.0.0")
+        self._server_port = cfg.server.online_router.get("port", 8081)
+        self._rollout_instance_num = placement.rollout_dp_size
+
+        # Sync weight state management
+        self._sync_model_lock = asyncio.Lock()
+        self._sync_model_in_progress = False
+        self._pending_requests: List[asyncio.Future] = []
+
+        # Request synchronization state
+        self._sync_in_progress = False
+        self._old_requests_complete = asyncio.Event()
+        self._new_requests_blocked = asyncio.Event()
+        self._new_requests_blocked.set()  # Initially allow new requests
+        self._blocked_requests: List[asyncio.Future] = []
+
+        # Request tracking
+        self._active_requests: Dict[str, asyncio.Future] = {}
+
+        # Setup FastAPI routes
+        self._setup_routes()
+        self._server_task = None
+
+    def _setup_routes(self):
+        """Setup FastAPI routes."""
+        app = FastAPI(title="OnlineRouterWorker", version="1.0.0")
+        app.add_api_route("/v1/completions", self._handle_complete, methods=["POST"])
+
+        # Init the HTTP server
+        self._server = uvicorn.Server(
+            uvicorn.Config(
+                app, host=self._server_host, port=self._server_port, log_level="info"
+            )
+        )
+
+    def server_start(self):
+        """Start service."""
+        assert self._server_task is None
+
+        # Start server in background task
+        self._server_task = asyncio.create_task(self._server.serve())
+
+        self.log_info(f"service started on {self._server_host}:{self._server_port}")
+
+    async def server_stop(self):
+        """Stop service."""
+        assert self._server_task is not None
+
+        # Stop the HTTP server
+        self._server.should_exit = True
+
+        # Wait the HTTP server to stop
+        await self._server_task
+
+        self._server_task = None
+        self.log_info("service stopped")
+
+    async def _handle_complete(self, request: CompleteRequest):
+        """Handle complete requests with synchronization support."""
+        request_id = str(uuid.uuid4())
+        start_time = time.time()
+
+        # Check if sync is in progress
+        if self._sync_in_progress:
+            # Wait for old requests to complete
+            await self._old_requests_complete.wait()
+            # Block new requests during sync
+            await self._new_requests_blocked.wait()
+
+        # Create future for this request
+        future = asyncio.Future()
+        self._active_requests[request_id] = future
+
+        try:
+            # Forward request to rollout worker
+            sglang_instance_id = random.randint(0, self._rollout_instance_num - 1)
+            generate_result = (
+                await self.rollout_worker.execute_on(sglang_instance_id)
+                .agenerate(request.prompt, stop=request.stop)
+                .async_wait()
+            )
+            generated_text = generate_result[0]["text"]
+
+            if not request.stream:
+                # Create response
+                response = CompleteResponse(
+                    id=str(request_id),
+                    choices=[
+                        {
+                            "text": generated_text,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": generate_result[0]["meta_info"][
+                                "finish_reason"
+                            ]["type"],
+                        }
+                    ],
+                    created=int(start_time),
+                    model="test-model",
+                    object="text_completion",
+                )
+            else:
+
+                def generate_stream():
+                    # Send final chunk with finish_reason
+                    final_data = {
+                        "id": request_id,
+                        "object": "text_completion.chunk",
+                        "created": int(start_time),
+                        "model": "test-model",
+                        "choices": [
+                            {
+                                "text": generated_text,
+                                "index": 0,
+                                "logprobs": None,
+                                "finish_reason": "stop",
+                            }
+                        ],
+                    }
+                    yield f"data: {json.dumps(final_data)}\n\n"
+                    yield "data: [DONE]\n\n"
+
+                response = StreamingResponse(
+                    generate_stream(),
+                    media_type="text/event-stream",
+                    headers={
+                        "Cache-Control": "no-cache",
+                        "Connection": "keep-alive",
+                        "X-Accel-Buffering": "no",  # Disable nginx buffering
+                    },
+                )
+
+            # Set future result
+            future.set_result(response)
+            return response
+
+        finally:
+            # Clean up
+            if request_id in self._active_requests:
+                del self._active_requests[request_id]
+
+    async def init_worker(self, rollout_worker: AsyncSGLangWorker):
+        """Initialize the worker."""
+        self.rollout_worker = rollout_worker
+
+    async def sync_model_start(self):
+        """Start model synchronization. Block new requests and wait for old ones to complete."""
+        async with self._sync_model_lock:
+            assert not self._sync_in_progress
+
+            self.log_info("Starting model synchronization...")
+            self._sync_in_progress = True
+
+            # Clear the event to block new requests
+            self._new_requests_blocked.clear()
+
+            # Wait for all existing requests to complete
+            if self._active_requests:
+                self.log_info(
+                    f"Waiting for {len(self._active_requests)} active requests to complete..."
+                )
+                # Wait for all active requests to finish
+                await asyncio.gather(
+                    *self._active_requests.values(), return_exceptions=True
+                )
+
+            # Set event to indicate old requests are complete
+            self._old_requests_complete.set()
+            self.log_info("All old requests completed, sync can proceed")
+
+    async def sync_model_end(self):
+        """End model synchronization. Resume processing of blocked requests."""
+        async with self._sync_model_lock:
+            assert self._sync_in_progress
+
+            self.log_info("Ending model synchronization...")
+
+            # Reset sync state
+            self._sync_in_progress = False
+            self._old_requests_complete.clear()
+
+            # Allow new requests to proceed
+            self._new_requests_blocked.set()
+
+            self.log_info("Model synchronization completed, new requests can proceed")
diff --git a/rlinf/workers/rollout/server/server_rollout_worker.py b/rlinf/workers/rollout/server/server_rollout_worker.py
new file mode 100644
index 000000000..d95135462
--- /dev/null
+++ b/rlinf/workers/rollout/server/server_rollout_worker.py
@@ -0,0 +1,377 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import json
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+import uvicorn
+from fastapi import FastAPI, Request, Response
+from omegaconf import DictConfig
+from transformers import AutoTokenizer
+
+from rlinf.data.io_struct import (
+    RolloutResult,
+)
+from rlinf.scheduler import Channel, Worker
+
+
+class TrainingDataStorage:
+    """Storage manager for training data received via HTTP API."""
+
+    def __init__(self, storage_config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize storage manager.
+
+        Args:
+            storage_config: Configuration dict with options:
+                - enabled: bool, whether to enable storage (default: True)
+                - storage_dir: str, directory to store files (default: "./training_data")
+                - max_files_per_dir: int, max files per directory (default: 1000)
+                - compress: bool, whether to compress files (default: False)
+        """
+        if storage_config is None:
+            storage_config = {}
+
+        self.enabled = storage_config.get("enabled", True)
+        self.storage_dir = Path(storage_config.get("storage_dir", "./training_data"))
+        self.max_files_per_dir = storage_config.get("max_files_per_dir", 1000)
+        self.compress = storage_config.get("compress", False)
+
+        # Create storage directory if enabled
+        if self.enabled:
+            self.storage_dir.mkdir(parents=True, exist_ok=True)
+
+        # Track current file and entry count
+        self._current_file_path = None
+        self._entries_in_current_file = 0
+
+    def store_training_data(self, training_data: Dict[str, Any]) -> Optional[str]:
+        """
+        Store training data to file.
+
+        Args:
+            training_data: The training data dictionary to store
+
+        Returns:
+            Path to the stored file, or None if storage is disabled
+        """
+        if not self.enabled:
+            return None
+
+        # Add metadata
+        storage_entry = {
+            "timestamp": datetime.utcnow().isoformat(),
+            "stored_at": time.time(),
+            "data": training_data,
+        }
+
+        # Get or create file for writing
+        file_path = self._get_current_file_path()
+
+        # Write data based on format
+        self._write_jsonl_entry(file_path, storage_entry)
+
+        return str(file_path)
+
+    def _get_current_file_path(self) -> Path:
+        """Get the current file path for writing, creating new file if needed."""
+        # Check if we need a new file
+        if (
+            self._current_file_path is None
+            or self._entries_in_current_file >= self.max_files_per_dir
+        ):
+            # Create new file path
+            timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")[
+                :-3
+            ]  # microseconds to milliseconds
+            filename = f"training_data_{timestamp}.jsonl"
+            if self.compress:
+                filename += ".gz"
+
+            self._current_file_path = self.storage_dir / filename
+            self._entries_in_current_file = 0
+
+        return self._current_file_path
+
+    def _write_jsonl_entry(self, file_path: Path, entry: Dict[str, Any]):
+        """Write entry to JSONL file (one JSON per line)."""
+        # JSONL is more efficient for appending
+        with open(file_path, "a", encoding="utf-8") as f:
+            json.dump(entry, f, ensure_ascii=False)
+            f.write("\n")
+
+        self._entries_in_current_file += 1
+
+    def get_storage_stats(self) -> Dict[str, Any]:
+        """Get storage statistics."""
+        if not self.enabled:
+            return {"enabled": False}
+
+        stats = {
+            "enabled": True,
+            "storage_dir": str(self.storage_dir),
+            "current_file": str(self._current_file_path)
+            if self._current_file_path
+            else None,
+            "entries_in_current_file": self._entries_in_current_file,
+            "total_files": 0,
+            "total_size_bytes": 0,
+        }
+
+        # Count files and calculate total size
+        if self.storage_dir.exists():
+            for file_path in self.storage_dir.iterdir():
+                if file_path.is_file():
+                    stats["total_files"] += 1
+                    stats["total_size_bytes"] += file_path.stat().st_size
+
+        return stats
+
+
+class ServerRolloutWorker(Worker):
+    """
+    ServerRolloutWorker that supports both HTTP API and Channel interfaces.
+    It can receive training data from router's feedback_worker via HTTP
+    and also work with CodingOnlineRLRunner via Channel interface.
+
+    Key features:
+    - Unified data processing for both HTTP and Channel inputs
+    - Automatic rollout processing after server startup
+    - Compatible with CodingOnlineRLRunner interface
+    """
+
+    def __init__(self, cfg: DictConfig):
+        Worker.__init__(self)
+
+        self._cfg = cfg
+
+        # Initialize tokenizer for text processing
+        self._tokenizer = AutoTokenizer.from_pretrained(self._cfg.rollout.model_dir)
+
+        # Configuration
+        self._server_host = cfg.server.tracking_rollout.get("host", "0.0.0.0")
+        self._server_port = cfg.server.tracking_rollout.get("port", 8082)
+        self._enable_dummy_data = cfg.server.tracking_rollout.get(
+            "enable_dummy_data", False
+        )
+
+        # Unified data source for both HTTP and Channel data
+        self._data_source = asyncio.Queue()
+
+        # Initialize training data storage
+        # storage_config = getattr(self._cfg, 'storage', None)
+        storage_config = None
+        if storage_config is not None:
+            storage_config = dict(storage_config)
+        self._storage = TrainingDataStorage(storage_config)
+
+        # Processing configuration
+        self._max_new_tokens = getattr(
+            self._cfg.algorithm.sampling_params, "max_new_tokens", 512
+        )
+        self._batch_size = cfg.data.rollout_batch_size * cfg.algorithm.group_size
+
+        # Processing control
+        self._track_data_enable = False
+
+        # Output channel for continuous processing
+
+        # Setup FastAPI routes
+        self._setup_routes()
+        self._server_task = None
+
+    def _setup_routes(self):
+        """Setup FastAPI routes."""
+        app = FastAPI(title="OnlineRouterWorker", version="1.0.0")
+        app.add_route("/api/training/submit", self._handle_track, methods=["POST"])
+
+        # Init the HTTP server
+        self._server = uvicorn.Server(
+            uvicorn.Config(
+                app, host=self._server_host, port=self._server_port, log_level="info"
+            )
+        )
+
+    def server_start(self):
+        """Start service."""
+        assert self._server_task is None
+
+        # Start server in background task
+        self._server_task = asyncio.create_task(self._server.serve())
+
+        self.log_info(f"service started on {self._server_host}:{self._server_port}")
+
+    async def server_stop(self):
+        """Stop service."""
+        assert self._server_task is not None
+
+        # Stop the HTTP server
+        self._server.should_exit = True
+
+        # Wait the HTTP server to stop
+        await self._server_task
+
+        self._server_task = None
+        self.log_info("service stopped")
+
+    async def _handle_track(self, request: Request):
+        """Handle training data submission from router's feedback_worker."""
+        # Parse incoming training data
+        training_data = await request.json()
+
+        self.log_debug(
+            f"Received training data: {training_data.get('metadata', {}).get('request_id', 'unknown')}"
+        )
+
+        training_data["received_at"] = time.time()
+
+        if self._track_data_enable:
+            # Store training data to file (async, non-blocking)
+            storage_path = self._storage.store_training_data(training_data)
+            if storage_path:
+                training_data["storage_path"] = storage_path
+                self.log_debug(f"Training data stored to: {storage_path}")
+
+            # Put data into unified data source
+            await self._data_source.put(training_data)
+
+        # Return response to router
+        response_data = {
+            "status": "submitted",
+            "message": "Training data submitted successfully",
+            "queue_position": self._data_source.qsize(),
+        }
+
+        return Response(
+            content=json.dumps(response_data),
+            media_type="application/json",
+        )
+
+    def _convert_training_data_to_rollout_result(
+        self, training_data: Dict[str, Any]
+    ) -> RolloutResult:
+        """Convert training data from HTTP request into RolloutResult format."""
+        # Extract text data
+        input_text = training_data.get("prompt", "")
+        output_text = training_data.get("completion", "")
+        reward_score = training_data.get("accepted", 0.0)
+        assert input_text is not None
+        assert output_text is not None
+
+        # Tokenize texts
+        input_encoding = self._tokenizer(
+            input_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self._cfg.runner.seq_length - self._max_new_tokens,
+        )
+        input_ids = input_encoding["input_ids"][0].tolist()
+
+        output_encoding = self._tokenizer(
+            text=output_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self._max_new_tokens,
+        )
+        output_ids = output_encoding["input_ids"][0].tolist()
+
+        # Create RolloutResult with the feedback data
+        group_size = getattr(self._cfg.algorithm, "group_size", 1)
+
+        rollout_result = RolloutResult(
+            num_sequence=1,
+            group_size=group_size,
+            prompt_lengths=[len(input_ids)],
+            prompt_ids=[input_ids],
+            response_lengths=[len(output_ids)],
+            response_ids=[output_ids],
+            is_end=[True],  # Assume the response is complete
+            rewards=torch.tensor([reward_score], dtype=torch.float32).reshape(-1, 1),
+            advantages=[0.0],  # Will be computed later in the training pipeline
+            prompt_texts=[input_text],
+            response_texts=[output_text],
+            answers=[output_text],
+        )
+
+        self.log_debug(
+            f"Created RolloutResult from HTTP data with reward {reward_score}"
+        )
+
+        return rollout_result
+
+    async def _process_unified_data_continuously(self, output_channel: Channel):
+        """Continuously process data from the unified data source."""
+        self.log_info("Starting continuous unified data processing")
+
+        # clear existing data in self._data_source
+        while not self._data_source.empty():
+            self._data_source.get_nowait()
+
+        # start tracking new data
+        self._track_data_enable = True
+        if self._enable_dummy_data:
+            for i in range(self._batch_size):
+                data = {
+                    "prompt": "Hello, world!",
+                    "completion": "Hello, world!",
+                    "accepted": 1.0,
+                }
+                await self._data_source.put(data)
+
+        for i in range(self._batch_size):
+            # Get data from unified source (either HTTP or Channel)
+            data = await self._data_source.get()
+
+            # Convert data to RolloutResult based on source type
+            rollout_result = self._convert_training_data_to_rollout_result(data)
+
+            # Send result to output channel if available
+            await output_channel.put(item=rollout_result, async_op=True).async_wait()
+            # log the qsize of the output channel
+            self.log_debug(f"Output channel qsize: {output_channel.qsize()}")
+
+            # Mark task as done
+            self._data_source.task_done()
+        self._track_data_enable = False
+
+        self.log_info("Continuous unified data processing stopped")
+
+    async def rollout(self, output_channel: Channel):
+        """Run HTTP server and start automatic data processing."""
+
+        # Start automatic processing
+        await self._process_unified_data_continuously(output_channel)
+
+        self.log_info(
+            "ServerRolloutWorker is running with HTTP server and auto processing"
+        )
+
+    def init_worker(self):
+        """Initialize the worker (sync version)."""
+
+        self.log_info("ServerRolloutWorker initialized")
+
+    async def shutdown(self):
+        """Shutdown the server and cleanup resources."""
+        self.log_info("Shutting down ServerRolloutWorker")
+
+        while not self._data_source.empty():
+            self._data_source.get_nowait()
+
+        self.log_info("ServerRolloutWorker shutdown complete")
diff --git a/rlinf/workers/rollout/sglang/__init__.py b/rlinf/workers/rollout/sglang/__init__.py
index 0e903531c..cc78162b9 100644
--- a/rlinf/workers/rollout/sglang/__init__.py
+++ b/rlinf/workers/rollout/sglang/__init__.py
@@ -30,7 +30,7 @@ def get_version(pkg):
 sglang_version = None
 
 if package_version is None:
-    raise ValueError(f"vllm version {package_version} not supported")
+    raise ValueError(f"sglang version {package_version} not supported")
 elif package_version >= parse("0.4.4") and package_version < parse("0.4.6.post2"):
     sglang_version = package_version
     from rlinf.hybrid_engines.sglang.sglang_0_4_4 import io_struct
@@ -49,6 +49,12 @@ def get_version(pkg):
     from rlinf.hybrid_engines.sglang.sglang_0_4_9.sgl_engine import (
         Engine,
     )
+elif package_version >= parse("0.5.0") and package_version < parse("0.5.3"):
+    sglang_version = package_version
+    from rlinf.hybrid_engines.sglang.sglang_0_5_2 import io_struct
+    from rlinf.hybrid_engines.sglang.sglang_0_5_2.sgl_engine import (
+        Engine,
+    )
 else:
     raise ValueError(f"sglang version {package_version} not supported")
 
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 96c723284..e58027ba5 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 
 import asyncio
+import copy
 import dataclasses
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import torch
 from omegaconf import DictConfig
 from sglang.srt.server_args import ServerArgs
 from transformers import AutoTokenizer
@@ -34,7 +34,6 @@
 from rlinf.workers.rollout.utils import (
     print_sglang_outputs,
 )
-from toolkits.math_verifier.verify import MathRewardModel, math_verify_call
 
 
 class SGLangWorker(Worker):
@@ -114,6 +113,8 @@ def _init_engine(self):
             log_level="info",
             max_running_requests=self._cfg.rollout.max_running_requests,
             dist_init_addr=f"127.0.0.1:{str(Cluster.find_free_port())}",
+            device="npu",
+            watchdog_timeout=3600,
         )
 
         self.log_on_first_rank(f"{server_args=}")
@@ -144,6 +145,8 @@ def _get_sampling_param_from_config(self) -> dict:
                 "repetition_penalty": cfg_sampling_params.repetition_penalty,
                 "max_new_tokens": cfg_sampling_params.max_new_tokens,
             }
+            if "stop" in cfg_sampling_params:
+                sampling_params["stop"] = cfg_sampling_params["stop"]
         return sampling_params
 
     def _stop(self):
@@ -166,7 +169,6 @@ def sync_model_from_actor(self):
 
     def rollout(self, input_channel: Channel, output_channel: Channel):
         request: RolloutRequest = input_channel.get()
-
         # Repeat prompts based on the group_size config
         requests = request.repeat_and_split(self._rollout_batch_size)
 
@@ -176,18 +178,25 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
         for request in requests:
             # Generate outputs using the SGLang engine.
             with self.worker_timer():
+                self.log_info(f"Generating {len(request.input_ids)} samples...")
                 results = self._engine.generate(
                     input_ids=request.input_ids,
+                    # 0.4.4 has modality bug,can't pass non-None image_data
+                    image_data=request.image_data if any(request.image_data) else None,
                     sampling_params=self._sampling_params,
                     return_logprob=self._return_logprobs,
                 )
 
+            self.log_info(f"Generation for {len(request.input_ids)} samples done.")
+
             # Create RolloutResult from the outputs.
             rollout_result = RolloutResult.from_sglang_results(
                 results,
                 request.n,
                 request.input_ids,
                 request.answers,
+                request.image_data,
+                request.multi_modal_inputs,
                 self._return_logprobs,
             )
             rollout_results.append(rollout_result)
@@ -202,8 +211,9 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
         self._stop()
         # Release the GPUs once the engine has offloaded
         output_channel.device_lock.release()
-        rollout_result = RolloutResult.merge_result_list(rollout_results)
-        output_channel.put(rollout_result)
+        rollout_result_list = RolloutResult.split_result_list_by_group(rollout_results)
+        for rollout_result in rollout_result_list:
+            output_channel.put(rollout_result)
 
 
 def all_floats_equal(float_list: list[float], epsilon: float = 1e-9) -> bool:
@@ -226,7 +236,6 @@ def __init__(self, config: DictConfig, placement: ComponentPlacement):
         self._rollout_end_event = asyncio.Event()
         self._sync_weight_end_event = asyncio.Event()
 
-        self._reward_model = MathRewardModel(scale=self._cfg.reward.reward_scale)
         assert self._rollout_batch_size is None, (
             "rollout_batch_size_per_gpu is not supported in AsyncSGLangWorker"
         )
@@ -255,29 +264,6 @@ async def init_worker(self):
         if self._cfg.rollout.validate_weight:
             await self._validate_weight_at_first()
 
-    async def _compute_reward_and_advantage(
-        self, engine_results: List[Dict], answer: str
-    ):
-        answers = [answer] * len(engine_results)
-        texts: List[str] = []
-        for res in engine_results:
-            if hasattr(res, "text"):
-                texts.append(res["text"])
-            else:
-                texts.append(
-                    self._tokenizer.decode(res["output_ids"], skip_special_tokens=True)
-                )
-
-        results = math_verify_call(texts, answers)
-        rewards = [(1 if r else -1) * self._reward_model.scale for r in results]
-        rewards_tensor = torch.tensor(rewards, dtype=torch.float)
-
-        mean = rewards_tensor.mean()
-        std = rewards_tensor.std()
-        advantages = (rewards_tensor - mean) / (std + 1e-6)
-
-        return rewards, advantages.tolist()
-
     async def _async_generate(
         self, raw_id: int, input_ids: List[int], sampling_params: dict
     ):
@@ -316,7 +302,6 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
             total_reqs = len(rollout_tasks)
             required_reqs = total_reqs // self._cfg.algorithm.max_num_gen_batches
 
-            droped_reqs = 0
             finished_reqs = 0
             abort_flag = False
 
@@ -327,32 +312,17 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
 
                 if self._completion_info.is_completed(hash_id):
                     results = self._completion_info.get_results(hash_id)
-                    (
-                        rewards,
-                        advantages,
-                    ) = await self._compute_reward_and_advantage(
-                        results,
-                        self._current_request.answers[raw_id],
-                    )
-                    if (
-                        all_floats_equal(rewards)
-                        and self._cfg.algorithm.get("max_num_gen_batches", 1) > 1
-                    ):
-                        if (total_reqs - droped_reqs) > required_reqs:
-                            droped_reqs += rollout_request.n
-                            continue
 
                     input_ids = [input_ids] * len(results)
+                    answers = [rollout_request.answers[raw_id]] * len(results)
                     rollout_result = RolloutResult.from_sglang_results(
                         results,
                         rollout_request.n,
                         input_ids,
+                        answers=answers,
                         return_logprobs=self._return_logprobs,
                     )
-                    rollout_result.rewards = torch.tensor(
-                        rewards, dtype=torch.float32
-                    ).reshape(-1, 1)
-                    rollout_result.advantages = advantages
+
                     return_tasks.append(
                         asyncio.create_task(
                             self._put_result(rollout_result, output_channel)
@@ -391,3 +361,16 @@ def shutdown(self):
         self.log_info(f"Shutting down SGLang worker {self._rank} ...")
         self._engine.shutdown()
         self.log_info(f"SGLang worker {self._rank} shutdown complete.")
+
+    async def agenerate(self, prompt: str, stop: Optional[List[str]] = None):
+        sampling_params = self._sampling_params
+        if stop is not None:
+            sampling_params = copy.deepcopy(sampling_params)
+            sampling_params["stop"] = stop
+
+        result = await self._engine.async_generate(
+            prompt=prompt,
+            sampling_params=sampling_params,
+            return_logprob=self._return_logprobs,
+        )
+        return result
diff --git a/rlinf/workers/rollout/utils.py b/rlinf/workers/rollout/utils.py
index 0e1c534ee..f92e48caa 100644
--- a/rlinf/workers/rollout/utils.py
+++ b/rlinf/workers/rollout/utils.py
@@ -376,6 +376,12 @@ def get_actor_rank_to_rollout_rank_map(
         """
         Get the global mapping from actor 1D rank to rollout 2D rank as dict.
         """
+        # rank -> (dp, tp)
+        if actor_tp_size == 1:
+            return {
+                rank: (rank // rollout_tp_size, rank % rollout_tp_size)
+                for rank in range(actor_world_size)
+            }
         rank_map = {}
         for actor_rank in range(actor_world_size):
             rank_map[actor_rank] = cls._get_actor_rank_to_rollout_rank(
@@ -550,12 +556,11 @@ def get_rollout_backend_worker(
     if rollout_backend == "vllm":
         from rlinf.workers.rollout.vllm.vllm_worker import VLLMWorker
 
-        if placement.placement_mode == PlacementMode.COLLOCATED:
+        if (
+            placement.placement_mode == PlacementMode.COLLOCATED
+            or placement.placement_mode == PlacementMode.DISAGGREGATED
+        ):
             return VLLMWorker
-        elif placement.placement_mode == PlacementMode.DISAGGREGATED:
-            raise NotImplementedError(
-                "vLLM rollout backend does not support the pipeline mode."
-            )
         else:
             raise ValueError(f"Unsupported placement mode: {placement.placement_mode}")
     elif rollout_backend == "sglang":
diff --git a/rlinf/workers/rollout/vllm/__init__.py b/rlinf/workers/rollout/vllm/__init__.py
index 1a43de500..7237a1038 100644
--- a/rlinf/workers/rollout/vllm/__init__.py
+++ b/rlinf/workers/rollout/vllm/__init__.py
@@ -34,8 +34,8 @@ def get_version(pkg):
         "vllm package is not installed or its version could not be determined."
     )
 elif package_version >= parse("0.8.5") and package_version < parse("0.9.0"):
-    from rlinf.hybrid_engines.vllm.vllm_0_8_5.vllm_engine import VLLMEngine
+    from rlinf.hybrid_engines.vllm.vllm_0_8_5.executor import VLLMExecutor
 else:
     raise ValueError(f"vllm version {package_version} not supported")
 
-__all__ = ["VLLMEngine"]
+__all__ = ["VLLMExecutor"]
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index 6bfb001fb..7324fe841 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -12,14 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
+import io
 import os
-from typing import List
+from functools import partial
+from typing import AsyncGenerator, List, Optional, Union
 
+import requests
 from omegaconf import DictConfig
+from PIL import Image
 from transformers import AutoTokenizer
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs.data import PromptType, TextPrompt, TokensPrompt
+from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.utils import Counter
+from vllm.v1.engine.async_llm import AsyncLLM as AsyncLLMEngine
 
 from rlinf.config import torch_dtype_from_precision
 from rlinf.data.io_struct import RolloutRequest, RolloutResult
@@ -27,7 +36,7 @@
 from rlinf.utils.placement import ComponentPlacement
 from rlinf.workers.rollout.utils import print_vllm_outputs
 
-from . import VLLMEngine
+from . import VLLMExecutor
 
 
 class VLLMWorker(Worker):
@@ -58,6 +67,7 @@ def __init__(self, config: DictConfig, placement: ComponentPlacement):
             "The capital of France is",
             "The future of AI is",
         ]
+        self.request_counter = Counter()
 
     def _prepare_vllm_environment(self) -> None:
         """
@@ -70,9 +80,15 @@ def _prepare_vllm_environment(self) -> None:
         )
         # use spawn to avoid fork issues with CUDA
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-        # set False to use Inproclient, which uses sync calls.
-        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
         os.environ["VLLM_ATTENTION_BACKEND"] = self._cfg.rollout.vllm.attention_backend
+        # set True to use AsyncMPClient, which uses async calls.
+        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "1"
+        if self._cfg.rollout.vllm.torch_profiler_dir is not None:
+            os.environ["VLLM_TORCH_PROFILER_DIR"] = (
+                self._cfg.rollout.vllm.torch_profiler_dir
+            )
+            if not os.path.exists(self._cfg.rollout.vllm.torch_profiler_dir):
+                os.makedirs(self._cfg.rollout.vllm.torch_profiler_dir)
 
     def _get_sampling_params_from_config(self) -> SamplingParams:
         """
@@ -84,6 +100,8 @@ def _get_sampling_params_from_config(self) -> SamplingParams:
                 temperature=0,
                 max_tokens=cfg_sampling_params.max_new_tokens,
                 output_kind=RequestOutputKind.FINAL_ONLY,
+                n=self._cfg.algorithm.group_size,
+                logprobs=0 if self._return_logprobs else None,
             )
         else:
             sampling_params = SamplingParams(
@@ -93,40 +111,217 @@ def _get_sampling_params_from_config(self) -> SamplingParams:
                 repetition_penalty=cfg_sampling_params.repetition_penalty,
                 max_tokens=cfg_sampling_params.max_new_tokens,
                 output_kind=RequestOutputKind.FINAL_ONLY,
+                n=self._cfg.algorithm.group_size,
+                logprobs=0 if self._return_logprobs else None,
             )
         return sampling_params
 
-    def _validate_weight_at_first(self) -> None:
+    def _process_image_data(
+        self, image_data: Optional[List[Union[bytes, str]]]
+    ) -> Optional[List[Image]]:
+        """
+        Process the batch image data which can be bytes or image paths.
+
+        Args:
+            batch_image_data (Optional[List[List[Union[bytes,str]]]]): A batch of
+                image data, each item can be bytes or image path (local or URL).
+        Returns:
+            Optional[List[List[Image]]]: A batch of list of PIL Image. If input
+                is None, return None.
+        """
+        if image_data is None:
+            return None
+        if not isinstance(image_data, list):
+            raise ValueError("image_data should be a list of list of image data.")
+        image_list = []
+        for img in image_data:
+            if isinstance(img, bytes):
+                image = Image.open(io.BytesIO(img))
+            elif isinstance(img, str):
+                if img.startswith("http://") or img.startswith("https://"):
+                    response = requests.get(img)
+                    image = Image.open(io.BytesIO(response.content))
+                else:
+                    image = Image.open(img)
+            else:
+                raise ValueError("Unsupported image data type.")
+            image_list.append(image)
+        return image_list
+
+    async def _validate_weight_at_first(self) -> None:
         """
         Validate the model weights before starting to rollout formally.
         """
         if self._cfg.rollout.detokenize:
-            outputs = self._vllm_engine.generate(
+            vllm_outputs = await self.generate(
                 input_ids=None,
                 sampling_params=self._validate_sampling_params,
-                return_logprobs=False,
                 prompt_texts=self._validate_prompts,
             )
         else:
             prompt_ids = self._tokenizer(self._validate_prompts).input_ids
-            outputs = self._vllm_engine.generate(
+            vllm_outputs = await self.generate(
                 input_ids=prompt_ids,
                 sampling_params=self._validate_sampling_params,
-                return_logprobs=False,
             )
-        print_vllm_outputs(outputs=outputs)
-        print("===============================", flush=True)
+        for request_output in vllm_outputs:
+            print_vllm_outputs(request_output, self._tokenizer)
+
+    async def offload_model_weights(self) -> None:
+        """
+        Use async_engine to offload model weights/kv cache.
+        """
+        await self._async_engine.reset_prefix_cache()
+        await self._async_engine.collective_rpc("offload_model_weights")
 
-    def sync_model_from_actor(self) -> None:
+    async def sync_model_from_actor(self) -> None:
         """
-        Use vllm_engine to sync model weights from the actor.
+        Sync model weights from actor to the vllm workers.
         """
-        self._vllm_engine.sync_hf_weight()
+        await self._async_engine.collective_rpc("sync_hf_weight")
+        await self._async_engine.reset_prefix_cache()
 
-    def init_worker(self) -> None:
+    async def _get_output_from_async_generator(
+        self, async_generator: AsyncGenerator[RequestOutput, None]
+    ) -> RequestOutput:
+        """
+        Helper function to get the final output from an async generator.
         """
-        Use EngineArgs and VllmConfig to initialize the VLLM engine.
-        Then offload the model weights, ready to use weights sent from actor.
+        output: RequestOutput = None
+        async for out in async_generator:
+            output = out
+        assert output is not None, "Async generator returned no output."
+        return output
+
+    def _pre_process_rollout_request(
+        self,
+        request: RolloutRequest,
+    ) -> List[List[RolloutRequest]]:
+        if self._rollout_batch_size is not None:
+            # NOTE:
+            # it's different from sglang, here a request's sample count
+            # instead of sample count x group_size  should be divisible by rollout_batch_size
+            assert len(request.input_ids) % self._rollout_batch_size == 0, (
+                f"rollout_batch_size {self._rollout_batch_size} must divide the total number of requests {len(request.input_ids)}"
+            )
+            num_batch = len(request.input_ids) // self._rollout_batch_size
+        else:
+            num_batch = 1
+
+        split_requests = request.split(num_batch)
+        if self._placement.is_disaggregated:
+            num_prompts_per_request = len(split_requests[0].input_ids)
+            return [r.split(num_prompts_per_request) for r in split_requests]
+        else:
+            return [r.split(1) for r in split_requests]
+
+    async def generate(
+        self,
+        input_ids: Union[List[List[int]], List[int]],
+        sampling_params: SamplingParams,
+        prompt_texts: Optional[Union[List[str], str]] = None,
+        image_data: Optional[
+            Union[List[List[Union[bytes, str]]], List[Union[bytes, str]]]
+        ] = None,
+    ) -> List[RequestOutput]:
+        """
+        Do Generate Task using the vllm async engine.
+
+        Args:
+            input_ids: The input token ids to generate. It can be a list of list of int,
+                or a list of int (single prompt).
+            sampling_params: The sampling parameters to use for generation.
+            prompt_texts: The input prompt texts to generate. It can be a list of strings
+                or a single string. If provided, it will be used instead of input_ids.
+            image_data: The input multi-modal data to generate. It can be a list of list
+                of bytes or image paths (local or URL), or a list of bytes or image paths
+                (single prompt).
+
+        Returns:
+            List[RequestOutput]: A list of RequestOutput from vllm engine.
+        """
+
+        def check_input_ids() -> List[List[int]]:
+            assert isinstance(input_ids, list), (
+                "input_ids should be a list or list of list of int."
+            )
+            assert len(input_ids) > 0, "input_ids should not be empty."
+            if isinstance(input_ids[0], int):
+                return [input_ids]
+            else:
+                return input_ids
+
+        def check_prompt_text() -> Optional[List[str]]:
+            if prompt_texts is None:
+                return None
+            assert isinstance(prompt_texts, list) or isinstance(prompt_texts, str), (
+                "prompt_text should be a string or list of strings."
+            )
+            if isinstance(prompt_texts, str):
+                return [prompt_texts]
+            else:
+                assert len(prompt_texts) > 0, "prompt_text should not be empty."
+                return prompt_texts
+
+        def check_image_data() -> Optional[List[List[Image.Image]]]:
+            if image_data is None or not any(image_data):
+                return None
+            assert isinstance(image_data, list), "image_data should be a list."
+            if isinstance(image_data[0], list):
+                return image_data
+            else:
+                return [image_data]
+
+        input_ids = check_input_ids()
+        prompt_texts = check_prompt_text()
+        image_list = check_image_data()
+
+        inputs: List[PromptType] = []
+        outputs: List[RequestOutput] = []
+        if prompt_texts is not None:
+            for i, prompt_text in enumerate(prompt_texts):
+                if image_list is not None:
+                    images = self._process_image_data(image_data=image_list[i])
+                    inputs.append(
+                        TextPrompt(
+                            prompt=prompt_text, multi_modal_data={"image": images}
+                        )
+                    )
+                else:
+                    inputs.append(TextPrompt(prompt=prompt_text))
+        else:
+            for i, input_id in enumerate(input_ids):
+                if image_list is not None:
+                    images = self._process_image_data(image_data=image_list[i])
+                    inputs.append(
+                        TokensPrompt(
+                            prompt_token_ids=input_id,
+                            multi_modal_data={"image": images},
+                        )
+                    )
+                else:
+                    inputs.append(TokensPrompt(prompt_token_ids=input_id))
+
+        outputs = await asyncio.gather(
+            *[
+                self._get_output_from_async_generator(
+                    self._async_engine.generate(
+                        prompt=inp,
+                        sampling_params=sampling_params,
+                        request_id=str(next(self.request_counter)),
+                    )
+                )
+                for inp in inputs
+            ]
+        )
+
+        return outputs
+
+    async def init_worker(self) -> None:
+        """
+        Use EngineArgs and VllmConfig to initialize VLLM async engine.
+        If mode is collocated, it will additionally offload model weights,
+        ready to use parameters sent from actor.
         """
         engine_args: EngineArgs = EngineArgs(
             model=self._cfg.rollout.model_dir,
@@ -136,28 +331,61 @@ def init_worker(self) -> None:
             enforce_eager=self._cfg.rollout.enforce_eager,
             enable_chunked_prefill=self._cfg.rollout.vllm.enable_chunked_prefill,
             enable_prefix_caching=self._cfg.rollout.vllm.enable_prefix_caching,
+            max_num_batched_tokens=self._cfg.rollout.vllm.max_num_batched_tokens,
             task="generate",
+            load_format="dummy" if not self._cfg.rollout.validate_weight else "auto",
             trust_remote_code=self._cfg.actor.tokenizer.trust_remote_code,
             max_model_len=self._cfg.runner.seq_length,
             max_num_seqs=self._cfg.rollout.max_running_requests,
-            enable_sleep_mode=True,  # it enables offload weights
+            enable_sleep_mode=False,
+            device="npu", 
         )
         vllm_config: VllmConfig = engine_args.create_engine_config()
+
+        # here to set the customed worker class for VLLM engine
+        vllm_worker_cls = "rlinf.hybrid_engines.vllm.vllm_0_8_5.worker.VLLMWorker"
+        vllm_config.parallel_config.worker_cls = vllm_worker_cls
+
         self.log_info(f"vllm_config is {vllm_config}")
-        self.log_info(f"[LLM dp {self._rank}] start to initialize VLLM engine")
-        self._vllm_engine = VLLMEngine(
+
+        executor_class = partial(
+            VLLMExecutor,
             rlinf_config=self._cfg,
-            vllm_config=vllm_config,
-            log_stats=not self._cfg.rollout.disable_log_stats,
-            multiprocess_model=False,  # use Inproclient
             parent_address=self.worker_address,
             placement=self._placement,
             dp_rank=self._rank,
         )
+
+        self._async_engine = AsyncLLMEngine(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=not self._cfg.rollout.disable_log_stats,
+            log_requests=False,  # do not need to log each request
+        )
+
         self.log_info(f"[LLM dp {self._rank}] VLLM engine initialized.")
-        self._vllm_engine.offload_model_weights()
 
-    def _stop(self) -> None:
+        if not self._placement.is_disaggregated:
+            await self.offload_model_weights()
+
+    async def _put_result(self, result: RolloutResult, output_channel: Channel) -> None:
+        """
+        Helper function to put the result to output channel.
+
+        Args:
+            result: The RolloutResult to put to the channel.
+            output_channel: The output channel to send results to.
+        """
+        # NOTE:
+        # To fit reward worker and actor workers' expected input count,
+        # currently we can only split result into groups.
+        splited_results = RolloutResult.split_result_list_by_group([result])
+        put_tasks = [
+            output_channel.put(r, async_op=True).async_wait() for r in splited_results
+        ]
+        await asyncio.gather(*put_tasks)
+
+    async def _stop(self) -> None:
         """
         Helper function to stop the VLLM engine and offload model weights.
         This should only be called when vllm engine has no more requests to process.
@@ -165,50 +393,62 @@ def _stop(self) -> None:
         self.log_debug(
             f"[LLM dp {self._rank}] Received None input tokens, rollout end."
         )
-        self._vllm_engine.offload_model_weights()
+        if not self._placement.is_disaggregated:
+            await self.offload_model_weights()
 
-    def rollout(self, input_channel: Channel, output_channel: Channel) -> None:
+    async def rollout_and_return(
+        self, request: RolloutRequest, output_channel: Channel
+    ):
         """
-        The main rollout function to interact with the VLLM engine.
-        It receives RolloutRequest from input_channel, and sends back RolloutResult
-        to output_channel.
+        Helper function to rollout for a single RolloutRequest and build RolloutResult then
+        put it to output channel.
 
         Args:
-            input_channel (Channel): The channel to receive RolloutRequest.
-            output_channel (Channel): The channel to send RolloutResult.
+            request: The RolloutRequest to process.
+            output_channel: The output channel to send results to.
         """
-        request: RolloutRequest = input_channel.get()
-
-        requests: List[RolloutRequest] = request.repeat_and_split(
-            self._rollout_batch_size
+        vllm_results: List[RequestOutput] = await self.generate(
+            input_ids=request.input_ids,
+            image_data=request.image_data,
+            sampling_params=self._sampling_params,
+        )
+        rollout_result: RolloutResult = RolloutResult.from_vllm_results(
+            group_size=self._cfg.algorithm.group_size,
+            results=vllm_results,
+            answers=request.answers,
+            multi_modal_inputs=request.multi_modal_inputs,
+            return_logprobs=self._return_logprobs,
         )
+        if self._cfg.rollout.print_outputs:
+            print_vllm_outputs(outputs=vllm_results)
+        await self._put_result(result=rollout_result, output_channel=output_channel)
 
-        # Acquire the GPUs to ensure no one is using them during rollout
-        output_channel.device_lock.acquire()
+    async def rollout(self, input_channel: Channel, output_channel: Channel) -> None:
+        """
+            Perform rollout using vllm engine.
+            It will read `RolloutRequest` from input_channel and put `RolloutResult` to output_channel.
+            If the input request is None, it will stop the rollout.
 
-        rollout_results: List[RolloutResult] = []
-        for request in requests:
-            with self.worker_timer():
-                vllm_results = self._vllm_engine.generate(
-                    input_ids=request.input_ids,
-                    sampling_params=self._sampling_params,
-                    return_logprobs=self._return_logprobs,
-                )
-            # should be converted by _vllm_engine side.
-            results = RolloutResult.from_vllm_results(
-                group_size=self._cfg.algorithm.group_size,
-                results=vllm_results,
-                answers=request.answers,
-                return_logprobs=self._return_logprobs,
-            )
-            rollout_results.append(results)
-            if self._cfg.rollout.print_outputs:
-                print_vllm_outputs(outputs=vllm_results)
-
-        # Stop and offload SGLang first before putting into channel
-        # This avoids running SGLang and Megatron simultaneously
-        self._stop()
-        # Release the GPUs once the engine has offloaded
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        rollout_request: RolloutRequest = await input_channel.get(
+            async_op=True
+        ).async_wait()
+        output_channel.device_lock.acquire()
+        batched_requests = self._pre_process_rollout_request(rollout_request)
+        with self.worker_timer():
+            for requests in batched_requests:
+                rollout_tasks: List[asyncio.Task] = []
+                for request in requests:
+                    rollout_tasks.append(
+                        asyncio.create_task(
+                            self.rollout_and_return(
+                                request=request, output_channel=output_channel
+                            )
+                        )
+                    )
+                await asyncio.gather(*rollout_tasks)
+            await self._stop()
         output_channel.device_lock.release()
-        rollout_result = RolloutResult.merge_result_list(rollout_results)
-        output_channel.put(rollout_result)
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..485f51f99
--- /dev/null
+++ b/test.py
@@ -0,0 +1,3 @@
+from safetensors import safe_open
+with safe_open("/home/weight/Qwen2.5-VL-3B-Instruct/model-00001-of-00002.safetensors", framework="pt") as f:
+    print(f.keys())
diff --git a/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml b/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml
index 6555cb9bf..d1c65161b 100644
--- a/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml
+++ b/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -119,6 +119,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 128
@@ -256,13 +257,20 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
 
 critic:
   use_critic_model: false
 
+
 profile_data:
   actor_cost: 95.7
   inference_cost: 30.8
diff --git a/tests/e2e_tests/auto_placement/run_auto_placement.sh b/tests/e2e_tests/auto_placement/run.sh
similarity index 100%
rename from tests/e2e_tests/auto_placement/run_auto_placement.sh
rename to tests/e2e_tests/auto_placement/run.sh
diff --git a/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
new file mode 100644
index 000000000..f34ee4cae
--- /dev/null
+++ b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
@@ -0,0 +1,301 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    rollout: 0-3
+    inference: 4-5
+    actor: 6-7
+    reward: 0-3
+
+runner:
+  task_type: coding_online_rl
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 10
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 10
+
+  seq_length: 2560
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 2560
+
+  resume_dir: null
+  experiment_name: online-ppo-1.5b-pipeline
+  output_dir: ../results
+
+algorithm:
+  group_size: 1
+
+  n_minibatches: 2
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # PPO loss params (no critic model)
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+
+  # Control critic usage (similar to AReaL's disable_head)
+  use_critic: False  # Disable critic model
+  use_value_loss: False  # Disable value loss computation
+  
+  # PPO parameters for no-critic setup
+  gamma: 0.99
+  gae_lambda: 0.95
+  # value_clip and huber_delta not needed without critic
+
+  # Use no-critic GAE advantage computation
+  adv_type: math_gae_no_critic
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 0.1
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+    stop: [
+      "<|endoftext|>",
+      "<|fim_prefix|>",
+      "<|fim_middle|>",
+      "<|fim_suffix|>",
+      "<|fim_pad|>",
+      "<|repo_name|>",
+      "<|file_sep|>",
+      "<|im_start|>",
+      "<|im_end|>",
+    ]
+
+inference:
+  model_arch: ${rollout.model_arch}
+  group_name: "InferenceGroup"
+  load_from_actor: True
+  model:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: True
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/Qwen2.5-Coder-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: True            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: sglang     # online_rl now only support sglang 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    # not used, but reserved to pass config validate
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  max_prompt_length: 1024
+  rollout_batch_size: 16
+  seed: 1234
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: null
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-06
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-7
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: False
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-6
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: ${rollout.model_dir}
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: Qwen2.5-Coder-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+reward:
+  use_reward_model: False
+  reward_type: code
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: False
+
+server:
+  # online serving and user reward track
+  online_router:
+    host: 0.0.0.0
+    port: 8081
+
+  tracking_rollout:
+    host: 0.0.0.0
+    port: 8082
+    enable_dummy_data: True
diff --git a/tests/e2e_tests/coding_online_rl/run.sh b/tests/e2e_tests/coding_online_rl/run.sh
new file mode 100644
index 000000000..9bf7b8df7
--- /dev/null
+++ b/tests/e2e_tests/coding_online_rl/run.sh
@@ -0,0 +1,12 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+export RAY_DEDUP_LOGS=0
+
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+
+python ${REPO_PATH}/examples/coding_online_rl/main_coding_online_rl.py --config-path ${REPO_PATH}/tests/e2e_tests/coding_online_rl  --config-name qwen2.5-1.5b-ppo
+
diff --git a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..2672dcd01
--- /dev/null
+++ b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
@@ -0,0 +1,170 @@
+defaults:
+  - env/train: libero_130
+  - env/eval: libero_130
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:REPO_PATH}/examples/embodiment/config
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: 0-3
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: -1
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 1
+  reward_type: step_level  # step_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"
+
+  enable_offload: True
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"
+  checkpoint_save_path: "/workspace/results"
+  micro_batch_size: 2
+  global_batch_size: 256
+  seed: 1234
+  enable_offload: True
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_130_no_noops_trajall
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..6dc7893d3
--- /dev/null
+++ b/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
@@ -0,0 +1,169 @@
+defaults:
+  - env/train: libero_goal
+  - env/eval: libero_goal
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:REPO_PATH}/examples/embodiment/config
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: 0-3
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: -1
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 1
+  reward_type: action_level  # action_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  enable_offload: False
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  checkpoint_save_path: "/workspace/results"
+  micro_batch_size: 2
+  global_batch_size: 256
+  seed: 1234
+  enable_offload: False
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_goal_no_noops
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..f26ce7c3d
--- /dev/null
+++ b/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
@@ -0,0 +1,168 @@
+defaults:
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:REPO_PATH}/examples/embodiment/config
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: 0-3
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 2
+  max_steps: -1
+
+  val_check_interval: -1
+  save_interval: -1
+  seq_length: 1024
+  max_prompt_length: 30
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 2
+
+  n_chunk_steps: 10
+  n_eval_chunk_steps: 10
+  num_group_envs: 8
+  rollout_epoch: 1
+  reward_type: action_level  # action_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.0
+    temperature_eval: 0.6
+    top_k: 0
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  enable_offload: True
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  checkpoint_save_path: "/workspace/results"
+
+  micro_batch_size: 10
+  global_batch_size: 40
+  seed: 1234
+  enable_offload: True
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: bridge_orig
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: True
+    lora_rank: 32
+    lora_path: /workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora/lora_adapter
+    ckpt_path: null
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+  optim:
+    lr: 1.0e-4
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 10.0
+
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/ppo_openvla.yaml b/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
similarity index 91%
rename from tests/e2e_tests/embodied/ppo_openvla.yaml
rename to tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
index a5c3c9487..5d9258743 100644
--- a/tests/e2e_tests/embodied/ppo_openvla.yaml
+++ b/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
@@ -24,12 +24,12 @@ runner:
     experiment_name: "ci-test"
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 3
+  max_epochs: 2
   max_steps: -1
 
   val_check_interval: -1
   save_interval: -1
-  seq_length: 4096
+  seq_length: 1024
   max_prompt_length: 30
 
 algorithm:
@@ -43,16 +43,9 @@ algorithm:
 
   n_chunk_steps: 10
   n_eval_chunk_steps: 10
-  # training rollout mbs
-  rollout_micro_batch_size: 64
   num_group_envs: 8
   rollout_epoch: 1
 
-  # mbs to do log prob inference, can be set to
-  # lower than rollout_micro_batch_size to reduce
-  # memory usage
-  logprob_forward_micro_batch_size: 16 # ${.rollout_micro_batch_size}
-
   adv_type: embodied_gae
   loss_type: embodied_ppo
   reward_type: chunk_level
@@ -122,8 +115,8 @@ actor:
   checkpoint_load_path: "/workspace/dataset/openvla-7b"
   checkpoint_save_path: "/workspace/results"
 
-  micro_batch_size: 20
-  global_batch_size: 80
+  micro_batch_size: 10
+  global_batch_size: 40
   seed: 1234
   enable_offload: True
 
@@ -172,6 +165,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 1.0
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   use_reward_model: False
 
diff --git a/tests/e2e_tests/embodied/run_openvla.sh b/tests/e2e_tests/embodied/run.sh
similarity index 65%
rename from tests/e2e_tests/embodied/run_openvla.sh
rename to tests/e2e_tests/embodied/run.sh
index 34a9b7ac9..09d44d7cc 100644
--- a/tests/e2e_tests/embodied/run_openvla.sh
+++ b/tests/e2e_tests/embodied/run.sh
@@ -3,7 +3,11 @@ set -x
 
 tabs 4
 
+CONFIG=$1
+BACKEND=${2:-"egl"}
+
+export MUJOCO_GL=${BACKEND}
+export PYOPENGL_PLATFORM=${BACKEND}
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-unset HOME # GitHub action sets HOME to a wrong path (/github/home), breaking simulator
 
-python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name ppo_openvla
\ No newline at end of file
+python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name ${CONFIG}
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/run_collocated.sh b/tests/e2e_tests/math/sglang/run_collocated.sh
deleted file mode 100644
index 1610f8fac..000000000
--- a/tests/e2e_tests/math/sglang/run_collocated.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-collocated"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/run_pipeline.sh b/tests/e2e_tests/math/sglang/run_pipeline.sh
deleted file mode 100644
index 85e2e5c2d..000000000
--- a/tests/e2e_tests/math/sglang/run_pipeline.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-pipeline"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/run_collocated.sh b/tests/e2e_tests/math/vllm/run_collocated.sh
deleted file mode 100644
index b4e924b1d..000000000
--- a/tests/e2e_tests/math/vllm/run_collocated.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-collocated"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
new file mode 100644
index 000000000..86c6f362c
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
@@ -0,0 +1,219 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 16
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
new file mode 100644
index 000000000..dbf7925f2
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
@@ -0,0 +1,219 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 16
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
new file mode 100644
index 000000000..e85fd4146
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
@@ -0,0 +1,219 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 16
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
new file mode 100644
index 000000000..1aab4cb7b
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
@@ -0,0 +1,219 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 16
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml
similarity index 92%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml
index 33835a003..4edc14979 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
@@ -49,7 +49,7 @@ algorithm:
   # val rollout mbs
   val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
 
-  recompute_logprobs: True
+  recompute_logprobs: False
   shuffle_rollout: False
 
   # GRPO loss params
@@ -104,7 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
-
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -115,15 +116,12 @@ rollout:
   validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
   print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
 
-  sglang_decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
   max_running_requests: 64 # the maximum number of running requests in the rollout engine.
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
-  use_torch_compile: False # enable torch_compile in SGLang for rollout.
-  torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
-
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -148,7 +146,7 @@ actor:
   offload_weight: True
   offload_grad: True
 
-  enable_dp_load_balance: True
+  enable_dp_load_balance: False
 
   calculate_flops: True
 
@@ -262,9 +260,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml
new file mode 100644
index 000000000..1854fd8c3
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml
@@ -0,0 +1,275 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 2
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: sglang     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: True
+
+  calculate_flops: True
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    context_parallel_size: 2
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: False
+    fp16: True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: True
+    overlap_param_gather: True
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+    
+    ckpt_convertor: # config for ckpt convertor
+      model: DeepSeek-R1-Distill-Qwen-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+      
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs.yaml
similarity index 90%
rename from tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs.yaml
index 07f30cc8c..edeaee9c3 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -119,6 +121,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -137,7 +140,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -225,7 +228,7 @@ actor:
     use_dist_ckpt: False
     tp_comm_bootstrap_backend: nccl
     tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
     use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
     
     ckpt_convertor: # config for ckpt convertor
@@ -257,9 +260,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml
similarity index 90%
rename from tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml
index 03e024c2e..09df84ca8 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
@@ -120,6 +122,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -138,7 +141,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -226,7 +229,7 @@ actor:
     use_dist_ckpt: False
     tp_comm_bootstrap_backend: nccl
     tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
     use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
     
     ckpt_convertor: # config for ckpt convertor
@@ -258,9 +261,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs.yaml
similarity index 92%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs.yaml
index 8054bc911..34bcff492 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs.yaml
@@ -11,9 +11,10 @@ cluster:
   component_placement:
     rollout: 0-3
     actor: 4-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -21,7 +22,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
@@ -107,6 +108,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -122,6 +125,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   rollout_batch_size: 8
   val_rollout_batch_size: null
@@ -139,7 +143,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -255,11 +259,17 @@ actor:
       schedule_repeat: 1 # inference and training will repeat such times
       # schedule_wait: it will be set at runtime
 
-
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml
similarity index 92%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml
index 03d24a2ac..b48eb6057 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml
@@ -12,9 +12,10 @@ cluster:
     rollout: 0-3
     inference: 4-5
     actor: 6-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -22,7 +23,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
@@ -117,6 +118,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -136,6 +139,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   rollout_batch_size: 8
   val_rollout_batch_size: null
@@ -153,7 +157,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -271,9 +275,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs.yaml
similarity index 88%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs.yaml
index 72744a4ed..111969ff4 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs.yaml
@@ -8,11 +8,14 @@ hydra:
 
 cluster:
   num_nodes: 1
+  num_gpus_per_node: 8
   component_placement:
-    actor,rollout: all
+    rollout: 0-3
+    actor: 4-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -20,14 +23,14 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
 
   seq_length: 1024
 
-  enable_dynamic_batch_size: True
+  enable_dynamic_batch_size: False
   max_tokens_per_mbs: 1024
 
   resume_dir: null
@@ -52,6 +55,8 @@ algorithm:
   recompute_logprobs: False
   shuffle_rollout: False
 
+  max_num_gen_batches: 1
+
   # GRPO loss params
   loss_type: math_ppo_actor
   loss_agg_func: "token-mean"
@@ -59,13 +64,13 @@ algorithm:
   kl_penalty_type: low_var_kl
   ratio_clip_eps: 0.2
   entropy_bonus: 0.0
-  calculate_entropy: False
+  calculate_entropy: True
   clip_ratio_c: null # 3.0
 
   adv_type: math_grpo
-  normalize_advantages: True
+  normalize_advantages: False
   early_stop_imp_ratio: 5.0
-  use_valid_token_scale: False
+  use_valid_token_scale: True
 
   # params for rollout
   sampling_params:
@@ -91,7 +96,7 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for vllm rollout
   eos: null                   # will be tokenizer.eos_token_id if null.
 
-  rollout_backend: sglang     # [sglang, vllm]
+  rollout_backend: vllm     # [sglang, vllm]
 
   sglang:
     attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
@@ -104,6 +109,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
@@ -120,7 +127,6 @@ rollout:
 data:
   type: math
   max_prompt_length: 256
-  filter_prompt_by_length: True
   rollout_batch_size: 8
   val_rollout_batch_size: null
   num_workers: 2
@@ -137,7 +143,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -155,7 +161,6 @@ actor:
 
     tensor_model_parallel_size: 2
     pipeline_model_parallel_size: 1
-    context_parallel_size: 2
 
     activation: swiglu
     sequence_parallel: True
@@ -201,7 +206,6 @@ actor:
     overlap_param_gather_with_optimizer_step: False
     clip_grad: 0.8
     loss_scale: 65536
-
   lr_sched:
     lr_warmup_fraction: 0.01
     lr_warmup_init: 0.0
@@ -225,9 +229,9 @@ actor:
     use_dist_ckpt: False
     tp_comm_bootstrap_backend: nccl
     tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
     use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
-    
+
     ckpt_convertor: # config for ckpt convertor
       model: DeepSeek-R1-Distill-Qwen-1.5B
       model_type: null # will be set by hf model's config if null
@@ -257,9 +261,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml
new file mode 100644
index 000000000..705757c31
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml
@@ -0,0 +1,290 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  num_gpus_per_node: 8
+  component_placement:
+    rollout: 0-3
+    inference: 4-5
+    actor: 6-7
+    reward: 0-3
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 2
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: True
+  clip_ratio_c: null # 3.0
+
+  adv_type: math_grpo
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: True
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+inference:
+  model_arch: ${rollout.model_arch}
+  group_name: "InferenceGroup"
+  load_from_actor: True
+  model:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: True
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  sglang_decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+  use_torch_compile: False # enable torch_compile in SGLang for rollout.
+  torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+data:
+  type: math
+  max_prompt_length: 256
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: True
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: False
+    fp16: True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: True
+    overlap_param_gather: True
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: DeepSeek-R1-Distill-Qwen-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+      
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
new file mode 100644
index 000000000..ddc087f99
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
@@ -0,0 +1,228 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 2
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-qwen2.5-vl-3b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 16
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/workspace/dataset/robo2vlm-1/data/train-00000-of-00262.parquet"]
+  val_data_paths: ["/workspace/dataset/robo2vlm-1/data/test-00000-of-00003.parquet"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
new file mode 100644
index 000000000..86c64ebe0
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
@@ -0,0 +1,228 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 2
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-qwen2.5-vl-3b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 16
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/workspace/dataset/robo2vlm-1/data/train-00000-of-00262.parquet"]
+  val_data_paths: ["/workspace/dataset/robo2vlm-1/data/test-00000-of-00003.parquet"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/run.sh b/tests/e2e_tests/reasoning/run.sh
new file mode 100644
index 000000000..92e43866f
--- /dev/null
+++ b/tests/e2e_tests/reasoning/run.sh
@@ -0,0 +1,16 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    echo "Please provide a config name as the first argument."
+    exit 1
+else
+    CONFIG_NAME=$1
+fi
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/reasoning/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/unit_tests/test_auto_placement.py b/tests/unit_tests/test_auto_placement.py
index a70b01c9d..559229763 100644
--- a/tests/unit_tests/test_auto_placement.py
+++ b/tests/unit_tests/test_auto_placement.py
@@ -598,7 +598,7 @@ def test_scheduler_task_initialization(self, mock_validate):
         """Test SchedulerTask initialization."""
         # Create a mock config
         mock_cfg = MagicMock()
-        mock_cfg.runner.task_type = "math"
+        mock_cfg.runner.task_type = "reasoning"
         mock_cfg.actor.model.tensor_model_parallel_size = 2
         mock_cfg.actor.model.pipeline_model_parallel_size = 1
         mock_cfg.rollout.tensor_parallel_size = 1
@@ -620,7 +620,7 @@ def test_scheduler_task_initialization(self, mock_validate):
 
         scheduler_task = SchedulerTask(mock_cfg, mock_cluster)
 
-        assert scheduler_task.is_math is True
+        assert scheduler_task.is_reasoning is True
         assert scheduler_task.total_gpus == 8
         assert scheduler_task.group_size == 4
         assert "actor" in scheduler_task.components_config
diff --git a/tests/unit_tests/test_placement.py b/tests/unit_tests/test_placement.py
index c16ff7fb9..22c75ab68 100644
--- a/tests/unit_tests/test_placement.py
+++ b/tests/unit_tests/test_placement.py
@@ -1087,34 +1087,6 @@ def test_model_parallel_component_placement_init_missing_rollout_gpus(self):
             cluster = mock_cluster(num_nodes=1, num_accelerators_per_node=4)
             ModelParallelComponentPlacement(config, cluster)
 
-    def test_model_parallel_component_placement_init_collocated_mode_invalid_tp_sizes(
-        self,
-    ):
-        """Test ModelParallelComponentPlacement raises error when actor TP size < rollout TP size in collocated mode."""
-        config = DictConfig(
-            {
-                "cluster": {
-                    "num_nodes": 1,
-                    "component_placement": {"actor,rollout": "0-3"},
-                },
-                "actor": {
-                    "model": {
-                        "tensor_model_parallel_size": 2,
-                        "context_parallel_size": 1,
-                        "pipeline_model_parallel_size": 1,
-                    }
-                },
-                "rollout": {"tensor_parallel_size": 4, "pipeline_parallel_size": 1},
-            }
-        )
-
-        with pytest.raises(
-            AssertionError,
-            match="Actor TP size 2 must be greater or equal to Rollout TP size 4",
-        ):
-            cluster = mock_cluster(num_nodes=1, num_accelerators_per_node=4)
-            ModelParallelComponentPlacement(config, cluster)
-
     def test_model_parallel_component_placement_init_collocated_mode_with_inference_gpus(
         self,
     ):
diff --git a/toolkits/auto_placement/scheduler_task.py b/toolkits/auto_placement/scheduler_task.py
index 00d8ea8aa..b3be46012 100644
--- a/toolkits/auto_placement/scheduler_task.py
+++ b/toolkits/auto_placement/scheduler_task.py
@@ -31,8 +31,10 @@ def __init__(
         workflow_graph: Optional[Dict[ComponentNode, List[ComponentNode]]] = None,
     ):
         self.cfg = cfg
-        self.is_math = cfg.runner.task_type == "math"
-        assert self.is_math, "Only math task is supported"
+        self.is_reasoning = cfg.runner.task_type == "reasoning"
+        assert self.is_reasoning, (
+            f"Only reasoning task is supported, current task type: {cfg.runner.task_type}"
+        )
 
         self.components_config = {
             "actor": {
@@ -71,7 +73,7 @@ def __init__(
         self.global_step_batch_size = self.rollout_batch_size * self.group_size
 
         if workflow_graph is None:
-            if self.is_math:
+            if self.is_reasoning:
                 actor = ComponentNode("actor")
                 inference = ComponentNode("inference")
                 rollout = ComponentNode("rollout")
@@ -179,7 +181,7 @@ def parse_partition_allocation_to_cfg(
 
     def time_division_multiplexing(self) -> List[Dict[str, Workflow]]:
         partitions: List[Dict[str, Workflow]] = get_workflow_partition(self.workflow)
-        if self.is_math:
+        if self.is_reasoning:
             valid_partitions = [
                 i for i in partitions if len(i) in [1, len(self.components_config)]
             ]
diff --git a/toolkits/ckpt_convertor/convert_hf_to_middle_file.py b/toolkits/ckpt_convertor/convert_hf_to_middle_file.py
index 92983ffbb..40e3d615a 100644
--- a/toolkits/ckpt_convertor/convert_hf_to_middle_file.py
+++ b/toolkits/ckpt_convertor/convert_hf_to_middle_file.py
@@ -116,27 +116,15 @@ def convert_layer(
         should_load_prefix.update(
             (k for k in hfst_loader.keys() if k.startswith("model.embed_tokens."))
         )
-        if not convert_config.tie_word_embeddings:
-            model_strategy_map.update(
-                {
-                    "embedding.word_embeddings.weight": (
-                        "copy",
-                        linear_trans,
-                        "model.embed_tokens.weight",
-                    ),
-                }
-            )
-        else:
-            model_strategy_map.update(
-                {
-                    "embedding.word_embeddings.weight": (
-                        "copy_equal",
-                        linear_trans,
-                        "model.embed_tokens.weight",
-                        "lm_head.weight",
-                    ),
-                }
-            )
+        model_strategy_map.update(
+            {
+                "embedding.word_embeddings.weight": (
+                    "copy",
+                    linear_trans,
+                    "model.embed_tokens.weight",
+                ),
+            }
+        )
     elif layer_idx == num_layers + 1:
         should_load_prefix.update(
             (
diff --git a/toolkits/ckpt_convertor/convert_middle_file_to_hf.py b/toolkits/ckpt_convertor/convert_middle_file_to_hf.py
index 01208f18a..c48246a5f 100644
--- a/toolkits/ckpt_convertor/convert_middle_file_to_hf.py
+++ b/toolkits/ckpt_convertor/convert_middle_file_to_hf.py
@@ -293,7 +293,6 @@ def convert_layer(args, mfst_loader: STLoaderLazy, saver: HFSTSaver, layer_idx):
                     layernorm_trans,
                     "decoder.final_layernorm.weight",
                 ),
-                "lm_head.weight": ("copy", linear_trans, "output_layer.weight"),
             }
         )
         if not args.tie_word_embeddings:
@@ -302,16 +301,6 @@ def convert_layer(args, mfst_loader: STLoaderLazy, saver: HFSTSaver, layer_idx):
                     "lm_head.weight": ("copy", linear_trans, "output_layer.weight"),
                 }
             )
-        else:
-            model_strategy_map.update(
-                {
-                    "lm_head.weight": (
-                        "copy",
-                        linear_trans,
-                        "embedding.word_embeddings.weight",
-                    ),
-                }
-            )
     else:
         should_load_prefix.update(
             (
diff --git a/toolkits/ckpt_convertor/default_args.yaml b/toolkits/ckpt_convertor/default_args.yaml
index 4065992e2..1c89ee0e4 100644
--- a/toolkits/ckpt_convertor/default_args.yaml
+++ b/toolkits/ckpt_convertor/default_args.yaml
@@ -65,6 +65,16 @@ explict_model:
     head_dim: 128
     num_layers: 28
     tie_word_embeddings: true
+  'Qwen2.5-Coder-1.5B':
+    model_type: qwen_2
+    num_attention_heads: 12
+    num_query_groups: 2
+    head_dim: 128
+    num_layers: 28
+    te_ln_linear_qkv: True
+    te_ln_linear_mlp_fc1: True
+    te_ln_add_extra_state: True # pay attention if precision is fp8 mixture
+    tie_word_embeddings: true
 
 model_type:
   deepseek:
diff --git a/toolkits/code_verifier/__init__.py b/toolkits/code_verifier/__init__.py
new file mode 100644
index 000000000..5b365ea1e
--- /dev/null
+++ b/toolkits/code_verifier/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/toolkits/code_verifier/verify.py b/toolkits/code_verifier/verify.py
new file mode 100644
index 000000000..5017b9e54
--- /dev/null
+++ b/toolkits/code_verifier/verify.py
@@ -0,0 +1,40 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+try:
+    from fuzzywuzzy import fuzz
+
+    FUZZY_AVAILABLE = True
+except ImportError:
+    fuzz = None
+    FUZZY_AVAILABLE = False
+
+
+def fim_verify_call(
+    responses: List[str],
+    references: List[str],
+) -> List:
+    assert FUZZY_AVAILABLE, "fuzzywuzzy is not installed"
+    assert len(responses) == len(references), (
+        len(responses),
+        len(references),
+    )
+
+    rewards = []
+    for resp, ref in zip(responses, references):
+        fuzzy_sim = fuzz.ratio(resp.strip(), ref.strip()) / 100
+        rewards.append(fuzzy_sim)
+    return rewards
diff --git a/toolkits/math_verifier/verify.py b/toolkits/math_verifier/verify.py
index 89cd9ef1e..8d0cbdb11 100644
--- a/toolkits/math_verifier/verify.py
+++ b/toolkits/math_verifier/verify.py
@@ -14,7 +14,13 @@
 
 import multiprocessing
 import re
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import (
+    ProcessPoolExecutor,
+    as_completed,
+)
+from concurrent.futures import (
+    TimeoutError as FuturesTimeoutError,
+)
 from typing import List, Union
 
 import regex
@@ -347,22 +353,22 @@ def process_results(answer, solution):
         extracted_solution = extract_answer(solution, "math", use_last_number=True)
 
         if extracted_answer is None or extracted_answer.strip() in ["None", "none", ""]:
-            retval = 0
+            retval = -1
         elif extracted_solution is None or extracted_solution.strip() in [
             "None",
             "none",
             "",
         ]:
-            retval = 0
+            retval = -1
         elif math_equal(extracted_answer, extracted_solution, timeout=False):
             # elif call_with_timeout(math_equal, extracted_answer, extracted_solution):
             retval = 1
         else:
-            retval = 0
+            retval = -1
 
         return retval, (extracted_answer, extracted_solution)
     except Exception:
-        return 0, ("None", "None")
+        return -1, ("None", "None")
 
 
 def process_results_process(a, b, output_queue):
@@ -401,7 +407,7 @@ def math_verify_call(
             jobs.append(job)
         all_jobs.append(jobs)
 
-    labels = []
+    labels: List[int] = []
     has_timeout = False
     for jobs in all_jobs:
         label = 0
@@ -409,40 +415,18 @@ def math_verify_call(
             for job in as_completed(jobs, timeout=timeout):
                 x = job.result()
                 label = label or x
-        except TimeoutError:
+        except FuturesTimeoutError:
             has_timeout = True
-        labels.append(label)
+            for job in jobs:
+                job.cancel()
+        finally:
+            labels.append(label)
 
     if has_timeout:
         reset_global_process_pool()
     return labels
 
 
-class MathRewardModel:
-    def __init__(self, scale: float):
-        self.scale = scale
-
-    def get_reward(
-        self, response: List[str], reference: List[List[str]]
-    ) -> List[float]:
-        """
-        Calculates reward scores for a list of responses compared to corresponding lists of reference answers.
-        For each response, the function checks if it matches any of the provided references using the `process_results` function.
-        The reward for each response is computed as the first element of the result (converted to float) multiplied by `self.scale`.
-        Args:
-            response (List[str]): A list of response strings to be evaluated.
-            reference (List[List[str]]): A list where each element is a list of reference strings corresponding to each response.
-        Returns:
-            List[float]: A list of reward scores, one for each response.
-        """
-
-        results = []
-        for resp, refs in zip(response, reference):
-            result = any(process_results(resp, ref)[0] for ref in refs)
-            results.append((1 if result else -1) * self.scale)
-        return results
-
-
 if __name__ == "__main__":
     sample = {
         "answers": ["\\boxed{-\\frac{2}{3}}"],

Model	Vision	Semantic	Position	Average	Evaluation results on ManiSkill. Values denote success rates
	In-Distribution	Out-Of-Distribution
	In-Distribution	Vision	Semantic	Execution	Avg.
OpenVLA (Base)	53.91%	38.75%	35.94%	42.11%	39.10%
RL4VLA (PPO)	93.75%	80.47%	75.00%	81.77%	79.15%
rl4vla	76.6%	75.4%	77.6%	76.1%	OpenVLA (RLinf-GRPO)	84.38%	74.69%	72.99%	77.86%	75.15%
GRPO-OpenVLA-OFT	84.6%	51.6%	42.9%	61.5%	OpenVLA (RLinf-PPO)	96.09%	82.03%	78.35%	85.42%	81.93%
PPO-OpenVLA-OFT	80.5%	56.6%	56.1%	64.5%
PPO-OpenVLA	82.0%	80.6%	89.3%	82.2%	OpenVLA-OFT (Base)	28.13%	27.73%	12.95%	11.72%	18.29%
GRPO-OpenVLA	74.7%	74.4%	81.6%	75.5%	OpenVLA-OFT (RLinf-GRPO)	94.14%	84.69%	45.54%	44.66%	60.64%
OpenVLA-OFT (RLinf-PPO)	97.66%	92.11%	64.84%	73.57%	77.05%
OpenVLA-OFT model results on LIBERO						Evaluation results of the unified model on the five LIBERO task groups
Model	Spatial	Goal	Object	Long	Average	Model	Spatial	Object	Goal	10	90	Avg.
OpenVLA-OFT-SFT (one-shot)	56.5%	45.6%	25.6%	9.7%	34.4%	OpenVLA-OFT (Base)	72.18%	71.48%	64.06%	48.44%	70.97%	65.43%
OpenVLA-OFT-RLinf	99.0%	99.0%	99.0%	94.4%	97.9%	OpenVLA-OFT (RLinf-GRPO)	99.40%	99.80%	98.79%	93.95%	98.59%	98.11%
Improvement	+42.5%	+53.4%	+73.4%	+84.7%	+63.5%	Δ Improvement	+27.22	+28.32	+34.73	+45.51	+27.62	+32.68
1.5B model results
Model	AIME 24	AIME 25	GPQA-diamond	Average
DeepSeek-R1-Distill-Qwen-1.5B (base model)	28.33	24.90	27.45	26.89
DeepMath-1.5B	37.80	30.42	32.11	33.44
DeepScaleR-1.5B-Preview	40.41	30.93	27.54	32.96
AReaL-1.5B-Preview-Stage-3	40.73	31.56	28.10	33.46
AReaL-1.5B-retrain*	44.42	34.27	33.81	37.50
FastCuRL-1.5B-V3	43.65	32.49	35.00	37.05
RLinf-math-1.5B	48.44	35.63	38.46	40.84