diff --git a/graph_net/tools/generate_subgraph_dataset.sh b/graph_net/tools/generate_subgraph_dataset.sh
index b381ea711..3456067ef 100755
--- a/graph_net/tools/generate_subgraph_dataset.sh
+++ b/graph_net/tools/generate_subgraph_dataset.sh
@@ -29,18 +29,20 @@ DIM_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/11_dimension_generaliz
 RENAMED_DIM_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/12_renamed_dimension_generalized_fusible_subgraphs
 DEDUP_DIM_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/13_deduplicated_dimension_generalized_fusible_subgraphs
 DTYPE_GENERALIZED_FUSIBLE_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/14_dtype_generalized_fusible_subgraphs
-FUSIBLE_SUBGRAPH_UNITTEST_DIR=$DECOMPOSE_WORKSPACE/15_fusible_subgraphs_unittests
+BACKWARD_GRAPH_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/15_backward_graph_extracted
+FUSIBLE_SUBGRAPH_UNITTEST_DIR=$DECOMPOSE_WORKSPACE/16_fusible_subgraphs_unittests
 
 # typical_subgraphs
 DIM_GENERALIZED_TYPICAL_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/2-08_dimension_generalized_typical_subgraphs
 RENAMED_DIM_GENERALIZED_TYPICAL_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/2-09_renamed_dimension_generalized_typical_subgraphs
 DEDUP_DIM_GENERALIZED_TYPICAL_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/2-10_deduplicated_dimension_generalized_typical_subgraphs
 DTYPE_GENERALIZED_TYPICAL_SUBGRAPH_DIR=$DECOMPOSE_WORKSPACE/2-11_dtype_generalized_typical_subgraphs
-TYPICAL_SUBGRAPH_UNITTEST_DIR=$DECOMPOSE_WORKSPACE/2-12_typical_kernelbench_unittests
+BACKWARD_GRAPH_TYPICAL_OUTPUT_DIR=$DECOMPOSE_WORKSPACE/2-13_backward_graph_extracted
+TYPICAL_SUBGRAPH_UNITTEST_DIR=$DECOMPOSE_WORKSPACE/2-14_typical_kernelbench_unittests
 
 mkdir -p "$DECOMPOSE_WORKSPACE"
 
-model_list="$GRAPH_NET_ROOT/graph_net/config/torch_samples_list.txt"
+model_list="$GRAPH_NET_ROOT/graph_net/config/small10_torch_samples_list.txt"
 DB_PATH=$DECOMPOSE_WORKSPACE/small100_torch_samples.db
 
 device_rewrited_sample_list=${DECOMPOSE_WORKSPACE}/device_rewrited_sample_list.txt
@@ -401,8 +403,28 @@ EOF
 )
 }
 
+function backward_graph_extractor() {
+    echo ">>> [13] Extract backward graphs for samples under ${DTYPE_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}."
+    echo ">>>"
+    python3 -m graph_net.model_path_handler \
+        --model-path-list ${dtype_generalized_subgraphs_list} \
+        --handler-config=$(base64 -w 0 <<EOF
+{
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/backward_graph_extractor.py",
+    "handler_class_name": "BackwardGraphExtractorPass",
+    "handler_config": {
+        "model_path_prefix": "$DTYPE_GENERALIZED_FUSIBLE_SUBGRAPH_DIR",
+        "output_dir": "$BACKWARD_GRAPH_OUTPUT_DIR",
+        "device": "cuda",
+        "resume": ${RESUME}
+    }
+}
+EOF
+)
+}
+
 function generate_unittests() {
-    echo ">>> [13] Generate unittests for subgraph samples under ${DTYPE_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}. "
+    echo ">>> [14] Generate unittests for subgraph samples under ${BACKWARD_GRAPH_OUTPUT_DIR}. "
     echo ">>>"
     python3 -m graph_net.model_path_handler \
         --model-path-list ${dtype_generalized_subgraphs_list} \
@@ -412,7 +434,7 @@ function generate_unittests() {
     "handler_class_name": "AgentUnittestGeneratorPass",
     "handler_config": {
         "framework": "torch",
-        "model_path_prefix": "${DTYPE_GENERALIZED_FUSIBLE_SUBGRAPH_DIR}",
+        "model_path_prefix": "${BACKWARD_GRAPH_OUTPUT_DIR}",
         "output_dir": "${FUSIBLE_SUBGRAPH_UNITTEST_DIR}",
         "device": "cuda",
         "generate_main": false,
@@ -516,8 +538,28 @@ EOF
 )
 }
 
+function backward_graph_extractor_typical() {
+    echo ">>> [2-13] Extract backward graphs for typical subgraph samples under ${DTYPE_GENERALIZED_TYPICAL_SUBGRAPH_DIR}."
+    echo ">>>"
+    python3 -m graph_net.model_path_handler \
+        --model-path-list ${dtype_generalized_typical_subgraph_list} \
+        --handler-config=$(base64 -w 0 <<EOF
+{
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/backward_graph_extractor.py",
+    "handler_class_name": "BackwardGraphExtractorPass",
+    "handler_config": {
+        "model_path_prefix": "$DTYPE_GENERALIZED_TYPICAL_SUBGRAPH_DIR",
+        "output_dir": "$BACKWARD_GRAPH_TYPICAL_OUTPUT_DIR",
+        "device": "cuda",
+        "resume": ${RESUME}
+    }
+}
+EOF
+)
+}
+
 function generate_unittest_for_typical_subgraphs() {
-    echo ">>> [2-13] Generate unittests for subgraph samples under ${DTYPE_GENERALIZED_TYPICAL_SUBGRAPH_DIR}. "
+    echo ">>> [2-14] Generate unittests for subgraph samples under ${BACKWARD_GRAPH_TYPICAL_OUTPUT_DIR}. "
     echo ">>>"
     python3 -m graph_net.model_path_handler \
         --model-path-list ${dtype_generalized_typical_subgraph_list} \
@@ -527,7 +569,7 @@ function generate_unittest_for_typical_subgraphs() {
     "handler_class_name": "AgentUnittestGeneratorPass",
     "handler_config": {
         "framework": "torch",
-        "model_path_prefix": "${DTYPE_GENERALIZED_TYPICAL_SUBGRAPH_DIR}",
+        "model_path_prefix": "${BACKWARD_GRAPH_TYPICAL_OUTPUT_DIR}",
         "output_dir": "${TYPICAL_SUBGRAPH_UNITTEST_DIR}",
         "device": "cuda",
         "generate_main": false,
@@ -582,6 +624,9 @@ function generate_fusible_subgraphs() {
     dtype_generalizer 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_dtype_generalizer_${suffix}.txt
     generate_generalized_subgraph_list ${DTYPE_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} ${dtype_generalized_subgraphs_list}
 
+    # extract backward graphs (train_forward, train_backward, eval_forward)
+    backward_graph_extractor 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_backward_graph_extractor_${suffix}.txt
+
     # generate kernelbench format unittest
     generate_unittests 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_unittests_${suffix}.txt
 }
@@ -602,6 +647,9 @@ function generate_typical_subgraphs() {
     dtype_generalizer_for_typical_subgraphs 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_dtype_generalizer_typical_subgraphs_${suffix}.txt
     generate_generalized_subgraph_list ${DTYPE_GENERALIZED_TYPICAL_SUBGRAPH_DIR} ${dtype_generalized_typical_subgraph_list}
 
+    # extract backward graphs (train_forward, train_backward, eval_forward)
+    backward_graph_extractor_typical 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_backward_graph_extractor_typical_${suffix}.txt
+
     # generate kernelbench format unittest
     generate_unittest_for_typical_subgraphs 2>&1 | tee ${DECOMPOSE_WORKSPACE}/log_unittests_typical_subgraphs_${suffix}.txt
 }
@@ -613,11 +661,14 @@ function main() {
     python ${GRAPH_NET_ROOT}/sqlite/init_db.py --db_path ${DB_PATH} 2>&1 | tee sqlite/logs/init_db_${timestamp}.log
     insert_graph_sample ${GRAPH_NET_ROOT} "github_torch_samples" "full_graph" ${model_list}
 
+    # Generate common decomposition (must run first)
+    do_common_generalzation_and_decompose
+
     generate_fusible_subgraphs
     insert_graph_sample ${DEDUP_DIM_GENERALIZED_FUSIBLE_SUBGRAPH_DIR} "github_torch_samples" "fusible_graph" ${deduplicated_fusible_subgraphs_list}
 
     generate_typical_subgraphs
-    insert_graph_sample ${DEDUP_TYPICAL_SUBGRAPH_DIR} "github_torch_samples" "typical_graph" ${deduplicated_typical_subgraph_list}    
+    insert_graph_sample ${DEDUP_TYPICAL_SUBGRAPH_DIR} "github_torch_samples" "typical_graph" ${deduplicated_typical_subgraph_list}
 }
 
 summary() {
@@ -700,9 +751,18 @@ summary() {
     done
     echo ""
 
+    num_backward_graph_subgraphs=`find ${BACKWARD_GRAPH_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Fusible - 6] backward graph extraction: successed=${num_backward_graph_subgraphs}"
+    for mode in eval_forward train_forward train_backward
+    do
+        num_backward_graph_subgraphs_mode=`find ${BACKWARD_GRAPH_OUTPUT_DIR}/${mode} -name "model.py" | wc -l`
+        echo "    ${mode}, successed=${num_backward_graph_subgraphs_mode}"
+    done
+    echo ""
+
     num_successed_unittests=`find ${FUSIBLE_SUBGRAPH_UNITTEST_DIR} -name "*_test.py" | wc -l`
-    unittest_successed_percent=$((num_successed_unittests * 100 / num_dtype_generalized_subgraphs))
-    echo "- [Fusible - 6] generate unittest: successed=${num_successed_unittests}, percent=${unittest_successed_percent}%"
+    unittest_successed_percent=$((num_successed_unittests * 100 / num_backward_graph_subgraphs))
+    echo "- [Fusible - 7] generate unittest: successed=${num_successed_unittests}, percent=${unittest_successed_percent}%"
     for dtype in float32 float16 bfloat16
     do
         num_successed_unittests=`find ${FUSIBLE_SUBGRAPH_UNITTEST_DIR}/${dtype} -name "*_test.py" | wc -l`
@@ -734,6 +794,15 @@ summary() {
         echo "    ${index}, successed=${num_deduplicated_typical_subgraphs_index}"
     done
     echo ""
+
+    num_backward_graph_typical_subgraphs=`find ${BACKWARD_GRAPH_TYPICAL_OUTPUT_DIR} -name "model.py" | wc -l`
+    echo "- [Typical - 4] backward graph extraction: successed=${num_backward_graph_typical_subgraphs}"
+    for mode in eval_forward train_forward train_backward
+    do
+        num_backward_graph_typical_mode=`find ${BACKWARD_GRAPH_TYPICAL_OUTPUT_DIR}/${mode} -name "model.py" | wc -l`
+        echo "    ${mode}, successed=${num_backward_graph_typical_mode}"
+    done
+    echo ""
 }
 
 main
diff --git a/graph_net/torch/sample_pass/backward_graph_extractor.py b/graph_net/torch/sample_pass/backward_graph_extractor.py
index f27b1da9b..e50d3b7e9 100644
--- a/graph_net/torch/sample_pass/backward_graph_extractor.py
+++ b/graph_net/torch/sample_pass/backward_graph_extractor.py
@@ -16,11 +16,12 @@
 
 
 class BackwardGraphExtractor:
-    def __init__(self, model_name, model_path, output_dir, device):
+    def __init__(self, model_name, model_path, output_dir, device, rel_model_path):
         self.model_name = model_name
         self.model_path = model_path
         self.output_dir = output_dir
         self.device = device
+        self.rel_model_path = rel_model_path
 
     def __call__(self):
         module, forward_inputs = get_torch_module_and_inputs(
@@ -28,24 +29,27 @@ def __call__(self):
         )
         module.train()
 
-        original_name_dir = os.path.join(self.output_dir, self.model_name)
-        if not os.path.exists(original_name_dir):
-            shutil.copytree(self.model_path, original_name_dir)
+        eval_forward_dir = os.path.join(
+            self.output_dir, "eval_forward", self.rel_model_path
+        )
+        if not os.path.exists(eval_forward_dir):
+            shutil.copytree(self.model_path, eval_forward_dir)
 
         forward_inputs = self.set_requires_grad_for_forward_inputs(
             self.model_path, module, forward_inputs
         )
         gm_holder, backward_inputs = self.capture_graph(module, forward_inputs)
-        self.get_extractor("forward")(gm_holder["forward_gm"], forward_inputs)
-        self.get_extractor("backward")(gm_holder["backward_gm"], backward_inputs)
+        self.get_extractor("train_forward")(gm_holder["forward_gm"], forward_inputs)
+        self.get_extractor("train_backward")(gm_holder["backward_gm"], backward_inputs)
 
     def get_extractor(self, suffix):
+        workspace_path = os.path.join(self.output_dir, suffix, self.rel_model_path)
         return BuiltinGraphExtractor(
             name=f"{self.model_name}_{suffix}",
             dynamic=False,
             mut_graph_codes=[],
             placeholder_auto_rename=False,
-            workspace_path=self.output_dir,
+            workspace_path=workspace_path,
         )
 
     def capture_graph(self, module, forward_inputs):
@@ -180,13 +184,11 @@ def resume(self, rel_model_path: str):
         model_path_prefix = Path(self.config["model_path_prefix"])
         model_name = f"{os.path.basename(rel_model_path)}"
         model_path = model_path_prefix / rel_model_path
-        output_dir = (
-            Path(self.config["output_dir"])
-            / os.path.dirname(rel_model_path)
-            / model_name
-        )
+        output_dir = Path(self.config["output_dir"])
         device = self._choose_device(self.config["device"])
-        extractor = BackwardGraphExtractor(model_name, model_path, output_dir, device)
+        extractor = BackwardGraphExtractor(
+            model_name, model_path, output_dir, device, rel_model_path
+        )
         extractor()
 
     def _choose_device(self, device) -> str: