From f00e1f769d8adad5d00b98ed951443fb12eceeae Mon Sep 17 00:00:00 2001
From: zhangtao <zhangtao529@huawei.com>
Date: Tue, 27 Jan 2026 22:31:38 +0800
Subject: [PATCH 1/2] Feat. MindIE-SD compilation support for NPU.

---
 docs/user_guide/ASCEND_NPU.md     | 42 ++++++++++++++++++++++---
 src/cache_dit/__init__.py         |  3 ++
 src/cache_dit/compile/__init__.py |  1 +
 src/cache_dit/compile/dispatch.py | 52 +++++++++++++++++++++++++++++++
 4 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 src/cache_dit/compile/dispatch.py

diff --git a/docs/user_guide/ASCEND_NPU.md b/docs/user_guide/ASCEND_NPU.md
index be84d9def..8e429ceb6 100644
--- a/docs/user_guide/ASCEND_NPU.md
+++ b/docs/user_guide/ASCEND_NPU.md
@@ -14,10 +14,11 @@ Please refer to **[Ascend NPU Supported Matrix](../supported_matrix/ASCEND_NPU.m
 
 ## Features Support 
 
-|Device|Hybrid Cache|Context Parallel|Tensor Parallel|Text Encoder Parallel|Auto Encoder(VAE) Parallel|
-|:---|:---:|:---:|:---:|:---:|:---:|
-|Atlas 800T A2|✅|✅|✅|✅|✅|
-|Atlas 800I A2|✅|✅|✅|✅|✅|
+|Device|Hybrid Cache|Context Parallel|Tensor Parallel|Text Encoder Parallel|Auto Encoder(VAE) Parallel|Compilation|
+|:---|:---:|:---:|:---:|:---:|:---:|:---:|
+|Atlas 800T A2|✅|✅|✅|✅|✅|🟡|
+|Atlas 800I A2|✅|✅|✅|✅|✅|🟡|
+> 🟡: Experimental Feature
 
 ## Attention backend
 
@@ -200,3 +201,36 @@ torchrun --nproc_per_node=4 -m cache_dit.generate flux --parallel ulysses --cach
 torchrun --nproc_per_node=4 -m cache_dit.generate zimage --parallel ulysses --cache --attn _native_npu
 torchrun --nproc_per_node=4 -m cache_dit.generate qwen_image --parallel ulysses --cache --attn _native_npu
 ```
+
+## [Experimental] Speedup with MindIE-SD Compilation
+
+MindIE-SD is an open-source acceleration framework for diffusion models on Ascend NPU. By providing a custom `MindieSDBackend` for `torch.compile`, it enables automatic operator fusion and optimization for enhanced performance on Ascend hardware. For detailed documentation and examples, visit [MindIE-SD](https://gitcode.com/Ascend/MindIE-SD).
+
+
+### Install MindIE-SD
+
+```bash
+git clone --branch master --single-branch https://gitcode.com/Ascend/MindIE-SD.git
+cd MindIE-SD
+pip install wheel
+python3 setup.py bdist_wheel
+pip install ./dist/*.whl --force-reinstall
+```
+
+### Enable MindIE-SD Compilation
+
+```bash
+python3 generate.py flux --attn _native_npu --compile
+```
+
+### Performance
+
+The performance of MindIE-SD Compilation is as follows (device 910B3):
+
+|Model|Batch Size|Resolution|Compile|E2E Time(s)|
+|:---|:---:|:---:|:---:|:---:|
+|flux.1-dev|1|1024x1024|❌|14.09|
+|flux.1-dev|1|1024x1024|✅|12.85|
+
+⚠️ **Experimental Feature**: MindIE-SD Compilation is currently in the experimental stage. 
+For bug reports, feature requests, or detailed information, please visit the [MindIE-SD Compilation Documentation](https://gitcode.com/Ascend/MindIE-SD/blob/master/docs/features/compilation.md).
diff --git a/src/cache_dit/__init__.py b/src/cache_dit/__init__.py
index a70ac1672..a9fdd203b 100644
--- a/src/cache_dit/__init__.py
+++ b/src/cache_dit/__init__.py
@@ -34,6 +34,7 @@
 from .parallelism import ParallelismBackend
 from .parallelism import ParallelismConfig
 from .compile import set_compile_configs
+from .compile import maybe_wrap_torch_compile
 from .summary import supported_matrix
 from .summary import summary
 from .summary import strify
@@ -54,3 +55,5 @@
 Pattern_3 = ForwardPattern.Pattern_3
 Pattern_4 = ForwardPattern.Pattern_4
 Pattern_5 = ForwardPattern.Pattern_5
+
+maybe_wrap_torch_compile()
diff --git a/src/cache_dit/compile/__init__.py b/src/cache_dit/compile/__init__.py
index 5d8f9058d..28baee72c 100644
--- a/src/cache_dit/compile/__init__.py
+++ b/src/cache_dit/compile/__init__.py
@@ -1 +1,2 @@
 from .utils import set_compile_configs
+from .dispatch import maybe_wrap_torch_compile
diff --git a/src/cache_dit/compile/dispatch.py b/src/cache_dit/compile/dispatch.py
new file mode 100644
index 000000000..95c38c887
--- /dev/null
+++ b/src/cache_dit/compile/dispatch.py
@@ -0,0 +1,52 @@
+import torch
+import functools
+from typing import Optional, Callable
+from cache_dit.platforms import current_platform
+from cache_dit.logger import init_logger
+
+logger = init_logger(__name__)
+_original_torch_compile: Optional[Callable] = None
+
+
+def _get_mindiesd_backend():
+    try:
+        from mindiesd.compilation import MindieSDBackend
+        _backend = MindieSDBackend()
+    except ImportError:
+        _backend = None
+    
+    return _backend
+
+def maybe_wrap_torch_compile():
+    global _original_torch_compile
+    
+    # Avoid duplicate patch
+    if _original_torch_compile is not None:
+        return
+    
+    _original_torch_compile = torch.compile
+
+    # MindIESD Backend Available
+    mindiesd_backend = _get_mindiesd_backend()
+    
+    @functools.wraps(_original_torch_compile)
+    def patched_compile(*args, **kwargs):
+        if 'backend' not in kwargs and 'npu' in current_platform.device_type:
+            if mindiesd_backend:
+                logger.warning(
+                    'NPU platform detected with MindIE-SD available. '
+                    'torch.compile will default to MindIESDBackend. '
+                    'Override it with torch.compile(backend=...) if needed.'
+                )
+                kwargs['backend'] = mindiesd_backend
+            else:
+                logger.warning(
+                    "NPU platform detected but MindIE-SD not found. "
+                    "Run `pip install mindiesd` for better NPU performance on Compilation."
+                )
+        
+        return _original_torch_compile(*args, **kwargs)
+    
+    # Patch Torch Compile
+    torch.compile = patched_compile
+    return True

From 9893229abb6edb31074d37202560a537f7e9846c Mon Sep 17 00:00:00 2001
From: zhangtao <zhangtao529@huawei.com>
Date: Tue, 27 Jan 2026 22:44:49 +0800
Subject: [PATCH 2/2] Clean Code

---
 src/cache_dit/compile/dispatch.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/cache_dit/compile/dispatch.py b/src/cache_dit/compile/dispatch.py
index 95c38c887..7737ad9a7 100644
--- a/src/cache_dit/compile/dispatch.py
+++ b/src/cache_dit/compile/dispatch.py
@@ -11,42 +11,43 @@
 def _get_mindiesd_backend():
     try:
         from mindiesd.compilation import MindieSDBackend
+
         _backend = MindieSDBackend()
     except ImportError:
         _backend = None
-    
+
     return _backend
 
+
 def maybe_wrap_torch_compile():
     global _original_torch_compile
-    
+
     # Avoid duplicate patch
     if _original_torch_compile is not None:
         return
-    
+
     _original_torch_compile = torch.compile
 
     # MindIESD Backend Available
     mindiesd_backend = _get_mindiesd_backend()
-    
+
     @functools.wraps(_original_torch_compile)
     def patched_compile(*args, **kwargs):
-        if 'backend' not in kwargs and 'npu' in current_platform.device_type:
+        if "backend" not in kwargs and "npu" in current_platform.device_type:
             if mindiesd_backend:
                 logger.warning(
-                    'NPU platform detected with MindIE-SD available. '
-                    'torch.compile will default to MindIESDBackend. '
-                    'Override it with torch.compile(backend=...) if needed.'
+                    "NPU platform detected with MindIE-SD available. "
+                    "torch.compile will default to MindIESDBackend. "
+                    "Override it with torch.compile(backend=...) if needed."
                 )
-                kwargs['backend'] = mindiesd_backend
+                kwargs["backend"] = mindiesd_backend
             else:
                 logger.warning(
                     "NPU platform detected but MindIE-SD not found. "
                     "Run `pip install mindiesd` for better NPU performance on Compilation."
                 )
-        
+
         return _original_torch_compile(*args, **kwargs)
-    
+
     # Patch Torch Compile
     torch.compile = patched_compile
-    return True