From f00e1f769d8adad5d00b98ed951443fb12eceeae Mon Sep 17 00:00:00 2001 From: zhangtao Date: Tue, 27 Jan 2026 22:31:38 +0800 Subject: [PATCH 1/2] Feat. MindIE-SD compilation support for NPU. --- docs/user_guide/ASCEND_NPU.md | 42 ++++++++++++++++++++++--- src/cache_dit/__init__.py | 3 ++ src/cache_dit/compile/__init__.py | 1 + src/cache_dit/compile/dispatch.py | 52 +++++++++++++++++++++++++++++++ 4 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 src/cache_dit/compile/dispatch.py diff --git a/docs/user_guide/ASCEND_NPU.md b/docs/user_guide/ASCEND_NPU.md index be84d9def..8e429ceb6 100644 --- a/docs/user_guide/ASCEND_NPU.md +++ b/docs/user_guide/ASCEND_NPU.md @@ -14,10 +14,11 @@ Please refer to **[Ascend NPU Supported Matrix](../supported_matrix/ASCEND_NPU.m ## Features Support -|Device|Hybrid Cache|Context Parallel|Tensor Parallel|Text Encoder Parallel|Auto Encoder(VAE) Parallel| -|:---|:---:|:---:|:---:|:---:|:---:| -|Atlas 800T A2|✅|✅|✅|✅|✅| -|Atlas 800I A2|✅|✅|✅|✅|✅| +|Device|Hybrid Cache|Context Parallel|Tensor Parallel|Text Encoder Parallel|Auto Encoder(VAE) Parallel|Compilation| +|:---|:---:|:---:|:---:|:---:|:---:|:---:| +|Atlas 800T A2|✅|✅|✅|✅|✅|🟡| +|Atlas 800I A2|✅|✅|✅|✅|✅|🟡| +> 🟡: Experimental Feature ## Attention backend @@ -200,3 +201,36 @@ torchrun --nproc_per_node=4 -m cache_dit.generate flux --parallel ulysses --cach torchrun --nproc_per_node=4 -m cache_dit.generate zimage --parallel ulysses --cache --attn _native_npu torchrun --nproc_per_node=4 -m cache_dit.generate qwen_image --parallel ulysses --cache --attn _native_npu ``` + +## [Experimental] Speedup with MindIE-SD Compilation + +MindIE-SD is an open-source acceleration framework for diffusion models on Ascend NPU. By providing a custom `MindieSDBackend` for `torch.compile`, it enables automatic operator fusion and optimization for enhanced performance on Ascend hardware. For detailed documentation and examples, visit [MindIE-SD](https://gitcode.com/Ascend/MindIE-SD). + + +### Install MindIE-SD + +```bash +git clone --branch master --single-branch https://gitcode.com/Ascend/MindIE-SD.git +cd MindIE-SD +pip install wheel +python3 setup.py bdist_wheel +pip install ./dist/*.whl --force-reinstall +``` + +### Enable MindIE-SD Compilation + +```bash +python3 generate.py flux --attn _native_npu --compile +``` + +### Performance + +The performance of MindIE-SD Compilation is as follows (device 910B3): + +|Model|Batch Size|Resolution|Compile|E2E Time(s)| +|:---|:---:|:---:|:---:|:---:| +|flux.1-dev|1|1024x1024|❌|14.09| +|flux.1-dev|1|1024x1024|✅|12.85| + +⚠️ **Experimental Feature**: MindIE-SD Compilation is currently in the experimental stage. +For bug reports, feature requests, or detailed information, please visit the [MindIE-SD Compilation Documentation](https://gitcode.com/Ascend/MindIE-SD/blob/master/docs/features/compilation.md). diff --git a/src/cache_dit/__init__.py b/src/cache_dit/__init__.py index a70ac1672..a9fdd203b 100644 --- a/src/cache_dit/__init__.py +++ b/src/cache_dit/__init__.py @@ -34,6 +34,7 @@ from .parallelism import ParallelismBackend from .parallelism import ParallelismConfig from .compile import set_compile_configs +from .compile import maybe_wrap_torch_compile from .summary import supported_matrix from .summary import summary from .summary import strify @@ -54,3 +55,5 @@ Pattern_3 = ForwardPattern.Pattern_3 Pattern_4 = ForwardPattern.Pattern_4 Pattern_5 = ForwardPattern.Pattern_5 + +maybe_wrap_torch_compile() diff --git a/src/cache_dit/compile/__init__.py b/src/cache_dit/compile/__init__.py index 5d8f9058d..28baee72c 100644 --- a/src/cache_dit/compile/__init__.py +++ b/src/cache_dit/compile/__init__.py @@ -1 +1,2 @@ from .utils import set_compile_configs +from .dispatch import maybe_wrap_torch_compile diff --git a/src/cache_dit/compile/dispatch.py b/src/cache_dit/compile/dispatch.py new file mode 100644 index 000000000..95c38c887 --- /dev/null +++ b/src/cache_dit/compile/dispatch.py @@ -0,0 +1,52 @@ +import torch +import functools +from typing import Optional, Callable +from cache_dit.platforms import current_platform +from cache_dit.logger import init_logger + +logger = init_logger(__name__) +_original_torch_compile: Optional[Callable] = None + + +def _get_mindiesd_backend(): + try: + from mindiesd.compilation import MindieSDBackend + _backend = MindieSDBackend() + except ImportError: + _backend = None + + return _backend + +def maybe_wrap_torch_compile(): + global _original_torch_compile + + # Avoid duplicate patch + if _original_torch_compile is not None: + return + + _original_torch_compile = torch.compile + + # MindIESD Backend Available + mindiesd_backend = _get_mindiesd_backend() + + @functools.wraps(_original_torch_compile) + def patched_compile(*args, **kwargs): + if 'backend' not in kwargs and 'npu' in current_platform.device_type: + if mindiesd_backend: + logger.warning( + 'NPU platform detected with MindIE-SD available. ' + 'torch.compile will default to MindIESDBackend. ' + 'Override it with torch.compile(backend=...) if needed.' + ) + kwargs['backend'] = mindiesd_backend + else: + logger.warning( + "NPU platform detected but MindIE-SD not found. " + "Run `pip install mindiesd` for better NPU performance on Compilation." + ) + + return _original_torch_compile(*args, **kwargs) + + # Patch Torch Compile + torch.compile = patched_compile + return True From 9893229abb6edb31074d37202560a537f7e9846c Mon Sep 17 00:00:00 2001 From: zhangtao Date: Tue, 27 Jan 2026 22:44:49 +0800 Subject: [PATCH 2/2] Clean Code --- src/cache_dit/compile/dispatch.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/cache_dit/compile/dispatch.py b/src/cache_dit/compile/dispatch.py index 95c38c887..7737ad9a7 100644 --- a/src/cache_dit/compile/dispatch.py +++ b/src/cache_dit/compile/dispatch.py @@ -11,42 +11,43 @@ def _get_mindiesd_backend(): try: from mindiesd.compilation import MindieSDBackend + _backend = MindieSDBackend() except ImportError: _backend = None - + return _backend + def maybe_wrap_torch_compile(): global _original_torch_compile - + # Avoid duplicate patch if _original_torch_compile is not None: return - + _original_torch_compile = torch.compile # MindIESD Backend Available mindiesd_backend = _get_mindiesd_backend() - + @functools.wraps(_original_torch_compile) def patched_compile(*args, **kwargs): - if 'backend' not in kwargs and 'npu' in current_platform.device_type: + if "backend" not in kwargs and "npu" in current_platform.device_type: if mindiesd_backend: logger.warning( - 'NPU platform detected with MindIE-SD available. ' - 'torch.compile will default to MindIESDBackend. ' - 'Override it with torch.compile(backend=...) if needed.' + "NPU platform detected with MindIE-SD available. " + "torch.compile will default to MindIESDBackend. " + "Override it with torch.compile(backend=...) if needed." ) - kwargs['backend'] = mindiesd_backend + kwargs["backend"] = mindiesd_backend else: logger.warning( "NPU platform detected but MindIE-SD not found. " "Run `pip install mindiesd` for better NPU performance on Compilation." ) - + return _original_torch_compile(*args, **kwargs) - + # Patch Torch Compile torch.compile = patched_compile - return True