ruoqianguo · KyeeHuang · Jun 16, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jul 2, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third-party/cutlass"]
 	path = third-party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "third-party/fmt"]
+	path = third-party/fmt
+	url = https://github.com/fmtlib/fmt.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,44 +1,33 @@
 # NOTES: current just for CMake-based IDE (e.g. CLion) indexing, the real compilation is done via JIT
-# TODO: add CUDA utils' library via CMake
 cmake_minimum_required(VERSION 3.10)
 project(deep_gemm LANGUAGES CXX CUDA)
-
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CUDA_STANDARD 20)
 set(CMAKE_VERBOSE_MAKEFILE ON)
 
-find_package(CUDAToolkit REQUIRED)
-find_package(pybind11 REQUIRED)
-
-file(WRITE ${CMAKE_BINARY_DIR}/test_cuda.cu "extern \"C\" __global__ void testKernel() { }")
-execute_process(
-        COMMAND ${CUDA_NVCC_EXECUTABLE} ${CMAKE_CUDA_FLAGS} -gencode arch=compute_90a,code=sm_90a -o ${CMAKE_BINARY_DIR}/test_cuda.o -c ${CMAKE_BINARY_DIR}/test_cuda.cu
-        RESULT_VARIABLE NVCC_RESULT
-        OUTPUT_VARIABLE NVCC_OUTPUT
-        ERROR_VARIABLE NVCC_ERROR_OUTPUT
-        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC -Wno-psabi")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -Wno-psabi")
+set(CUDA_SEPARABLE_COMPILATION ON)
+list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG")
+list(APPEND CUDA_NVCC_FLAGS "-O3")
+list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage")
 
-if (NVCC_RESULT EQUAL "0")
-    set(NVCC_SUPPORTS_SM90 TRUE)
-    message(STATUS "NVCC supports SM90")
-else()
-    message(STATUS "NVCC does not support SM90")
-endif()
+set(USE_SYSTEM_NVTX on)
+set(CUDA_ARCH_LIST "9.0" CACHE STRING "List of CUDA architectures to compile")
+set(TORCH_CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
 
-if (NVCC_SUPPORTS_SM90)
-    set(TORCH_CUDA_ARCH_LIST "8.6" CACHE STRING "Add arch tag 90a to NVCC" FORCE)
-    list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_90a,code=sm_90a")
-endif()
+find_package(CUDAToolkit REQUIRED)
+find_package(pybind11 REQUIRED)
 find_package(Torch REQUIRED)
 
-include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include)
-include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS})
-link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+
+include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include third-party/fmt/include)
+include_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS})
+link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs)
 
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -fPIC -DNDEBUG")
-set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3 -std=c++17 -DNDEBUG --ptxas-options=--register-usage-level=10")
+# The main Python API entrance
+pybind11_add_module(deep_gemm_cpp csrc/python_api.cpp)
+target_link_libraries(deep_gemm_cpp PRIVATE ${TORCH_LIBRARIES} torch_python cuda)
 
-cuda_add_library(example_gemm STATIC indexing/main.cu)
+# Enable kernel code indexing with CMake-based IDEs
+cuda_add_library(deep_gemm_indexing_cuda STATIC csrc/indexing/main.cu)
diff --git a/README.md b/README.md
diff --git a/build.sh b/build.sh
@@ -0,0 +1,12 @@
+# Change current directory into project root
+original_dir=$(pwd)
+script_dir=$(realpath "$(dirname "$0")")
+cd "$script_dir"
+
+# Remove old dist file, build files, and install
+rm -rf build dist
+rm -rf *.egg-info
+python setup.py bdist_wheel
+
+# Open users' original directory
+cd "$original_dir"
diff --git a/csrc/apis/gemm.hpp b/csrc/apis/gemm.hpp
diff --git a/csrc/apis/layout.hpp b/csrc/apis/layout.hpp
@@ -0,0 +1,93 @@
+#pragma once
+
+#include "../utils/layout.hpp"
+#include "../jit_kernels/impls/smxx_layout.hpp"
+
+namespace deep_gemm::layout {
+
+static torch::Tensor transform_sf_into_required_layout(const torch::Tensor& sf,
+                                                       const int& mn, const int& k,
+                                                       const std::tuple<int, int, int>& recipe,
+                                                       const std::optional<int>& num_groups,
+                                                       const bool& is_sfa,
+                                                       const bool& disable_ue8m0_cast,
+                                                       const bool& is_per_tensor) {
+    const auto& gran_mn = is_sfa ? std::get<0>(recipe) : std::get<1>(recipe);
+    const auto& gran_k = std::get<2>(recipe);
+    const auto& arch_major = device_runtime->get_arch_major();
+
+    // Pre-transform checks
+    if (is_per_tensor)
+        check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, false, false, std::nullopt, true);
+    else
+        check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, false, false, std::nullopt, false);
+
+    // (FP32, 1, 128) on SM90: transform to TMA-aligned and MN-major
+    if (sf.scalar_type() == torch::kFloat and gran_mn == 1 and gran_k == 128 and (arch_major == 9 or disable_ue8m0_cast))
+        return get_mn_major_tma_aligned_tensor(sf);
+
+    // (FP32, 1, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major
+    if (sf.scalar_type() == torch::kFloat and gran_mn == 1 and gran_k == 128 and arch_major == 10) {
+        DG_HOST_ASSERT(not disable_ue8m0_cast);
+        return get_mn_major_tma_aligned_packed_ue8m0_tensor(sf);
+    }
+
+    // (FP32, 128, 128) on SM90: no need to transform, check shape and contiguous
+    if (sf.scalar_type() == torch::kFloat and gran_mn == 128 and gran_k == 128 and (arch_major == 9 or disable_ue8m0_cast) and is_per_tensor == false)
+        return check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, false, true, torch::kFloat);
+
+    // (FP32, 128, 128) per tensor on SM90: no need to transform, check shape and contiguous
+    if (sf.scalar_type() == torch::kFloat and gran_mn == 128 and gran_k == 128 and (arch_major == 9 or disable_ue8m0_cast) and is_per_tensor)
+        return check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, false, true, torch::kFloat, true);
+
+    // (FP32, 128, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major
+    if (sf.scalar_type() == torch::kFloat and gran_mn == 128 and gran_k == 128 and arch_major == 10) {
+        DG_HOST_ASSERT(not disable_ue8m0_cast);
+        const auto& broadcasted = sf.index_select(-2, torch::arange(mn, at::TensorOptions().device(sf.device())).floor_divide_(128));
+        return get_mn_major_tma_aligned_packed_ue8m0_tensor(broadcasted);
+    }
+
+    // (INT, 1, 128) on SM100: transform to TMA-aligned and MN-major
+    if (sf.scalar_type() == torch::kInt and gran_mn == 1 and gran_k == 128 and arch_major == 10)
+        return check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, true, false, torch::kInt);
+
+    DG_HOST_UNREACHABLE("Unknown SF transformation");
+}
+
+static torch::Tensor transform_k_grouped_sf_into_required_layout(const torch::Tensor& sf,
+                                                                 const std::vector<int>& ks,
+                                                                 const torch::Tensor& ks_tensor,
+                                                                 const std::tuple<int, int, int>& recipe) {
+    DG_HOST_ASSERT(sf.dim() == 2);
+    DG_HOST_ASSERT(recipe == std::make_tuple(1, 1, 128));
+    const auto& arch_major = device_runtime->get_arch_major();
+
+    // FP32 on SM90
+    if (sf.scalar_type() == torch::kFloat and arch_major == 9)
+        DG_HOST_UNREACHABLE("Unimplemented");
+
+    // FP32 on SM100
+    if (sf.scalar_type() == torch::kFloat and arch_major == 10)
+        return get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(sf, ks_tensor, ks);
+
+    // INT on SM100
+    if (sf.scalar_type() == torch::kFloat and arch_major == 10)
+        DG_HOST_UNREACHABLE("Unimplemented");
+
+    DG_HOST_UNREACHABLE("Unknown cases");
+}
+
+static void register_apis(pybind11::module_& m) {
+    m.def("transform_sf_into_required_layout", &transform_sf_into_required_layout,
+      py::arg("sf"), py::arg("mn"), py::arg("k"), py::arg("recipe"),
+      py::arg("num_groups") = std::nullopt, py::arg("is_sfa") = false,
+      py::arg("disable_ue8m0_cast") = false, py::arg("is_per_tensor") = false);
+
+    m.def("get_tma_aligned_size", &get_tma_aligned_size);
+    m.def("get_mk_alignment_for_contiguous_layout", &get_mk_alignment_for_contiguous_layout);
+    m.def("get_mn_major_tma_aligned_tensor", &get_mn_major_tma_aligned_tensor);
+    m.def("get_mn_major_tma_aligned_packed_ue8m0_tensor", &get_mn_major_tma_aligned_packed_ue8m0_tensor);
+    m.def("get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor", &get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor);
+}
+
+} // namespace deep_gemm::layout
diff --git a/csrc/apis/runtime.hpp b/csrc/apis/runtime.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../jit/compiler.hpp"
+#include "../jit/device_runtime.hpp"
+
+namespace deep_gemm::runtime {
+
+static void register_apis(pybind11::module_& m) {
+    m.def("set_num_sms", [&](const int& new_num_sms) {
+        device_runtime->set_num_sms(new_num_sms);
+    });
+    m.def("get_num_sms", [&]() {
+       return device_runtime->get_num_sms();
+    });
+    m.def("set_tc_util", [&](const int& new_tc_util) {
+        device_runtime->set_tc_util(new_tc_util);
+    });
+    m.def("get_tc_util", [&]() {
+        return device_runtime->get_tc_util();
+    });
+
+    m.def("init", [&](const std::string& library_root_path, const std::string& cuda_home_path_by_python) {
+        Compiler::prepare_init(library_root_path, cuda_home_path_by_python);
+        KernelRuntime::prepare_init(cuda_home_path_by_python);
+    });
+}
+
+} // namespace deep_gemm::runtime
diff --git a/csrc/indexing/main.cu b/csrc/indexing/main.cu
@@ -0,0 +1,13 @@
+#include <deep_gemm/impls/sm90_bf16_gemm.cuh>
+#include <deep_gemm/impls/sm100_bf16_gemm.cuh>
+#include <deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh>
+#include <deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh>
+#include <deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh>
+#include <deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh>
+#include <deep_gemm/impls/smxx_layout.cuh>
+
+using namespace deep_gemm;
+
+int main() {
+    return 0;
+}
diff --git a/csrc/jit/cache.hpp b/csrc/jit/cache.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <filesystem>
+#include <memory>
+#include <unordered_map>
+
+#include "kernel_runtime.hpp"
+
+namespace deep_gemm {
+
+class KernelRuntimeCache {
+    std::unordered_map<std::string, std::shared_ptr<KernelRuntime>> cache;
+
+public:
+    // TODO: consider cache capacity
+    KernelRuntimeCache() = default;
+
+    std::shared_ptr<KernelRuntime> get(const std::filesystem::path& dir_path) {
+        // Hit the runtime cache
+        if (const auto& iterator = cache.find(dir_path); iterator != cache.end())
+            return iterator->second;
+
+        if (KernelRuntime::check_validity(dir_path))
+            return cache[dir_path] = std::make_shared<KernelRuntime>(dir_path);
+        return nullptr;
+    }
+};
+
+static auto kernel_runtime_cache = std::make_shared<KernelRuntimeCache>();
+
+} // namespace deep_gemm