Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 3 additions & 141 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(NVFUSER_ROOT ${PROJECT_SOURCE_DIR})
set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
set(NVFUSER_PYTHON_DIR "${NVFUSER_ROOT}/python")
set(NVFUSER_PYTHON_BINDINGS "${NVFUSER_ROOT}/python/python_frontend")
set(NVFUSER_PYTHON_COMMON "${NVFUSER_ROOT}/python/python_common")
set(NVFUSER_PYTHON_DIRECT_BINDINGS "${NVFUSER_ROOT}/python/python_direct")
set(NVFUSER_CUTLASS "${NVFUSER_ROOT}/cutlass")
Expand Down Expand Up @@ -381,21 +380,6 @@ if(NOT MSVC)
)
endif()

if(BUILD_PYTHON)
list(APPEND NVFUSER_SRCS
${NVFUSER_PYTHON_BINDINGS}/fusion_cache.cpp
${NVFUSER_PYTHON_BINDINGS}/fusion_definition.cpp
${NVFUSER_PYTHON_BINDINGS}/fusion_state.cpp
${NVFUSER_PYTHON_BINDINGS}/segmentation.cpp
${NVFUSER_PYTHON_BINDINGS}/translation.cpp
${NVFUSER_PYTHON_BINDINGS}/translation_utils.cpp
${NVFUSER_SRCS_DIR}/serde/fusion_record.cpp
${NVFUSER_PYTHON_COMMON}/distributed_tensor.cpp
${NVFUSER_PYTHON_COMMON}/python_utils.cpp
${NVFUSER_PYTHON_COMMON}/translation_names.cpp
)
endif()

# We create both static and shared libraries.
#
# Shared libraries are what ships, but a large advantage of static libraries is
Expand Down Expand Up @@ -606,128 +590,6 @@ install(DIRECTORY "${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvfuser")

if(BUILD_PYTHON)
# -----------------------------
# build nvfuser python library
# -----------------------------
# nvfuser python API sources
set(NVFUSER_PYTHON_SRCS)
list(APPEND NVFUSER_PYTHON_SRCS
${NVFUSER_PYTHON_BINDINGS}/multidevice_bindings.cpp
${NVFUSER_PYTHON_BINDINGS}/python_bindings.cpp
${NVFUSER_PYTHON_BINDINGS}/python_bindings_extension.cpp
${NVFUSER_PYTHON_BINDINGS}/schedule_bindings.cpp
)

add_library(nvf_py_internal OBJECT ${NVFUSER_PYTHON_SRCS})
target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_PYTHON_DIR})
target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_PYTHON_COMMON})
target_include_directories(nvf_py_internal PUBLIC ${NVFUSER_CUTLASS})
target_include_directories(nvf_py_internal SYSTEM INTERFACE
${CMAKE_SOURCE_DIR}/third_party/flatbuffers/include
)

# setup python API version
add_custom_command(
OUTPUT ${NVFUSER_PYTHON_DIR}/nvfuser/version.py
COMMAND
"${Python_EXECUTABLE}" -c \"from pathlib import Path\; Path('${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py') .touch() \"
COMMAND
"${Python_EXECUTABLE}" ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py nvfuser
DEPENDS ${NVFUSER_PYTHON_DIR}/tools/gen_nvfuser_version.py
DEPENDS ${NVFUSER_PYTHON_DIR}/version.txt
WORKING_DIRECTORY ${NVFUSER_PYTHON_DIR}/tools/
)
add_custom_target(
gen_nvfuser_version ALL
DEPENDS ${NVFUSER_PYTHON_DIR}/nvfuser/version.py
)
add_dependencies(nvf_py_internal gen_nvfuser_version)

target_compile_definitions(nvf_py_internal PRIVATE
"-DTORCH_CUDA_BUILD_MAIN_LIB"
"-DC10_BUILD_MAIN_LIB=1"
EXTENSION_NAME=_C
)

add_library(nvfuser MODULE $<TARGET_OBJECTS:nvf_py_internal>)
target_compile_definitions(nvfuser PRIVATE
"-DTORCH_CUDA_BUILD_MAIN_LIB"
"-DC10_BUILD_MAIN_LIB=1"
EXTENSION_NAME=_C
)

if(NOT MSVC)
target_compile_options(nvf_py_internal PRIVATE -Wall -Wno-unused-function)
target_compile_options(nvf_py_internal PRIVATE -Werror)

# Add function/data sections for dead code elimination
target_compile_options(nvf_py_internal PRIVATE
"-ffunction-sections"
"-fdata-sections"
)

set(NVF_LIB_SUFFIX ".so")
else()
set(NVF_LIB_SUFFIX ".pyd")
endif()

set_target_properties(nvfuser PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
CXX_STANDARD ${NVFUSER_CPP_STANDARD}
CXX_STANDARD_REQUIRED ON
CXX_VISIBILITY_PRESET hidden
INSTALL_RPATH
"$ORIGIN/lib:$ORIGIN/../nvfuser_common/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../torch/lib"
POSITION_INDEPENDENT_CODE Yes
SUFFIX ${NVF_LIB_SUFFIX}
VISIBILITY_INLINES_HIDDEN Yes
)
set_target_properties(nvf_py_internal PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
CXX_STANDARD ${NVFUSER_CPP_STANDARD}
CXX_STANDARD_REQUIRED ON
CXX_VISIBILITY_PRESET hidden
INSTALL_RPATH
"$ORIGIN/lib:$ORIGIN/../nvidia/cuda_runtime/lib:$ORIGIN/../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../torch/lib"
POSITION_INDEPENDENT_CODE Yes
VISIBILITY_INLINES_HIDDEN Yes
)

if (NVFUSER_USE_CUTLASS)
target_link_libraries(nvf_py_internal PRIVATE nvf_cutlass)
endif()

if (NOT MSVC)
target_link_libraries(nvf_py_internal PRIVATE CUDA::cupti)
endif()

target_link_libraries(nvf_py_internal PRIVATE
nvfuser_codegen
"${TORCH_INSTALL_PREFIX}/lib/libtorch_python.so"
pybind11::pybind11 pybind11::headers
)

target_link_libraries(nvfuser PRIVATE
nvf_py_internal
Python::Module
)

# Add dead code elimination flags to reduce file size
if(NOT MSVC)
target_link_options(nvfuser PRIVATE
"-Wl,--gc-sections"
"-Wl,--as-needed"
$<$<CONFIG:Release>:-s>
)
endif()

set_target_properties(nvfuser PROPERTIES
INSTALL_RPATH "$ORIGIN:$ORIGIN/lib:$ORIGIN/../build:$ORIGIN/../nvfuser_common/lib"
)
install(TARGETS nvfuser DESTINATION lib)

# ------------------------------------------------
# build nvfuser direct python library
# ------------------------------------------------
Expand All @@ -750,6 +612,9 @@ if(BUILD_PYTHON)
${NVFUSER_PYTHON_DIRECT_BINDINGS}/profile.cpp
${NVFUSER_PYTHON_DIRECT_BINDINGS}/direct_utils.cpp
${NVFUSER_PYTHON_DIRECT_BINDINGS}/python_translate.cpp
${NVFUSER_PYTHON_COMMON}/distributed_tensor.cpp
${NVFUSER_PYTHON_COMMON}/python_utils.cpp
${NVFUSER_PYTHON_COMMON}/translation_names.cpp
)
add_library(nvf_py_direct_internal OBJECT ${NVFUSER_PYTHON_DIRECT_SRCS})

Expand Down Expand Up @@ -1435,9 +1300,6 @@ target_include_directories(codegen_internal PRIVATE "${CMAKE_BINARY_DIR}/include
install(EXPORT NvfuserTargets FILE NvfuserConfig.cmake DESTINATION share/cmake/nvfuser)

file(CREATE_LINK "${CMAKE_BINARY_DIR}" "${NVFUSER_ROOT}/bin" SYMBOLIC)
# These symbolic links help IDEs like Cursor resolve symbols in nvfuser and
# nvfuser_direct.
file(CREATE_LINK "${NVFUSER_ROOT}/python/nvfuser" "${NVFUSER_ROOT}/nvfuser" SYMBOLIC)
file(CREATE_LINK "${NVFUSER_ROOT}/python/nvfuser_direct" "${NVFUSER_ROOT}/nvfuser_direct" SYMBOLIC)

message(STATUS "******** Nvfuser configuration summary ********")
Expand Down
2 changes: 1 addition & 1 deletion csrc/fusion_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4535,7 +4535,7 @@ bool SegmentCandidateFinder::privatizeUpCastOrSqueezeOp() {
// More details of the issue regarding merging horizontal groups can be
// found in issue 3829 -- https://github.com/NVIDIA/Fuser/issues/3829.
// Even with a squeeze op with 2 uses, this test case:
// https://github.com/NVIDIA/Fuser/blob/70ab277c7d91bcc24cd50dd75cedd79863a24f96/tests/python/test_python_frontend.py#L3666C1-L3666C30
// https://github.com/NVIDIA/Fuser/blob/69da2d1972eb19bf7a04cef0c4debe9f55d8e11c/tests/python/direct/test_repro.py#L801
// demonstrates that privatizing the squeeze op leads to horizontal groups
// that can't be merged back.
if (maybe_upcast_squeeze_out_tv->definition()->isA<SqueezeOp>() &&
Expand Down
3 changes: 2 additions & 1 deletion csrc/multidevice/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ std::unordered_map<ParallelType, IterDomain*> mapDeviceAndStreamParallelTypeToId
// `tv->getLogicalDomain()` map one-to-one modulo reduction. However, a size in
// `at::Tensor::sizes` is a factor of the corresponding logical IterDomain's
// extent if that IterDomain is sharded.
int64_t getShardedLogicalAxis(const TensorView* tv, ParallelType parallel_type);
NVF_API int64_t
getShardedLogicalAxis(const TensorView* tv, ParallelType parallel_type);

// Returns the IterDomain that's parallelized on `parallel_type` in the domain
// of type `domain_type`.
Expand Down
2 changes: 0 additions & 2 deletions csrc/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,6 @@ std::unordered_map<DebugDumpOption, std::vector<std::string>> Options<
{"ptx", DebugDumpOption::Ptx},
{"ptxas_verbose", DebugDumpOption::PrintPtxasLog},
{"python_definition", DebugDumpOption::PythonDefinition},
{"python_definition_segments", DebugDumpOption::PythonDefinitionSegments},
{"python_frontend_debug", DebugDumpOption::PythonFrontendDebug},
{"sass", DebugDumpOption::Sass},
{"sass_to_file", DebugDumpOption::SassToFile},
{"segmented_fusion", DebugDumpOption::FusionSegments},
Expand Down
2 changes: 0 additions & 2 deletions csrc/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ enum class DebugDumpOption {
PreSegmenterLogging,
HostIrLowering, //! Dump the Host IR after each lowering pass
PythonDefinition, //! Python Frontend Fusion Definition.
PythonDefinitionSegments, //! Python Frontend Fusion Definition of segments.
PythonFrontendDebug, //! Python Frontend debug information.
TransformPropagator, //! When running TransformPropagator, print propagation
//! path and replay result
Cubin, //! Dump compiled CUBIN
Expand Down
12 changes: 0 additions & 12 deletions csrc/runtime/fusion_kernel_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#include "instrumentation.h"
#include "ir/base_nodes.h"
#include "preseg_passes/pre_segmenter.h"
#include "python_frontend/fusion_definition.h"
#include "python_frontend/translation.h"
#include "runtime/executor.h"
#include "runtime/executor_dispatch.h"
#include "runtime/fusion_cache_utils.h"
Expand Down Expand Up @@ -430,16 +428,6 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
FusionProfiler::startCompile();
}

if (isDebugDumpEnabled(DebugDumpOption::PythonDefinitionSegments)) {
for (SegmentedGroup* group_to_run : runtime_workspace_.group_run_order) {
debug() << "Python definition for segmented group "
<< group_to_run->groupId() << ":" << std::endl;
python_frontend::FusionDefinition fd(/*id=*/std::nullopt);
python_frontend::translate(group_to_run->getFusion(), &fd);
fd.print(debug());
}
}

const std::vector<KernelArgumentHolder> all_runtime_inputs =
prepareInputs(args);

Expand Down
4 changes: 3 additions & 1 deletion csrc/serde/Serde.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

# NvFuser Serialization

## Serialization is disabled because legacy bindings are removed.

Serde is an acronym of serialization and deserialization.

# Overview
Expand Down Expand Up @@ -66,7 +68,7 @@ References:

# Serde Testing

In test_python_frontend.py, the `exec_nvfuser` function is decorated with the `serde_check` functions. Every unit test should automatically test serialization.
The `exec_nvfuser` function is decorated with the `serde_check` functions. Every unit test should automatically test serialization.

```python
def serde_check(test_fn: Callable):
Expand Down
Loading
Loading