diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fe46d0a5ef..a3cf59b59e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,8 @@ option(USE_CLANG_TIDY "" OFF) option(BUILD_PYTHON "" ON) option(BUILD_CPP_API "Option to build OneFlow C++ API (beta)" OFF) option(BUILD_RDMA "" OFF) -option(BUILD_CUDA "" ON) +option(BUILD_CUDA "" OFF) +option(BUILD_NPU "Huawei Ascend NPU" ON) option(BUILD_TESTING "" OFF) option(BUILD_GIT_VERSION "" ON) option(BUILD_PROFILER "" OFF) @@ -40,6 +41,10 @@ option(OF_FORCE_COLORED_DIAGNOSTICS "Always produce ANSI-colored diagnostics (GN set(ONEFLOW_CURRENT_VERSION 0.8.1.dev CACHE STRING "") +if(BUILD_NPU) + add_definitions(-DWITH_NPU) +endif() + if(BUILD_FOR_CI) set(ONEFLOW_CURRENT_VERSION ci) endif() diff --git a/cmake/ascend_npu.cmake b/cmake/ascend_npu.cmake new file mode 100644 index 00000000000..9108b80a30a --- /dev/null +++ b/cmake/ascend_npu.cmake @@ -0,0 +1,79 @@ +# The following are set after configuration is done: +# ASCEND_INCLUDE_DIRS +# ASCEND_LIBRARIES + + +if(NOT DEFINED ENV{ASCEND_TOOLKIT_HOME}) + message(WARNING "ASCEND_TOOLKIT_HOME env is not found. Setting default value: /usr/local/Ascend/ascend-toolkit/latest") + set(ASCEND_TOOLKIT_HOME "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "Folder contains Ascend toolkit") +else() + # get ASCEND_TOOLKIT_HOME from environment + message(STATUS "ASCEND_TOOLKIT_HOME found: $ENV{ASCEND_TOOLKIT_HOME}") + set(ASCEND_TOOLKIT_HOME $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "Folder contains Ascend toolkit") +endif() + + +find_path( + ASCEND_INCLUDE_DIRS + NAMES acl hccl + PATHS $ENV{ASCEND_TOOLKIT_HOME}/include $ENV{CPLUS_INCLUDE_PATH} + PATH_SUFFIXES include) + +if(ASCEND_INCLUDE_DIRS) + message(STATUS "ASCEND_INCLUDE_DIRS found: ${ASCEND_INCLUDE_DIRS}") + execute_process(COMMAND source ${ASCEND_HOME_DIR}/bin/setenv.bash) +else() + message( + FATAL_ERROR + "Huawei Ascend header files are not found. Please set ASCEND_TOOLKIT_HOME to specify the search path." + ) +endif() + +find_library( + ASCEND_LD_LIBRARIES + NAMES ascendcl + PATHS ${ASCEND_TOOLKIT_HOME} $ENV{ASCEND_TOOLKIT_HOME}/lib64 + $ENV{LD_LIBRARY_PATH}) + + +if(ASCEND_LD_LIBRARIES) + message(STATUS "ASCEND_LD_LIBRARIES found: ${ASCEND_LD_LIBRARIES}") +else() + message( + FATAL_ERROR + "ASCEND_LD_LIBRARIES Ascend lib(ascendcl) is not found. Please set ASCEND_TOOLKIT_HOME to specify the search path." + ) +endif() + +find_library( + ASCEND_OP_COMPILER_LD_LIBRARIE + NAMES acl_op_compiler + PATHS ${ASCEND_TOOLKIT_HOME} $ENV{ASCEND_TOOLKIT_HOME}/lib64 + $ENV{LD_LIBRARY_PATH}) + +if(NOT ASCEND_OP_COMPILER_LD_LIBRARIE) + message( + FATAL_ERROR + "ASCEND_OP_COMPILER_LD_LIBRARIE Ascend lib(acl_op_compiler) is not found. Please set ASCEND_TOOLKIT_HOME to specify the search path." + ) +endif() + +find_library( + ASCEND_HCCL_LD_LIBRARIE + NAMES hccl + PATHS ${ASCEND_TOOLKIT_HOME} $ENV{ASCEND_TOOLKIT_HOME}/lib64 + $ENV{LD_LIBRARY_PATH}) + +if(NOT ASCEND_HCCL_LD_LIBRARIE) + message( + FATAL_ERROR + "ASCEND_HCCL_LD_LIBRARIE Ascend lib(hccl) is not found. Please set ASCEND_TOOLKIT_HOME to specify the search path." + ) +endif() + +set(ASCEND_INCLUDE_DIRS ${ASCEND_INCLUDE_DIRS}) +set(ASCEND_LIBRARIES ${ASCEND_LD_LIBRARIES} ${ASCEND_HCCL_LD_LIBRARIE} + ${ASCEND_OP_COMPILER_LD_LIBRARIE}) + +message(STATUS "Ascend: ASCEND_INCLUDE_DIRS = ${ASCEND_INCLUDE_DIRS}") +message(STATUS "Ascend: ASCEND_LIBRARIES = ${ASCEND_LIBRARIES}") diff --git a/cmake/oneflow.cmake b/cmake/oneflow.cmake index b37535367e1..3521d2b4425 100644 --- a/cmake/oneflow.cmake +++ b/cmake/oneflow.cmake @@ -317,6 +317,12 @@ add_definitions(-DONEFLOW_BINARY_DIR="${PROJECT_BINARY_DIR}") include(op_schema) +if(BUILD_NPU) + include(${PROJECT_SOURCE_DIR}/cmake/ascend_npu.cmake) + target_include_directories(oneflow PRIVATE ${ASCEND_INCLUDE_DIRS}) + target_link_libraries(oneflow ${ASCEND_LIBRARIES}) +endif() + get_property(EXTERNAL_TARGETS GLOBAL PROPERTY EXTERNAL_TARGETS) if(APPLE) diff --git a/oneflow/core/profiler/acl_profiler.cpp b/oneflow/core/profiler/acl_profiler.cpp new file mode 100644 index 00000000000..2ca0c8770df --- /dev/null +++ b/oneflow/core/profiler/acl_profiler.cpp @@ -0,0 +1,150 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#if defined(WITH_NPU) +#include +#include +#include +#include "oneflow/core/profiler/acl_profiler.h" + +namespace oneflow { +namespace profiler { + +std::map npu_metrics_map_ = { + {"ACL_AICORE_PIPE_UTILIZATION", ACL_AICORE_PIPE_UTILIZATION}, + {"ACL_AICORE_ARITHMETIC_UTILIZATION", ACL_AICORE_ARITHMETIC_UTILIZATION}, + {"ACL_AICORE_MEMORY_BANDWIDTH", ACL_AICORE_MEMORY_BANDWIDTH}, + {"ACL_AICORE_L0B_AND_WIDTH", ACL_AICORE_L0B_AND_WIDTH}, + {"ACL_AICORE_RESOURCE_CONFLICT_RATIO", ACL_AICORE_RESOURCE_CONFLICT_RATIO}, + {"ACL_AICORE_MEMORY_UB", ACL_AICORE_MEMORY_UB}, + {"ACL_AICORE_L2_CACHE", ACL_AICORE_L2_CACHE}, + {"ACL_AICORE_NONE", ACL_AICORE_NONE}, +}; + +std::map trace_level_map_ = { + {"Level0", Level0}, + {"Level1", Level1}, + {"Level2", Level2}, + {"Level_none", Level_none}, +}; + +aclError AclProfilingInit(const char* profilerResultPath, size_t length) { + return aclprofInit(profilerResultPath, length); +} + +aclError AclProfilingStart(const aclprofConfig* profilerConfig) { + return aclprofStart(profilerConfig); +} + +aclError AclProfilingStop(const aclprofConfig* profilerConfig) { + return aclprofStop(profilerConfig); +} + +aclError AclProfilingFinalize() { return aclprofFinalize(); } + +aclprofConfig* AclProfilingCreateConfig(uint32_t* deviceIdList, uint32_t deviceNums, + aclprofAicoreMetrics aicoreMetrics, + aclprofAicoreEvents* aicoreEvents, + uint64_t dataTypeConfig) { + return aclprofCreateConfig(deviceIdList, deviceNums, aicoreMetrics, aicoreEvents, dataTypeConfig); +} + +aclError AclprofSetConfig(aclprofConfigType configType, const char* config, size_t configLength) { + return aclprofSetConfig(configType, config, configLength); +} + +aclError AclProfilingDestroyConfig(const aclprofConfig* profilerConfig) { + return aclprofDestroyConfig(profilerConfig); +} + +aclprofConfig* AclPrepareTrace() { + // ref: torch_npu/csrc/profiler/profiler_mgr.cpp + char* profiler_log_dir_env_var = getenv("ASCEND_PROFILER_LOG_DIR"); + if (profiler_log_dir_env_var == nullptr) { + char* env_var = getenv("ASCEND_TOOLKIT_HOME"); + std::string ascend_home_path(env_var); + AclProfilingInit(ascend_home_path.c_str(), ascend_home_path.size()); + } else { + std::string profiler_log_dir(profiler_log_dir_env_var); + AclProfilingInit(profiler_log_dir.c_str(), profiler_log_dir.size()); + } + + // torch_npu/profiler/profiler.py + // torch_npu/profiler/experimental_config.py + NpuTraceConfig npu_config = { + /*trace_level*/ "Level2", /*metrics*/ "ACL_AICORE_PIPE_UTILIZATION", + /*npu_memory*/ true, /*l2_cache*/ false, + /*record_op_args*/ true, + /*msprof_tx*/ true, /*op_attr*/ false}; + aclprofAicoreMetrics aic_metrics = ACL_AICORE_NONE; + auto level_iter = trace_level_map_.find(npu_config.trace_level); + uint64_t datatype_config = + (level_iter == trace_level_map_.end()) ? Level0 : trace_level_map_[npu_config.trace_level]; + auto metrics_iter = npu_metrics_map_.find(npu_config.metrics); + if (metrics_iter != npu_metrics_map_.end() + && npu_config.metrics.compare("ACL_AICORE_NONE") != 0) { + datatype_config |= ACL_PROF_AICORE_METRICS; + aic_metrics = npu_metrics_map_[npu_config.metrics]; + } + if (npu_config.l2_cache) { datatype_config |= ACL_PROF_L2CACHE; } + if (npu_config.msprof_tx) { datatype_config |= ACL_PROF_MSPROFTX; } + if (npu_config.npu_memory) { + datatype_config |= ACL_PROF_TASK_MEMORY; + const std::string freq = "50"; + auto prof_ret = AclprofSetConfig(ACL_PROF_SYS_HARDWARE_MEM_FREQ, freq.c_str(), freq.size()); + if (prof_ret == ACL_ERROR_PROF_MODULES_UNSUPPORTED) { + LOG(WARNING) << "ProfileManager npu AclprofSetConfig() failed: " + << "not support to set config for sys-hardware-mem."; + } + } + // op_attr=true has bug + if (npu_config.op_attr) { datatype_config |= ACL_PROF_OP_ATTR; } + + uint32_t deviceId = 0; + // TODO: get current local device + // auto ret = c10_npu::GetDevice(&deviceId); + // if (ret != ACL_ERROR_NONE) { + // LOG(WARNING) <<"ProfileManager npu AclprofSetConfig() failed: " << "Get Device ID failed."; + // return; + // } + const uint32_t deviceNum = 1; + uint32_t deviceIdList[deviceNum] = {deviceId}; + aclprofConfig* profConfig = + AclProfilingCreateConfig(deviceIdList, deviceNum, aic_metrics, nullptr, datatype_config); + return profConfig; +} + +aclError AclStartTrace(aclprofConfig* profConfig) { return AclProfilingStart(profConfig); } + +void AclReleaseTrace(aclprofConfig* profConfig) { + aclrtSynchronizeDevice(); + // stop + AclProfilingStop(profConfig); + auto ret = AclProfilingDestroyConfig(profConfig); + if (ret != ACL_SUCCESS) { + LOG(WARNING) << "ProfileManager npu AclReleaseTrace() failed: " + << "AclProfDestoryConfig fail, error code: " << ret; + return; + } + profConfig = nullptr; + + // finalize + AclProfilingFinalize(); +} + +} // namespace profiler +} // namespace oneflow + +#endif // WITH_NPU diff --git a/oneflow/core/profiler/acl_profiler.h b/oneflow/core/profiler/acl_profiler.h new file mode 100644 index 00000000000..5a49c556c62 --- /dev/null +++ b/oneflow/core/profiler/acl_profiler.h @@ -0,0 +1,62 @@ +/* +Copyright 2020 The OneFlow Authors. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#if defined(WITH_NPU) +#include +#include +#include +#include "acl/acl.h" +#include "acl/acl_prof.h" + +namespace oneflow { +namespace profiler { + +// trace_level +constexpr uint64_t Level_none = 0; +constexpr uint64_t Level0 = ACL_PROF_TASK_TIME_L0 | ACL_PROF_ACL_API; +constexpr uint64_t Level1 = + ACL_PROF_TASK_TIME | ACL_PROF_ACL_API | ACL_PROF_HCCL_TRACE | ACL_PROF_AICORE_METRICS; +constexpr uint64_t Level2 = Level1 | ACL_PROF_RUNTIME_API | ACL_PROF_AICPU; + +struct NpuTraceConfig { + std::string trace_level; + std::string metrics; + bool npu_memory; + bool l2_cache; + bool record_op_args; + bool msprof_tx; + bool op_attr; +}; + +#define ACL_PROF_OP_ATTR 0x00004000ULL + +aclError AclProfilingInit(const char* profilerResultPath, size_t length); +aclError AclProfilingStart(const aclprofConfig* profilerConfig); +aclError AclProfilingStop(const aclprofConfig* profilerConfig); +aclError AclProfilingFinalize(); +aclprofConfig* AclProfilingCreateConfig(uint32_t* deviceIdList, uint32_t deviceNums, + aclprofAicoreMetrics aicoreMetrics, + aclprofAicoreEvents* aicoreEvents, uint64_t dataTypeConfig); +aclError AclprofSetConfig(aclprofConfigType configType, const char* config, size_t configLength); +aclError AclProfilingDestroyConfig(const aclprofConfig* profilerConfig); + +aclprofConfig* AclPrepareTrace(); +aclError AclStartTrace(aclprofConfig* profConfig); +void AclReleaseTrace(aclprofConfig* profConfig); + +} // namespace profiler +} // namespace oneflow + +#endif // WITH_NPU diff --git a/oneflow/core/profiler/event.cpp b/oneflow/core/profiler/event.cpp index dd14f454374..135e0ca0e1e 100644 --- a/oneflow/core/profiler/event.cpp +++ b/oneflow/core/profiler/event.cpp @@ -30,15 +30,36 @@ void IEvent::SetStartedAt(double t) { started_at_ = t; } void IEvent::SetFinishedAt(double t) { finished_at_ = t; } -void IEvent::Start() { SetStartedAt(GetTimeNow()); } +void IEvent::Start() { + SetStartedAt(GetTimeNow( +#ifdef WITH_NPU + true +#else + false +#endif + )); +} -void IEvent::Finish() { SetFinishedAt(GetTimeNow()); } +void IEvent::Finish() { + SetFinishedAt(GetTimeNow( +#ifdef WITH_NPU + true +#else + false +#endif + )); +} bool IEvent::IsChildOf(const IEvent* e) { if (!e) { return false; } if (this == e) { return false; } - return GetStartedAt() >= e->GetStartedAt() - && GetFinishedAt() <= e->GetFinishedAt(); +#ifdef WITH_NPU + const auto time_unit = EventTimeUnit::kNS; +#else + const auto time_unit = EventTimeUnit::kUS; +#endif + return GetStartedAt(time_unit) >= e->GetStartedAt(time_unit) + && GetFinishedAt(time_unit) <= e->GetFinishedAt(time_unit); } const std::string& IEvent::GetName() const { return name_; } @@ -60,10 +81,12 @@ nlohmann::json KernelEvent::ToJson() { for (const auto& desc : description_) { j["description"][desc.first] = {desc.second.first, desc.second.second}; } -#if defined(WITH_CUDA) +#if defined(WITH_CUDA) || defined(WITH_NPU) +#ifdef WITH_CUDA j["memory_size"] = memory_size_; - if (!children_.empty()) { j["children"] = children_; } #endif // WITH_CUDA + if (!children_.empty()) { j["children"] = children_; } +#endif // WITH_CUDA || WITH_NPU return j; } diff --git a/oneflow/core/profiler/event.h b/oneflow/core/profiler/event.h index bfbabff3ef2..68561300e89 100644 --- a/oneflow/core/profiler/event.h +++ b/oneflow/core/profiler/event.h @@ -34,9 +34,10 @@ enum class EventType { kOneflowKernel // OneFlow cpu/cuda kernel }; enum class CustomEventType { - kDefault, // for record_function - kCudaKernel, // cuda kernel - kCudaRuntime // something like cudaLaunchKernel + kDefault, // for record_function + kCudaKernel, // cuda kernel + kCudaRuntime, // something like cudaLaunchKernel + kNpuKernel // huawei ascend npu kernel }; enum class EventTimeUnit { kNS, kUS }; @@ -125,7 +126,12 @@ class CustomEvent final : public IEvent { CustomEventType type_; CustomEvent(const std::string& custom_name, CustomEventType type) : IEvent(custom_name, - type == CustomEventType::kDefault ? EventTimeUnit::kNS : EventTimeUnit::kUS), + [type]() { + if (type == CustomEventType::kDefault || type == CustomEventType::kNpuKernel) { + return EventTimeUnit::kNS; + } + return EventTimeUnit::kUS; + }()), type_(type) {} }; @@ -138,8 +144,10 @@ class KernelEvent final : public IEvent { static std::shared_ptr Create(const std::string& name, const Description& description); -#if defined(WITH_CUDA) +#if defined(WITH_CUDA) || defined(WITH_NPU) +#ifdef WITH_CUDA void SetMemorySize(int64_t memory_size) { memory_size_ = memory_size; } +#endif // WITH_CUDA void AddChildEvent(const std::shared_ptr& e) { children_.emplace(e); } bool AddChildEventIfSo(const std::shared_ptr& e) { if (e->IsChildOf(dynamic_cast(this))) { @@ -152,17 +160,18 @@ class KernelEvent final : public IEvent { void WalkAmongChildren(const std::function& e)>& f) const { for (const auto& x : children_) { f(x); } } -#endif // WITH_CUDA +#endif // WITH_CUDA || WITH_NPU private: KernelEvent(const std::string& kernel_name, const Description& description) : IEvent(kernel_name, EventTimeUnit::kNS), description_(description) {} -#if defined(WITH_CUDA) +#if defined(WITH_CUDA) || defined(WITH_NPU) +#ifdef WITH_CUDA int64_t memory_size_ = -1; - std::set> children_; #endif // WITH_CUDA - + std::set> children_; +#endif // WITH_CUDA || WITH_NPU const Description description_; }; diff --git a/oneflow/core/profiler/profile_manager.cpp b/oneflow/core/profiler/profile_manager.cpp index 271b84890bb..157c2258aba 100644 --- a/oneflow/core/profiler/profile_manager.cpp +++ b/oneflow/core/profiler/profile_manager.cpp @@ -43,11 +43,17 @@ void ProfileManager::UnregisterEventRecorder(const std::string& event_recorder_k } std::string ProfileManager::DumpResultsJson() { - const json j = ExportEvents(); +#ifdef WITH_NPU + AclReleaseTrace(profConfig_); +#else + ProcessRawEvents(); +#endif + const json j = events_result_; + decltype(events_result_)().swap(events_result_); return j.dump(); } -std::vector> ProfileManager::ExportEvents() { +void ProfileManager::ProcessRawEvents() { #if defined(WITH_CUDA) auto trace = StopTrace(); const auto& kineto_events = *(trace.get()->activities()); @@ -73,7 +79,6 @@ std::vector> ProfileManager::ExportEvents() { } } #endif // WITH_CUDA - std::vector> events; while (!events_.empty()) { auto evt = events_.front(); events_.pop(); @@ -95,9 +100,8 @@ std::vector> ProfileManager::ExportEvents() { } } #endif // WITH_CUDA - events.emplace_back(evt); + events_result_.emplace_back(evt); } - return events; } std::string ProfileManager::GetNextEventRecorderKey(const std::string& name) { diff --git a/oneflow/core/profiler/profile_manager.h b/oneflow/core/profiler/profile_manager.h index d6246e9aed7..a4bf54356ed 100644 --- a/oneflow/core/profiler/profile_manager.h +++ b/oneflow/core/profiler/profile_manager.h @@ -20,8 +20,13 @@ limitations under the License. #include #include #include +#include #include "oneflow/core/profiler/kineto_shim.h" +#if defined(WITH_NPU) +#include "oneflow/core/profiler/acl_profiler.h" +#endif + namespace oneflow { namespace profiler { @@ -32,10 +37,11 @@ class ProfileManager { public: friend class EventRecorder; - ProfileManager(bool use_cpu, bool use_cuda, bool record_shapes, bool record_attrs, + ProfileManager(bool use_cpu, bool use_cuda, bool use_npu, bool record_shapes, bool record_attrs, bool record_bandwidth) : use_cpu_(use_cpu), use_cuda_(use_cuda), + use_npu_(use_npu), record_shapes_(record_shapes), record_attrs_(record_attrs), record_bandwidth_(record_bandwidth) { @@ -46,6 +52,19 @@ class ProfileManager { PrepareTrace(/*cpuOnly*/ false, activities); StartTrace(); #endif // WITH_CUDA +#if defined(WITH_NPU) + profConfig_ = AclPrepareTrace(); + if (profConfig_ == nullptr) { + LOG(ERROR) << "ProfileManager npu AclProfilingCreateConfig() failed: " + << "Create Prof Config failed."; + } + auto ret = AclStartTrace(profConfig_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "ProfileManager npu AclProfilingStart() failed: " + << "Profiling start failed, error code:" << ret; + } + +#endif // WITH_NPU } std::string RegisterEventRecorder(const std::shared_ptr& event_recorder, @@ -56,17 +75,23 @@ class ProfileManager { private: bool use_cpu_; bool use_cuda_; + bool use_npu_; bool record_shapes_; bool record_attrs_; bool record_bandwidth_; std::queue> events_; + std::vector> events_result_; std::unordered_map> event_recorders_; // To prevent releasing EventRecorders of the same name. std::unordered_map event_recorders_last_id_; std::string GetNextEventRecorderKey(const std::string& name); - std::vector> ExportEvents(); + void ProcessRawEvents(); + +#if defined(WITH_NPU) + aclprofConfig* profConfig_; +#endif }; } // namespace profiler diff --git a/oneflow/core/profiler/profiler.cpp b/oneflow/core/profiler/profiler.cpp index 658c37baf7e..32277bea7e2 100644 --- a/oneflow/core/profiler/profiler.cpp +++ b/oneflow/core/profiler/profiler.cpp @@ -92,11 +92,11 @@ void ProfilerStop() { #endif // OF_ENABLE_PROFILER } -void EnableProfiler(bool use_cpu, bool use_cuda, bool record_shapes, bool record_attrs, - bool record_bandwidth) { +void EnableProfiler(bool use_cpu, bool use_cuda, bool use_npu, bool record_shapes, + bool record_attrs, bool record_bandwidth) { CHECK_JUST(vm::ClusterSync()); if (Singleton::Get() == nullptr) { - Singleton::New(use_cpu, use_cuda, record_shapes, record_attrs, + Singleton::New(use_cpu, use_cuda, use_npu, record_shapes, record_attrs, record_bandwidth); } } @@ -107,6 +107,9 @@ Maybe DisableProfilerAndReturnResult() { #if defined(WITH_CUDA) OF_CUDA_CHECK(cudaDeviceSynchronize()); #endif // WITH_CUDA +#if defined(WITH_NPU) + aclrtSynchronizeDevice(); +#endif // WITH_NPU auto* pmgr = JUST(SingletonMaybe()); std::string results = pmgr->DumpResultsJson(); Singleton::Delete(); diff --git a/oneflow/core/profiler/profiler.h b/oneflow/core/profiler/profiler.h index 51d7ed3e373..30b5d3a092d 100644 --- a/oneflow/core/profiler/profiler.h +++ b/oneflow/core/profiler/profiler.h @@ -17,6 +17,9 @@ limitations under the License. #define ONEFLOW_CORE_PROFILER_PROFILER_H_ #include "oneflow/core/common/util.h" +#if defined(WITH_NPU) +#include "acl/acl.h" +#endif namespace oneflow { @@ -63,8 +66,8 @@ class RangeGuard final { #define OF_PROFILER_LOG_HOST_MEMORY_USAGE(name) #endif -void EnableProfiler(bool use_cpu, bool use_cuda, bool record_shapes, bool record_attrs, - bool record_bandwidth); +void EnableProfiler(bool use_cpu, bool use_cuda, bool use_npu, bool record_shapes, + bool record_attrs, bool record_bandwidth); // DisableProfilerAndReturnResult will return a json of profile results. Maybe DisableProfilerAndReturnResult(); diff --git a/python/oneflow/profiler/events.py b/python/oneflow/profiler/events.py index 9585202e4c6..2fc16c66e5b 100644 --- a/python/oneflow/profiler/events.py +++ b/python/oneflow/profiler/events.py @@ -33,6 +33,7 @@ class CustomEventType(Enum): Default = 0 CudaKernel = 1 CudaRuntime = 2 + NpuKernel = 3 class EventBase: @@ -84,10 +85,15 @@ def cuda_time(self): def has_cuda_time(self) -> bool: return self.cuda_time_total is not None + @property + def key(self): + return (self.name, self.has_cuda_time()) + def __eq__(self, __o: object) -> bool: + if self.key != __o.key: + return False return ( - self.name == __o.name - and self.count == __o.count + self.count == __o.count and self.cpu_time_total == __o.cpu_time_total and self.cuda_time_total == __o.cuda_time_total ) @@ -106,11 +112,14 @@ def from_dict(cls, d: dict): @property def key(self): - return self.name, self.custom_event_type + return super().key + (self.custom_event_type,) @property def cuda_time_total(self): - if self.custom_event_type == CustomEventType.CudaKernel: + if ( + self.custom_event_type == CustomEventType.CudaKernel + or self.custom_event_type == CustomEventType.NpuKernel + ): return self._time_total return None @@ -177,9 +186,9 @@ def get_extra_keys(): return tuple(extra_keys) if len(self.children) == 0: - return (self.name,) + get_extra_keys() + return super().key + get_extra_keys() return ( - self.name, + *(super().key), *get_extra_keys(), ",".join([x.name for x in self.children]), ) diff --git a/python/oneflow/profiler/profiler.py b/python/oneflow/profiler/profiler.py index d8962c22872..93a3bd177bc 100644 --- a/python/oneflow/profiler/profiler.py +++ b/python/oneflow/profiler/profiler.py @@ -22,6 +22,7 @@ class ProfilerActivity(Enum): CPU = 1 CUDA = 2 + NPU = 3 class ProfilerAction(Enum): @@ -43,6 +44,7 @@ def supported_activities() -> Set[ProfilerActivity]: activities = set([ProfilerActivity.CPU]) if oneflow.cuda.is_available(): activities.add(ProfilerActivity.CUDA) + activities.add(ProfilerActivity.NPU) return activities @@ -75,6 +77,7 @@ def __enter__(self): oneflow._oneflow_internal.profiler.EnableProfiler( ProfilerActivity.CPU in self.activities, ProfilerActivity.CUDA in self.activities, + ProfilerActivity.NPU in self.activities, self.record_shapes, self.record_attrs, self.record_bandwidth_for_cuda,