diff --git a/VERSION_NUMBER b/VERSION_NUMBER index b1e80bb2480..845639eef26 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -0.1.3 +0.1.4 diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index dc8899b46ef..4fb4b14c795 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -232,6 +232,7 @@ option(GGML_WEBGPU "ggml: use WebGPU" option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF) option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF) +option(GGML_BUILD_PROFILE "ggml: enable ggml trace profiler (GGML_TRACE=1)" OFF) option(GGML_WEBGPU_JSPI "ggml: use JSPI for WebGPU" ON) option(GGML_ZDNN "ggml: use zDNN" OFF) option(GGML_VIRTGPU "ggml: use the VirtGPU/Virglrenderer API Remoting frontend" OFF) diff --git a/ggml/include/ggml-profile.h b/ggml/include/ggml-profile.h index 8b0a7be4373..227ea8fc2e0 100644 --- a/ggml/include/ggml-profile.h +++ b/ggml/include/ggml-profile.h @@ -20,8 +20,8 @@ void ggml_set_current_token_idx(int idx); void ggml_trace_log_begin(const char * name, const char * cat, const char * args); void ggml_trace_log_end(const char * name, const char * cat, const char * args); -void ggml_profile_log_op_begin(struct ggml_tensor * tensor); -void ggml_profile_log_op_end(struct ggml_tensor * tensor); +void ggml_profile_log_op_begin(struct ggml_tensor * tensor, int ith, int nth); +void ggml_profile_log_op_end(struct ggml_tensor * tensor, int ith, int nth); void ggml_profile_flush_tls(void); #else @@ -31,8 +31,8 @@ void ggml_profile_flush_tls(void); #define ggml_set_current_token_idx(idx) ; #define ggml_trace_log_begin(name, cat, args) ; #define ggml_trace_log_end(name, cat, args) ; -#define ggml_profile_log_op_begin(tensor) ; -#define ggml_profile_log_op_end(tensor) ; +#define ggml_profile_log_op_begin(tensor, ith, nth) ; +#define ggml_profile_log_op_end(tensor, ith, nth) ; #define ggml_profile_flush_tls() ; #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c26c3f1470d..90f5f99d5f1 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -206,6 +206,8 @@ add_library(ggml-base ggml-threading.h ggml-quants.c ggml-quants.h + ggml-profile.c + ../include/ggml-profile.h gguf.cpp) set_target_properties(ggml-base PROPERTIES @@ -218,6 +220,10 @@ if (GGML_BACKEND_DL) target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL) endif() +if (GGML_BUILD_PROFILE) + target_compile_definitions(ggml-base PUBLIC GGML_BUILD_PROFILE) +endif() + if (GGML_SCHED_NO_REALLOC) target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC) endif() diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 8c735a045b3..651b5024802 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -57,6 +57,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name) target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17) target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu) + if (GGML_BUILD_PROFILE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_BUILD_PROFILE) + endif() + if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) if (ACCELERATE_FRAMEWORK) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 84399ac34de..6dfdf1269a9 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -6,6 +6,7 @@ #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "ggml-profile.h" #include "quants.h" #include "ggml-threading.h" #include "unary-ops.h" @@ -3061,6 +3062,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // TODO: move fused-op detection into ggml_graph_plan so fusion decisions are made once at planning time // Try fused ops, fall back to normal compute + ggml_profile_log_op_begin(node, state->ith, params.nth); + const int n_fused = ggml_cpu_try_fuse_ops(cgraph, node_n, ¶ms, cplan); if (n_fused > 0) { node_n += n_fused; @@ -3068,6 +3071,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); } + ggml_profile_log_op_end(node, state->ith, params.nth); + if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed); @@ -3077,8 +3082,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (node_n + 1 < cgraph->n_nodes) { ggml_barrier(state->threadpool); } + } + ggml_profile_flush_tls(); + #ifdef GGML_USE_OPENMP GGML_PRINT_DEBUG("thread #%d compute-done cplan %p\n", state->ith, (const void *)cplan); #else diff --git a/ggml/src/ggml-profile.c b/ggml/src/ggml-profile.c index d8410292ec0..76a1ef44cff 100644 --- a/ggml/src/ggml-profile.c +++ b/ggml/src/ggml-profile.c @@ -27,12 +27,16 @@ void ggml_trace_log_end(const char * name, const char * cat, const char * args) GGML_UNUSED(args); } -void ggml_profile_log_op_begin(struct ggml_tensor * t) { +void ggml_profile_log_op_begin(struct ggml_tensor * t, int ith, int nth) { GGML_UNUSED(t); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } -void ggml_profile_log_op_end(struct ggml_tensor * t) { +void ggml_profile_log_op_end(struct ggml_tensor * t, int ith, int nth) { GGML_UNUSED(t); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } void ggml_profile_flush_tls(void) { @@ -51,10 +55,14 @@ void ggml_profile_flush_trace(void) { static int g_trace_enabled = 0; static pthread_once_t g_trace_once = PTHREAD_ONCE_INIT; +void ggml_profile_flush_trace(void); + static void ggml_trace_parse_env(void) { const char * env = getenv("GGML_TRACE"); if (env && (env[0] == '1' || env[0] == 'y' || env[0] == 'Y')) { g_trace_enabled = 1; + // ensure the trace file gets its closing "]" and is flushed/closed on exit + atexit(ggml_profile_flush_trace); } else { g_trace_enabled = 0; } @@ -83,6 +91,7 @@ static FILE * g_trace_file = NULL; __thread char g_trace_tls_buffer[TRACE_BUFFER_SIZE]; __thread size_t g_trace_tls_offset = 0; +__thread int g_trace_worker_named = 0; int g_current_token_idx = -1; @@ -223,33 +232,85 @@ void ggml_trace_log_end(const char * name, const char * cat, const char * args) ggml_trace_write(buf); } -void ggml_profile_log_op_begin(struct ggml_tensor * t) { +static void ggml_trace_log_begin_on_lane(const char * name, const char * cat, const char * args, int pid, int tid) { + if (!TRACE_ENABLED()) { + return; + } + + char buf[1024]; + + snprintf(buf, sizeof(buf), + "{\"name\":\"%s\",\"cat\":\"%s\",\"ph\":\"B\"," + "\"ts\":%ld,\"pid\":%d,\"tid\":%d,\"args\":{%s}},\n", + name, cat, ggml_trace_now_us(), pid, tid, args ? args : ""); + + ggml_trace_write(buf); +} + +static void ggml_trace_log_end_on_lane(const char * name, const char * cat, const char * args, int pid, int tid) { + if (!TRACE_ENABLED()) { + return; + } + + char buf[1024]; + + snprintf(buf, sizeof(buf), + "{\"name\":\"%s\",\"cat\":\"%s\",\"ph\":\"E\"," + "\"ts\":%ld,\"pid\":%d,\"tid\":%d,\"args\":{%s}},\n", + name, cat, ggml_trace_now_us(), pid, tid, args ? args : ""); + + ggml_trace_write(buf); +} + +static void ggml_trace_log_worker_name(int ith) { + if (!TRACE_ENABLED() || g_trace_worker_named) { + return; + } + + char buf[256]; + snprintf(buf, sizeof(buf), + "{\"name\":\"thread_name\",\"ph\":\"M\",\"pid\":2,\"tid\":%d," + "\"args\":{\"name\":\"ggml-worker-%d\"}},\n", + ith, ith); + + ggml_trace_write(buf); + g_trace_worker_named = 1; +} + +void ggml_profile_log_op_begin(struct ggml_tensor * t, int ith, int nth) { if (!TRACE_ENABLED()) { return; } + ggml_trace_log_worker_name(ith); + char args[1024]; ggml_format_op_args(t, NULL, args, sizeof(args)); + size_t args_len = strlen(args); + snprintf(args + args_len, sizeof(args) - args_len, ",\"ith\":%d,\"nth\":%d", ith, nth); + char name[128], tname[64]; ggml_json_escape(ggml_tensor_name_safe(t), tname, sizeof(tname)); snprintf(name, sizeof(name), "%s (%s)", ggml_op_name(t->op), tname); - ggml_trace_log_begin(name, "Operator", args); + ggml_trace_log_begin_on_lane(name, "Operator", args, 2, ith); } -void ggml_profile_log_op_end(struct ggml_tensor * t) { +void ggml_profile_log_op_end(struct ggml_tensor * t, int ith, int nth) { if (!TRACE_ENABLED()) { return; } + GGML_UNUSED(nth); + char name[128], tname[64]; ggml_json_escape(ggml_tensor_name_safe(t), tname, sizeof(tname)); snprintf(name, sizeof(name), "%s (%s)", ggml_op_name(t->op), tname); - ggml_trace_log_end(name, "Operator", NULL); + ggml_trace_log_end_on_lane(name, "Operator", NULL, 2, ith); } void ggml_profile_flush_tls(void) {