diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index b1e80bb2480..845639eef26 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-0.1.3
+0.1.4
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index dc8899b46ef..4fb4b14c795 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -232,6 +232,7 @@ option(GGML_WEBGPU                          "ggml: use WebGPU"
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
 option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
+option(GGML_BUILD_PROFILE                   "ggml: enable ggml trace profiler (GGML_TRACE=1)" OFF)
 option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_VIRTGPU                         "ggml: use the VirtGPU/Virglrenderer API Remoting frontend"     OFF)
diff --git a/ggml/include/ggml-profile.h b/ggml/include/ggml-profile.h
index 8b0a7be4373..227ea8fc2e0 100644
--- a/ggml/include/ggml-profile.h
+++ b/ggml/include/ggml-profile.h
@@ -20,8 +20,8 @@ void ggml_set_current_token_idx(int idx);
 void ggml_trace_log_begin(const char * name, const char * cat, const char * args);
 void ggml_trace_log_end(const char * name, const char * cat, const char * args);
 
-void ggml_profile_log_op_begin(struct ggml_tensor * tensor);
-void ggml_profile_log_op_end(struct ggml_tensor * tensor);
+void ggml_profile_log_op_begin(struct ggml_tensor * tensor, int ith, int nth);
+void ggml_profile_log_op_end(struct ggml_tensor * tensor, int ith, int nth);
 
 void ggml_profile_flush_tls(void);
 #else
@@ -31,8 +31,8 @@ void ggml_profile_flush_tls(void);
 #define ggml_set_current_token_idx(idx) ;
 #define ggml_trace_log_begin(name, cat, args) ;
 #define ggml_trace_log_end(name, cat, args) ;
-#define ggml_profile_log_op_begin(tensor) ;
-#define ggml_profile_log_op_end(tensor) ;
+#define ggml_profile_log_op_begin(tensor, ith, nth) ;
+#define ggml_profile_log_op_end(tensor, ith, nth) ;
 #define ggml_profile_flush_tls() ;
 #endif
 
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index c26c3f1470d..90f5f99d5f1 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -206,6 +206,8 @@ add_library(ggml-base
             ggml-threading.h
             ggml-quants.c
             ggml-quants.h
+            ggml-profile.c
+            ../include/ggml-profile.h
             gguf.cpp)
 
 set_target_properties(ggml-base PROPERTIES
@@ -218,6 +220,10 @@ if (GGML_BACKEND_DL)
     target_compile_definitions(ggml-base PUBLIC GGML_BACKEND_DL)
 endif()
 
+if (GGML_BUILD_PROFILE)
+    target_compile_definitions(ggml-base PUBLIC GGML_BUILD_PROFILE)
+endif()
+
 if (GGML_SCHED_NO_REALLOC)
     target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
 endif()
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 8c735a045b3..651b5024802 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -57,6 +57,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
     target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
     target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu)
 
+    if (GGML_BUILD_PROFILE)
+        target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_BUILD_PROFILE)
+    endif()
+
     if (APPLE AND GGML_ACCELERATE)
         find_library(ACCELERATE_FRAMEWORK Accelerate)
         if (ACCELERATE_FRAMEWORK)
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 84399ac34de..6dfdf1269a9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -6,6 +6,7 @@
 #include "traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
+#include "ggml-profile.h"
 #include "quants.h"
 #include "ggml-threading.h"
 #include "unary-ops.h"
@@ -3061,6 +3062,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         // TODO: move fused-op detection into ggml_graph_plan so fusion decisions are made once at planning time
         // Try fused ops, fall back to normal compute
+        ggml_profile_log_op_begin(node, state->ith, params.nth);
+
         const int n_fused = ggml_cpu_try_fuse_ops(cgraph, node_n, &params, cplan);
         if (n_fused > 0) {
             node_n += n_fused;
@@ -3068,6 +3071,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             ggml_compute_forward(&params, node);
         }
 
+        ggml_profile_log_op_end(node, state->ith, params.nth);
+
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
             atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
@@ -3077,8 +3082,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         if (node_n + 1 < cgraph->n_nodes) {
             ggml_barrier(state->threadpool);
         }
+
     }
 
+    ggml_profile_flush_tls();
+
 #ifdef GGML_USE_OPENMP
     GGML_PRINT_DEBUG("thread #%d compute-done cplan %p\n", state->ith, (const void *)cplan);
 #else
diff --git a/ggml/src/ggml-profile.c b/ggml/src/ggml-profile.c
index d8410292ec0..76a1ef44cff 100644
--- a/ggml/src/ggml-profile.c
+++ b/ggml/src/ggml-profile.c
@@ -27,12 +27,16 @@ void ggml_trace_log_end(const char * name, const char * cat, const char * args)
     GGML_UNUSED(args);
 }
 
-void ggml_profile_log_op_begin(struct ggml_tensor * t) {
+void ggml_profile_log_op_begin(struct ggml_tensor * t, int ith, int nth) {
     GGML_UNUSED(t);
+    GGML_UNUSED(ith);
+    GGML_UNUSED(nth);
 }
 
-void ggml_profile_log_op_end(struct ggml_tensor * t) {
+void ggml_profile_log_op_end(struct ggml_tensor * t, int ith, int nth) {
     GGML_UNUSED(t);
+    GGML_UNUSED(ith);
+    GGML_UNUSED(nth);
 }
 
 void ggml_profile_flush_tls(void) {
@@ -51,10 +55,14 @@ void ggml_profile_flush_trace(void) {
 static int            g_trace_enabled = 0;
 static pthread_once_t g_trace_once    = PTHREAD_ONCE_INIT;
 
+void ggml_profile_flush_trace(void);
+
 static void ggml_trace_parse_env(void) {
     const char * env = getenv("GGML_TRACE");
     if (env && (env[0] == '1' || env[0] == 'y' || env[0] == 'Y')) {
         g_trace_enabled = 1;
+        // ensure the trace file gets its closing "]" and is flushed/closed on exit
+        atexit(ggml_profile_flush_trace);
     } else {
         g_trace_enabled = 0;
     }
@@ -83,6 +91,7 @@ static FILE *          g_trace_file  = NULL;
 
 __thread char   g_trace_tls_buffer[TRACE_BUFFER_SIZE];
 __thread size_t g_trace_tls_offset = 0;
+__thread int    g_trace_worker_named = 0;
 
 int g_current_token_idx = -1;
 
@@ -223,33 +232,85 @@ void ggml_trace_log_end(const char * name, const char * cat, const char * args)
     ggml_trace_write(buf);
 }
 
-void ggml_profile_log_op_begin(struct ggml_tensor * t) {
+static void ggml_trace_log_begin_on_lane(const char * name, const char * cat, const char * args, int pid, int tid) {
+    if (!TRACE_ENABLED()) {
+        return;
+    }
+
+    char buf[1024];
+
+    snprintf(buf, sizeof(buf),
+             "{\"name\":\"%s\",\"cat\":\"%s\",\"ph\":\"B\","
+             "\"ts\":%ld,\"pid\":%d,\"tid\":%d,\"args\":{%s}},\n",
+             name, cat, ggml_trace_now_us(), pid, tid, args ? args : "");
+
+    ggml_trace_write(buf);
+}
+
+static void ggml_trace_log_end_on_lane(const char * name, const char * cat, const char * args, int pid, int tid) {
+    if (!TRACE_ENABLED()) {
+        return;
+    }
+
+    char buf[1024];
+
+    snprintf(buf, sizeof(buf),
+             "{\"name\":\"%s\",\"cat\":\"%s\",\"ph\":\"E\","
+             "\"ts\":%ld,\"pid\":%d,\"tid\":%d,\"args\":{%s}},\n",
+             name, cat, ggml_trace_now_us(), pid, tid, args ? args : "");
+
+    ggml_trace_write(buf);
+}
+
+static void ggml_trace_log_worker_name(int ith) {
+    if (!TRACE_ENABLED() || g_trace_worker_named) {
+        return;
+    }
+
+    char buf[256];
+    snprintf(buf, sizeof(buf),
+             "{\"name\":\"thread_name\",\"ph\":\"M\",\"pid\":2,\"tid\":%d,"
+             "\"args\":{\"name\":\"ggml-worker-%d\"}},\n",
+             ith, ith);
+
+    ggml_trace_write(buf);
+    g_trace_worker_named = 1;
+}
+
+void ggml_profile_log_op_begin(struct ggml_tensor * t, int ith, int nth) {
     if (!TRACE_ENABLED()) {
         return;
     }
 
+    ggml_trace_log_worker_name(ith);
+
     char args[1024];
     ggml_format_op_args(t, NULL, args, sizeof(args));
 
+    size_t args_len = strlen(args);
+    snprintf(args + args_len, sizeof(args) - args_len, ",\"ith\":%d,\"nth\":%d", ith, nth);
+
     char name[128], tname[64];
     ggml_json_escape(ggml_tensor_name_safe(t), tname, sizeof(tname));
 
     snprintf(name, sizeof(name), "%s (%s)", ggml_op_name(t->op), tname);
 
-    ggml_trace_log_begin(name, "Operator", args);
+    ggml_trace_log_begin_on_lane(name, "Operator", args, 2, ith);
 }
 
-void ggml_profile_log_op_end(struct ggml_tensor * t) {
+void ggml_profile_log_op_end(struct ggml_tensor * t, int ith, int nth) {
     if (!TRACE_ENABLED()) {
         return;
     }
 
+    GGML_UNUSED(nth);
+
     char name[128], tname[64];
     ggml_json_escape(ggml_tensor_name_safe(t), tname, sizeof(tname));
 
     snprintf(name, sizeof(name), "%s (%s)", ggml_op_name(t->op), tname);
 
-    ggml_trace_log_end(name, "Operator", NULL);
+    ggml_trace_log_end_on_lane(name, "Operator", NULL, 2, ith);
 }
 
 void ggml_profile_flush_tls(void) {