From c3187a20dd6de1a36a4357160d734459ff744e5f Mon Sep 17 00:00:00 2001
From: ramic-k <kramic12@gmail.com>
Date: Sat, 27 Jun 2026 16:18:04 -0400
Subject: [PATCH] VirtAPI: reuse a per-process cache in the Uncached calls

The Type1_v1 Uncached entry points (crossSectionUncached, sampleScatterUncached)
allocated a fresh CachePtr on every call ("Fully MT safe, fully inefficient"). That
per-call allocation/rebuild dominates the NCrystal-attributable cost when NCrystal is
embedded in a host Monte Carlo code through the VirtAPI.

Reuse a persistent cache per (thread, process): a thread_local map keyed by the
process pointer, so the internal cache is allocated once per process per thread
instead of once per call. Keying by process pointer is required because accessCache
does not validate process identity and m_nHistory is not unique per process, so a
single shared cache could be applied to the wrong process. The cross-section and
scatter paths use separate maps, matching the two original CachePtr objects.
thread_local keeps it thread-safe without locks. No public API change, no physics
change.

Measured with OpenMC and NCrystal 4.4.4 on a graphite-moderated benchmark: +11-12%
transport throughput, with bit-identical k.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01CRiGki7G3D8QyixmYeMGiY
---
 .../src/virtualapi/NCVirtAPI_Type1_v1_impl.hh | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh b/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh
index b20286729..8adfcb1bf 100644
--- a/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh
+++ b/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh
@@ -21,6 +21,7 @@
 #include "NCrystal/virtualapi/NCVirtAPI_Type1_v1.hh"
 #include "NCrystal/factories/NCFactImpl.hh"
 #include "NCVirtAPIUtils.hh"
+#include <unordered_map>
 
 namespace NCRYSTAL_NAMESPACE {
 
@@ -56,13 +57,21 @@ namespace NCRYSTAL_NAMESPACE {
         delete reinterpret_cast<const ScatterProcess*>(sp);
       }
 
+      // Reuse a persistent cache per (thread, process) instead of allocating a
+      // fresh one on every call. The cache is keyed by process pointer so it is
+      // never shared between processes: accessCache does not check process
+      // identity, and m_nHistory (the only reset trigger) is not unique per
+      // process, so a single shared cache could be applied to the wrong process.
+      // The cross-section and scatter paths use separate maps, matching the two
+      // original CachePtr objects. thread_local keeps it thread-safe without
+      // locks, and replaces the per-call allocation the original flagged as
+      // "Fully MT safe, fully inefficient".
       double crossSectionUncached( const PubScatterProcess& pub_sp,
                                    const double* n ) const override
       {
         auto sp = reinterpret_cast<const ScatterProcess*>(&pub_sp);
-        CachePtr dummycache;//<--- Fully MT safe, fully inefficient. To be
-                            //revisited in a future api version!
-        return sp->procptr->crossSection( dummycache,
+        thread_local std::unordered_map<const ScatterProcess*, CachePtr> xs_caches;
+        return sp->procptr->crossSection( xs_caches[sp],
                                           NeutronEnergy{ n[0] },
                                           NeutronDirection( n[1], n[2], n[3] )
                                           ).dbl();
@@ -75,10 +84,9 @@ namespace NCRYSTAL_NAMESPACE {
                                   double* n ) const override
       {
         auto sp = reinterpret_cast<const ScatterProcess*>(&pub_sp);
-        CachePtr dummycache;//<--- Fully MT safe, fully inefficient. To be
-                            //revisited in a future api version!
+        thread_local std::unordered_map<const ScatterProcess*, CachePtr> scat_caches;
         VirtAPIUtils::RNGWrapper rng( &rng_fct );
-        auto out = sp->procptr->sampleScatter( dummycache, rng,
+        auto out = sp->procptr->sampleScatter( scat_caches[sp], rng,
                                                NeutronEnergy{ n[0] },
                                                NeutronDirection( n[1],
                                                                  n[2],