From c3187a20dd6de1a36a4357160d734459ff744e5f Mon Sep 17 00:00:00 2001 From: ramic-k Date: Sat, 27 Jun 2026 16:18:04 -0400 Subject: [PATCH] VirtAPI: reuse a per-process cache in the Uncached calls The Type1_v1 Uncached entry points (crossSectionUncached, sampleScatterUncached) allocated a fresh CachePtr on every call ("Fully MT safe, fully inefficient"). That per-call allocation/rebuild dominates the NCrystal-attributable cost when NCrystal is embedded in a host Monte Carlo code through the VirtAPI. Reuse a persistent cache per (thread, process): a thread_local map keyed by the process pointer, so the internal cache is allocated once per process per thread instead of once per call. Keying by process pointer is required because accessCache does not validate process identity and m_nHistory is not unique per process, so a single shared cache could be applied to the wrong process. The cross-section and scatter paths use separate maps, matching the two original CachePtr objects. thread_local keeps it thread-safe without locks. No public API change, no physics change. Measured with OpenMC and NCrystal 4.4.4 on a graphite-moderated benchmark: +11-12% transport throughput, with bit-identical k. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01CRiGki7G3D8QyixmYeMGiY --- .../src/virtualapi/NCVirtAPI_Type1_v1_impl.hh | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh b/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh index b20286729..8adfcb1bf 100644 --- a/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh +++ b/ncrystal_core/src/virtualapi/NCVirtAPI_Type1_v1_impl.hh @@ -21,6 +21,7 @@ #include "NCrystal/virtualapi/NCVirtAPI_Type1_v1.hh" #include "NCrystal/factories/NCFactImpl.hh" #include "NCVirtAPIUtils.hh" +#include namespace NCRYSTAL_NAMESPACE { @@ -56,13 +57,21 @@ namespace NCRYSTAL_NAMESPACE { delete reinterpret_cast(sp); } + // Reuse a persistent cache per (thread, process) instead of allocating a + // fresh one on every call. The cache is keyed by process pointer so it is + // never shared between processes: accessCache does not check process + // identity, and m_nHistory (the only reset trigger) is not unique per + // process, so a single shared cache could be applied to the wrong process. + // The cross-section and scatter paths use separate maps, matching the two + // original CachePtr objects. thread_local keeps it thread-safe without + // locks, and replaces the per-call allocation the original flagged as + // "Fully MT safe, fully inefficient". double crossSectionUncached( const PubScatterProcess& pub_sp, const double* n ) const override { auto sp = reinterpret_cast(&pub_sp); - CachePtr dummycache;//<--- Fully MT safe, fully inefficient. To be - //revisited in a future api version! - return sp->procptr->crossSection( dummycache, + thread_local std::unordered_map xs_caches; + return sp->procptr->crossSection( xs_caches[sp], NeutronEnergy{ n[0] }, NeutronDirection( n[1], n[2], n[3] ) ).dbl(); @@ -75,10 +84,9 @@ namespace NCRYSTAL_NAMESPACE { double* n ) const override { auto sp = reinterpret_cast(&pub_sp); - CachePtr dummycache;//<--- Fully MT safe, fully inefficient. To be - //revisited in a future api version! + thread_local std::unordered_map scat_caches; VirtAPIUtils::RNGWrapper rng( &rng_fct ); - auto out = sp->procptr->sampleScatter( dummycache, rng, + auto out = sp->procptr->sampleScatter( scat_caches[sp], rng, NeutronEnergy{ n[0] }, NeutronDirection( n[1], n[2],