diff --git a/arcshift/Cargo.toml b/arcshift/Cargo.toml
index c7bbcf7..d9969da 100644
--- a/arcshift/Cargo.toml
+++ b/arcshift/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "arcshift"
-version = "0.3.0"
+version = "0.3.1"
 documentation = "https://docs.rs/arcshift/"
 homepage = "https://github.com/avl/arcshift/"
 repository = "https://github.com/avl/arcshift/"
diff --git a/arcshift/examples/weak.rs b/arcshift/examples/weak.rs
new file mode 100644
index 0000000..ed96fc5
--- /dev/null
+++ b/arcshift/examples/weak.rs
@@ -0,0 +1,24 @@
+use arcshift::{ArcShift, ArcShiftWeak};
+use std::cell::RefCell;
+
+fn main() {
+    #[allow(dead_code)]
+    struct Node {
+        parent: Option<ArcShiftWeak<Node>>,
+        child: RefCell<Option<ArcShift<Node>>>,
+    }
+
+    let mut root = ArcShift::new(Node {
+        parent: None,
+        child: RefCell::new(None),
+    });
+
+    let child = ArcShift::new(Node {
+        parent: Some(ArcShift::downgrade(&root)),
+        child: RefCell::new(None),
+    });
+
+    root.get().child.borrow_mut().replace(child.clone());
+
+    // Both root and child will be dropped here, there will be no memory-leak
+}
diff --git a/arcshift/run_loom_slow.sh b/arcshift/run_loom_slow.sh
new file mode 100755
index 0000000..c2366e5
--- /dev/null
+++ b/arcshift/run_loom_slow.sh
@@ -0,0 +1 @@
+LOOM_MAX_BRANCHES=4000 LOOM_MAX_PREEMPTIONS=3 RUSTFLAGS="--cfg loom" cargo nextest run --features=loom,validate --release $@
diff --git a/arcshift/src/cell.rs b/arcshift/src/cell.rs
index 47c9330..a8ecf64 100644
--- a/arcshift/src/cell.rs
+++ b/arcshift/src/cell.rs
@@ -67,15 +67,16 @@ impl<T: 'static + ?Sized> Deref for ArcShiftCellHandle<'_, T> {
 }
 
 /// ArcShiftCell cannot be Sync, but there's nothing stopping it from being Send.
-/// SAFETY:
-/// As long as the contents of the cell are not !Send, it is safe to
-/// send the cell. The object must be uniquely owned to be sent, and
-/// this is only possible if we're not in a recursive call to
-/// 'get'. And in this case, the properties of ArcShiftCell are the same
-/// as ArcShift, and ArcShift is Send (if T is Send + Sync).
 ///
 /// Note that ArcShiftCell *cannot* be Sync, because then multiple threads
 /// could call 'get' simultaneously, corrupting the (non-atomic) refcount.
+// SAFETY:
+// As long as the contents of the cell are not !Send, it is safe to
+// send the cell. The object must be uniquely owned to be sent, and
+// this is only possible if we're not in a recursive call to
+// 'get'. And in this case, the properties of ArcShiftCell are the same
+// as ArcShift, and ArcShift is Send (if T is Send + Sync).
+//
 unsafe impl<T: 'static> Send for ArcShiftCell<T> where T: Send + Sync {}
 
 impl<T: 'static + ?Sized> Clone for ArcShiftCell<T> {
@@ -129,6 +130,7 @@ impl<T: 'static + ?Sized> ArcShiftCell<T> {
     /// Make sure not to leak this handle: See further documentation on
     /// [`ArcShiftCellHandle`]. Leaking the handle will leak resources, but
     /// not cause undefined behaviour.
+    #[inline]
     pub fn borrow(&self) -> ArcShiftCellHandle<T> {
         self.recursion.set(self.recursion.get() + 1);
         ArcShiftCellHandle {
@@ -146,7 +148,8 @@ impl<T: 'static + ?Sized> ArcShiftCell<T> {
     ///
     /// This method is reentrant - you are allowed to call it from within the closure 'f'.
     /// However, only the outermost invocation will cause a reload.
-    pub fn get(&self, f: impl FnOnce(&T)) {
+    #[inline]
+    pub fn get<R>(&self, f: impl FnOnce(&T) -> R) -> R {
         self.recursion.set(self.recursion.get() + 1);
         let val = if self.recursion.get() == 1 {
             // SAFETY:
@@ -157,8 +160,9 @@ impl<T: 'static + ?Sized> ArcShiftCell<T> {
             // Getting the inner value is safe, no other thread can be accessing it now
             unsafe { &*self.inner.get() }.shared_non_reloading_get()
         };
-        f(val);
+        let t = f(val);
         self.recursion.set(self.recursion.get() - 1);
+        t
     }
 
     /// Assign the given ArcShift to this instance.
@@ -179,7 +183,7 @@ impl<T: 'static + ?Sized> ArcShiftCell<T> {
         }
     }
     /// Reload this ArcShiftCell-instance.
-    /// This allows dropping heap blocks kept alive by this instance of
+    /// This allows heap blocks kept alive by this instance of
     /// ArcShiftCell to be dropped.
     /// Note, this method only works when not called from within a closure
     /// supplied to the 'get' method. If such recursion occurs, this method
@@ -198,7 +202,7 @@ impl<T: 'static + ?Sized> ArcShiftCell<T> {
     /// Create an ArcShift-instance pointing to the same data
     pub fn make_arcshift(&self) -> ArcShift<T> {
         // SAFETY:
-        // ArcShiftCell is not Sync, and 'reload' does not recursively call into user
+        // ArcShiftCell is not Sync, and 'make_arcshift' does not recursively call into user
         // code, so we know no other operation can be ongoing.
         unsafe { &mut *self.inner.get() }.clone()
     }
diff --git a/arcshift/src/deferred_panics_helper.rs b/arcshift/src/deferred_panics_helper.rs
index 535e321..68e8063 100644
--- a/arcshift/src/deferred_panics_helper.rs
+++ b/arcshift/src/deferred_panics_helper.rs
@@ -1,9 +1,10 @@
-//! This module contains routines to help defer panics to outside of the critical sections
-//! handling the lock free data structure. Instead, unwinding is deferred to after
-//! the lock free structures have been updated. This avoids potential memory leaks, when
-//! multiple objects need to be dropped simultaneously, and the first drop impl
-//! panics. In this case we still wish to call other drop handlers and not resume unwind
-//! until all drops have occurred.
+//! This module contains routines to help ensure that panicking drop-implementations
+//! do not cause corruption in the heap data-structures. The strategy to achieve
+//! this differs depending on if we run in `no_std` case or not.
+//! While running in `no_std`, dropping is deferred until after all lock-free memory
+//! structures have been updated, at some extra cost.
+//! When not using `no_std`, `catch_unwind` is used to catch panics and resume them
+//! when it is safe.
 use crate::{IMetadata, ItemHolder};
 
 pub(crate) trait IDropHandler<T: ?Sized, M: IMetadata> {
diff --git a/arcshift/src/lib.rs b/arcshift/src/lib.rs
index 2ef3fad..d3822ff 100644
--- a/arcshift/src/lib.rs
+++ b/arcshift/src/lib.rs
@@ -7,11 +7,11 @@
 //! # Introduction to ArcShift
 //!
 //! [`ArcShift`] is a data type similar to `std::sync::Arc`, except that it allows updating
-//! the value pointed to. It can be used as a faster replacement for
-//! `std::sync::Arc<std::sync::RwLock<T>>`.
+//! the value pointed to. It can be used as a replacement for
+//! `std::sync::Arc<std::sync::RwLock<T>>`, giving much faster read access.
 //!
-//! Writing to ArcShift is significantly more expensive than for `std::sync::RwLock`, so
-//! ArcShift is most suited to use cases where updates are infrequent.
+//! Updating the value in ArcShift is significantly more expensive than writing `std::sync::RwLock`,
+//! so ArcShift is most suited to cases where updates are infrequent.
 //!
 //! ## Example
 //! ```
@@ -46,22 +46,32 @@
 //!
 //! # Strong points
 //! * Easy to use (similar to Arc)
-//! * All functions are lock free (see <https://en.wikipedia.org/wiki/Non-blocking_algorithm> )
-//! * For use cases where no modification of values occurs, performance is very good (much
+//! * Extensively tested
+//! * All functions are lock free ( <https://en.wikipedia.org/wiki/Non-blocking_algorithm> )
+//! * For use cases where no updates occur, performance is very good (much
 //!   better than RwLock or Mutex).
-//! * Modifying values is reasonably fast (think, 50-150 nanoseconds), but much slower than Mutex or
-//!   RwLock.
+//! * Updates are reasonably fast (think, 15-100 nanoseconds), but much slower than Mutex- or
+//!   RwLock-writes.
 //! * The function [`ArcShift::shared_non_reloading_get`] allows access without any overhead
 //!   compared to regular Arc (benchmarks show identical performance to Arc).
 //! * ArcShift does not rely on thread-local variables.
+//! * Supports unsized types (i.e, you can use `ArcShift<[u8]>`)
 //! * ArcShift is no_std compatible (though 'alloc' is required, since ArcShift is a heap
-//!   allocating data structure). Compile with "default-features=false" to enable no_std
+//!   data structure). Compile with "default-features=false" to enable no_std
 //!   compatibility.
 //!
 //! # Limitations
 //!
 //! ArcShift achieves its performance at the expense of the following disadvantages:
 //!
+//! * ArcShift's performance relies on being able to update its pointer when
+//!   new values are detected. This means that ArcShift is most efficient when each
+//!   thread has a mutable ArcShift instance. This can often be achieved by cloning the ArcShift,
+//!   and distributing one owned copy to every thread (these clones all point to the same
+//!   inner value). ArcShift can still be used with only shared access ([`ArcShift::shared_get`]),
+//!   and performance is still very good as long as the pointer is current. However, if
+//!   the ArcShift instance is stale (needs reloading, because an update has occurred), reads will
+//!   be approximately twice as costly as for RwLock.
 //! * When modifying the value, the old version of the value lingers in memory until
 //!   the last ArcShift that uses it has reloaded. Such a reload only happens when the ArcShift
 //!   is accessed using a unique (`&mut`) access (like [`ArcShift::get`] or [`ArcShift::reload`]).
@@ -75,26 +85,56 @@
 //! * ArcShift is its own datatype. It is in no way compatible with `Arc<T>`.
 //! * At most usize::MAX/8 instances of ArcShift or ArcShiftWeak can be created for each value.
 //!   (this is because it uses some bits of its weak refcount to store metadata).
-//! * ArcShift instances should ideally be owned (or be mutably accessible). This is because
-//!   reloading ArcShift requires mutable access to the ArcShift object itself.
 //!
-//! The last limitation might seem unacceptable, but for many applications it is not
-//! hard to make sure each thread/scope has its own instance of ArcShift pointing to
-//! the resource. Cloning ArcShift instances is reasonably fast.
+//! # Detailed performance characteristics
 //!
-//! # Implementation
 //!
-//! When ArcShift values are updated, a linked list of all updates is formed. Whenever
-//! an ArcShift-instance is reloaded (using [`ArcShift::reload`], [`ArcShift::get`],
-//! that instance advances along the linked list to the last
-//! node in the list. When no instance exists pointing at a node in the list, it is dropped.
-//! It is thus important to periodically call [`ArcShift::reload`] or [`ArcShift::get`] to
-//! avoid retaining unneeded values.
+//! * [`ArcShift::get`] - Very good average performance.
+//!   Checking for new values requires a single atomic operation, of the least expensive kind
+//!   (Ordering::Relaxed). On x86_64, this is the exact same machine operation as a regular
+//!   memory access, and also on arm it is not an expensive operation. The cost of such access
+//!   is much smaller than a mutex access, even an uncontended one. In the case where a reload
+//!   is actually necessary, there is a significant performance impact (but still typically
+//!   below 150ns for modern machines (2025)).
+//!
+//!   If other instances have made updates, subsequent accesses will have a penalty. This
+//!   penalty can be significant, because previous values may have to be dropped.  However,
+//!   when any updates have been processed, subsequent accesses will be fast again. It is
+//!   guaranteed that any update that completed before the execution of [`ArcShift::get`]
+//!   started, will be visible.
+//!
+//! * [`ArcShift::shared_get`] - Good performance as long as the value is not stale.
+//!   If self points to a previous value, each call to `shared_get` will traverse the
+//!   memory structures to find the most recent value.
+//!
+//!   There are three cases:
+//!   * The value is up-to-date. In this case, execution is very fast.
+//!   * The value is stale, but no write is in progress. Expect a penalty equal to
+//!     twice the cost of an RwLock write, approximately.
+//!   * The value is stale, and there is a write in progress. This is a rare race condition.
+//!     Expect a severe performance penalty (~10-20x the cost of an RwLock write).
+//!
+//!   `shared_get` also guarantees that any updates that completed before it was called,
+//!   will be visible.
+//!
+//! * [`ArcShift::shared_non_reloading_get`] - No overhead compared to plain Arc.
+//!   Will not reload, even if the ArcShift instance is stale. May thus return an old
+//!   value. If shared_get has been used previously, this method may return an older
+//!   value than what the shared_get returned.
+//!
+//! * [`ArcShift::reload`] - Similar cost to [`ArcShift::get`].
+//!
+//! * [`ArcShift::clone`] - Fast. Requires a single atomic increment, and an atomic read.
+//!   If the current instance is stale, the cloned value will be reloaded, with identical
+//!   cost to [`ArcShift::get`].
+//!
+//! * Drop - Can be slow. The last remaining owner of a value will drop said value.
+//!
 //!
 //! # Motivation
 //!
 //! The primary raison d'être for [`ArcShift`] is to be a version of Arc which allows
-//! modifying the stored value, with very little overhead over regular Arc for read heavy
+//! updating the stored value, with very little overhead over regular Arc for read heavy
 //! loads.
 //!
 //! One such motivating use-case for ArcShift is hot-reloadable assets in computer games.
@@ -109,16 +149,6 @@
 //!
 //! ArcShift can, of course, be useful in other domains than computer games.
 //!
-//! # Performance properties
-//!
-//! Accessing the value stored in an ArcShift instance only requires a regular memory access,
-//! not any form of atomic operation. Checking for new values requires a single
-//! atomic operation, of the least expensive kind (Ordering::Relaxed). On x86_64,
-//! this is the exact same machine operation as a regular memory access, and also
-//! on arm it is not an expensive operation.
-//! The cost of such access is much smaller than a mutex access, even an uncontended one.
-//! In the case where a reload is actually necessary, there is a significant performance impact
-//! (but still typically below 150ns for modern machines (2025)).
 //!
 //! # Panicking drop methods
 //! If a drop implementation panics, ArcShift will make sure that the internal data structures
@@ -151,7 +181,7 @@
 //! The 'next'-pointer starts out as null, but when the value in an ArcShift is updated, the
 //! 'next'-pointer is set to point to the updated value.
 //!
-//! This means that each ArcShift-instance always points at valid value of type T. No locking
+//! This means that each ArcShift-instance always points at a valid value of type T. No locking
 //! or synchronization is required to get at this value. This is why ArcShift instances are fast
 //! to use. There is the drawback that as long as an ArcShift-instance exists, whatever value
 //! it points to must be kept alive. Each time an ArcShift instance is accessed mutably, we have
@@ -160,11 +190,17 @@
 //!
 //! When the last ArcShift-instance releases a particular value, it will be dropped.
 //!
-//! ArcShiftWeak-instances also keep pointers to the heap blocks mentioned above, but value T
+//! ArcShiftWeak-instances also keep pointers to the heap blocks mentioned above, but the value T
 //! in the block can be dropped while being held by an ArcShiftWeak. This means that an ArcShiftWeak-
 //! instance only consumes `std::mem::size_of::<T>()` bytes plus 5 words of memory, when the value
-//! it points to has been dropped. When the ArcShiftWeak-instance is reloaded, or dropped, that memory
-//! is also released.
+//! it points to has been dropped. When the ArcShiftWeak-instance is reloaded, or dropped, that
+//! memory is also released.
+//!
+//! # Prior Art
+//!
+//! ArcShift is very much inspired by arc-swap. The two crates can be used for similar problems.
+//! They have slightly different APIs, one or the other may be a more natural fit depending on
+//! the problem. ArcShift may be faster for some problems, slower for others.
 //!
 //!
 //! # Pitfall #1 - lingering memory usage
@@ -280,7 +316,7 @@ use core::sync::atomic::Ordering;
 // assigning null to a pointer, that could cause UB in unsafe code in this crate.
 // This crate is inherently dependent on all the code in it being correct. Therefore,
 // marking more functions unsafe buys us very little.
-// Note! The API of this crate is 100% safe and UB should be impossible to trigger by using it.
+// Note! The API of this crate is 100% safe and UB should be impossible to trigger through the API.
 // All public methods are 100% sound, this argument only concerns private methods.
 
 // All atomic primitives are reexported from a
@@ -417,10 +453,11 @@ pub struct ArcShiftWeak<T: ?Sized> {
     item: NonNull<ItemHolderDummy<T>>,
 }
 
-/// A handle that allows reloading an ArcShift instance without having 'mut' access.
-/// However, it does not implement `Sync`.
+/// Module with a convenient cell-like data structure for reloading ArcShift instances
+/// despite only having shared access.
 pub mod cell;
 
+#[inline(always)]
 const fn is_sized<T: ?Sized>() -> bool {
     size_of::<&T>() == size_of::<&()>()
 }
@@ -558,9 +595,12 @@ fn get_holder_layout<T: ?Sized>(ptr: *const T) -> Layout {
     }
 }
 
+#[inline(always)]
 fn to_dummy<T: ?Sized, M: IMetadata>(ptr: *const ItemHolder<T, M>) -> *const ItemHolderDummy<T> {
     ptr as *mut ItemHolderDummy<T>
 }
+
+#[inline(always)]
 fn from_dummy<T: ?Sized, M: IMetadata>(ptr: *const ItemHolderDummy<T>) -> *const ItemHolder<T, M> {
     get_full_ptr_raw::<T, M>(ptr)
 }
@@ -888,8 +928,9 @@ trait IMetadata {}
 impl IMetadata for SizedMetadata {}
 impl<T: ?Sized> IMetadata for UnsizedMetadata<T> {}
 
+#[doc(hidden)]
 #[repr(transparent)]
-struct ItemHolderDummy<T: ?Sized> {
+pub struct ItemHolderDummy<T: ?Sized> {
     // void pointers should point to u8
     _dummy: u8,
     _phantom_data: PhantomData<T>,
@@ -1410,6 +1451,7 @@ fn assert_is_undecorated<T: ?Sized, M: IMetadata>(_ptr: *const ItemHolderDummy<T
 /// Return an undecorated version of the given pointer.
 /// Supplying an already undecorated pointer is not an error, and returns
 /// the value unmodified.
+#[inline(always)]
 fn undecorate<T: ?Sized>(cand: *const ItemHolderDummy<T>) -> *const ItemHolderDummy<T> {
     let raw = cand as usize & 7;
     (cand as *const u8).wrapping_offset(-(raw as isize)) as *const ItemHolderDummy<T>
@@ -2906,20 +2948,137 @@ pub enum SharedGetGuard<'a, T: ?Sized> {
     /// The pointer already referenced the most recent value
     Raw(&'a T),
     /// We had to advance, and to do this we unfortunately had to clone
-    Cloned(ArcShift<T>),
+    LightCloned {
+        #[doc(hidden)]
+        next: *const ItemHolderDummy<T>,
+    },
+    /// Most expensive case, only used in some rare race-cases
+    FullClone {
+        #[doc(hidden)]
+        cloned: ArcShift<T>,
+    },
+}
+
+impl<T: ?Sized> Drop for SharedGetGuard<'_, T> {
+    #[inline(always)]
+    fn drop(&mut self) {
+        match self {
+            SharedGetGuard::Raw(_) => {}
+            SharedGetGuard::FullClone { .. } => {}
+            SharedGetGuard::LightCloned { next } => {
+                if is_sized::<T>() {
+                    let item_ptr = from_dummy::<_, SizedMetadata>(*next);
+                    // SAFETY:
+                    // The 'next' pointer of a SharedGetGuard::Cloned is always valid
+                    let item = unsafe { &*item_ptr };
+                    let mut dropq = DropHandler::default();
+                    do_drop_strong(item, &mut dropq);
+                    dropq.resume_any_panics();
+                } else {
+                    let item_ptr = from_dummy::<_, UnsizedMetadata<T>>(*next);
+                    // SAFETY:
+                    // The 'next' pointer of a SharedGetGuard::Cloned is always valid
+                    let item = unsafe { &*item_ptr };
+                    let mut dropq = DropHandler::default();
+                    do_drop_strong(item, &mut dropq);
+                    dropq.resume_any_panics();
+                }
+            }
+        }
+    }
 }
 
-impl<T> core::ops::Deref for SharedGetGuard<'_, T> {
+impl<T: ?Sized> core::ops::Deref for SharedGetGuard<'_, T> {
     type Target = T;
 
+    #[inline(always)]
     fn deref(&self) -> &Self::Target {
         match self {
             SharedGetGuard::Raw(r) => r,
-            SharedGetGuard::Cloned(c) => c.shared_non_reloading_get(),
+            SharedGetGuard::LightCloned { next } => {
+                if is_sized::<T>() {
+                    // SAFETY:
+                    // The 'next' pointer of a SharedGetGuard::Cloned is always valid
+                    unsafe { (*from_dummy::<_, SizedMetadata>(*next)).payload() }
+                } else {
+                    // SAFETY:
+                    // The 'next' pointer of a SharedGetGuard::Cloned is always valid
+                    unsafe { (*from_dummy::<_, UnsizedMetadata<T>>(*next)).payload() }
+                }
+            }
+            SharedGetGuard::FullClone { cloned } => cloned.shared_non_reloading_get(),
         }
     }
 }
 
+fn slow_shared_get<T: ?Sized, M: IMetadata>(
+    item: &ItemHolder<T, M>,
+) -> Option<SharedGetGuard<'_, T>> {
+    debug_println!("slow_shared_get: {:?} (advancing count)", item as *const _);
+    item.advance_count.fetch_add(1, Ordering::SeqCst);
+
+    #[cfg(loom)]
+    atomic::fence(Ordering::SeqCst);
+
+    debug_println!("advanced count");
+    let next = item.next.load(Ordering::SeqCst);
+    assert!(!undecorate(next).is_null());
+    debug_println!("slow_shared_get: {:?}, next = {:?}", item as *const _, next);
+
+    let next_val = from_dummy::<_, SizedMetadata>(undecorate(next));
+    // SAFETY:
+    // The 'item.next' pointer is not null as per method precondition, and since we have
+    // incremented advance_count, 'item.next' is a valid pointer.
+    let sc = unsafe { (*next_val).strong_count.load(Ordering::Relaxed) };
+    if sc == 0 {
+        debug_println!("slow_shared sc == 0");
+        item.advance_count.fetch_sub(1, Ordering::SeqCst);
+        return None;
+    }
+    debug_println!("slow_shared sc #1 for {:?}", next_val);
+    // SAFETY:
+    // The 'item.next' pointer is not null as per method precondition, and since we have
+    // incremented advance_count, 'item.next' is a valid pointer.
+    let exc = unsafe {
+        (*next_val)
+            .strong_count
+            .compare_exchange(sc, sc + 1, Ordering::SeqCst, Ordering::SeqCst)
+    };
+    debug_println!("slow_shared sc #1.5");
+    if exc.is_err() {
+        debug_println!("slow_shared sc race");
+        debug_println!("slow_shared_get: {:?}, sc == 0", item as *const _);
+        item.advance_count.fetch_sub(1, Ordering::SeqCst);
+        return None;
+    }
+    debug_println!("slow_shared sc #2");
+
+    // SAFETY:
+    // The 'item.next' pointer is not null as per method precondition, and since we have
+    // incremented advance_count, 'item.next' is a valid pointer.
+    let next_next = unsafe { (*next_val).next.load(Ordering::SeqCst) };
+    if !undecorate(next_next).is_null() {
+        debug_println!("slow_shared_get: {:?}, was dropped", item as *const _);
+        let mut dropq = DropHandler::default();
+        do_drop_strong(next_val, &mut dropq);
+        item.advance_count.fetch_sub(1, Ordering::SeqCst);
+        dropq.resume_any_panics();
+        return None;
+    }
+    debug_println!("slow_shared sc #3");
+    item.advance_count.fetch_sub(1, Ordering::SeqCst);
+
+    debug_println!(
+        "slow_shared_get: {:?}, advanced to {:?}",
+        item as *const _,
+        next
+    );
+
+    Some(SharedGetGuard::LightCloned {
+        next: undecorate(next),
+    })
+}
+
 impl<T: ?Sized> ArcShift<T> {
     #[doc(hidden)]
     #[cfg_attr(test, mutants::skip)]
@@ -3038,7 +3197,7 @@ impl<T: ?Sized> ArcShift<T> {
     }
 
     /// Reload this ArcShift instance, and return the latest value.
-    #[inline(always)]
+    #[inline]
     pub fn get(&mut self) -> &T {
         if is_sized::<T>() {
             let ptr = from_dummy::<T, SizedMetadata>(self.item.as_ptr());
@@ -3046,24 +3205,24 @@ impl<T: ?Sized> ArcShift<T> {
             // self.item is always a valid pointer
             let item = unsafe { &*ptr };
             let next = item.next.load(Ordering::Relaxed);
-            if next.is_null() {
-                // SAFETY:
-                // self.item is always a valid pointer
-                return unsafe { item.payload() };
+            if !undecorate(next).is_null() {
+                return Self::advance_strong_helper2::<SizedMetadata>(&mut self.item, true);
             }
-            Self::advance_strong_helper2::<SizedMetadata>(&mut self.item, true)
+            // SAFETY:
+            // self.item is always a valid pointer
+            unsafe { &*(item.payload.get() as *const T) }
         } else {
             let ptr = from_dummy::<T, UnsizedMetadata<T>>(self.item.as_ptr());
             // SAFETY:
             // self.item is always a valid pointer
             let item = unsafe { &*ptr };
             let next = item.next.load(Ordering::Relaxed);
-            if next.is_null() {
-                // SAFETY:
-                // self.item is always a valid pointer
-                return unsafe { item.payload() };
+            if !undecorate(next).is_null() {
+                return Self::advance_strong_helper2::<UnsizedMetadata<T>>(&mut self.item, true);
             }
-            Self::advance_strong_helper2::<UnsizedMetadata<T>>(&mut self.item, true)
+            // SAFETY:
+            // self.item is always a valid pointer
+            unsafe { item.payload() }
         }
     }
 
@@ -3086,26 +3245,30 @@ impl<T: ?Sized> ArcShift<T> {
             // self.item is always a valid pointer
             let item = unsafe { &*ptr };
             let next = item.next.load(Ordering::Relaxed);
-            if next.is_null() {
+            if undecorate(next).is_null() {
                 // SAFETY:
                 // self.item is always a valid pointer
                 return SharedGetGuard::Raw(unsafe { item.payload() });
             }
-            let cloned = self.clone();
-            SharedGetGuard::Cloned(cloned)
+
+            slow_shared_get(item).unwrap_or_else(|| SharedGetGuard::FullClone {
+                cloned: self.clone(),
+            })
         } else {
             let ptr = from_dummy::<T, UnsizedMetadata<T>>(self.item.as_ptr());
             // SAFETY:
             // self.item is always a valid pointer
             let item = unsafe { &*ptr };
             let next = item.next.load(Ordering::Relaxed);
-            if next.is_null() {
+            if undecorate(next).is_null() {
                 // SAFETY:
                 // self.item is always a valid pointer
                 return SharedGetGuard::Raw(unsafe { item.payload() });
             }
-            let cloned = self.clone();
-            SharedGetGuard::Cloned(cloned)
+
+            slow_shared_get(item).unwrap_or_else(|| SharedGetGuard::FullClone {
+                cloned: self.clone(),
+            })
         }
     }
 
@@ -3159,8 +3322,7 @@ impl<T: ?Sized> ArcShift<T> {
             unsafe { (*item).strong_count.load(Ordering::SeqCst) }
         })
     }
-    #[inline(never)]
-    #[cold]
+
     fn advance_strong_helper2<M: IMetadata>(
         old_item_ptr: &mut NonNull<ItemHolderDummy<T>>,
         gc: bool,
diff --git a/arcshift/src/tests.rs b/arcshift/src/tests.rs
index 66c4c2d..287a85c 100644
--- a/arcshift/src/tests.rs
+++ b/arcshift/src/tests.rs
@@ -2398,6 +2398,130 @@ fn simple_threading_shared_get_update() {
         assert_eq!(_seen_values2.load(Ordering::Relaxed), 3);
     }
 }
+#[test]
+fn simple_threading_shared_get_twice_update() {
+    let seen_values = alloc::sync::Arc::new(core::sync::atomic::AtomicU8::new(0));
+    let _seen_values2 = seen_values.clone();
+    model(move || {
+        debug_println!("-------- loom -------------");
+        let shift = ArcShift::new(0u32);
+        let shift1 = shift.clone();
+        let mut shift2 = shift.clone();
+        drop(shift);
+        // SAFETY:
+        // No threading involved
+        unsafe { ArcShift::debug_validate(&[&shift1, &shift2], &[]) };
+        let t1 = atomic::thread::Builder::new()
+            .name("t1".to_string())
+            .stack_size(1_000_000)
+            .spawn(move || *shift1.shared_get())
+            .unwrap();
+
+        let t2 = atomic::thread::Builder::new()
+            .name("t2".to_string())
+            .stack_size(1_000_000)
+            .spawn(move || {
+                shift2.update(1);
+                shift2.update(2);
+            })
+            .unwrap();
+        let r1 = t1.join().unwrap();
+        t2.join().unwrap();
+        debug_println!("--> Main dropping");
+
+        assert!(r1 == 0 || r1 == 1 || r1 == 2);
+        seen_values.fetch_or(1 << r1, Ordering::Relaxed);
+    });
+    #[cfg(loom)]
+    {
+        assert_eq!(_seen_values2.load(Ordering::Relaxed), 7);
+    }
+}
+
+#[test]
+fn simple_threading_shared_get_thrice_update() {
+    let seen_values = alloc::sync::Arc::new(core::sync::atomic::AtomicU8::new(0));
+    let _seen_values2 = seen_values.clone();
+    model(move || {
+        let owner = alloc::sync::Arc::new(SpyOwner2::new());
+        let owner2 = owner.clone();
+
+        debug_println!("-------- loom -------------");
+        let shift1 = ArcShift::new(owner.create("0"));
+
+        let mut shift2 = shift1.clone();
+
+        // SAFETY:
+        // No threading involved
+        unsafe { ArcShift::debug_validate(&[&shift1, &shift2], &[]) };
+        let t1 = atomic::thread::Builder::new()
+            .name("t1".to_string())
+            .stack_size(1_000_000)
+            .spawn(move || shift1.shared_get().str().parse::<u32>().unwrap())
+            .unwrap();
+
+        let t2 = atomic::thread::Builder::new()
+            .name("t2".to_string())
+            .stack_size(1_000_000)
+            .spawn(move || {
+                shift2.update(owner2.create("1"));
+                shift2.update(owner2.create("2"));
+                shift2.update(owner2.create("3"));
+            })
+            .unwrap();
+        let r1 = t1.join().unwrap();
+        t2.join().unwrap();
+        debug_println!("--> Main dropping");
+
+        assert!((0..=3).contains(&r1));
+        seen_values.fetch_or(1 << r1, Ordering::Relaxed);
+        owner.validate();
+    });
+    #[cfg(loom)]
+    {
+        assert_eq!(_seen_values2.load(Ordering::Relaxed), 15);
+    }
+}
+
+#[test]
+fn simple_threading_drop_four_times_update() {
+    model(move || {
+        let owner = alloc::sync::Arc::new(SpyOwner2::new());
+        let owner2 = owner.clone();
+
+        debug_println!("-------- loom -------------");
+        let shift1 = ArcShift::new(owner.create("0"));
+
+        let mut shift2 = shift1.clone();
+
+        // SAFETY:
+        // No threading involved
+        unsafe { ArcShift::debug_validate(&[&shift1, &shift2], &[]) };
+        let t1 = atomic::thread::Builder::new()
+            .name("t1".to_string())
+            .stack_size(1_000_000)
+            .spawn(move || {
+                let _t = shift1;
+            })
+            .unwrap();
+
+        let t2 = atomic::thread::Builder::new()
+            .name("t2".to_string())
+            .stack_size(1_000_000)
+            .spawn(move || {
+                shift2.update(owner2.create("1"));
+                shift2.update(owner2.create("2"));
+                shift2.update(owner2.create("3"));
+                shift2.update(owner2.create("4"));
+            })
+            .unwrap();
+        t1.join().unwrap();
+        t2.join().unwrap();
+        debug_println!("--> Main dropping");
+        owner.validate();
+    });
+}
+
 #[test]
 fn simple_threading_shared_get_drop() {
     model(move || {
@@ -2642,50 +2766,3 @@ fn simple_threading_update_downgrade_shared_get() {
         assert_eq!(*shift.get(), 43);
     });
 }
-
-#[test]
-fn simple_threading_shared_get_thrice_update() {
-    let seen_values = alloc::sync::Arc::new(core::sync::atomic::AtomicU8::new(0));
-    let _seen_values2 = seen_values.clone();
-    model(move || {
-        let owner = alloc::sync::Arc::new(SpyOwner2::new());
-        let owner2 = owner.clone();
-
-        debug_println!("-------- loom -------------");
-        let shift = ArcShift::new(owner.create("0"));
-        let shift1 = shift.clone();
-        let mut shift2 = shift.clone();
-        drop(shift);
-        // SAFETY:
-        // No threading involved
-        unsafe { ArcShift::debug_validate(&[&shift1, &shift2], &[]) };
-        let t1 = atomic::thread::Builder::new()
-            .name("t1".to_string())
-            .stack_size(1_000_000)
-            .spawn(move || {
-                shift1.shared_get().str().parse::<u32>().unwrap()
-            })
-            .unwrap();
-
-        let t2 = atomic::thread::Builder::new()
-            .name("t2".to_string())
-            .stack_size(1_000_000)
-            .spawn(move || {
-                shift2.update(owner2.create("1"));
-                shift2.update(owner2.create("2"));
-                shift2.update(owner2.create("3"));
-            })
-            .unwrap();
-        let r1 = t1.join().unwrap();
-        t2.join().unwrap();
-        debug_println!("--> Main dropping");
-
-        assert!((0..=3).contains(&r1));
-        seen_values.fetch_or(1 << r1, Ordering::Relaxed);
-        owner.validate();
-    });
-    #[cfg(loom)]
-    {
-        assert_eq!(_seen_values2.load(Ordering::Relaxed), 15);
-    }
-}
diff --git a/arcshift_bench/benches/my_benchmark.rs b/arcshift_bench/benches/my_benchmark.rs
index 38faca9..7bd6fcd 100644
--- a/arcshift_bench/benches/my_benchmark.rs
+++ b/arcshift_bench/benches/my_benchmark.rs
@@ -1,4 +1,5 @@
 use arc_swap::{ArcSwap, Cache};
+use arcshift::cell::ArcShiftCell;
 use arcshift::ArcShift;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use std::ops::Deref;
@@ -52,6 +53,18 @@ fn arcshift_bench(c: &mut Criterion) {
     let mut ac = ArcShift::new(42u32);
     c.bench_function("arcshift_get", |b| b.iter(|| *ac.get()));
 }
+fn arcshift_cell_bench(c: &mut Criterion) {
+    let ac = ArcShift::new(42u32);
+    let cell = ArcShiftCell::from_arcshift(ac);
+    c.bench_function("arcshift_cell_get", |b| {
+        b.iter(|| black_box(cell.get(|x| *x)))
+    });
+}
+fn arcshift_cell_borrow_bench(c: &mut Criterion) {
+    let ac = ArcShift::new(42u32);
+    let cell = ArcShiftCell::from_arcshift(ac);
+    c.bench_function("arcshift_cell_borrow", |b| b.iter(|| *cell.borrow()));
+}
 fn arcshift_contended_bench(c: &mut Criterion) {
     let mut ac = ArcShift::new(42u32);
     let mut ac_clone = ac.clone();
@@ -88,7 +101,7 @@ fn arcshift_shared_non_reloading_bench(c: &mut Criterion) {
 }
 
 fn arcswap_bench(c: &mut Criterion) {
-    let ac = ArcSwap::from_pointee(42);
+    let ac = Arc::new(ArcSwap::from_pointee(42));
     c.bench_function("arc_swap", |b| {
         b.iter(|| {
             let loaded = ac.load();
@@ -98,6 +111,24 @@ fn arcswap_bench(c: &mut Criterion) {
         })
     });
 }
+fn arcswap_stale_bench(c: &mut Criterion) {
+    let shared = Arc::new(ArcSwap::from_pointee(42));
+    let shared2 = shared.clone();
+
+    let jh = std::thread::spawn(move || {
+        shared2.store(Arc::new(43));
+    });
+    jh.join().unwrap();
+
+    c.bench_function("arc_swap_stale", |b| {
+        b.iter(|| {
+            let arc = shared.load();
+            let x: i32 = *(*arc).deref();
+            black_box(x)
+        })
+    });
+}
+
 fn arcswap_cached_bench(c: &mut Criterion) {
     let shared = Arc::new(ArcSwap::from_pointee(42));
     let mut cache = Cache::new(Arc::clone(&shared));
@@ -117,7 +148,9 @@ fn arcswap_update(c: &mut Criterion) {
 }
 criterion_group!(
     benches,
+    arcshift_cell_borrow_bench,
     arcshift_shared_bench,
+    arcshift_cell_bench,
     arcshift_shared_stale_bench,
     arcshift_shared_non_reloading_bench,
     std_arc_bench,
@@ -126,6 +159,7 @@ criterion_group!(
     arcshift_contended_bench,
     arcshift_update_bench,
     arcswap_bench,
+    arcswap_stale_bench,
     arcswap_cached_bench,
     mutex_bench,
     rwlock_read_bench,