diff --git a/arbor/backends/gpu/shared_state.cpp b/arbor/backends/gpu/shared_state.cpp
index a15a3f9545..74fe7aed15 100644
--- a/arbor/backends/gpu/shared_state.cpp
+++ b/arbor/backends/gpu/shared_state.cpp
@@ -22,8 +22,6 @@
 #include "util/range.hpp"
 #include "util/strprintf.hpp"
 
-#include <iostream>
-
 using arb::memory::make_const_view;
 
 namespace arb {
@@ -37,6 +35,8 @@ void take_samples_impl(
 
 void add_scalar(std::size_t n, arb_value_type* data, arb_value_type v);
 
+void apply_diffusive_concentration_delta_impl(std::size_t, arb_value_type*, arb_value_type* const *, arb_index_type const *);
+
 // GPU-side minmax: consider CUDA kernel replacement.
 std::pair<arb_value_type, arb_value_type> minmax_value_impl(arb_size_type n, const arb_value_type* v) {
     auto v_copy = memory::on_host(memory::const_device_view<arb_value_type>(v, n));
@@ -201,6 +201,12 @@ shared_state::shared_state(task_system_handle tp,
     add_scalar(temperature_degC.size(), temperature_degC.data(), -273.15);
 }
 
+void shared_state::apply_diffusive_concentration_delta() {
+    for (auto& [name, state] : ion_data) {
+        apply_diffusive_concentration_delta_impl(state.Xd_contribution.size(), state.Xd_.data(), state.Xd_contribution_d.data(), state.Xd_index_d.data());
+    }
+}
+
 void shared_state::update_prng_state(mechanism& m) {
     if (!m.mech_.n_random_variables) return;
     auto const mech_id = m.mechanism_id();
@@ -250,20 +256,23 @@ void shared_state::instantiate(mechanism& m,
     store.globals_    = std::vector<arb_value_type>(m.mech_.n_globals);
 
     // Set ion views
+    arb_size_type n_ions_with_written_xd = 0;
     for (auto idx: make_span(m.mech_.n_ions)) {
         auto ion = m.mech_.ions[idx].name;
         auto ion_binding = value_by_key(overrides.ion_rebind, ion).value_or(ion);
         ion_state* oion = ptr_by_key(ion_data, ion_binding);
         if (!oion) throw arbor_internal_error("gpu/mechanism: mechanism holds ion with no corresponding shared state");
-        auto& ion_state = store.ion_states_[idx];
-        ion_state = {0};
-        ion_state.current_density         = oion->iX_.data();
-        ion_state.reversal_potential      = oion->eX_.data();
-        ion_state.internal_concentration  = oion->Xi_.data();
-        ion_state.external_concentration  = oion->Xo_.data();
-        ion_state.diffusive_concentration = oion->Xd_.data();
-        ion_state.ionic_charge            = oion->charge.data();
-        ion_state.conductivity            = oion->gX_.data();
+        auto& ion_state_ = store.ion_states_[idx]; // arb_ion_state
+        ion_state_ = {0};
+        ion_state_.current_density         = oion->iX_.data();
+        ion_state_.reversal_potential      = oion->eX_.data();
+        ion_state_.internal_concentration  = oion->Xi_.data();
+        ion_state_.external_concentration  = oion->Xo_.data();
+        ion_state_.diffusive_concentration = oion->Xd_.data();
+        ion_state_.ionic_charge            = oion->charge.data();
+        ion_state_.conductivity            = oion->gX_.data();
+
+        n_ions_with_written_xd += m.mech_.ions[idx].write_diff_concentration;
     }
 
     // If there are no sites (is this ever meaningful?) there is nothing more to do.
@@ -272,7 +281,7 @@ void shared_state::instantiate(mechanism& m,
     // Allocate and initialize state and parameter vectors with default values.
     {
         // Allocate bulk storage
-        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1)*width_padded + m.mech_.n_globals;
+        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1 + n_ions_with_written_xd)*width_padded + m.mech_.n_globals;
         store.data_ = array(count, NAN);
         chunk_writer writer(store.data_.data(), width_padded);
 
@@ -295,6 +304,11 @@ void shared_state::instantiate(mechanism& m,
         for (auto idx: make_span(m.mech_.n_state_vars)) {
             store.state_vars_[idx] = writer.fill(m.mech_.state_vars[idx].default_value);
         }
+        // Set diffusive concentration deltas if needed
+        for (auto idx: make_span(m.mech_.n_ions)) {
+            store.ion_states_[idx].diffusive_concentration_delta =
+                m.mech_.ions[idx].write_diff_concentration ? writer.fill(0) : nullptr;
+        }
         // Assign global scalar parameters. NB: Last chunk, since it breaks the width striding.
         for (auto idx: make_span(m.mech_.n_globals)) store.globals_[idx] = m.mech_.globals[idx].default_value;
         for (auto& [k, v]: overrides.globals) {
@@ -332,6 +346,31 @@ void shared_state::instantiate(mechanism& m,
             auto indices = util::index_into(pos_data.cv, ni);
             std::vector<arb_index_type> mech_ion_index(indices.begin(), indices.end());
             store.ion_states_[idx].index = writer.append_with_padding(mech_ion_index, 0);
+
+            if (m.mech_.ions[idx].write_diff_concentration) {
+                auto& Xd_contribution_map = oion->Xd_contribution_map;
+                auto& Xd_contribution     = oion->Xd_contribution;
+                auto& Xd_index            = oion->Xd_index;
+                auto& Xd_contribution_d   = oion->Xd_contribution_d;
+                auto& Xd_index_d          = oion->Xd_index_d;
+                util::append(
+                    Xd_contribution_map,
+                    util::transform_view(
+                        util::make_span(width),
+                        [mech_ion_index, d = store.ion_states_[idx].diffusive_concentration_delta](const auto& i) {
+                            return std::make_pair(d+i, mech_ion_index[i]);
+                        }));
+                // sort contribution map according to index and transpose from AoS to SoA
+                util::stable_sort_by(Xd_contribution_map, [](const auto& p) { return p.second; });
+                Xd_contribution.clear(); Xd_contribution.reserve(Xd_contribution_map.size());
+                Xd_index.clear(); Xd_index.reserve(Xd_contribution_map.size());
+                for (auto [ptr, idx_] : Xd_contribution_map) {
+                    Xd_contribution.push_back(ptr);
+                    Xd_index.push_back(idx_);
+                }
+                Xd_contribution_d = memory::on_gpu(Xd_contribution);
+                Xd_index_d = memory::on_gpu(Xd_index);
+            }
         }
 
         m.ppack_.multiplicity = mult_in_place? writer.append_with_padding(pos_data.multiplicity, 0): nullptr;
diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu
index 403067d70f..e87866b1d3 100644
--- a/arbor/backends/gpu/shared_state.cu
+++ b/arbor/backends/gpu/shared_state.cu
@@ -7,6 +7,7 @@
 
 #include <arbor/gpu/gpu_api.hpp>
 #include <arbor/gpu/gpu_common.hpp>
+#include <arbor/gpu/reduce_by_key.hpp>
 
 namespace arb {
 namespace gpu {
@@ -40,6 +41,20 @@ __global__ void take_samples_impl(
     }
 }
 
+__global__ void apply_diffusive_concentration_delta_impl(
+    std::size_t n,
+    arb_value_type         * __restrict__ const Xd,
+    arb_value_type * const * __restrict__ const Xd_contribution,
+    arb_index_type   const * __restrict__ const Xd_index)
+{
+    const unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
+    const unsigned mask = gpu::ballot(0xffffffff, i<n);
+    if (i < n) {
+        reduce_by_key(*Xd_contribution[i], Xd, Xd_index[i], mask);
+        *Xd_contribution[i] = 0;
+    }
+}
+
 } // namespace kernel
 
 void add_scalar(std::size_t n, arb_value_type* data, arb_value_type v) {
@@ -53,5 +68,13 @@ void take_samples_impl(
     launch_1d(s.size(), 128, kernel::take_samples_impl, s.begin_marked, s.end_marked, time, sample_time, sample_value);
 }
 
+void apply_diffusive_concentration_delta_impl(
+    std::size_t n,
+    arb_value_type         * Xd,
+    arb_value_type * const * Xd_contribution,
+    arb_index_type   const * Xd_index) {
+    launch_1d(n, 128, kernel::apply_diffusive_concentration_delta_impl, n, Xd, Xd_contribution, Xd_index);
+}
+
 } // namespace gpu
 } // namespace arb
diff --git a/arbor/backends/gpu/shared_state.hpp b/arbor/backends/gpu/shared_state.hpp
index 2f9c89cfde..4cad821d3f 100644
--- a/arbor/backends/gpu/shared_state.hpp
+++ b/arbor/backends/gpu/shared_state.hpp
@@ -60,6 +60,12 @@ struct ARB_ARBOR_API ion_state {
 
     array charge;       // charge of ionic species (global, length 1)
 
+    std::vector<std::pair<arb_value_type*,arb_index_type>> Xd_contribution_map;
+    std::vector<arb_value_type*> Xd_contribution;
+    std::vector<arb_index_type> Xd_index;
+    memory::device_vector<arb_value_type*> Xd_contribution_d;
+    memory::device_vector<arb_index_type> Xd_index_d;
+
     solver_ptr solver = nullptr;
 
     ion_state() = default;
@@ -224,6 +230,8 @@ struct ARB_ARBOR_API shared_state: shared_state_base<shared_state, array, ion_st
                      const mechanism_layout&,
                      const std::vector<std::pair<std::string, std::vector<arb_value_type>>>&);
 
+    void apply_diffusive_concentration_delta();
+
     void update_prng_state(mechanism&);
 
     void zero_currents();
diff --git a/arbor/backends/multicore/shared_state.cpp b/arbor/backends/multicore/shared_state.cpp
index a32ea19b0f..dff975f589 100644
--- a/arbor/backends/multicore/shared_state.cpp
+++ b/arbor/backends/multicore/shared_state.cpp
@@ -329,6 +329,17 @@ std::size_t extend_width(const arb::mechanism& mech, std::size_t width) {
 }
 } // anonymous namespace
 
+void shared_state::apply_diffusive_concentration_delta() {
+    for (auto& [name, state] : ion_data) {
+        auto* Xd_ = state.Xd_.data();
+        const auto& Xd_c = state.Xd_contribution;
+        const auto& Xd_i = state.Xd_index;
+        for (auto i : util::count_along(Xd_c)) {
+            Xd_[Xd_i[i]] += std::exchange(*Xd_c[i], 0);
+        }
+    }
+}
+
 void shared_state::update_prng_state(mechanism& m) {
     if (!m.mech_.n_random_variables) return;
     const auto mech_id = m.mechanism_id();
@@ -418,6 +429,7 @@ void shared_state::instantiate(arb::mechanism& m,
     store.ion_states_.resize(m.mech_.n_ions);       m.ppack_.ion_states = store.ion_states_.data();
 
     // Set ion views
+    arb_size_type n_ions_with_written_xd = 0;
     for (auto idx: make_span(m.mech_.n_ions)) {
         auto ion = m.mech_.ions[idx].name;
         auto ion_binding = value_by_key(overrides.ion_rebind, ion).value_or(ion);
@@ -433,6 +445,8 @@ void shared_state::instantiate(arb::mechanism& m,
         ion_state.diffusive_concentration = oion->Xd_.data();
         ion_state.ionic_charge            = oion->charge.data();
         ion_state.conductivity            = oion->gX_.data();
+
+        n_ions_with_written_xd += m.mech_.ions[idx].write_diff_concentration;
     }
 
     // Initialize state and parameter vectors with default values.
@@ -446,7 +460,7 @@ void shared_state::instantiate(arb::mechanism& m,
         // Allocate bulk storage
         std::size_t value_width_padded = extend_width<arb_value_type>(m, pos_data.cv.size());
         store.value_width_padded = value_width_padded;
-        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1 +
+        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1 + n_ions_with_written_xd +
             random_number_storage)*value_width_padded + m.mech_.n_globals;
         store.data_ = array(count, NAN, pad);
         chunk_writer writer(store.data_.data(), value_width_padded);
@@ -470,6 +484,11 @@ void shared_state::instantiate(arb::mechanism& m,
         for (auto idx: make_span(m.mech_.n_state_vars)) {
             m.ppack_.state_vars[idx] = writer.fill(m.mech_.state_vars[idx].default_value);
         }
+        // Set diffusive concentration deltas if needed
+        for (auto idx: make_span(m.mech_.n_ions)) {
+            m.ppack_.ion_states[idx].diffusive_concentration_delta =
+                m.mech_.ions[idx].write_diff_concentration ? writer.fill(0) : nullptr;
+        }
         // Set random numbers
         for (auto idx_v: make_span(num_random_numbers_per_cv))
             for (auto idx_c: make_span(cbprng::cache_size()))
@@ -530,7 +549,30 @@ void shared_state::instantiate(arb::mechanism& m,
             m.ppack_.ion_states[idx].index = writer.append(indices, util::back(indices));
             // Check SIMD constraints
             arb_assert(compatible_index_constraints(node_index, util::range_n(m.ppack_.ion_states[idx].index, index_width_padded), m.iface_.partition_width));
+
+            if (m.mech_.ions[idx].write_diff_concentration) {
+                auto mech_ion_index       = m.ppack_.ion_states[idx].index;
+                auto& Xd_contribution_map = oion->Xd_contribution_map;
+                auto& Xd_contribution     = oion->Xd_contribution;
+                auto& Xd_index            = oion->Xd_index;
+                util::append(
+                    Xd_contribution_map,
+                    util::transform_view(
+                        util::make_span(width),
+                        [mech_ion_index, d = store.ion_states_[idx].diffusive_concentration_delta](const auto& i) {
+                            return std::make_pair(d+i, mech_ion_index[i]);
+                        }));
+                // sort contribution map according to index and transpose from AoS to SoA
+                util::stable_sort_by(Xd_contribution_map, [](const auto& p) { return p.second; });
+                Xd_contribution.clear(); Xd_contribution.reserve(Xd_contribution_map.size());
+                Xd_index.clear(); Xd_index.reserve(Xd_contribution_map.size());
+                for (auto [ptr, idx] : Xd_contribution_map) {
+                    Xd_contribution.push_back(ptr);
+                    Xd_index.push_back(idx);
+                }
+            }
         }
+
         if (mult_in_place) m.ppack_.multiplicity = writer.append(pos_data.multiplicity, 0);
         // `peer_index` holds the peer CV of each CV in node_index.
         // Peer CVs are only filled for gap junction mechanisms. They are used
diff --git a/arbor/backends/multicore/shared_state.hpp b/arbor/backends/multicore/shared_state.hpp
index 6d0d49c804..d132a1e012 100644
--- a/arbor/backends/multicore/shared_state.hpp
+++ b/arbor/backends/multicore/shared_state.hpp
@@ -73,6 +73,10 @@ struct ARB_ARBOR_API ion_state {
 
     array charge;           // charge of ionic species (global value, length 1)
 
+    std::vector<std::pair<arb_value_type*,arb_index_type>> Xd_contribution_map;
+    std::vector<arb_value_type*> Xd_contribution;
+    std::vector<arb_index_type> Xd_index;
+
     solver_ptr solver = nullptr;
 
     ion_state() = default;
@@ -231,6 +235,8 @@ struct ARB_ARBOR_API shared_state:
                      const mechanism_layout&,
                      const std::vector<std::pair<std::string, std::vector<arb_value_type>>>&);
 
+    void apply_diffusive_concentration_delta();
+
     void update_prng_state(mechanism&);
 
     void zero_currents();
diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
index 94716b21c5..0e68cfcf48 100644
--- a/arbor/fvm_lowered_cell_impl.hpp
+++ b/arbor/fvm_lowered_cell_impl.hpp
@@ -213,6 +213,8 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
             m->update_current();
         }
 
+        state_->apply_diffusive_concentration_delta();
+
         // Add stimulus current contributions.
         // NOTE: performed after dt, time_to calculation, in case we want to
         // use mean current contributions as opposed to point sample.
@@ -236,11 +238,15 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
             m->update_state();
         }
 
+        state_->apply_diffusive_concentration_delta();
+
         // Update ion concentrations.
         PE(advance:integrate:ionupdate);
         update_ion_state();
         PL();
 
+        // TODO: can `write_ions` affect xd?
+
         // voltage mechs run now; after the cable_solver, but before the
         // threshold test
         for (auto& m: voltage_mechanisms_) {
diff --git a/arbor/include/arbor/gpu/cuda_api.hpp b/arbor/include/arbor/gpu/cuda_api.hpp
index 48bc96f262..532b9ec8f9 100644
--- a/arbor/include/arbor/gpu/cuda_api.hpp
+++ b/arbor/include/arbor/gpu/cuda_api.hpp
@@ -139,17 +139,6 @@ inline float gpu_atomic_sub(float* address, float val) {
 
 /// Warp-Level Primitives
 
-__device__ __inline__ double shfl(unsigned mask, double x, int lane)
-{
-    auto tmp = static_cast<uint64_t>(x);
-    auto lo = static_cast<unsigned>(tmp);
-    auto hi = static_cast<unsigned>(tmp >> 32);
-    hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize);
-    lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize);
-    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
-                               static_cast<uint64_t>(lo));
-}
-
 __device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
     return __ballot_sync(mask, is_root);
 }
@@ -158,24 +147,15 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) {
     return __any_sync(mask, width);
 }
 
-#ifdef __NVCC__
-__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return __shfl_up_sync(mask, idx, shift);
-}
-
-__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return __shfl_down_sync(mask, idx, shift);
+template<typename T>
+__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
+    return __shfl_up_sync(mask, var, shift);
 }
 
-#else
-__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return shfl(mask, idx, lane_id - shift);
+template<typename T>
+__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
+    return __shfl_down_sync(mask, var, shift);
 }
-
-__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return shfl(mask, idx, lane_id + shift);
-}
-#endif
 #endif
 
 } // namespace gpu
diff --git a/arbor/include/arbor/gpu/hip_api.hpp b/arbor/include/arbor/gpu/hip_api.hpp
index fbdba3151f..019f235a8e 100644
--- a/arbor/include/arbor/gpu/hip_api.hpp
+++ b/arbor/include/arbor/gpu/hip_api.hpp
@@ -1,5 +1,6 @@
 #include <utility>
 #include <string>
+#include <type_traits>
 
 #include <hip/hip_runtime.h>
 #include <hip/hip_runtime_api.h>
@@ -118,6 +119,14 @@ inline float gpu_atomic_sub(float* address, float val) {
 
 /// Warp-level Primitives
 
+template<typename T>
+__device__ __inline__
+std::enable_if_t< !std::is_same_v<std::decay_t<T>, double>, std::decay_t<T>>
+shfl(T x, int lane)
+{
+    return __shfl(x, lane);
+}
+
 __device__ __inline__ double shfl(double x, int lane)
 {
     auto tmp = static_cast<uint64_t>(x);
@@ -137,12 +146,14 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) {
     return __any(width);
 }
 
-__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return shfl(idx, lane_id - shift);
+template<typename T>
+__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
+    return shfl(var, (int)lane_id - shift);
 }
 
-__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return shfl(idx, lane_id + shift);
+template<typename T>
+__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
+    return shfl(var, (int)lane_id + shift);
 }
 
 } // namespace gpu
diff --git a/arbor/include/arbor/mechanism_abi.h b/arbor/include/arbor/mechanism_abi.h
index bf0a9b30ba..76e9a46917 100644
--- a/arbor/include/arbor/mechanism_abi.h
+++ b/arbor/include/arbor/mechanism_abi.h
@@ -20,7 +20,7 @@ extern "C" {
 
 // Version
 #define ARB_MECH_ABI_VERSION_MAJOR 0
-#define ARB_MECH_ABI_VERSION_MINOR 6
+#define ARB_MECH_ABI_VERSION_MINOR 7
 #define ARB_MECH_ABI_VERSION_PATCH 0
 #define ARB_MECH_ABI_VERSION ((ARB_MECH_ABI_VERSION_MAJOR * 10000L * 10000L) + (ARB_MECH_ABI_VERSION_MAJOR * 10000L) + ARB_MECH_ABI_VERSION_PATCH)
 
@@ -59,6 +59,7 @@ typedef struct arb_ion_state {
     arb_value_type* internal_concentration;
     arb_value_type* external_concentration;
     arb_value_type* diffusive_concentration;
+    arb_value_type* diffusive_concentration_delta;
     arb_value_type* ionic_charge;
     arb_index_type* index;
 } arb_ion_state;
@@ -193,6 +194,7 @@ typedef struct arb_ion_info {
     const char* name;
     bool write_int_concentration;
     bool write_ext_concentration;
+    bool write_diff_concentration;
     bool use_diff_concentration;
     bool write_rev_potential;
     bool read_rev_potential;
diff --git a/modcc/blocks.hpp b/modcc/blocks.hpp
index 1e12b54287..ef2a001cf5 100644
--- a/modcc/blocks.hpp
+++ b/modcc/blocks.hpp
@@ -45,6 +45,9 @@ struct IonDep {
     bool writes_concentration_int() const {
         return writes_variable(name + "i");
     };
+    bool writes_concentration_diff() const {
+        return writes_variable(name + "d");
+    };
     bool writes_concentration_ext() const {
         return writes_variable(name + "o");
     };
diff --git a/modcc/printer/cprinter.cpp b/modcc/printer/cprinter.cpp
index 36ec914e6d..59fd8eb562 100644
--- a/modcc/printer/cprinter.cpp
+++ b/modcc/printer/cprinter.cpp
@@ -447,6 +447,7 @@ static std::string index_i_name(const std::string& index_var) {
 namespace {
     // Access through ppack
     std::string data_via_ppack(const indexed_variable_info& i) { return pp_var_pfx + i.data_var; }
+    std::string delta_data_via_ppack(const indexed_variable_info& i) { return pp_var_pfx + i.data_var + "_delta"; }
     std::string node_index_i_name(const indexed_variable_info& i) { return i.node_index_var + "i_"; }
     std::string source_index_i_name(const index_prop& i) { return i.source_var + "i_"; }
     std::string source_var(const index_prop& i) { return pp_var_pfx + i.source_var; }
@@ -464,6 +465,15 @@ namespace {
             return o << data_via_ppack(wrap.d) << '[' << (wrap.d.scalar() ? "0": i_name) << ']';
         }
     };
+
+    struct deref_delta {
+        indexed_variable_info d;
+        deref_delta(indexed_variable_info d): d(d) {}
+
+        friend std::ostream& operator<<(std::ostream& o, const deref_delta& wrap) {
+            return o << delta_data_via_ppack(wrap.d) << '[' << (wrap.d.scalar() ? "0": "i_") << ']';
+        }
+    };
 }
 
 // Return the indices that need to be read at the beginning
@@ -494,7 +504,7 @@ std::list<index_prop> gather_indexed_vars(const std::vector<LocalVariable*>& ind
                     indices.push_back(outer_index_prop);
                 }
             }
-            else {
+            else if (!d.outer_index_var().empty()) {
                 // Need to read 1 index: outer[index]
                 index_prop outer_index_prop = {d.outer_index_var(), index, d.index_var_kind};
                 auto it = std::find(indices.begin(), indices.end(), outer_index_prop);
@@ -562,9 +572,13 @@ void emit_api_body(std::ostream& out, APIMethod* method, const ApiFlags& flags)
                 std::stringstream v; v << deref(d); var = v.str();
             }
             if (d.additive && flags.use_additive) {
+                std::string delta_var;
+                {
+                    std::stringstream v; v << deref_delta(d); delta_var = v.str();
+                }
                 out << fmt::format("{3} -= {0};\n"
-                                   "{0} = fma({1}{2}, {3}, {0});\n",
-                                   var, scale, weight, name);
+                                   "{4} = {1}{2}*{3};\n",
+                                   var, scale, weight, name, delta_var);
             }
             else if (write_voltage) {
                 // SAFETY: we only ever allow *one* V-PROCESS per CV, so this is OK.
@@ -783,6 +797,7 @@ void emit_simd_state_update(std::ostream& out,
     auto ext = external->name();
     auto name = from->name();
     auto data = data_via_ppack(d);
+    auto delta_data = delta_data_via_ppack(d);
     auto node = node_index_i_name(d);
     auto index = index_i_name(d.outer_index_var());
 
@@ -801,26 +816,31 @@ void emit_simd_state_update(std::ostream& out,
     if (d.additive && flags.use_additive) {
         if (d.index_var_kind == index_kind::node) {
             if (constraint == simd_expr_constraint::contiguous) {
-                out << fmt::format("indirect({} + {}, simd_width_) = S::mul({}, {});\n",
-                                   data, node, weight, scaled);
+                // We need this instead of simple assignment!
+                out << fmt::format("{{\n"
+                                   "  simd_value t_{0}0_ = {0};\n"
+                                   "  assign(t_{0}0_, indirect({1} + {2}, simd_width_));\n"
+                                   "  {0} = S::sub({0}, t_{0}0_);\n"
+                                   "  indirect({5} + index_, simd_width_) = S::mul({3}, {4});\n"
+                                   "}}\n",
+                                   name, data, node, weight, scaled, delta_data);
             }
             else {
-                    // We need this instead of simple assignment!
-                    out << fmt::format("{{\n"
-                                       "  simd_value t_{}0_ = simd_cast<simd_value>(0.0);\n"
-                                       "  assign(t_{}0_, indirect({}, simd_cast<simd_index>({}), simd_width_, constraint_category_));\n"
-                                       "  {} = S::sub({}, t_{}0_);\n"
-                                       "  indirect({}, simd_cast<simd_index>({}), simd_width_, constraint_category_) += S::mul({}, {});\n"
-                                       "}}\n",
-                                       name,
-                                       name, data, node,
-                                       scaled, scaled, name,
-                                       data, node, weight, scaled);
+                // We need this instead of simple assignment!
+                out << fmt::format("{{\n"
+                                   "  simd_value t_{0}0_ = {0};\n"
+                                   "  assign(t_{0}0_, indirect({1}, simd_cast<simd_index>({2}), simd_width_, constraint_category_));\n"
+                                   "  {0} = S::sub({0}, t_{0}0_);\n"
+                                   "  indirect({5} + index_, simd_width_) = S::mul({3}, {4});\n"
+                                   "}}\n",
+                                   name, data, node, weight, scaled, delta_data);
             }
         }
         else {
-            out << fmt::format("indirect({}, {}, simd_width_, index_constraint::none) = S::mul({}, {});\n",
-                               data, index, weight, scaled);
+            //out << fmt::format("indirect({}, {}, simd_width_, index_constraint::none) = S::mul({}, {});\n",
+            //                   data, index, weight, scaled);
+            //only additive node variables allowed
+            throw compiler_exception("Cannot write to additive non-node variables: "+external->to_string());
         }
     }
     else if (write_voltage) {
diff --git a/modcc/printer/gpuprinter.cpp b/modcc/printer/gpuprinter.cpp
index 04857c0aec..10a0f4cdd2 100644
--- a/modcc/printer/gpuprinter.cpp
+++ b/modcc/printer/gpuprinter.cpp
@@ -217,7 +217,7 @@ ARB_LIBMODCC_API std::string emit_gpu_cu_source(const Module& module_, const pri
                                        "}}\n\n"),
                            pp_var_pfx);
     }
-    emit_api_kernel(state_api);
+    emit_api_kernel(state_api, true);
     emit_api_kernel(current_api, true);
     emit_api_kernel(write_ions_api);
 
@@ -232,11 +232,12 @@ ARB_LIBMODCC_API std::string emit_gpu_cu_source(const Module& module_, const pri
                                        "        const auto tid_ = (begin_ + ii_)->mech_index;\n"
                                        "        if ((ii_ > 0) && ((begin_ + (ii_ - 1))->mech_index == tid_)) return;\n"
                                        "        for (auto i_ = begin_ + ii_; i_ < end_; ++i_) {{\n"
+                                       "            int n_ = 0;\n"
                                        "            if (i_->mech_index != tid_) break;\n"
                                        "            [[maybe_unused]] auto {0} = i_->weight;\n"),
                            net_receive_api->args().empty() ? "weight" : net_receive_api->args().front()->is_argument()->name());
         out << indent << indent << indent;
-        emit_api_body_cu(out, net_receive_api, ApiFlags{}.point(is_point_proc).loop(false).iface(false));
+        emit_api_body_cu(out, net_receive_api, ApiFlags{}.point(is_point_proc).additive(true).loop(false).iface(false));
         out << popindent << "}\n" << popindent << "}\n" << popindent << "}\n\n";
     }
 
@@ -433,6 +434,16 @@ namespace {
                      << (wrap.v.scalar()? "0": index_i_name(index_var)) << ']';
         }
     };
+
+    struct deref_delta {
+        indexed_variable_info v;
+        deref_delta(indexed_variable_info v): v(v) {}
+
+        friend std::ostream& operator<<(std::ostream& o, const deref_delta& wrap) {
+            return o << pp_var_pfx + wrap.v.data_var + "_delta" << '['
+                     << (wrap.v.scalar()? "0": "tid_") << ']';
+        }
+    };
 }
 
 void emit_state_read_cu(std::ostream& out, LocalVariable* local, const ApiFlags& flags) {
@@ -475,12 +486,7 @@ void emit_state_update_cu(std::ostream& out,
 
     if (d.additive && flags.use_additive) {
         out << name << " -= " << var << ";\n";
-        if (flags.is_point) {
-            out << fmt::format("::arb::gpu::reduce_by_key({}*{}, {}, {}, lane_mask_);\n", weight, name, data, index);
-        }
-        else {
-            out << var << " = fma(" << weight << ", " << name << ", " << var << ");\n";
-        }
+        out << deref_delta(d) << " = " << weight << "*" << name << ";\n";
     }
     else if (write_voltage) {
         /* SAFETY:
diff --git a/modcc/printer/infoprinter.cpp b/modcc/printer/infoprinter.cpp
index 596b7b80d0..97e7637aa3 100644
--- a/modcc/printer/infoprinter.cpp
+++ b/modcc/printer/infoprinter.cpp
@@ -52,10 +52,10 @@ ARB_LIBMODCC_API std::string build_info_header(const Module& m, const printer_op
                 id.unit_string(), val, lo, hi);
     };
     auto fmt_ion = [&](const auto& ion) {
-        return fmt::format(FMT_COMPILE("{{ \"{}\", {}, {}, {}, {}, {}, {}, {}, {} }}"),
+        return fmt::format(FMT_COMPILE("{{ \"{}\", {}, {}, {}, {}, {}, {}, {}, {}, {} }}"),
            ion.name,
            ion.writes_concentration_int(), ion.writes_concentration_ext(),
-           ion.uses_concentration_diff(),
+           ion.writes_concentration_diff(), ion.uses_concentration_diff(),
            ion.writes_rev_potential(), ion.uses_rev_potential(),
            ion.uses_valence(), ion.verifies_valence(), ion.expected_valence);
     };
diff --git a/python/test/fixtures.py b/python/test/fixtures.py
index 11b9011750..5e80b41d59 100644
--- a/python/test/fixtures.py
+++ b/python/test/fixtures.py
@@ -173,6 +173,19 @@ def dummy_catalogue(repo_path):
     return arbor.load_catalogue(str(cat_path))
 
 
+@_singleton_fixture
+@repo_path()
+def diffusion_catalogue(repo_path):
+    """
+    Fixture that returns an `arbor.catalogue`
+    which contains mechanisms `neuron_with_diffusion`
+    and `synapse_with_diffusion`.
+    """
+    path = repo_path / "test" / "unit" / "diffusion"
+    cat_path = _build_cat("diffusion", path)
+    return arbor.load_catalogue(str(cat_path))
+
+
 @_fixture
 class empty_recipe(arbor.recipe):
     """
diff --git a/python/test/readme.md b/python/test/readme.md
index 515366df87..ac4cc673e0 100644
--- a/python/test/readme.md
+++ b/python/test/readme.md
@@ -39,6 +39,6 @@ In subfolders `unit`/`unit_distributed`:
 
 ## Naming convention
 
-- modules: `test_xxxs` (ending with `s` since module can consist of multiple classes)
-- class(es): `TestXxxs` (ending with `s` since class can consist of multiple test functions)
+- modules: `test_xxxs` (if applicable use the plural with `s` as modules can comprise multiple classes)
+- class(es): `TestXxxs` (if applicable use the plural with `s` as classes can comprise multiple test functions)
 - functions: `test_yyy`
diff --git a/python/test/unit/test_diffusion.py b/python/test/unit/test_diffusion.py
new file mode 100644
index 0000000000..476a50f4c8
--- /dev/null
+++ b/python/test/unit/test_diffusion.py
@@ -0,0 +1,476 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+import arbor as A
+import numpy as np
+from .. import fixtures
+
+"""
+Tests for the concentration and amount of diffusive particles across time and morphology.
+Three different morphological structures are considered: 1 segment ("soma only"), 2 segments
+("soma with dendrite"), and 3 segments ("soma with two dendrites").
+
+NOTE: Internally, Arbor only knows concentrations. Thus, particle amounts have to be computed
+      from concentrations by integrating over the volume of the morphology. The total amount
+      of particles should be conserved unless there is deliberate injection or removal of
+      particles.
+"""
+
+
+# ---------------------------------------------------------------------------------------
+# recipe class
+class recipe(A.recipe):
+    # Constructor
+    # - cat: catalogue of custom mechanisms
+    # - cell: cell description
+    # - probes: list of probes
+    # - inject_remove: list of dictionaries of the form [ {"time" : <time>, "synapse" : <synapse>, "change" : <change>} ],
+    #                  where <time> is the time of the event in milliseconds, <synapse> is a label, and <change> is the
+    #                  change in total particle amount in 1e-18 mol
+    def __init__(self, cat, cell, probes, inject_remove):
+        A.recipe.__init__(self)
+        self.the_cell = cell
+        self.the_probes = probes
+        self.the_props = A.neuron_cable_properties()
+        self.the_props.catalogue = (
+            cat  # use the provided catalogue of diffusion mechanisms
+        )
+        self.the_props.set_ion("s", 1, 0, 0, 0)  # use diffusive particles "s"
+        self.inject_remove = inject_remove
+
+    # num_cells
+    # Returns the total number of cells
+    def num_cells(self):
+        return 1
+
+    # cell_kind
+    # Returns the kind of the specified cell
+    # - gid: the identifier of the cell
+    def cell_kind(self, gid):
+        return A.cell_kind.cable
+
+    # cell_description
+    # Returns the description object of the specified cell
+    # - gid: the identifier of the cell
+    def cell_description(self, gid):
+        return self.the_cell
+
+    # probes
+    # Returns the list of probes for the specified cell
+    # - gid: the identifier of the cell
+    def probes(self, gid):
+        return self.the_probes
+
+    # global_properties
+    # Returns the properties of the specified cell
+    # - kind: the kind of the specified cell
+    def global_properties(self, kind):
+        return self.the_props
+
+    # event_generators
+    # Returns the list of event generators for the specified cell
+    # - gid: the identifier of the cell
+    def event_generators(self, gid):
+        event_gens = []
+        for event in self.inject_remove:
+            event_gens.append(
+                A.event_generator(
+                    event["synapse"],
+                    event["change"],
+                    A.explicit_schedule([event["time"]]),
+                )
+            )
+        return event_gens
+
+
+# ---------------------------------------------------------------------------------------
+# test class
+class TestDiffusion(unittest.TestCase):
+    # Constructor (overridden)
+    # - args: arguments that are passed to the super class
+    def __init__(self, args):
+        super(TestDiffusion, self).__init__(args)
+
+        self.runtime = 5.00  # runtime of the whole simulation in ms
+        self.dt = 0.01  # duration of one timestep in ms
+        self.dev = 0.01  # accepted relative deviation for `assertAlmostEqual`
+
+    # get_morph_and_decor_1_seg
+    # Method that sets up and returns a morphology and decoration for one segment with the given parameters
+    # (one segment => there'll be one branch)
+    # - num_cvs_per_seg: number of CVs per segment
+    # - length_1: axial length of the first segment in µm
+    # - radius_1: radius of the first segment in µm
+    def get_morph_and_decor_1_seg(self, num_cvs_per_seg, length_1, radius_1):
+        # ---------------------------------------------------------------------------------------
+        # set up the morphology
+        tree = A.segment_tree()
+        _ = tree.append(
+            A.mnpos,
+            A.mpoint(-length_1, 0, 0, radius_1),
+            A.mpoint(0, 0, 0, radius_1),
+            tag=0,
+        )
+        labels = A.label_dict(
+            {
+                "soma-region": "(tag 0)",
+                "soma-start": '(on-components 0.0 (region "soma-region"))',
+                "soma-center": '(on-components 0.5 (region "soma-region"))',
+                "soma-end": '(on-components 1.0 (region "soma-region"))',
+            }
+        )
+        morph = A.morphology(tree)
+
+        # ---------------------------------------------------------------------------------------
+        # decorate the morphology with mechanisms
+        dec = A.decor()
+        dec.discretization(
+            A.cv_policy(f"(fixed-per-branch {num_cvs_per_seg})")
+        )  # use 'fixed-per-branch' policy to obtain exact number of CVs; there's one branch here
+        dec.place('"soma-end"', A.synapse("synapse_with_diffusion"), "syn_exc_A")
+        dec.place('"soma-end"', A.synapse("synapse_with_diffusion"), "syn_exc_B")
+        dec.place('"soma-end"', A.synapse("synapse_with_diffusion"), "syn_inh")
+        dec.paint("(all)", A.density("neuron_with_diffusion"))
+
+        return morph, dec, labels
+
+    # get_morph_and_decor_2_seg
+    # Method that sets up and returns a morphology and decoration for two segments with the given parameters
+    # (two segments => there'll be one branch)
+    # - num_cvs_per_seg: number of CVs per segment
+    # - length_1: axial length of the first segment in µm
+    # - length_2: axial length of the second segment in µm
+    # - radius_1: radius of the first segment in µm
+    # - radius_2: radius of the second segment in µm
+    def get_morph_and_decor_2_seg(
+        self, num_cvs_per_seg, length_1, length_2, radius_1, radius_2
+    ):
+        # ---------------------------------------------------------------------------------------
+        # set up the morphology
+        tree = A.segment_tree()
+        s = tree.append(
+            A.mnpos,
+            A.mpoint(-length_1, 0, 0, radius_1),
+            A.mpoint(0, 0, 0, radius_1),
+            tag=0,
+        )
+        _ = tree.append(
+            s,
+            A.mpoint(0, 0, 0, radius_2),
+            A.mpoint(+length_2, 0, 0, radius_2),
+            tag=1,
+        )
+        labels = A.label_dict(
+            {
+                "soma-region": "(tag 0)",
+                "dendriteA-region": "(tag 1)",
+                "soma-start": '(on-components 0.0 (region "soma-region"))',
+                "soma-center": '(on-components 0.5 (region "soma-region"))',
+                "soma-end": '(on-components 1.0 (region "soma-region"))',
+                "dendriteA-center": '(on-components 0.5 (region "dendriteA-region"))',
+            }
+        )
+        morph = A.morphology(tree)
+
+        # ---------------------------------------------------------------------------------------
+        # decorate the morphology with mechanisms
+        dec = A.decor()
+        dec.discretization(
+            A.cv_policy(f"(fixed-per-branch {2*num_cvs_per_seg})")
+        )  # use 'fixed-per-branch' policy to obtain exact number of CVs; there's one branch here
+        dec.place(
+            '"dendriteA-center"', A.synapse("synapse_with_diffusion"), "syn_exc_A"
+        )
+        dec.place('"soma-end"', A.synapse("synapse_with_diffusion"), "syn_exc_B")
+        dec.place('"soma-end"', A.synapse("synapse_with_diffusion"), "syn_inh")
+        dec.paint("(all)", A.density("neuron_with_diffusion"))
+
+        return morph, dec, labels
+
+    # get_morph_and_decor_3_seg
+    # Method that sets up and returns a morphology and decoration for three segments with the given parameters
+    # (three segments => there'll be three branches)
+    # - num_cvs_per_seg: number of CVs per segment
+    # - length_1: axial length of the first segment in µm
+    # - length_2: axial length of the second segment in µm
+    # - length_3: axial length of the third segment in µm
+    # - radius_1: radius of the first segment in µm
+    # - radius_2: radius of the second segment in µm
+    # - radius_3: radius of the third segment in µm
+    def get_morph_and_decor_3_seg(
+        self,
+        num_cvs_per_seg,
+        length_1,
+        length_2,
+        length_3,
+        radius_1,
+        radius_2,
+        radius_3,
+    ):
+        # ---------------------------------------------------------------------------------------
+        # set up the morphology
+        tree = A.segment_tree()
+        s = tree.append(
+            A.mnpos,
+            A.mpoint(-length_1, 0, 0, radius_1),
+            A.mpoint(0, 0, 0, radius_1),
+            tag=0,
+        )
+        _ = tree.append(
+            s,
+            A.mpoint(0, 0, 0, radius_2),
+            A.mpoint(+length_2, 0, 0, radius_2),
+            tag=1,
+        )
+        _ = tree.append(
+            s,
+            A.mpoint(0, 0, 0, radius_3),
+            A.mpoint(+length_3, 0, 0, radius_3),
+            tag=2,
+        )
+        labels = A.label_dict(
+            {
+                "soma-region": "(tag 0)",
+                "dendriteA-region": "(tag 1)",
+                "dendriteB-region": "(tag 2)",
+                "soma-start": '(on-components 0.0 (region "soma-region"))',
+                "soma-center": '(on-components 0.5 (region "soma-region"))',
+                "soma-end": '(on-components 1.0 (region "soma-region"))',
+                "dendriteA-center": '(on-components 0.5 (region "dendriteA-region"))',
+                "dendriteB-center": '(on-components 0.5 (region "dendriteB-region"))',
+            }
+        )
+        morph = A.morphology(tree)
+
+        # ---------------------------------------------------------------------------------------
+        # decorate the morphology with mechanisms
+        dec = A.decor()
+        dec.discretization(
+            A.cv_policy(f"(fixed-per-branch {num_cvs_per_seg})")
+        )  # use 'fixed-per-branch' policy to obtain exact number of CVs; there are three branches here
+        dec.place(
+            '"dendriteA-center"', A.synapse("synapse_with_diffusion"), "syn_exc_A"
+        )
+        dec.place(
+            '"dendriteB-center"', A.synapse("synapse_with_diffusion"), "syn_exc_B"
+        )
+        dec.place('"soma-end"', A.synapse("synapse_with_diffusion"), "syn_inh")
+        dec.paint("(all)", A.density("neuron_with_diffusion"))
+
+        return morph, dec, labels
+
+    # simulate_and_test_diffusion
+    # Method that runs an Arbor simulation with diffusion across different segments and subsequently
+    # performs tests on the results
+    # - cat: catalogue of custom mechanisms
+    # - num_segs: number of segments (1, 2, or 3)
+    # - num_cvs_per_seg: number of CVs per segment
+    # - l_1 [optional]: axial length of the first segment in µm
+    # - l_2 [optional]: axial length of the second segment in µm
+    # - l_3 [optional]: axial length of the third segment in µm
+    # - r_1 [optional]: radius of the first segment in µm
+    # - r_2 [optional]: radius of the second segment in µm
+    # - r_3 [optional]: radius of the third segment in µm
+    def simulate_and_test_diffusion(
+        self,
+        cat,
+        num_segs,
+        num_cvs_per_seg,
+        l_1=5.0,
+        l_2=5.0,
+        l_3=5.0,
+        r_1=4.0,
+        r_2=4.0,
+        r_3=4.0,
+    ):
+        # ---------------------------------------------------------------------------------------
+        # set parameters
+        inject_remove = [
+            {"time": 0.1, "synapse": "syn_exc_A", "change": 600},
+            {"time": 0.5, "synapse": "syn_exc_B", "change": 1200},
+            {"time": 1.5, "synapse": "syn_inh", "change": -1400},
+        ]  # changes in particle amount (in 1e-18 mol)
+        diffusivity = 1  # diffusivity (in m^2/s)
+
+        # ---------------------------------------------------------------------------------------
+        # get morphology, decoration, and labels, and calculate geometrical measures
+        if num_segs == 1:
+            r_2 = l_2 = 0  # set radius and length of second segment to zero
+            r_3 = l_3 = 0  # set radius and length of third segment to zero
+            morph, dec, labels = self.get_morph_and_decor_1_seg(
+                num_cvs_per_seg, l_1, r_1
+            )  # get morphology, decoration, and labels
+            length_soma_cv = (
+                l_1 / num_cvs_per_seg
+            )  # consider 'fixed-per-branch' policy for one segment, which forms one branch
+        elif num_segs == 2:
+            r_3 = l_3 = 0  # set radius and length of third segment to zero
+            morph, dec, labels = self.get_morph_and_decor_2_seg(
+                num_cvs_per_seg, l_1, l_2, r_1, r_2
+            )  # get morphology, decoration, and labels
+            length_soma_cv = (l_1 + l_2) / (
+                2 * num_cvs_per_seg
+            )  # consider 'fixed-per-branch' policy for two segments, which only form one branch
+        elif num_segs == 3:
+            morph, dec, labels = self.get_morph_and_decor_3_seg(
+                num_cvs_per_seg, l_1, l_2, l_3, r_1, r_2, r_3
+            )  # get morphology, decoration, and labels
+            length_soma_cv = (
+                l_1 / num_cvs_per_seg
+            )  # consider 'fixed-per-branch' policy for three segments, which form three branches
+        else:
+            raise ValueError(
+                f"Specified number of segments ({num_segs}) not supported."
+            )
+        volume_soma_cv = np.pi * (
+            r_1**2 * length_soma_cv
+        )  # volume of one cylindrical CV of the first segment in µm^3
+        volume_tot = np.pi * (
+            r_1**2 * l_1 + r_2**2 * l_2 + r_3**2 * l_3
+        )  # volume of the whole setup in µm^3
+
+        # ---------------------------------------------------------------------------------------
+        # add the diffusive particle species 's'
+        dec.set_ion("s", int_con=0.0, diff=diffusivity)
+
+        # ---------------------------------------------------------------------------------------
+        # set probes
+        prb = [
+            A.cable_probe_ion_diff_concentration('"soma-start"', "s", "s_pt"),
+            A.cable_probe_density_state(
+                '"soma-start"', "neuron_with_diffusion", "sV", "sV_pt"
+            ),
+            A.cable_probe_density_state_cell(
+                "neuron_with_diffusion", "sV", "sV_cell_pt"
+            ),
+        ]
+
+        # ---------------------------------------------------------------------------------------
+        # prepare the simulation
+        cel = A.cable_cell(morph, dec, labels)
+        rec = recipe(cat, cel, prb, inject_remove)
+        sim = A.simulation(rec)
+
+        # ---------------------------------------------------------------------------------------
+        # set handles
+        hdl_s = sim.sample(
+            (0, "s_pt"), A.regular_schedule(self.dt)
+        )  # s at "soma-start"
+        hdl_sV = sim.sample(
+            (0, "sV_pt"), A.regular_schedule(self.dt)
+        )  # sV at "soma-start"
+        hdl_sV_all = sim.sample(
+            (0, "sV_cell_pt"), A.regular_schedule(self.dt)
+        )  # sV (cell-wide array)
+
+        # ---------------------------------------------------------------------------------------
+        # run the simulation
+        sim.run(dt=self.dt, tfinal=self.runtime)
+
+        # ---------------------------------------------------------------------------------------
+        # retrieve data and do the testing
+        data_s = sim.samples(hdl_s)[0][0]
+        data_sV = sim.samples(hdl_sV)[0][0]
+        tmp_data = sim.samples(hdl_sV_all)[0][0]
+        data_sV_total = np.zeros_like(tmp_data[:, 0])
+        num_cvs = len(tmp_data[0, :]) - 1
+        for i in range(
+            len(tmp_data[0, :]) - 1
+        ):  # compute the total amount of particles by summing over all CVs of the whole neuron
+            data_sV_total += tmp_data[:, i + 1]
+
+        # final value of the total particle amount of s
+        sV_tot_lim_expected = 0
+        for event in inject_remove:
+            sV_tot_lim_expected += event["change"]
+
+        # final value of the concentration of s (total particle amount divided by total volume)
+        s_lim_expected = sV_tot_lim_expected / volume_tot
+
+        # maximum value of the total particle amount of s
+        sV_tot_max_expected = 0
+        for event in inject_remove:
+            if event["change"] > 0:
+                sV_tot_max_expected += event["change"]
+
+        # maximum value of the concentration of s (total particle amount divided by total volume)
+        s_max_expected = sV_tot_max_expected / volume_tot
+
+        # tests
+        if num_segs < 3:
+            self.assertEqual(morph.num_branches, 1)  # number of branches (1 expected)
+        else:
+            self.assertEqual(
+                morph.num_branches, 3
+            )  # number of branches (3 expected, see https://docs.arbor-sim.org/en/latest/concepts/morphology.html)
+        self.assertEqual(num_cvs, num_segs * num_cvs_per_seg)  # total number of CVs
+        self.assertAlmostEqual(
+            data_s[-1, 1], s_lim_expected, delta=self.dev * s_lim_expected
+        )  # equilibrium concentration lim_{t->inf}(s) [direct]
+        self.assertAlmostEqual(
+            data_sV[-1, 1] / volume_soma_cv,
+            s_lim_expected,
+            delta=self.dev * s_lim_expected,
+        )  # equilibrium concentration lim_{t->inf}(s) [estimated]
+        self.assertAlmostEqual(
+            np.max(data_s[:, 1]), s_max_expected, delta=self.dev * s_max_expected
+        )  # maximum concentration max_{t}(s) [direct]
+        self.assertAlmostEqual(
+            data_sV_total[-1],
+            sV_tot_lim_expected,
+            delta=self.dev * sV_tot_lim_expected,
+        )  # equilibrium particle amount lim_{t->inf}(s⋅V) [direct]
+        self.assertAlmostEqual(
+            data_sV[-1, 1] / volume_soma_cv * volume_tot,
+            sV_tot_lim_expected,
+            delta=self.dev * sV_tot_lim_expected,
+        )  # equilibrium particle amount lim_{t->inf}(s⋅V) [estimated]
+        self.assertAlmostEqual(
+            np.max(data_sV_total),
+            sV_tot_max_expected,
+            delta=self.dev * sV_tot_max_expected,
+        )  # maximum particle amount max_{t}(s⋅V) [direct]
+
+    # test_diffusion_equal_radii
+    # Test: simulations with segments of equal length and equal radius
+    # - diffusion_catalogue: catalogue of diffusion mechanisms
+    @fixtures.diffusion_catalogue()
+    def test_diffusion_equal_radii(self, diffusion_catalogue):
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 1, 150, l_1=5, r_1=4
+        )  # 1 segment with radius 4 µm
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 2, 75, l_1=5, l_2=5, r_1=4, r_2=4
+        )  # 2 segments with radius 4 µm
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 3, 50, l_1=5, l_2=5, l_3=5, r_1=4, r_2=4, r_3=4
+        )  # 3 segments with radius 4 µm
+
+    # test_diffusion_different_length
+    # Test: simulations with segments of different length but equal radius
+    # - diffusion_catalogue: catalogue of diffusion mechanisms
+    @fixtures.diffusion_catalogue()
+    def test_diffusion_different_length(self, diffusion_catalogue):
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 1, 150, l_1=5, r_1=4
+        )  # 1 segment with radius 4 µm
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 2, 75, l_1=5, l_2=3, r_1=4, r_2=4
+        )  # 2 segments with radius 4 µm
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 3, 50, l_1=5, l_2=3, l_3=3, r_1=4, r_2=4, r_3=4
+        )  # 3 segments with radius 4 µm
+
+    """ TODO: not succeeding as of Arbor v0.9.0:
+    # test_diffusion_different_radii
+    # Test: simulations with segments of equal length but different radius
+    # - diffusion_catalogue: catalogue of diffusion mechanisms
+    @fixtures.diffusion_catalogue()
+    def test_diffusion_different_radii(self, diffusion_catalogue):
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 2, 75, l_1=5, l_2=5, r_1=4, r_2=6
+        )  # 2 segments with radius 4 µm and 6 µm
+        self.simulate_and_test_diffusion(
+            diffusion_catalogue, 3, 50, l_1=5, l_2=5, l_3=5, r_1=4, r_2=6, r_3=6
+        )  # 3 segments with radius 4 µm and 6 µm
+    """
diff --git a/test/unit/diffusion/neuron_with_diffusion.mod b/test/unit/diffusion/neuron_with_diffusion.mod
new file mode 100644
index 0000000000..2feb697b2e
--- /dev/null
+++ b/test/unit/diffusion/neuron_with_diffusion.mod
@@ -0,0 +1,23 @@
+NEURON {
+	SUFFIX neuron_with_diffusion
+	USEION s READ sd
+}
+
+PARAMETER {
+	area : surface area of the CV (in µm^2, internal variable)
+	diam : CV diameter (in µm, internal variable)
+}
+
+ASSIGNED {
+	volume : volume of the CV (conversion factor between concentration and particle amount, in µm^3)
+	sV : particle amount in the CV (in 1e-18 mol)
+}
+
+INITIAL {
+	volume = area*diam/4 : = area*r/2 = 2*pi*r*h*r/2 = pi*r^2*h
+	sV = sd * volume
+}
+
+BREAKPOINT {
+	sV = sd * volume : read and normalize particle amount
+}
diff --git a/test/unit/diffusion/synapse_with_diffusion.mod b/test/unit/diffusion/synapse_with_diffusion.mod
new file mode 100644
index 0000000000..4bb89a847b
--- /dev/null
+++ b/test/unit/diffusion/synapse_with_diffusion.mod
@@ -0,0 +1,24 @@
+NEURON {
+	POINT_PROCESS synapse_with_diffusion
+	USEION s WRITE sd
+}
+
+PARAMETER {
+	area : surface area of the CV (in µm^2, internal variable)
+	diam : CV diameter (in µm, internal variable)
+}
+
+ASSIGNED {
+	volume : volume of the CV (conversion factor between concentration and particle amount, in µm^3)
+}
+
+INITIAL {
+	volume = area*diam/4 : = area*r/2 = 2*pi*r*h*r/2 = pi*r^2*h
+}
+
+BREAKPOINT {
+}
+
+NET_RECEIVE(weight) {
+	sd = sd + weight * area / volume / 1000
+}
diff --git a/test/unit/test_reduce_by_key.cu b/test/unit/test_reduce_by_key.cu
index 44dcab912a..5b5763c95e 100644
--- a/test/unit/test_reduce_by_key.cu
+++ b/test/unit/test_reduce_by_key.cu
@@ -89,8 +89,8 @@ TEST(reduce_by_key, scatter)
     // onto an array of length 12.
     std::size_t n = 12;
     std::vector<int> index = {0,0,0,1,2,2,2,2,3,3,7,7,7,7,7,11};
-    std::vector<double> in(index.size(), 1);
-    std::vector<double> expected = {3., 1., 4., 2., 0., 0., 0., 5., 0., 0., 0., 1.};
+    std::vector<double> in(index.size(), 0.5);
+    std::vector<double> expected = {1.5, 0.5, 2., 1., 0., 0., 0., 2.5, 0., 0., 0., 0.5};
 
     EXPECT_EQ(n, expected.size());