arbor-sim · boeschf · Aug 15, 2023 · Aug 15, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/arbor/backends/gpu/shared_state.cpp b/arbor/backends/gpu/shared_state.cpp
@@ -22,8 +22,6 @@
 #include "util/range.hpp"
 #include "util/strprintf.hpp"
 
-#include <iostream>
-
 using arb::memory::make_const_view;
 
 namespace arb {
@@ -37,6 +35,8 @@ void take_samples_impl(
 
 void add_scalar(std::size_t n, arb_value_type* data, arb_value_type v);
 
+void apply_diffusive_concentration_delta_impl(std::size_t, arb_value_type*, arb_value_type* const *, arb_index_type const *);
+
 // GPU-side minmax: consider CUDA kernel replacement.
 std::pair<arb_value_type, arb_value_type> minmax_value_impl(arb_size_type n, const arb_value_type* v) {
     auto v_copy = memory::on_host(memory::const_device_view<arb_value_type>(v, n));
@@ -201,6 +201,12 @@ shared_state::shared_state(task_system_handle tp,
     add_scalar(temperature_degC.size(), temperature_degC.data(), -273.15);
 }
 
+void shared_state::apply_diffusive_concentration_delta() {
+    for (auto& [name, state] : ion_data) {
+        apply_diffusive_concentration_delta_impl(state.Xd_contribution.size(), state.Xd_.data(), state.Xd_contribution_d.data(), state.Xd_index_d.data());
+    }
+}
+
 void shared_state::update_prng_state(mechanism& m) {
     if (!m.mech_.n_random_variables) return;
     auto const mech_id = m.mechanism_id();
@@ -250,20 +256,23 @@ void shared_state::instantiate(mechanism& m,
     store.globals_    = std::vector<arb_value_type>(m.mech_.n_globals);
 
     // Set ion views
+    arb_size_type n_ions_with_written_xd = 0;
     for (auto idx: make_span(m.mech_.n_ions)) {
         auto ion = m.mech_.ions[idx].name;
         auto ion_binding = value_by_key(overrides.ion_rebind, ion).value_or(ion);
         ion_state* oion = ptr_by_key(ion_data, ion_binding);
         if (!oion) throw arbor_internal_error("gpu/mechanism: mechanism holds ion with no corresponding shared state");
-        auto& ion_state = store.ion_states_[idx];
-        ion_state = {0};
-        ion_state.current_density         = oion->iX_.data();
-        ion_state.reversal_potential      = oion->eX_.data();
-        ion_state.internal_concentration  = oion->Xi_.data();
-        ion_state.external_concentration  = oion->Xo_.data();
-        ion_state.diffusive_concentration = oion->Xd_.data();
-        ion_state.ionic_charge            = oion->charge.data();
-        ion_state.conductivity            = oion->gX_.data();
+        auto& ion_state_ = store.ion_states_[idx]; // arb_ion_state
+        ion_state_ = {0};
+        ion_state_.current_density         = oion->iX_.data();
+        ion_state_.reversal_potential      = oion->eX_.data();
+        ion_state_.internal_concentration  = oion->Xi_.data();
+        ion_state_.external_concentration  = oion->Xo_.data();
+        ion_state_.diffusive_concentration = oion->Xd_.data();
+        ion_state_.ionic_charge            = oion->charge.data();
+        ion_state_.conductivity            = oion->gX_.data();
+
+        n_ions_with_written_xd += m.mech_.ions[idx].write_diff_concentration;
     }
 
     // If there are no sites (is this ever meaningful?) there is nothing more to do.
@@ -272,7 +281,7 @@ void shared_state::instantiate(mechanism& m,
     // Allocate and initialize state and parameter vectors with default values.
     {
         // Allocate bulk storage
-        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1)*width_padded + m.mech_.n_globals;
+        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1 + n_ions_with_written_xd)*width_padded + m.mech_.n_globals;
         store.data_ = array(count, NAN);
         chunk_writer writer(store.data_.data(), width_padded);
 
@@ -295,6 +304,11 @@ void shared_state::instantiate(mechanism& m,
         for (auto idx: make_span(m.mech_.n_state_vars)) {
             store.state_vars_[idx] = writer.fill(m.mech_.state_vars[idx].default_value);
         }
+        // Set diffusive concentration deltas if needed
+        for (auto idx: make_span(m.mech_.n_ions)) {
+            store.ion_states_[idx].diffusive_concentration_delta =
+                m.mech_.ions[idx].write_diff_concentration ? writer.fill(0) : nullptr;
+        }
         // Assign global scalar parameters. NB: Last chunk, since it breaks the width striding.
         for (auto idx: make_span(m.mech_.n_globals)) store.globals_[idx] = m.mech_.globals[idx].default_value;
         for (auto& [k, v]: overrides.globals) {
@@ -332,6 +346,31 @@ void shared_state::instantiate(mechanism& m,
             auto indices = util::index_into(pos_data.cv, ni);
             std::vector<arb_index_type> mech_ion_index(indices.begin(), indices.end());
             store.ion_states_[idx].index = writer.append_with_padding(mech_ion_index, 0);
+
+            if (m.mech_.ions[idx].write_diff_concentration) {
+                auto& Xd_contribution_map = oion->Xd_contribution_map;
+                auto& Xd_contribution     = oion->Xd_contribution;
+                auto& Xd_index            = oion->Xd_index;
+                auto& Xd_contribution_d   = oion->Xd_contribution_d;
+                auto& Xd_index_d          = oion->Xd_index_d;
+                util::append(
+                    Xd_contribution_map,
+                    util::transform_view(
+                        util::make_span(width),
+                        [mech_ion_index, d = store.ion_states_[idx].diffusive_concentration_delta](const auto& i) {
+                            return std::make_pair(d+i, mech_ion_index[i]);
+                        }));
+                // sort contribution map according to index and transpose from AoS to SoA
+                util::stable_sort_by(Xd_contribution_map, [](const auto& p) { return p.second; });
+                Xd_contribution.clear(); Xd_contribution.reserve(Xd_contribution_map.size());
+                Xd_index.clear(); Xd_index.reserve(Xd_contribution_map.size());
+                for (auto [ptr, idx_] : Xd_contribution_map) {
+                    Xd_contribution.push_back(ptr);
+                    Xd_index.push_back(idx_);
+                }
+                Xd_contribution_d = memory::on_gpu(Xd_contribution);
+                Xd_index_d = memory::on_gpu(Xd_index);
+            }
         }
 
         m.ppack_.multiplicity = mult_in_place? writer.append_with_padding(pos_data.multiplicity, 0): nullptr;

diff --git a/arbor/backends/gpu/shared_state.cu b/arbor/backends/gpu/shared_state.cu
@@ -7,6 +7,7 @@
 
 #include <arbor/gpu/gpu_api.hpp>
 #include <arbor/gpu/gpu_common.hpp>
+#include <arbor/gpu/reduce_by_key.hpp>
 
 namespace arb {
 namespace gpu {
@@ -40,6 +41,20 @@ __global__ void take_samples_impl(
     }
 }
 
+__global__ void apply_diffusive_concentration_delta_impl(
+    std::size_t n,
+    arb_value_type         * __restrict__ const Xd,
+    arb_value_type * const * __restrict__ const Xd_contribution,
+    arb_index_type   const * __restrict__ const Xd_index)
+{
+    const unsigned i = threadIdx.x+blockIdx.x*blockDim.x;
+    const unsigned mask = gpu::ballot(0xffffffff, i<n);
+    if (i < n) {
+        reduce_by_key(*Xd_contribution[i], Xd, Xd_index[i], mask);
+        *Xd_contribution[i] = 0;
+    }
+}
+
 } // namespace kernel
 
 void add_scalar(std::size_t n, arb_value_type* data, arb_value_type v) {
@@ -53,5 +68,13 @@ void take_samples_impl(
     launch_1d(s.size(), 128, kernel::take_samples_impl, s.begin_marked, s.end_marked, time, sample_time, sample_value);
 }
 
+void apply_diffusive_concentration_delta_impl(
+    std::size_t n,
+    arb_value_type         * Xd,
+    arb_value_type * const * Xd_contribution,
+    arb_index_type   const * Xd_index) {
+    launch_1d(n, 128, kernel::apply_diffusive_concentration_delta_impl, n, Xd, Xd_contribution, Xd_index);
+}
+
 } // namespace gpu
 } // namespace arb
diff --git a/arbor/backends/gpu/shared_state.hpp b/arbor/backends/gpu/shared_state.hpp
@@ -60,6 +60,12 @@ struct ARB_ARBOR_API ion_state {
 
     array charge;       // charge of ionic species (global, length 1)
 
+    std::vector<std::pair<arb_value_type*,arb_index_type>> Xd_contribution_map;
+    std::vector<arb_value_type*> Xd_contribution;
+    std::vector<arb_index_type> Xd_index;
+    memory::device_vector<arb_value_type*> Xd_contribution_d;
+    memory::device_vector<arb_index_type> Xd_index_d;
+
     solver_ptr solver = nullptr;
 
     ion_state() = default;
@@ -224,6 +230,8 @@ struct ARB_ARBOR_API shared_state: shared_state_base<shared_state, array, ion_st
                      const mechanism_layout&,
                      const std::vector<std::pair<std::string, std::vector<arb_value_type>>>&);
 
+    void apply_diffusive_concentration_delta();
+
     void update_prng_state(mechanism&);
 
     void zero_currents();

diff --git a/arbor/backends/multicore/shared_state.cpp b/arbor/backends/multicore/shared_state.cpp
@@ -329,6 +329,17 @@ std::size_t extend_width(const arb::mechanism& mech, std::size_t width) {
 }
 } // anonymous namespace
 
+void shared_state::apply_diffusive_concentration_delta() {
+    for (auto& [name, state] : ion_data) {
+        auto* Xd_ = state.Xd_.data();
+        const auto& Xd_c = state.Xd_contribution;
+        const auto& Xd_i = state.Xd_index;
+        for (auto i : util::count_along(Xd_c)) {
+            Xd_[Xd_i[i]] += std::exchange(*Xd_c[i], 0);
+        }
+    }
+}
+
 void shared_state::update_prng_state(mechanism& m) {
     if (!m.mech_.n_random_variables) return;
     const auto mech_id = m.mechanism_id();
@@ -418,6 +429,7 @@ void shared_state::instantiate(arb::mechanism& m,
     store.ion_states_.resize(m.mech_.n_ions);       m.ppack_.ion_states = store.ion_states_.data();
 
     // Set ion views
+    arb_size_type n_ions_with_written_xd = 0;
     for (auto idx: make_span(m.mech_.n_ions)) {
         auto ion = m.mech_.ions[idx].name;
         auto ion_binding = value_by_key(overrides.ion_rebind, ion).value_or(ion);
@@ -433,6 +445,8 @@ void shared_state::instantiate(arb::mechanism& m,
         ion_state.diffusive_concentration = oion->Xd_.data();
         ion_state.ionic_charge            = oion->charge.data();
         ion_state.conductivity            = oion->gX_.data();
+
+        n_ions_with_written_xd += m.mech_.ions[idx].write_diff_concentration;
     }
 
     // Initialize state and parameter vectors with default values.
@@ -446,7 +460,7 @@ void shared_state::instantiate(arb::mechanism& m,
         // Allocate bulk storage
         std::size_t value_width_padded = extend_width<arb_value_type>(m, pos_data.cv.size());
         store.value_width_padded = value_width_padded;
-        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1 +
+        std::size_t count = (m.mech_.n_state_vars + m.mech_.n_parameters + 1 + n_ions_with_written_xd +
             random_number_storage)*value_width_padded + m.mech_.n_globals;
         store.data_ = array(count, NAN, pad);
         chunk_writer writer(store.data_.data(), value_width_padded);
@@ -470,6 +484,11 @@ void shared_state::instantiate(arb::mechanism& m,
         for (auto idx: make_span(m.mech_.n_state_vars)) {
             m.ppack_.state_vars[idx] = writer.fill(m.mech_.state_vars[idx].default_value);
         }
+        // Set diffusive concentration deltas if needed
+        for (auto idx: make_span(m.mech_.n_ions)) {
+            m.ppack_.ion_states[idx].diffusive_concentration_delta =
+                m.mech_.ions[idx].write_diff_concentration ? writer.fill(0) : nullptr;
+        }
         // Set random numbers
         for (auto idx_v: make_span(num_random_numbers_per_cv))
             for (auto idx_c: make_span(cbprng::cache_size()))
@@ -530,7 +549,30 @@ void shared_state::instantiate(arb::mechanism& m,
             m.ppack_.ion_states[idx].index = writer.append(indices, util::back(indices));
             // Check SIMD constraints
             arb_assert(compatible_index_constraints(node_index, util::range_n(m.ppack_.ion_states[idx].index, index_width_padded), m.iface_.partition_width));
+
+            if (m.mech_.ions[idx].write_diff_concentration) {
+                auto mech_ion_index       = m.ppack_.ion_states[idx].index;
+                auto& Xd_contribution_map = oion->Xd_contribution_map;
+                auto& Xd_contribution     = oion->Xd_contribution;
+                auto& Xd_index            = oion->Xd_index;
+                util::append(
+                    Xd_contribution_map,
+                    util::transform_view(
+                        util::make_span(width),
+                        [mech_ion_index, d = store.ion_states_[idx].diffusive_concentration_delta](const auto& i) {
+                            return std::make_pair(d+i, mech_ion_index[i]);
+                        }));
+                // sort contribution map according to index and transpose from AoS to SoA
+                util::stable_sort_by(Xd_contribution_map, [](const auto& p) { return p.second; });
+                Xd_contribution.clear(); Xd_contribution.reserve(Xd_contribution_map.size());
+                Xd_index.clear(); Xd_index.reserve(Xd_contribution_map.size());
+                for (auto [ptr, idx] : Xd_contribution_map) {
+                    Xd_contribution.push_back(ptr);
+                    Xd_index.push_back(idx);
+                }
+            }
         }
+
         if (mult_in_place) m.ppack_.multiplicity = writer.append(pos_data.multiplicity, 0);
         // `peer_index` holds the peer CV of each CV in node_index.
         // Peer CVs are only filled for gap junction mechanisms. They are used

diff --git a/arbor/backends/multicore/shared_state.hpp b/arbor/backends/multicore/shared_state.hpp
@@ -73,6 +73,10 @@ struct ARB_ARBOR_API ion_state {
 
     array charge;           // charge of ionic species (global value, length 1)
 
+    std::vector<std::pair<arb_value_type*,arb_index_type>> Xd_contribution_map;
+    std::vector<arb_value_type*> Xd_contribution;
+    std::vector<arb_index_type> Xd_index;
+
     solver_ptr solver = nullptr;
 
     ion_state() = default;
@@ -231,6 +235,8 @@ struct ARB_ARBOR_API shared_state:
                      const mechanism_layout&,
                      const std::vector<std::pair<std::string, std::vector<arb_value_type>>>&);
 
+    void apply_diffusive_concentration_delta();
+
     void update_prng_state(mechanism&);
 
     void zero_currents();

diff --git a/arbor/fvm_lowered_cell_impl.hpp b/arbor/fvm_lowered_cell_impl.hpp
@@ -213,6 +213,8 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
             m->update_current();
         }
 
+        state_->apply_diffusive_concentration_delta();
+
         // Add stimulus current contributions.
         // NOTE: performed after dt, time_to calculation, in case we want to
         // use mean current contributions as opposed to point sample.
@@ -236,11 +238,15 @@ fvm_integration_result fvm_lowered_cell_impl<Backend>::integrate(
             m->update_state();
         }
 
+        state_->apply_diffusive_concentration_delta();
+
         // Update ion concentrations.
         PE(advance:integrate:ionupdate);
         update_ion_state();
         PL();
 
+        // TODO: can `write_ions` affect xd?
+
         // voltage mechs run now; after the cable_solver, but before the
         // threshold test
         for (auto& m: voltage_mechanisms_) {

diff --git a/arbor/include/arbor/gpu/cuda_api.hpp b/arbor/include/arbor/gpu/cuda_api.hpp
@@ -139,17 +139,6 @@ inline float gpu_atomic_sub(float* address, float val) {
 
 /// Warp-Level Primitives
 
-__device__ __inline__ double shfl(unsigned mask, double x, int lane)
-{
-    auto tmp = static_cast<uint64_t>(x);
-    auto lo = static_cast<unsigned>(tmp);
-    auto hi = static_cast<unsigned>(tmp >> 32);
-    hi = __shfl_sync(mask, static_cast<int>(hi), lane, warpSize);
-    lo = __shfl_sync(mask, static_cast<int>(lo), lane, warpSize);
-    return static_cast<double>(static_cast<uint64_t>(hi) << 32 |
-                               static_cast<uint64_t>(lo));
-}
-
 __device__ __inline__ unsigned ballot(unsigned mask, unsigned is_root) {
     return __ballot_sync(mask, is_root);
 }
@@ -158,24 +147,15 @@ __device__ __inline__ unsigned any(unsigned mask, unsigned width) {
     return __any_sync(mask, width);
 }
 
-#ifdef __NVCC__
-__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return __shfl_up_sync(mask, idx, shift);
-}
-
-__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return __shfl_down_sync(mask, idx, shift);
+template<typename T>
+__device__ __inline__ T shfl_up(unsigned mask, T var, unsigned lane_id, unsigned shift) {
+    return __shfl_up_sync(mask, var, shift);
 }
 
-#else
-__device__ __inline__ double shfl_up(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return shfl(mask, idx, lane_id - shift);
+template<typename T>
+__device__ __inline__ T shfl_down(unsigned mask, T var, unsigned lane_id, unsigned shift) {
+    return __shfl_down_sync(mask, var, shift);
 }
-
-__device__ __inline__ double shfl_down(unsigned mask, int idx, unsigned lane_id, unsigned shift) {
-    return shfl(mask, idx, lane_id + shift);
-}
-#endif
 #endif
 
 } // namespace gpu