diff --git a/CompasToolkit.jl/src/CompasToolkit.jl b/CompasToolkit.jl/src/CompasToolkit.jl index fd9e8c5..1714608 100644 --- a/CompasToolkit.jl/src/CompasToolkit.jl +++ b/CompasToolkit.jl/src/CompasToolkit.jl @@ -57,6 +57,14 @@ function get_context()::Context end end +""" +Waits until all asynchronous operations of Compas have finished. +""" +function synchronize() + context = get_context() + @ccall LIBRARY.compas_synchronize(pointer(context)::Ptr{Cvoid})::Cvoid +end + """ Object representing an array of size `N` and type `T`. """ diff --git a/include/compas/core/context.h b/include/compas/core/context.h index fc2d558..7c4cebb 100644 --- a/include/compas/core/context.h +++ b/include/compas/core/context.h @@ -40,6 +40,27 @@ struct CompasContext { return m_runtime.allocate(content.data(), content.size()); } + template + void + parallel_device(kmm::NDRange index_space, kmm::NDSize chunk_size, F fun, Args... args) const { + m_runtime + .parallel_submit(index_space, kmm::TaskPartitioner(chunk_size), kmm::GPU(fun), args...); + } + + template + void parallel_kernel( + kmm::NDRange index_space, + kmm::NDSize chunk_size, + dim3 block_dim, + F kernel, + Args... args) const { + m_runtime.parallel_submit( + index_space, + kmm::TaskPartitioner(chunk_size), + kmm::GPUKernel(kernel, block_dim), + args...); + } + template void submit_device(kmm::NDRange index_space, F fun, Args... args) const { m_runtime.submit(index_space, m_device, kmm::GPU(fun), args...); @@ -58,6 +79,10 @@ struct CompasContext { m_runtime.synchronize(); } + const kmm::Runtime& runtime() const { + return m_runtime; + } + private: kmm::Runtime m_runtime; kmm::DeviceId m_device; diff --git a/include/compas/core/view.h b/include/compas/core/view.h index d2d036e..f4ebecd 100644 --- a/include/compas/core/view.h +++ b/include/compas/core/view.h @@ -5,11 +5,23 @@ namespace compas { using index_t = int; -template -using view = kmm::view; - -template -using view_mut = kmm::view_mut; +using kmm::strided_subview; +using kmm::strided_subview_mut; +using kmm::strided_view; +using kmm::strided_view_mut; +using kmm::subview; +using kmm::subview_mut; +using kmm::view; +using kmm::view_mut; + +using kmm::gpu_strided_subview; +using kmm::gpu_strided_subview_mut; +using kmm::gpu_strided_view; +using kmm::gpu_strided_view_mut; +using kmm::gpu_subview; +using kmm::gpu_subview_mut; +using kmm::gpu_view; +using kmm::gpu_view_mut; template using host_view = kmm::view; @@ -17,19 +29,4 @@ using host_view = kmm::view; template using host_view_mut = kmm::view_mut; -template -using gpu_view = kmm::gpu_view; - -template -using gpu_view_mut = kmm::gpu_view_mut; - -template -using gpu_subview_mut = kmm::gpu_subview_mut; - -template -using gpu_strided_view = kmm::gpu_strided_view; - -template -using gpu_strided_view_mut = kmm::gpu_strided_view_mut; - } // namespace compas \ No newline at end of file diff --git a/include/compas/parameters/tissue.h b/include/compas/parameters/tissue.h index 995cf19..61ddced 100644 --- a/include/compas/parameters/tissue.h +++ b/include/compas/parameters/tissue.h @@ -13,20 +13,17 @@ namespace compas { * column is a voxel. */ struct TissueParameters: public Object { - Array parameters; // Size: [TissueParameterField::NUM_FIELDS, nvoxels] + Array data; // Size: [TissueParameterField::NUM_FIELDS, nvoxels] int nvoxels; + int chunk_size; bool has_z = true; bool has_b0 = true; bool has_b1 = true; - TissueParameters( - Array parameters, - int nvoxels, - bool has_z, - bool has_b0, - bool has_b1) : - parameters(parameters), - nvoxels(nvoxels), + TissueParameters(Array parameters, bool has_z, bool has_b0, bool has_b1) : + data(parameters), + nvoxels(kmm::checked_cast(parameters.size(1))), + chunk_size(kmm::checked_cast(parameters.chunk_size(1))), has_z(has_z), has_b0(has_b0), has_b1(has_b1) {} @@ -35,6 +32,7 @@ struct TissueParameters: public Object { TissueParameters make_tissue_parameters( const CompasContext& ctx, int num_voxels, + int chunk_size, view T1, view T2, view B1, @@ -43,7 +41,7 @@ TissueParameters make_tissue_parameters( view rho_y, view x, view y, - view z); + view z = {}); } // namespace compas @@ -53,13 +51,12 @@ struct Argument { using type = compas::TissueParametersView; static Argument pack(TaskBuilder& builder, compas::TissueParameters p) { - return { - {.parameters = {}, // - .nvoxels = p.nvoxels, - .has_z = p.has_z, - .has_b0 = p.has_b0, - .has_b1 = p.has_b1}, - pack_argument(builder, p.parameters)}; + compas::TissueParametersView view; + view.has_z = p.has_z; + view.has_b0 = p.has_b0; + view.has_b1 = p.has_b1; + + return {view, pack_argument(builder, p.data)}; } template diff --git a/include/compas/parameters/tissue_view.cuh b/include/compas/parameters/tissue_view.cuh index eb66070..b57c3fe 100644 --- a/include/compas/parameters/tissue_view.cuh +++ b/include/compas/parameters/tissue_view.cuh @@ -39,11 +39,12 @@ struct TissueVoxel { * Device-side representation of `TissueParameters` */ struct TissueParametersView { - gpu_view parameters; - int nvoxels; - bool has_z; - bool has_b0; - bool has_b1; + TissueParametersView(gpu_subview parameters = {}) : parameters(parameters) {} + + gpu_subview parameters; + bool has_z = true; + bool has_b0 = true; + bool has_b1 = true; /** * Returns the parameters for the voxel at location `i`. diff --git a/julia-bindings/src/bindings.cpp b/julia-bindings/src/bindings.cpp index afcf099..576ecb0 100644 --- a/julia-bindings/src/bindings.cpp +++ b/julia-bindings/src/bindings.cpp @@ -54,6 +54,10 @@ extern "C" void compas_destroy_context(const compas::CompasContext* ctx) { return catch_exceptions([&] { delete ctx; }); } +extern "C" void compas_synchronize(const compas::CompasContext* ctx) { + return catch_exceptions([&] { ctx->synchronize(); }); +} + extern "C" const kmm::ArrayBase* compas_make_array_float( const compas::CompasContext* context, const float* data_ptr, @@ -130,10 +134,14 @@ extern "C" const compas::TissueParameters* compas_make_tissue_parameters( const float* x, const float* y, const float* z) { + int num_devices = int(context->runtime().info().num_devices()); + int chunk_size = kmm::round_up_to_multiple(kmm::div_ceil(nvoxels, num_devices), 32); + return catch_exceptions([&] { auto params = compas::make_tissue_parameters( *context, nvoxels, + chunk_size, make_view(T1, nvoxels), make_view(T2, nvoxels), make_view(B1, nvoxels), diff --git a/src/jacobian/hermitian.cu b/src/jacobian/hermitian.cu index 5981151..6f3283b 100644 --- a/src/jacobian/hermitian.cu +++ b/src/jacobian/hermitian.cu @@ -74,8 +74,8 @@ Array compute_jacobian_hermitian( echos, \ delta_echos_T1, \ delta_echos_T2, \ - parameters.parameters.size(1), \ - parameters.parameters, \ + parameters.data.size(1), \ + parameters.data, \ coil_sensitivities, \ vector, \ E, \ diff --git a/src/jacobian/product.cu b/src/jacobian/product.cu index 58a8957..b1a6221 100644 --- a/src/jacobian/product.cu +++ b/src/jacobian/product.cu @@ -68,8 +68,8 @@ Array compute_jacobian( echos, \ delta_echos_T1, \ delta_echos_T2, \ - parameters.parameters.size(1), \ - parameters.parameters, \ + parameters.data.size(1), \ + parameters.data, \ coil_sensitivities, \ E, \ dEdT2, \ diff --git a/src/parameters/tissue.cpp b/src/parameters/tissue.cpp index 250f8ec..6f1ce36 100644 --- a/src/parameters/tissue.cpp +++ b/src/parameters/tissue.cpp @@ -6,6 +6,7 @@ namespace compas { TissueParameters make_tissue_parameters( const CompasContext& ctx, int num_voxels, + int voxels_per_chunk, host_view T1, host_view T2, host_view B1, @@ -15,70 +16,73 @@ TissueParameters make_tissue_parameters( host_view x, host_view y, host_view z) { - auto stride = round_up_to_multiple_of(num_voxels, 32); - auto params = kmm::Array {{TissueParameterField::NUM_FIELDS, stride}}; + using namespace kmm::placeholders; + auto params = kmm::Array {{TissueParameterField::NUM_FIELDS, num_voxels}}; bool has_z = !z.is_empty(); bool has_b0 = !B0.is_empty(); bool has_b1 = !B1.is_empty(); - ctx.submit_device( + ctx.parallel_device( num_voxels, - [&](kmm::DeviceContext& device, kmm::NDRange, gpu_view_mut params) { - device.fill(params, 0.0F); + voxels_per_chunk, + [&](kmm::DeviceContext& device, kmm::NDRange range, gpu_subview_mut params) { + KMM_ASSERT(params.is_contiguous()); + device.fill(params.data(), params.size(), 0.0F); + auto offset = range.begin(); + auto length = range.size(); device.copy( - T1.data(), - params.drop_axis<0>(TissueParameterField::T1).data(), - num_voxels); + T1.data_at(offset), + params.data_at(TissueParameterField::T1, offset), + length); device.copy( - T2.data(), - params.drop_axis<0>(TissueParameterField::T2).data(), - num_voxels); + T2.data_at(offset), + params.data_at(TissueParameterField::T2, offset), + length); device.copy( - rho_x.data(), - params.drop_axis<0>(TissueParameterField::RHO_X).data(), - num_voxels); + rho_x.data_at(offset), + params.data_at(TissueParameterField::RHO_X, offset), + length); device.copy( - rho_y.data(), - params.drop_axis<0>(TissueParameterField::RHO_Y).data(), - num_voxels); + rho_y.data_at(offset), + params.data_at(TissueParameterField::RHO_Y, offset), + length); - device.copy(x.data(), params.drop_axis<0>(TissueParameterField::X).data(), num_voxels); - device.copy(y.data(), params.drop_axis<0>(TissueParameterField::Y).data(), num_voxels); + device.copy(x.data_at(offset), params.data_at(TissueParameterField::X, offset), length); + device.copy(y.data_at(offset), params.data_at(TissueParameterField::Y, offset), length); if (has_z) { device.copy( - z.data(), - params.drop_axis<0>(TissueParameterField::Z).data(), - num_voxels); + z.data_at(offset), + params.data_at(TissueParameterField::Z, offset), + length); } if (has_b0) { device.copy( - B0.data(), - params.drop_axis<0>(TissueParameterField::B0).data(), - num_voxels); + B0.data_at(offset), + params.data_at(TissueParameterField::B0, offset), + length); } if (has_b1) { device.copy( - B1.data(), - params.drop_axis<0>(TissueParameterField::B1).data(), - num_voxels); + B1.data_at(offset), + params.data_at(TissueParameterField::B1, offset), + length); } else { // The default value for B1 is 1 - device.fill(params.drop_axis<0>(TissueParameterField::B1), 1.0f); + device.fill(params.data_at(TissueParameterField::B1, offset), length, 1.0F); } }, - write(params)); + write(params, access(_, _x))); params.synchronize(); return { params, - num_voxels, has_z, has_b0, has_b1, diff --git a/src/simulate/derivative.cu b/src/simulate/derivative.cu index 64dd232..7415b3c 100644 --- a/src/simulate/derivative.cu +++ b/src/simulate/derivative.cu @@ -12,6 +12,7 @@ template void simulate_magnetization_derivative_impl( const kmm::DeviceContext& context, kmm::NDRange, + int nvoxels, int field, gpu_view_mut new_parameters, gpu_view_mut delta_echos, @@ -19,7 +20,6 @@ void simulate_magnetization_derivative_impl( TissueParametersView tissue, SequenceView sequence, float delta) { - auto nvoxels = tissue.nvoxels; auto nreadouts = kmm::checked_cast(sequence.RF_train.size()); COMPAS_ASSERT(echos.size(0) == nreadouts); @@ -35,13 +35,8 @@ void simulate_magnetization_derivative_impl( field, delta); - auto new_tissue = TissueParametersView { - .parameters = new_parameters, - .nvoxels = nvoxels, - .has_z = tissue.has_z, - .has_b0 = tissue.has_b0, - .has_b1 = tissue.has_b1, - }; + auto new_tissue = TissueParametersView {tissue}; + new_tissue.parameters = new_parameters; simulate_magnetization_kernel(context, kmm::NDRange {}, delta_echos, new_tissue, sequence); @@ -68,12 +63,13 @@ Array simulate_magnetization_derivative( COMPAS_ASSERT(echos.size(0) == nreadouts); COMPAS_ASSERT(echos.size(1) == nvoxels); - auto new_parameters = Array {parameters.parameters.sizes()}; + auto new_parameters = Array {parameters.data.sizes()}; auto delta_echos = Array {echos.sizes()}; context.submit_device( {nreadouts, nvoxels}, simulate_magnetization_derivative_impl, + nvoxels, field, write(new_parameters), write(delta_echos), @@ -98,12 +94,13 @@ Array simulate_magnetization_derivative( COMPAS_ASSERT(echos.size(0) == nreadouts); COMPAS_ASSERT(echos.size(1) == nvoxels); - auto new_parameters = Array {parameters.parameters.sizes()}; + auto new_parameters = Array {parameters.data.sizes()}; auto delta_echos = Array {echos.sizes()}; context.submit_device( {nreadouts, nvoxels}, simulate_magnetization_derivative_impl, + nvoxels, field, write(new_parameters), write(delta_echos), diff --git a/src/simulate/derivative_kernels.cuh b/src/simulate/derivative_kernels.cuh index 1d41744..2347a37 100644 --- a/src/simulate/derivative_kernels.cuh +++ b/src/simulate/derivative_kernels.cuh @@ -5,8 +5,8 @@ namespace compas { namespace kernels { __global__ void add_difference_to_parameters( int nvoxels, - gpu_view_mut new_parameters, - gpu_view old_parameters, + gpu_subview_mut new_parameters, + gpu_subview old_parameters, int target_field, float delta) { auto v = index_t(blockIdx.x * blockDim.x + threadIdx.x); @@ -23,8 +23,8 @@ __global__ void add_difference_to_parameters( __global__ void calculate_finite_difference( int nreadouts, int nvoxels, - gpu_view_mut delta_echos, - gpu_view echos, + gpu_subview_mut delta_echos, + gpu_subview echos, float inv_delta) { auto v = index_t(blockIdx.x * blockDim.x + threadIdx.x); auto r = index_t(blockIdx.y * blockDim.y + threadIdx.y); diff --git a/src/simulate/fisp.cu b/src/simulate/fisp.cu index bdeaf68..28c70e4 100644 --- a/src/simulate/fisp.cu +++ b/src/simulate/fisp.cu @@ -7,27 +7,29 @@ namespace compas { template void simulate_fisp_sequence_for_size( const kmm::DeviceContext& context, - gpu_view_mut echos, + kmm::Range<1, index_t> range, + gpu_subview_mut echos, TissueParametersView parameters, FISPSequenceView sequence) { COMPAS_ASSERT(sequence.max_state <= max_N); COMPAS_ASSERT(is_power_of_two(warp_size) && warp_size <= 32); - int nvoxels = parameters.nvoxels; int nreadouts = int(sequence.RF_train.size()); COMPAS_ASSERT(echos.size(0) == nreadouts); - COMPAS_ASSERT(echos.size(1) == nvoxels); + COMPAS_ASSERT(echos.begin(1) == range.begin()); + COMPAS_ASSERT(echos.end(1) == range.end()); - context.fill(echos, cfloat(0)); + context.fill(echos.data(), echos.size(), cfloat(0)); for (index_t i = 0; i < sequence.sliceprofiles.size(0); i++) { dim3 block_size = 256; - dim3 grid_size = div_ceil(uint(nvoxels * warp_size), block_size.x); + dim3 grid_size = div_ceil(uint(range.size() * warp_size), block_size.x); kernels::simulate_fisp<<>>( + range, echos, - sequence.sliceprofiles.drop_axis<0>(i), + sequence.sliceprofiles.drop_axis(i), parameters, sequence); } @@ -35,24 +37,27 @@ void simulate_fisp_sequence_for_size( void simulate_magnetization_kernel( const kmm::DeviceContext& context, - kmm::NDRange, - gpu_view_mut echos, - TissueParametersView parameters, + kmm::NDRange nd_range, + gpu_subview_mut echos, + gpu_subview data, FISPSequenceView sequence) { + auto parameters = TissueParametersView {data}; + auto range = kmm::Range<1, index_t> {index_t(nd_range.begin()), index_t(nd_range.size())}; + if (sequence.max_state <= 4) { - simulate_fisp_sequence_for_size<4, 2>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<4, 2>(context, range, echos, parameters, sequence); } else if (sequence.max_state <= 8) { - simulate_fisp_sequence_for_size<8, 4>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<8, 4>(context, range, echos, parameters, sequence); } else if (sequence.max_state <= 16) { - simulate_fisp_sequence_for_size<16, 8>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<16, 8>(context, range, echos, parameters, sequence); } else if (sequence.max_state <= 32) { - simulate_fisp_sequence_for_size<32, 16>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<32, 16>(context, range, echos, parameters, sequence); } else if (sequence.max_state <= 64) { - simulate_fisp_sequence_for_size<64, 32>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<64, 32>(context, range, echos, parameters, sequence); } else if (sequence.max_state <= 96) { - simulate_fisp_sequence_for_size<96, 32>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<96, 32>(context, range, echos, parameters, sequence); } else if (sequence.max_state <= 128) { - simulate_fisp_sequence_for_size<128, 32>(context, echos, parameters, sequence); + simulate_fisp_sequence_for_size<128, 32>(context, range, echos, parameters, sequence); } else { COMPAS_PANIC("max_state cannot exceed 128"); } @@ -62,18 +67,26 @@ Array simulate_magnetization( const CompasContext& context, TissueParameters parameters, FISPSequence sequence) { + using namespace kmm::placeholders; int nreadouts = int(sequence.RF_train.size()); int nvoxels = parameters.nvoxels; + int chunk_size = parameters.chunk_size; auto echos = Array {{nreadouts, nvoxels}}; void (*fun)( const kmm::DeviceContext&, kmm::NDRange, - gpu_view_mut, - TissueParametersView, + gpu_subview_mut, + gpu_subview, FISPSequenceView) = simulate_magnetization_kernel; - context.submit_device(nvoxels, fun, write(echos), parameters, sequence); + context.parallel_device( + nvoxels, + chunk_size, + fun, + write(echos, access(_, _x)), + read(parameters.data, access(_, _x)), + sequence); return echos; } diff --git a/src/simulate/fisp_kernels.cuh b/src/simulate/fisp_kernels.cuh index ad51b82..f81c31b 100644 --- a/src/simulate/fisp_kernels.cuh +++ b/src/simulate/fisp_kernels.cuh @@ -10,7 +10,7 @@ template COMPAS_DEVICE void simulate_fisp_for_voxel( const FISPSequenceView& sequence, gpu_view slice_profile, - gpu_strided_view_mut echos, + gpu_strided_subview_mut echos, TissueVoxel p) { auto off_resonance_rotation = [](float delta_t, float B0 = 0.0f) -> cfloat { if (B0 == 0.0f) { @@ -69,14 +69,14 @@ COMPAS_DEVICE void simulate_fisp_for_voxel( template __global__ void simulate_fisp( - gpu_view_mut echos, + kmm::Range<1, index_t> range, + gpu_subview_mut echos, gpu_view slice_profile, TissueParametersView parameters, FISPSequenceView sequence) { - index_t voxel = index_t(blockDim.x * blockIdx.x + threadIdx.x) / warp_size; - index_t nvoxels = parameters.nvoxels; + index_t voxel = index_t(blockDim.x * blockIdx.x + threadIdx.x) / warp_size + range.begin(); - if (voxel >= nvoxels) { + if (voxel >= range.end()) { return; } diff --git a/src/simulate/pssfp.cu b/src/simulate/pssfp.cu index 4463d16..20b4c5f 100644 --- a/src/simulate/pssfp.cu +++ b/src/simulate/pssfp.cu @@ -7,12 +7,12 @@ namespace compas { template int simulate_pssfp_sequence_batch( const kmm::DeviceContext& context, + int nvoxels, int iz, - const gpu_view_mut& echos, + const gpu_subview_mut& echos, const TissueParametersView& parameters, const pSSFPSequenceView& sequence) { int nreadouts = sequence.nTR; - int nvoxels = parameters.nvoxels; int nz = kmm::checked_cast(sequence.z.size()); COMPAS_ASSERT(echos.size(0) == nreadouts); @@ -25,6 +25,7 @@ int simulate_pssfp_sequence_batch( auto z_subslices = gpu_view {sequence.z.data() + iz, {{batch_size}}}; kernels::simulate_pssfp<<>>( // + nvoxels, echos, z_subslices, parameters, @@ -39,23 +40,30 @@ int simulate_pssfp_sequence_batch( void simulate_magnetization_kernel( const kmm::DeviceContext& context, kmm::NDRange, - gpu_view_mut echos, + int nvoxels, + gpu_subview_mut echos, TissueParametersView parameters, pSSFPSequenceView sequence) { // Initialize echos to zero - context.fill(echos, cfloat(0)); + context.fill(echos.data(), echos.size(), cfloat(0)); // We process the z-slices in batches. The first batches process 32 slices at once, the next batches process 16 // slices at once, the next 8 slices, etc. Since the reduction is performed using warp-shuffles, these batch sizes // must powers of two and cannot exceed 32. The offset keeps track of how many slices have already been processed // and is incremented by each call to `simulate_pssfp_sequence_batch`. int offset = 0; - offset = simulate_pssfp_sequence_batch<32>(context, offset, echos, parameters, sequence); - offset = simulate_pssfp_sequence_batch<16>(context, offset, echos, parameters, sequence); - offset = simulate_pssfp_sequence_batch<8>(context, offset, echos, parameters, sequence); - offset = simulate_pssfp_sequence_batch<4>(context, offset, echos, parameters, sequence); - offset = simulate_pssfp_sequence_batch<2>(context, offset, echos, parameters, sequence); - offset = simulate_pssfp_sequence_batch<1>(context, offset, echos, parameters, sequence); + offset = + simulate_pssfp_sequence_batch<32>(context, nvoxels, offset, echos, parameters, sequence); + offset = + simulate_pssfp_sequence_batch<16>(context, nvoxels, offset, echos, parameters, sequence); + offset = + simulate_pssfp_sequence_batch<8>(context, nvoxels, offset, echos, parameters, sequence); + offset = + simulate_pssfp_sequence_batch<4>(context, nvoxels, offset, echos, parameters, sequence); + offset = + simulate_pssfp_sequence_batch<2>(context, nvoxels, offset, echos, parameters, sequence); + offset = + simulate_pssfp_sequence_batch<1>(context, nvoxels, offset, echos, parameters, sequence); COMPAS_ASSERT(offset == sequence.z.size()); } @@ -71,11 +79,12 @@ Array simulate_magnetization( void (*fun)( const kmm::DeviceContext&, kmm::NDRange, - gpu_view_mut, + int, + gpu_subview_mut, TissueParametersView, pSSFPSequenceView) = simulate_magnetization_kernel; - context.submit_device(nvoxels, fun, write(echos), parameters, sequence); + context.submit_device(nvoxels, fun, nvoxels, write(echos), parameters, sequence); return echos; } } // namespace compas diff --git a/src/simulate/pssfp_kernels.cuh b/src/simulate/pssfp_kernels.cuh index 5eb24fe..7822c5b 100644 --- a/src/simulate/pssfp_kernels.cuh +++ b/src/simulate/pssfp_kernels.cuh @@ -14,7 +14,7 @@ template COMPAS_DEVICE void simulate_pssfp_for_voxel( const pSSFPSequenceView& sequence, float z, - gpu_strided_view_mut echos, + gpu_strided_subview_mut echos, TissueVoxel p) { auto group = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); @@ -108,13 +108,13 @@ COMPAS_DEVICE void simulate_pssfp_for_voxel( template __global__ void simulate_pssfp( - gpu_view_mut echos, + index_t nvoxels, + gpu_subview_mut echos, gpu_view z, TissueParametersView parameters, pSSFPSequenceView sequence) { index_t lane = threadIdx.x % warp_size; index_t voxel = index_t(blockDim.x * blockIdx.x + threadIdx.x) / warp_size; - index_t nvoxels = parameters.nvoxels; if (voxel >= nvoxels) { return; diff --git a/src/trajectories/phase_encoding.cu b/src/trajectories/phase_encoding.cu index 61428b5..ecea411 100644 --- a/src/trajectories/phase_encoding.cu +++ b/src/trajectories/phase_encoding.cu @@ -9,20 +9,23 @@ Array phase_encoding( const Array& echos, const TissueParameters& parameters, const CartesianTrajectory& trajectory) { + using namespace kmm::placeholders; + int nreadouts = trajectory.nreadouts; int nvoxels = parameters.nvoxels; + int chunk_size = parameters.chunk_size; auto ph_en_echos = Array {{nreadouts, nvoxels}}; - dim3 block_dim = {32, 4}; - dim3 grid_dim = {div_ceil(uint(nvoxels), block_dim.x), div_ceil(uint(nreadouts), block_dim.y)}; - ctx.submit_kernel( - grid_dim, + + ctx.parallel_kernel( + {nvoxels, nreadouts}, + {chunk_size, nreadouts}, block_dim, kernels::phase_encoding, - write(ph_en_echos), - echos, - parameters, + write(ph_en_echos, access(_y, _x)), + read(echos, access(_y, _x)), + read(parameters.data, access(_y, _x)), trajectory); return ph_en_echos; diff --git a/src/trajectories/phase_encoding_kernels.cuh b/src/trajectories/phase_encoding_kernels.cuh index a68007d..730d0f3 100644 --- a/src/trajectories/phase_encoding_kernels.cuh +++ b/src/trajectories/phase_encoding_kernels.cuh @@ -8,19 +8,34 @@ namespace compas { namespace kernels { __global__ void phase_encoding( - kmm::NDRange, - gpu_view_mut ph_en_echos, - gpu_view echos, + kmm::NDRange range, + gpu_subview_mut ph_en_echos, + gpu_subview echos, TissueParametersView parameters, CartesianTrajectoryView trajectory) { - auto readout = index_t(blockIdx.y * blockDim.y + threadIdx.y); - auto voxel = index_t(blockIdx.x * blockDim.x + threadIdx.x); + auto voxel = index_t(blockIdx.x * blockDim.x + threadIdx.x + range.begin(0)); + auto readout = index_t(blockIdx.y * blockDim.y + threadIdx.y + range.begin(1)); - if (readout < echos.size(0) && voxel < echos.size(1)) { + if (voxel < range.end(0) && readout < range.end(1)) { auto k_y = trajectory.k_start[readout].imag(); auto y = parameters.get(voxel).y; - ph_en_echos[readout][voxel] = echos[readout][voxel] * exp(cfloat(0.0f, y * k_y)); + ph_en_echos[readout][voxel] = echos[readout][voxel] * exp(cfloat(0.0F, y * k_y)); + + if (voxel+1 == 25414 && readout+1 == 782) { + printf("%d %d] %p %p | %f %f -> %f %f | %f %f\n", + voxel, + readout, + echos.data(), + ph_en_echos.data(), + echos[readout][voxel].real(), + echos[readout][voxel].imag(), + ph_en_echos[readout][voxel].real(), + ph_en_echos[readout][voxel].imag(), + k_y, + y + ); + } } } diff --git a/src/trajectories/signal.cu b/src/trajectories/signal.cu index 28c55e0..7df11ab 100644 --- a/src/trajectories/signal.cu +++ b/src/trajectories/signal.cu @@ -12,6 +12,7 @@ namespace compas { void magnetization_to_signal_cartesian_direct( const kmm::DeviceContext& context, kmm::NDRange subrange, + int nvoxels, gpu_subview_mut signal, gpu_view echos, TissueParametersView parameters, @@ -20,7 +21,6 @@ void magnetization_to_signal_cartesian_direct( gpu_view_mut exponents, gpu_view_mut factors) { int ncoils = kmm::checked_cast(coil_sensitivities.size(0)); - int nvoxels = parameters.nvoxels; int nreadouts = trajectory.nreadouts; int samples_per_readout = trajectory.samples_per_readout; @@ -85,6 +85,7 @@ void magnetization_to_signal_cartesian_direct( void magnetization_to_signal_cartesian_gemm( const kmm::DeviceContext& context, kmm::NDRange subrange, + int nvoxels, gpu_view_mut signal, gpu_view echos, TissueParametersView parameters, @@ -94,7 +95,6 @@ void magnetization_to_signal_cartesian_gemm( gpu_view_mut factors, cublasComputeType_t compute_type) { int ncoils = kmm::checked_cast(coil_sensitivities.size(0)); - int nvoxels = parameters.nvoxels; int nreadouts = trajectory.nreadouts; int samples_per_readout = trajectory.samples_per_readout; @@ -164,6 +164,7 @@ void magnetization_to_signal_cartesian_gemm( void magnetization_to_signal_spiral( const kmm::DeviceContext& context, kmm::NDRange subrange, + int nvoxels, gpu_view_mut signal, gpu_view echos, TissueParametersView parameters, @@ -172,7 +173,6 @@ void magnetization_to_signal_spiral( gpu_view_mut exponents, gpu_view_mut factors) { int ncoils = kmm::checked_cast(coil_sensitivities.size(0)); - int nvoxels = parameters.nvoxels; int nreadouts = trajectory.nreadouts; int samples_per_readout = trajectory.samples_per_readout; @@ -270,6 +270,7 @@ Array magnetization_to_signal( context.submit_device( {nvoxels, nreadouts}, magnetization_to_signal_cartesian_direct, + nvoxels, write(signal), echos, parameters, @@ -281,6 +282,7 @@ Array magnetization_to_signal( context.submit_device( {nvoxels, nreadouts}, magnetization_to_signal_cartesian_gemm, + nvoxels, write(signal), echos, parameters, @@ -297,6 +299,7 @@ Array magnetization_to_signal( context.submit_device( {nvoxels, nreadouts}, magnetization_to_signal_spiral, + nvoxels, write(signal), echos, parameters,