-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathGPlex.h
159 lines (126 loc) · 4.69 KB
/
GPlex.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#ifndef _GPLEX_H_
#define _GPLEX_H_
#include <cuda_runtime.h>
#include <stdio.h>
#include "gpu_utils.h"
namespace GPlexBase
{
typedef int idx_t;
//------------------------------------------------------------------------------
template<typename T, idx_t D1, idx_t D2, idx_t N>
class GPlexBase
{
public:
typedef T value_type;
enum
{
/// return no. of matrix rows
kRows = D1,
/// return no. of matrix columns
kCols = D2,
/// return no of elements: rows*columns
kSize = D1 * D2,
/// size of the whole GPlexBase
kTotSize = N * kSize
};
T fArray[kTotSize] __attribute__((aligned(64)));
const T& ConstAt(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
T& At(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
T& operator()(idx_t n, idx_t i, idx_t j) { return fArray[(i * D2 + j) * N + n]; }
const T& operator()(idx_t n, idx_t i, idx_t j) const { return fArray[(i * D2 + j) * N + n]; }
};
template<typename T, idx_t D1, idx_t D2, idx_t N> using GPlexB = GPlexBase<T, D1, D2, N>;
}
//#include "gpu_constants.h"
__device__ __constant__ static int gplexSymOffsets[7][36] =
{
{},
{},
{ 0, 1, 1, 2 },
{ 0, 1, 3, 1, 2, 4, 3, 4, 5 }, // 3
{},
{},
{ 0, 1, 3, 6, 10, 15, 1, 2, 4, 7, 11, 16, 3, 4, 5, 8, 12, 17, 6, 7, 8, 9, 13, 18, 10, 11, 12, 13, 14, 19, 15, 16, 17, 18, 19, 20 }
};
constexpr GPlexBase::idx_t NN = 8; // "Length" of GPlexB.
constexpr GPlexBase::idx_t LL = 6; // Dimension of large/long GPlexB entities
constexpr GPlexBase::idx_t HH = 3; // Dimension of small/short GPlexB entities
typedef GPlexBase::GPlexBase<float, LL, LL, NN> GPlexBLL;
// The number of tracks is the fast dimension and is padded in order to have
// consecutive and aligned memory accesses. For cached reads, this result in a
// single memory transaction for the 32 threads of a warp to access 32 floats.
// See:
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0
// In practice, The number of tracks (ntracks) is set to be MPT_SIZE
template <typename M>
struct GPlex {
using T = typename M::value_type;
using value_type = T;
static const size_t kRows = M::kRows;
static const size_t kCols = M::kCols;
static const size_t kSize = M::kSize;
T* ptr;
size_t pitch, stride, N;
__device__ T operator[](int xx) const { return ptr[xx]; }
__device__ T& operator[](int xx) { return ptr[xx]; }
__device__ T& operator()(int n, int i, int j) { return ptr[n + (i*kCols + j)*stride]; }
__device__ T operator()(int n, int i, int j) const { return ptr[n + (i*kCols + j)*stride]; }
void allocate(size_t ntracks) {
N = ntracks;
cudaMallocPitch((void**)&ptr, &pitch, N*sizeof(T), kSize);
stride = pitch/sizeof(T); // Number of elements
}
void free() {
cudaFree(ptr);
N = 0; pitch = 0; stride = 0;
}
//cudaMemcpy2D(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T),
//N*sizeof(T), HS, cudaMemcpyHostToDevice);
void copyFromHost(const M& gplex) {
cudaMemcpy2D(ptr, pitch, gplex.fArray, N*sizeof(T),
N*sizeof(T), kSize, cudaMemcpyHostToDevice);
cudaCheckError();
}
void copyToHost(M& gplex) {
cudaMemcpy2D(gplex.fArray, N*sizeof(T), ptr, pitch,
N*sizeof(T), kSize, cudaMemcpyDeviceToHost);
cudaCheckError();
}
};
template <typename M>
struct GPlexSym : GPlex<M> {
using T = typename GPlex<M>::T;
using GPlex<M>::kRows;
using GPlex<M>::kCols;
using GPlex<M>::stride;
using GPlex<M>::ptr;
__device__ size_t Off(size_t i) const { return gplexSymOffsets[kRows][i]; }
// Note: convenient but noticeably slower due to the indirection
__device__ T& operator()(int n, int i, int j) { return ptr[n + Off(i*kCols + j)*stride]; }
__device__ T operator()(int n, int i, int j) const { return ptr[n + Off(i*kCols + j)*stride]; }
//__device__ T& operator()(int n, int i, int j) { return ptr[n + i*stride]; }
//__device__ T operator()(int n, int i, int j) const { return ptr[n + i*stride]; }
};
using GPlexLL = GPlex<GPlexBLL>;
template <typename M>
struct GPlexReg {
using T = typename M::value_type;
using value_type = T;
size_t kRows = M::kRows;
size_t kCols = M::kCols;
size_t kSize = M::kSize;
__device__ T operator[](int xx) const { return arr[xx]; }
__device__ T& operator[](int xx) { return arr[xx]; }
__device__ T& operator()(int n, int i, int j) { return arr[i*kCols + j]; }
__device__ T operator()(int n, int i, int j) const { return arr[i*kCols + j]; }
__device__ void SetVal(T v)
{
for (int i = 0; i < kSize; ++i)
{
arr[i] = v;
}
}
T arr[M::kSize];
};
using GPlexRegLL = GPlexReg<GPlexBLL>;
#endif // _GPLEX_H_