Copy
基础拷贝
4 个线程负责将 4 个数据从 global memory 拷贝至 shared memory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
| #include <iostream>
#include <vector>
#include "cute/tensor.hpp"
__global__ void copy(int* data) {
int idx = threadIdx.x;
__shared__ int temp[4];
//printf("data[%d]=%d\n", idx, data[idx]);
cute::Tensor g_input = cute::make_tensor(
cute::make_gmem_ptr(data),
cute::Shape<cute::_4>{}
);
cute::Tensor s_input = cute::make_tensor(
cute::make_smem_ptr(temp),
cute::Shape<cute::_4>{}
);
// 让每个线程知道自己要负责哪部分数据
cute::Layout t_layout = cute::make_layout(cute::Shape<cute::_4>{});
cute::Tensor tg_frag = cute::local_partition(
g_input,
t_layout,
idx
);
cute::Tensor ts_frag = cute::local_partition(
s_input,
t_layout,
idx
);
cute::copy(tg_frag, ts_frag);
__syncthreads();
if (idx == 0) {
for (int i = 0; i < 4; i++) {
printf("temp[%d]=%d\n", i, temp[i]);
}
}
}
int main() {
std::vector<int> h_data(4);
for (int i = 0; i < 4; i++) {
h_data[i] = i;
}
int *d_data;
cudaMalloc(&d_data, 4 * sizeof(int));
cudaMemcpy(d_data, h_data.data(), 4 * sizeof(int), cudaMemcpyHostToDevice);
copy<<<1, 4>>>(d_data);
cudaDeviceSynchronize();
return 0;
}
|
从这个 copy_if 最简单实现版本来看,src 和 dst 其实就是每个线程所对应的空间下的起始地址,这正是每个 thread 都会拿到一个 tensor 视图的原因
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
| template <class PrdTensor,
class SrcEngine, class SrcLayout,
class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
copy_if(PrdTensor const& pred,
Tensor<SrcEngine, SrcLayout> const& src,
Tensor<DstEngine, DstLayout> & dst)
{
using SrcType = typename SrcEngine::value_type;
using DstType = typename DstEngine::value_type;
CUTE_UNROLL
for (int i = 0; i < size(dst); ++i) {
if (pred(i)) {
dst(i) = static_cast<DstType>(static_cast<SrcType>(src(i)));
}
}
}
|
复杂一点的拷贝
还是 4 个线程,但是需要拷贝 8 个数据
因为每个 thread 不再仅负责一个数据了,所以我们尝试把每个线程要获取的数据视图布局修改为(4, 2),如果根据 mode0 来选择,每个线程会选择到 2 个数据,不过这只是简单的理解,如果按下面的代码尝试后会发现编译报错
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
| #include <iostream>
#include <vector>
#include "cute/tensor.hpp"
__global__ void copy(int* data) {
int idx = threadIdx.x;
__shared__ int temp[8];
//printf("data[%d]=%d\n", idx, data[idx]);
cute::Tensor g_input = cute::make_tensor(
cute::make_gmem_ptr(data),
cute::Shape<cute::_8>{}
);
cute::Tensor s_input = cute::make_tensor(
cute::make_smem_ptr(temp),
cute::Shape<cute::_8>{}
);
// 让每个线程知道自己要负责哪部分数据
cute::Layout t_layout = cute::make_layout(
cute::make_shape(cute::_4{}, cute::_2{})
);
cute::Tensor tg_frag = cute::local_partition(
g_input,
t_layout,
idx
);
cute::Tensor ts_frag = cute::local_partition(
s_input,
t_layout,
idx
);
cute::copy(tg_frag, ts_frag);
__syncthreads();
if (idx == 0) {
for (int i = 0; i < 4; i++) {
printf("temp[%d]=%d\n", i, temp[i]);
}
}
}
int main() {
std::vector<int> h_data(8);
for (int i = 0; i < 8; i++) {
h_data[i] = i;
}
int *d_data;
cudaMalloc(&d_data, 8 * sizeof(int));
cudaMemcpy(d_data, h_data.data(), 4 * sizeof(int), cudaMemcpyHostToDevice);
copy<<<1, 4>>>(d_data);
cudaDeviceSynchronize();
return 0;
}
|
原因在于 local_partition 无法直接将给定的 2 维 layout 作用在 1 维的 global memory 数据空间中
tiled_copy 内部会通过 raked_product 隐式完成不匹配维度的转换- 数据就那一份,但是可以通过 shape 和 strid 以不同的方式去看待它
没有用其他 api,只是把 global memory 的视图修改为 2 维,每个 thread 在定位时使用 1 维结果,就能让每个线程对应到两个数据了,不过不是很理解这背后从坐标映射上到底是怎么个实现流程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
| #include <iostream>
#include <vector>
#include "cute/tensor.hpp"
__global__ void copy(int* data) {
int idx = threadIdx.x;
__shared__ int temp[8];
//printf("data[%d]=%d\n", idx, data[idx]);
cute::Layout t_layout = cute::make_layout(
cute::make_shape(cute::_4{}, cute::_2{})
);
cute::Tensor g_input = cute::make_tensor(
cute::make_gmem_ptr(data),
t_layout
);
cute::Tensor s_input = cute::make_tensor(
cute::make_smem_ptr(temp),
t_layout
);
// 让每个线程知道自己要负责哪部分数据
cute::Tensor tg_frag = cute::local_partition(
g_input,
cute::make_layout(cute::Shape<cute::_4>{}),
idx
);
cute::Tensor ts_frag = cute::local_partition(
s_input,
cute::make_layout(cute::Shape<cute::_4>{}),
idx
);
cute::copy(tg_frag, ts_frag);
__syncthreads();
if (idx == 0) {
for (int i = 0; i < 8; i++) {
printf("temp[%d]=%d\n", i, temp[i]);
}
}
}
int main() {
std::vector<int> h_data(8);
for (int i = 0; i < 8; i++) {
h_data[i] = i;
}
int *d_data;
cudaMalloc(&d_data, 8 * sizeof(int));
cudaMemcpy(d_data, h_data.data(), 8 * sizeof(int), cudaMemcpyHostToDevice);
copy<<<1, 4>>>(d_data);
cudaDeviceSynchronize();
return 0;
}
|