CUTLASS -

Anonymous included in HPC

2025-10-31 2025-11-18

Contents

Copy

基础拷贝

4 个线程负责将 4 个数据从 global memory 拷贝至 shared memory

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include <iostream>
#include <vector>
#include "cute/tensor.hpp"

__global__ void copy(int* data) {
    int idx = threadIdx.x;
    __shared__ int temp[4];

    //printf("data[%d]=%d\n", idx, data[idx]);

    cute::Tensor g_input = cute::make_tensor(
        cute::make_gmem_ptr(data),
        cute::Shape<cute::_4>{}
    );
    cute::Tensor s_input = cute::make_tensor(
        cute::make_smem_ptr(temp),
        cute::Shape<cute::_4>{}
    );

    // 让每个线程知道自己要负责哪部分数据
    cute::Layout t_layout = cute::make_layout(cute::Shape<cute::_4>{});
    cute::Tensor tg_frag = cute::local_partition(
        g_input, 
        t_layout,
        idx
    );
    cute::Tensor ts_frag = cute::local_partition(
        s_input, 
        t_layout,
        idx
    );

    cute::copy(tg_frag, ts_frag);

    __syncthreads();

    if (idx == 0) {
        for (int i = 0; i < 4; i++) {
            printf("temp[%d]=%d\n", i, temp[i]);
        }
    }
}

int main() {
    std::vector<int> h_data(4);
    for (int i = 0; i < 4; i++) {
        h_data[i] = i;
    }

    int *d_data;
    cudaMalloc(&d_data, 4 * sizeof(int));    
    cudaMemcpy(d_data, h_data.data(), 4 * sizeof(int), cudaMemcpyHostToDevice);
    
    copy<<<1, 4>>>(d_data);

    cudaDeviceSynchronize();

    return 0;
}

从这个 copy_if 最简单实现版本来看，src 和 dst 其实就是每个线程所对应的空间下的起始地址，这正是每个 thread 都会拿到一个 tensor 视图的原因

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
template <class PrdTensor,
          class SrcEngine, class SrcLayout,
          class DstEngine, class DstLayout>
CUTE_HOST_DEVICE
void
copy_if(PrdTensor                    const& pred,
        Tensor<SrcEngine, SrcLayout> const& src,
        Tensor<DstEngine, DstLayout>      & dst)
{
  using SrcType = typename SrcEngine::value_type;
  using DstType = typename DstEngine::value_type;

  CUTE_UNROLL
  for (int i = 0; i < size(dst); ++i) {
    if (pred(i)) {
      dst(i) = static_cast<DstType>(static_cast<SrcType>(src(i)));
    }
  }
}

复杂一点的拷贝

还是 4 个线程，但是需要拷贝 8 个数据

因为每个 thread 不再仅负责一个数据了，所以我们尝试把每个线程要获取的数据视图布局修改为(4, 2)，如果根据 mode0 来选择，每个线程会选择到 2 个数据，不过这只是简单的理解，如果按下面的代码尝试后会发现编译报错

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <iostream>
#include <vector>
#include "cute/tensor.hpp"

__global__ void copy(int* data) {
    int idx = threadIdx.x;
    __shared__ int temp[8];

    //printf("data[%d]=%d\n", idx, data[idx]);

    cute::Tensor g_input = cute::make_tensor(
        cute::make_gmem_ptr(data),
        cute::Shape<cute::_8>{}
    );
    cute::Tensor s_input = cute::make_tensor(
        cute::make_smem_ptr(temp),
        cute::Shape<cute::_8>{}
    );

    // 让每个线程知道自己要负责哪部分数据
    cute::Layout t_layout = cute::make_layout(
        cute::make_shape(cute::_4{}, cute::_2{})
    );
    cute::Tensor tg_frag = cute::local_partition(
        g_input, 
        t_layout,
        idx
    );
    cute::Tensor ts_frag = cute::local_partition(
        s_input, 
        t_layout,
        idx
    );

    cute::copy(tg_frag, ts_frag);

    __syncthreads();

    if (idx == 0) {
        for (int i = 0; i < 4; i++) {
            printf("temp[%d]=%d\n", i, temp[i]);
        }
    }
}

int main() {
    std::vector<int> h_data(8);
    for (int i = 0; i < 8; i++) {
        h_data[i] = i;
    }

    int *d_data;
    cudaMalloc(&d_data, 8 * sizeof(int));    
    cudaMemcpy(d_data, h_data.data(), 4 * sizeof(int), cudaMemcpyHostToDevice);
    
    copy<<<1, 4>>>(d_data);

    cudaDeviceSynchronize();

    return 0;
}

原因在于 local_partition 无法直接将给定的 2 维 layout 作用在 1 维的 global memory 数据空间中

tiled_copy 内部会通过 raked_product 隐式完成不匹配维度的转换
数据就那一份，但是可以通过 shape 和 strid 以不同的方式去看待它

没有用其他 api，只是把 global memory 的视图修改为 2 维，每个 thread 在定位时使用 1 维结果，就能让每个线程对应到两个数据了，不过不是很理解这背后从坐标映射上到底是怎么个实现流程

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#include <iostream>
#include <vector>
#include "cute/tensor.hpp"

__global__ void copy(int* data) {
    int idx = threadIdx.x;
    __shared__ int temp[8];

    //printf("data[%d]=%d\n", idx, data[idx]);

    cute::Layout t_layout = cute::make_layout(
        cute::make_shape(cute::_4{}, cute::_2{})
    );

    cute::Tensor g_input = cute::make_tensor(
        cute::make_gmem_ptr(data),
        t_layout
    );
    cute::Tensor s_input = cute::make_tensor(
        cute::make_smem_ptr(temp),
        t_layout
    );

    // 让每个线程知道自己要负责哪部分数据
    cute::Tensor tg_frag = cute::local_partition(
        g_input, 
        cute::make_layout(cute::Shape<cute::_4>{}),
        idx
    );
    cute::Tensor ts_frag = cute::local_partition(
        s_input, 
        cute::make_layout(cute::Shape<cute::_4>{}),
        idx
    );

    cute::copy(tg_frag, ts_frag);

    __syncthreads();

    if (idx == 0) {
        for (int i = 0; i < 8; i++) {
            printf("temp[%d]=%d\n", i, temp[i]);
        }
    }
}

int main() {
    std::vector<int> h_data(8);
    for (int i = 0; i < 8; i++) {
        h_data[i] = i;
    }

    int *d_data;
    cudaMalloc(&d_data, 8 * sizeof(int));    
    cudaMemcpy(d_data, h_data.data(), 8 * sizeof(int), cudaMemcpyHostToDevice);
    
    copy<<<1, 4>>>(d_data);

    cudaDeviceSynchronize();

    return 0;
}