How to correctly use cute::copy to transfer 4bit data ? #1867

luliyucoordinate · 2024-10-13T02:06:44Z

luliyucoordinate
Oct 13, 2024

I found that in cute, the underlying storage of int4b_t is int8_t. If my data is stored as 4 bits, I can't use the following program to print it.

#include <cute/tensor.hpp>
using namespace cute;

template <typename Config>
__global__ void test_int4b_kernel(int4b_t *a_ptr, int m, int n) {
  using SmemLayoutA = typename Config::SmemLayoutA;
  using G2SCopyA = typename Config::G2SCopyA;
  constexpr int kTileM = Config::kTileM;
  constexpr int kTileN = Config::kTileN;

  extern __shared__ int8_t shm_data[];
  int4b_t *Ashm = (int4b_t *)shm_data;
  auto A = make_tensor(make_gmem_ptr(a_ptr), make_shape(m, n),
                       make_stride(n, Int<1>{}));
  int idx = threadIdx.x;
  int ix = blockIdx.x;
  auto gA =
      local_tile(A, make_tile(Int<kTileM>{}, Int<kTileN>{}), make_coord(ix, _));
  auto sA = make_tensor(make_smem_ptr(Ashm), SmemLayoutA{});
  G2SCopyA g2s_tiled_copy_a;
  auto g2s_thr_copy_a = g2s_tiled_copy_a.get_slice(idx);
  auto tAgA_copy = g2s_thr_copy_a.partition_S(gA);  // (CPY, CPY_M, CPY_K, k)
  auto tAsA_copy = g2s_thr_copy_a.partition_D(sA);  // (CPY, CPY_M, CPY_K)
  clear(tAsA_copy);
  cute::copy(g2s_tiled_copy_a, tAgA_copy(_, _, _, 0), tAsA_copy);
  cp_async_fence();
  cp_async_wait<0>();
  __syncthreads();
  if (thread0()) {
    print("gA: ");
    print_tensor(gA);
    print("\nsA: ");
    print_tensor(sA);
  }
}

template <int kTileM_ = 128, int kTileN_ = 128>
struct TestConfig {
  // tile configuration
  static constexpr int kThreadNum = 32;
  static constexpr int kTileM = kTileM_;
  static constexpr int kTileN = kTileN_;

  using SmemLayoutAtom =
      decltype(make_layout(make_shape(Int<16>{}, Int<kTileN>{}),
                           make_stride(Int<kTileN>{}, Int<1>{})));

  using SmemLayoutA = decltype(tile_to_shape(
      SmemLayoutAtom{}, make_shape(Int<kTileM>{}, Int<kTileN>{})));
  using g2s_copy_op = SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>;
  using g2s_copy_traits = Copy_Traits<g2s_copy_op>;
  using g2s_copy_atom = Copy_Atom<g2s_copy_traits, int4b_t>;

  using G2SCopyA =
      decltype(make_tiled_copy(g2s_copy_atom{},
                               make_layout(make_shape(Int<16>{}, Int<2>{}),
                                           make_stride(Int<2>{}, Int<1>{})),
                               make_layout(make_shape(Int<1>{}, Int<32>{}))));

  static constexpr int kShmSize = cute::cosize(SmemLayoutA{}) / 2;
};

void test_int4b() {
  using namespace cute;
  constexpr int M = 128;
  constexpr int N = 64;
  TestConfig<M, N> test_config;
  std::vector<int8_t> v(M * M / 2);
  for (int i = 0; i < M * M / 2; ++i) {
    int t = i % 8;
    v[i] = (t & 0x0f) | ((t & 0x0f) << 4);
  }
  int4b_t *v_d;
  cudaMalloc(&v_d, M * M / 2);
  cudaMemcpy(v_d, v.data(), M * M / 2, cudaMemcpyHostToDevice);

  dim3 block = test_config.kThreadNum;
  dim3 grid((M + test_config.kTileM - 1) / test_config.kTileM);
  int shm_size = test_config.kShmSize;
  auto partition_kernel = test_int4b_kernel<decltype(test_config)>;
  cudaFuncSetAttribute(partition_kernel,
                       cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
  partition_kernel<<<grid, block, shm_size>>>(v_d, M, M);
  cudaFree(v_d);
}

However, after I modified kShmSize to cute::cosize(SmemLayoutA{}), I was able to print it, but the result doesn't seem quite right.

gA: gmem_ptr[4b](0x80ae00000) o (_128,_64,2):(_128,_1,_64)
  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119
....
sA: smem_ptr[4b](0x7f853d000000) o (_128,_64):(_64,_1)
  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  17  34  51  68  85  102  119  0  17  34  51  68  85  102  119  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
....

Why are the print results of sA and gA different?

Answered by ccecka

Oct 13, 2024

This is a C/C++ thing, not really a CuTe thing. In C/C++, int4b_t* is a pointer to int4b_ts with underlying storage of int8_t. CuTe cannot assume that this pointer means "packed" (2x int4b_t within each int8_t) safely. This is the reason why array_subbyte.data() has been removed from CuTe's array_subbyte container (But apparently not CUTLASS's) -- it is dangerous and error-prone to use these naked pointers that don't mean what you think they mean.

You can create a packed CuTe tensor by specifying the logical data type you're working with:

Tensor mA = make_tensor(make_gmem_ptr<uint4b_t>(my_ptr), my_layout_of_4b);

which creates a "packed" pointer from my_ptr. Similarly with rmem or smem:

Te…

View full answer

ccecka · 2024-10-13T20:04:16Z

ccecka
Oct 13, 2024

This is a C/C++ thing, not really a CuTe thing. In C/C++, int4b_t* is a pointer to int4b_ts with underlying storage of int8_t. CuTe cannot assume that this pointer means "packed" (2x int4b_t within each int8_t) safely. This is the reason why array_subbyte.data() has been removed from CuTe's array_subbyte container (But apparently not CUTLASS's) -- it is dangerous and error-prone to use these naked pointers that don't mean what you think they mean.

You can create a packed CuTe tensor by specifying the logical data type you're working with:

Tensor mA = make_tensor(make_gmem_ptr<uint4b_t>(my_ptr), my_layout_of_4b);

which creates a "packed" pointer from my_ptr. Similarly with rmem or smem:

Tensor sA = make_tensor(make_smem_ptr<uint4b_t>(my_s_ptr), my_layout_of_4b);
Tensor rA = make_tensor<uint4b_t>(my_layout_of_4b);

1 reply

luliyucoordinate Oct 14, 2024
Author

thx, I found that cute doesn't support printing of uint{x}b_t type, so I added the relevant code. #1871

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to correctly use cute::copy to transfer 4bit data ? #1867

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment 1 reply

{{title}}

{{title}}

Select a reply

How to correctly use cute::copy to transfer 4bit data ? #1867

luliyucoordinate Oct 13, 2024

Replies: 1 comment · 1 reply

ccecka Oct 13, 2024

luliyucoordinate Oct 14, 2024 Author

luliyucoordinate
Oct 13, 2024

Replies: 1 comment 1 reply

ccecka
Oct 13, 2024

luliyucoordinate Oct 14, 2024
Author