将 cuBLAS 与 Thrust 的复数结合使用

2024-01-07

在我的代码中,我使用推力库中的复数数组,我想使用 cublasZgeam() 来转置数组。

使用 cuComplex.h 中的复数并不是一个更好的选择,因为我对数组进行了大量算术运算,并且 cuComplex 没有定义的运算符,例如 * +=。

这就是我定义要转置的数组的方式

thrust::complex<float> u[xmax][xmax];

我找到了这个https://github.com/jtravs/cuda_complex https://github.com/jtravs/cuda_complex,但这样使用它:

#include "cuComplex.hpp"

使用 nvcc 编译时不允许我使用提到的运算符

error: no operator "+=" matches these operands
        operand types are: cuComplex += cuComplex

有什么解决办法吗? github 上的代码很旧,可能存在问题,或者可能是我使用错误

编辑:这是有效的代码,与talonmies代码的唯一区别是添加简单的内核和指向相同数据的指针,但推力::复杂

#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>

using namespace std;

__global__ void test(thrust::complex<double>* u) {

  u[0] += thrust::complex<double>(3.3,3.3);
}

int main()
{
  int xmax = 100;
  thrust::complex<double>  u[xmax][xmax];
  double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;

  thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
  u[49][51] += thrust::complex<double>(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;
  cout << u[0][0] << endl;

  thrust::complex<double> alpha(1.0, 0.0);
  thrust::complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);
  test<<<1,1>>>(d_vTest);
  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
  cout << "After:" << endl;
  cout << u[0][0] << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

尽管您提出相反的抗议,C++ 标准库complex (or thrust::complex)肯定可以与 CUBLAS 配合使用。这cuComplex and cuDoubleComplex设计为与标准主机复杂类型二进制兼容,以便数据在传递到在设备上使用复杂数据的 CUBLAS 函数时不会被转换。

对您在评论中发布的代码进行简单修改,其效果与您想象的完全一样:

#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>

using namespace std;

int main()
{
  int xmax = 100;
  complex<double>  u[xmax][xmax];
  size_t arrSize = sizeof(complex<double>) * xmax * xmax;

  fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
  u[49][51] += complex<double>(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  complex<double> alpha(1.0, 0.0);
  complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);

  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
  
  cout << "After:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

像这样构建和运行:

~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose 
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)

唯一需要的修改是显式转换std::complex<double>类型为cuDoubleComplex。这样做,一切都会按预期进行。

使用推力,代码看起来几乎相同:

#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>

using namespace std;

int main()
{
  int xmax = 100;
  thrust::complex<double>  u[xmax][xmax];
  size_t arrSize = sizeof(thrust::complex<double>) * xmax * xmax;

  thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
  u[49][51] += thrust::complex<double>(665.0,665.0);
  u[51][49] *= 2.0;

  cout << "Before:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  thrust::complex<double> alpha(1.0, 0.0);
  thrust::complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  cuDoubleComplex* d_u;
  cuDoubleComplex* d_v;
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
  cudaMalloc(&d_u, arrSize);
  cudaMalloc(&d_v, arrSize);
  cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, d_u, xmax,
                  _beta,  d_u, xmax,
                  d_v, xmax);

  cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
  
  cout << "After:" << endl;
  cout << u[49][51] << endl;
  cout << u[51][49] << endl;

  return 0;
}

也许更接近您的用例,使用推力设备容器,内核在 CUBLAS 调用之前执行一些初始化:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>

__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
  u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
  u[49 + 51*xmax] *= 2.0;
}

int main()
{
  int xmax = 100;

  thrust::complex<double> alpha(1.0, 0.0);
  thrust::complex<double> beta(0.0, 0.0);
  cublasHandle_t handle;
  cublasCreate(&handle);

  thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
  thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
  setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);

  cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
  cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
  cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
  cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);

  cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
                  _alpha, _d_u, xmax,
                  _beta, _d_u, xmax,
                  _d_v, xmax);

  thrust::complex<double>  u[xmax][xmax];

  thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
  std::cout << "Before:" << std::endl;
  std::cout << u[49][51] << std::endl;
  std::cout << u[51][49] << std::endl;

  thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
  std::cout << "After:" << std::endl;
  std::cout << u[49][51] << std::endl;
  std::cout << u[51][49] << std::endl;

  return 0;

}
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

将 cuBLAS 与 Thrust 的复数结合使用 的相关文章

随机推荐