先从一个简单,直观的方法来了解如何用CUDA计算向量内积。向量内积既然是将两个向量对应元素相乘的结果再求和,我们先考虑将对应元素相乘并行化,再来考虑相加。
【方法一】
#include<stdio.h>
#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "error.cuh"
#define VECTOR_LENGTH 10000
__global__ void vec_product(float *out, float *vec_a, float *vec_b, int n)
{
extern __shared__ float temp[];
int idx = threadIdx.x;
int stride = blockDim.x;
for (int i=idx; i<n; i+=stride)
{
temp[i] = vec_a[i] * vec_b[i];
}
__syncthreads();
if (threadIdx.x == 0)
{
float sum = 0;
for (int i=0; i<n; i++)
{
sum += temp[i];
}
*out = sum;
}
}
int main()
{
//cuda_info();
float *a, *b, *out;
a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
out = (float*)malloc(sizeof(float));
*out = 0;
for(int i=0; i<VECTOR_LENGTH; i++)
{
a[i] = 2.0f;
b[i] = 2.0f;
}
float *d_a,*d_b,*d_out;
cudaMalloc((void**)&d_a, sizeof(float)*VECTOR_LENGTH);
cudaMalloc((void**)&d_b, sizeof(float)*VECTOR_LENGTH);
cudaMalloc((void**)&d_out, sizeof(float));
cudaMemcpy(d_a,a,sizeof(float)*VECTOR_LENGTH,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,sizeof(float)*VECTOR_LENGTH,cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
int minGridSize,blockSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, vec_product, VECTOR_LENGTH*sizeof(float), 0);
printf("minGridSize:%d,blockSize:%d\n",minGridSize,blockSize);
vec_product<<<1,blockSize, sizeof(float)*VECTOR_LENGTH>>>(d_out, d_a, d_b, VECTOR_LENGTH);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("time cost: %.3f\n", elapsed_time);
cudaMemcpy(out,d_out,sizeof(float),cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("vector product: %f\n", *out);
free(a);
free(b);
free(out);
cudaFree(a);
cudaFree(b);
cudaFree(out);
return 0;
}
方法一中网格大小为1,线程块的大小根据由cudaOccupancyMaxPotentialBlockSize计算得到,对于这个简单示例来说,你也可以自己定义一个线程块的大小,一个线程块中的线程并行地执行。上述代码中,从头至尾只有一个线程块在运行。对于大小为N的向量,第1个线程处理第0,blocksize,2*blocksize,....号位置上的元素,第2个线程处理第1,1+blocksize,1+2*blocksize,...号位置上的元素。对应元素的乘积结果保存在一个长度为N的共享内存上面。等线程块内的所有线程完成乘机运算后,在第一个线程上将沉积的结果加起来并返回结果。
【nvpro测试】
==5245== NVPROF is profiling process 5245, command: ./sample_v1
minGridSize:16,blockSize:1024
time cost: 0.116
vector product: 40000.000000
==5245== Profiling application: ./sample_v1
==5245== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 88.81% 53.821us 1 53.821us 53.821us 53.821us vec_product(float*, float*, float*, int)
9.08% 5.5030us 2 2.7510us 2.6550us 2.8480us [CUDA memcpy HtoD]
2.11% 1.2800us 1 1.2800us 1.2800us 1.2800us [CUDA memcpy DtoH]
API calls: 99.74% 273.58ms 3 91.195ms 4.9430us 273.57ms cudaMalloc
0.10% 262.02us 97 2.7010us 252ns 104.97us cuDeviceGetAttribute
0.07% 195.46us 1 195.46us 195.46us 195.46us cuDeviceTotalMem
0.02% 58.494us 3 19.498us 13.723us 29.006us cudaMemcpy
0.02% 50.682us 1 50.682us 50.682us 50.682us cudaEventSynchronize
0.02% 45.974us 1 45.974us 45.974us 45.974us cuDeviceGetName
0.01% 28.094us 1 28.094us 28.094us 28.094us cudaLaunchKernel
0.01% 17.564us 1 17.564us 17.564us 17.564us cuDeviceGetPCIBusId
0.00% 9.0360us 2 4.5180us 815ns 8.2210us cudaEventCreate
0.00% 7.9480us 2 3.9740us 3.8540us 4.0940us cudaEventRecord
0.00% 4.2820us 1 4.2820us 4.2820us 4.2820us cudaFuncGetAttributes
0.00% 3.3040us 1 3.3040us 3.3040us 3.3040us cudaDeviceSynchronize
0.00% 3.2010us 3 1.0670us 609ns 1.9100us cudaFree
0.00% 3.1560us 4 789ns 418ns 1.6510us cudaDeviceGetAttribute
0.00% 2.5620us 3 854ns 353ns 1.4010us cuDeviceGetCount
0.00% 1.8870us 1 1.8870us 1.8870us 1.8870us cudaEventElapsedTime
0.00% 1.6540us 1 1.6540us 1.6540us 1.6540us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 1.6410us 1 1.6410us 1.6410us 1.6410us cudaEventQuery
0.00% 1.5970us 2 798ns 430ns 1.1670us cuDeviceGet
0.00% 1.2450us 1 1.2450us 1.2450us 1.2450us cudaGetDevice
0.00% 506ns 1 506ns 506ns 506ns cuDeviceGetUuid
上面这种方法局限性很大。首先,CUDA中规定了一个线程块中的线程数量不能超过1024,所以你最多也就并行计算1024个位置上的数组对应元素的相乘。此外,一个线程块中可用的共享内存也是受限的,这种方法要求一个线程块中至少要sizeof(float)*N大小的共享内存,当数组长度更大是,会存在资源不足。所以,怎么把它扩展到多个block?可以考虑对数组按blockSize进行切分,如:[0,....blockSize-1],[blockSize,...,2*blockSize-1]....由多个block分别并行计算,方法二给出了核心代码。
【方法二】
#include<stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "error.cuh"
#define VECTOR_LENGTH 10000
__global__ void vector_product(float *out, float *vec_a, float *vec_b, int N)
{
const int stride = blockDim.x;
const int tid = threadIdx.x;
const int idx = blockIdx.x*blockDim.x+ threadIdx.x;
extern __shared__ float s_y[];
s_y[tid] = (idx < N) ? vec_a[idx] * vec_b[idx] : 0.0;
if (tid == 0)
{
float sum=0;
for (int i=0; i<stride; i++) {
sum += s_y[i];
}
atomicAdd(out, sum);
}
}
int main()
{
GetInfo();
float *a, *b, *out;
a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
out = (float*)malloc(sizeof(float));
*out = 0;
for(int i=0; i<VECTOR_LENGTH; i++)
{
a[i] = 2.0f;
b[i] = 2.0f;
}
int blockSize;
int minGridSize;
float *d_a,*d_b,*d_out;
cudaMalloc((void**)&d_a, sizeof(float)*VECTOR_LENGTH);
cudaMalloc((void**)&d_b, sizeof(float)*VECTOR_LENGTH);
cudaMalloc((void**)&d_out, sizeof(float));
cudaMemcpy(d_a,a,sizeof(float)*VECTOR_LENGTH,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,sizeof(float)*VECTOR_LENGTH,cudaMemcpyHostToDevice);
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, vector_product, 0, 0);
printf("minGridSize:%d,blockSize:%d\n",minGridSize,blockSize);
minGridSize = (VECTOR_LENGTH + blockSize - 1)/blockSize;
cudaEvent_t start,stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
vector_product<<<minGridSize,blockSize,sizeof(float)*blockSize>>>(d_out, d_a, d_b, VECTOR_LENGTH);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("time cost: %.3f\n", elapsed_time);
cudaMemcpy(out,d_out,sizeof(float),cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("vector product: %f\n", *out);
free(a);
free(b);
free(out);
cudaFree(a);
cudaFree(b);
cudaFree(out);
return 0;
}
【nvprof测试】
==26432== NVPROF is profiling process 26432, command: ./sample_v1_1
GPU has cuda devices: 1
----device id: 0 info----
GPU : GeForce GTX 1650
Capbility: 7.5
Global memory: 3911MB
Const memory: 64KB
Shared memory in a block: 48KB
warp size: 32
threads in a block: 1024
block dim: (1024,1024,64)
grid dim: (2147483647,65535,65535)
minGridSize:16,blockSize:1024
time cost: 0.036
vector product: 40000.000000
.....
代码中用到了CUDA编程中两项常用的技术:共享内存和原子操作。因为线程块的运行是独立的,在每个线程块的0号线程中完成所属block内的数组元素内积后要加到输出变量中去。这里使用了原子操作,保证线程之间互不干扰。好了,现在再来考虑一个问题。向量的乘法操作我们已经考虑了并行,但是元素相加仍然并行力度不够,只在每个线程块的0号线程上循化进行。怎么优化加法呢?我们引入归约的思想。对于一个线程块要处理的一段数组数据,我们将数组后半部分的各个元素与前半部分的各个元素相加。如果重复此过程,最后得到的第一个数组元素就是最初的数组中的各个元素的和,也就是所谓的折半归约。方法三实现了这一过程。
【方法三】
#include <stdio.h>
#include <algorithm>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "error.cuh"
#define VECTOR_LENGTH 10000
__global__ void vector_product(float *out, float *vec_a, float *vec_b, int N)
{
const int tid = threadIdx.x;
const int idx = blockIdx.x*blockDim.x+ threadIdx.x;
extern __shared__ float s_y[];
s_y[tid] = (idx < N) ? vec_a[idx] * vec_b[idx] : 0.0;
__syncthreads();
for (int offset = blockDim.x>>1; offset>0; offset>>=1)
{
if (tid < offset)
{
s_y[tid] += s_y[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
atomicAdd(out, s_y[0]);
}
}
int main()
{
//GetInfo();
float *a, *b, *out;
a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
out = (float*)malloc(sizeof(float));
*out = 0;
for(int i=0; i<VECTOR_LENGTH; i++)
{
a[i] = 2.0f;
b[i] = 2.0f;
}
int blockSize;
int minGridSize;
float *d_a,*d_b,*d_out;
cudaMalloc((void**)&d_a, sizeof(float)*VECTOR_LENGTH);
cudaMalloc((void**)&d_b, sizeof(float)*VECTOR_LENGTH);
cudaMalloc((void**)&d_out, sizeof(float));
cudaMemcpy(d_a,a,sizeof(float)*VECTOR_LENGTH,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,sizeof(float)*VECTOR_LENGTH,cudaMemcpyHostToDevice);
cudaMemset(d_out,0,sizeof(float));
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, vector_product, 0, 0);
cudaEvent_t start,stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
CHECK(cudaEventRecord(start));
cudaEventQuery(start);
minGridSize = std::min(minGridSize,(VECTOR_LENGTH + blockSize - 1)/blockSize);
printf("minGridSize:%d,blockSize:%d\n",minGridSize,blockSize);
vector_product<<<minGridSize,blockSize,sizeof(float)*blockSize>>>(d_out, d_a, d_b, VECTOR_LENGTH);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsed_time;
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
printf("time cost: %.3f\n", elapsed_time);
cudaMemcpy(out,d_out,sizeof(float),cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("vector product: %f\n", *out);
free(a);
free(b);
free(out);
cudaFree(a);
cudaFree(b);
cudaFree(out);
return 0;
}
【nvprof测试】
==30436== NVPROF is profiling process 30436, command: ./sample_v3
minGridSize:10,blockSize:1024
time cost: 0.036
vector product: 40000.000000
【附录】
GeForce GTX 1650 显卡 | NVIDIAGeForce GTX 1650 显卡拥有突破性的图形性能,能为热门游戏带来加速的游戏体验。https://www.nvidia.cn/geforce/graphics-cards/gtx-1650/
Device0:"GeForce GTX 1650"
Total amount of global memory 4101898240 bytes
Number of mltiprocessors 16
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block 49152 bytes
Total number of registers available per block: 65536
Warp size 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum size of each dimension of a grid: 2147483647 x 65535 x 65535
Maximum memory pitch : 2147483647 bytes
Texture alignmemt 32 bytes
Clock rate 1.56 GHz
代码中先通过cudaGetDeviceCount来得到系统中NVIDIA GPU的数目
再通过函数cudaGetDeviceProperties,获取系统中GPU的属性;
deviceProp.name为GPU名字,如果没有GPU则会输出 Device Emulation
deviceProp.totalGlobalMem返回的是全局储存器的大小,对大数据或一些大模型计算时显存大小必须大于数据大小,如图返回的是2GB的存储大小,
deviceProp.multiProcessorCount返回的是设备中流多处理器(SM)的个数,流处理器(SP)的个数SM数×每个SM包含的SP数,其中帕斯卡为每个SM含有64个SP,麦克斯韦为128个,开普勒为192个,费米为32个,
deviceProp.totalConstMem返回的是常数储存器的大小,如同为64kB
deviceProp.sharedMemPerBlock返回共享储存器的大小,共享存储器速度比全局储存器快,
deviceProp.regsPerBlock返回寄存器的数目;
deviceProp.warpSize返回线程束中线程多少;
deviceProp.maxThreadsPerBlock返回一个block中最多可以有的线程数;
deviceProp.maxThreadsDim[]返回block内3维度中各维度的最大值
deviceProp.maxGridSize[]返回Grid内三维度中各维度的最大值;
deviceProp.memPitch返回对显存访问时对齐时的pitch的最大值;
deviceProp.texturePitchAlignment返回对纹理单元访问时对其参数的最大值;
deviceProp.clockRate返回显存的频率;
计算最佳grid/block对
cudaOccupancyMaxPotentialBlockSize (
int *minGridSize,
int *blockSize,
T func,
size_t dynamicSMemSize,
int blockSizeLimit)
功能:
返回达到设备功能最大占用率时网格和块大小。
参数:
minGridSize : 返回达到最佳潜在占用率所需的最小网格尺寸。
blockSize : 返回块大小。
func :kernel函数名。
dynamicSMemSize : 每个block预定要使用的动态共享内存大小,单位是Byte。
blockSizeLimit :kernel在设计时可以使用的最大块大小,0表示没有限制。