GPU与CPU版本的矩阵乘法对比

2023-11-04

转载自：http://www.cnblogs.com/stormhan/p/5467187.html

由于刚刚开始学习Cuda，还没有整理出一个完整的Cuda类，只是在Nvidia提供的kenerl架构上做修改。

　　但用于初体验GPU给我们带来的好处也绰绰有余了。

　　直接贴代码：

/*
    矩阵乘法，CPU版本和GPU版本的对比


*/

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <Windows.h>
#include <string>
#include <malloc.h>

//用于指示不同的GPU 优化版本
enum Type
{
    Mode1 = 1,   //Mode 1 :将每一个C[i][j]都分别分配一个线程
    Mode2 = 2     //Mode 2 :不让一个线程完整计算一个C[i][j]，通过C(i,j) = sum { A(i,k)*B(k,j) }发现，我们还可以再细度划分：
                 //           sub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)}  0<=ksub < blockSize
                 //            C(i, j) = sum{ Csub(i, j) }
                 //            就是把矩阵分成n*n个大的子块，然后每一个block负责计算子块i 和 子块j的子乘积，计算完毕后加起来则可。这里主要使用了共享显存作优化。
};

cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode);

__global__ void MatrixMulGPU_1(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{
    float sum = 0;
    //找出该线程所在的行和列
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    //线程Thread(row, col)负责计算C(row, col)
    for (int i = 0; i < WB; ++i)
    {
        sum += a[row * WA + i] * b[i * WB + col];
    }

    c[row * WB + col] = sum;
}

template<int BLOCK_SIZE> __global__ void MatrixMulGPU_2(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    int aBegin = WA * BLOCK_SIZE * by;

    // Index of the last sub-matrix of A processed by the block
    int aEnd = aBegin + WA - 1;

    // Step size used to iterate through the sub-matrices of A
    int aStep = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    int bStep = BLOCK_SIZE * WB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int i = aBegin, j = bBegin;
        i <= aEnd;
        i += aStep, j += bStep)
    {

        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

        // Declaration of the shared memory array Bs used to
        // store the sub-matrix of B
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        As[ty][tx] = a[i + WA * ty + tx];
        Bs[ty][tx] = b[j + WB * ty + tx];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
#pragma unroll

        for (int k = 0; k < BLOCK_SIZE; ++k)
        {
            Csub += As[ty][k] * Bs[k][tx];
        }

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
    c[k + WB * ty + tx] = Csub;
}

//GPU version
void MatrixMulCPU(float *_C, const float* _A, const float* _B, int WA, int HA, int WB, int HB)
{
    if (WA != HB)
    {
        printf("the matrix A and B cannot be multipled!");
            exit(0);
    }

    for (int i = 0; i < HA; ++i)
    {
        for (int j = 0; j < WB; ++j)
        {
            for (int k = 0; k < WA; ++k)
            {
                _C[i * WA + j] += _A[i * WA + k] * _B[k * WB + j];
            }
        }
    }
}

//给初始的矩阵一个随机值
void randomInit(float* _data, int _size)
{
    for (int i = 0; i < _size; ++i)
    {
        _data[i] = rand() / (float)RAND_MAX * 100;
    }
}

//print the matrix
void printMatrix(float* m_Matrix, int W, int H)
{
    for (int i = 0; i < W * H; ++i)
    {
        printf("%2.1f ", m_Matrix[i]);
        if (i % W == 0 && i != 0) printf("\n");
    }
    printf("\n");
}

bool CheckAnswer(const float* _C, const float* _D, unsigned int size)
{
    bool isRight = true;
    for (int i = 0; i < size && isRight == true; ++i)
    {
        if (_C[i] != _D[i])
            isRight = false;
    }

    return isRight;
}

int main()
{
    const int width_A = 1024;
    const int height_A = 1024;
    const int width_B = 1024;
    const int height_B = 1024;

    float *B = (float *)malloc(sizeof(float) * height_B * width_B);
    float *A = (float *)malloc(sizeof(float) * height_A * width_A);
    float *C = (float *)malloc(sizeof(float) * height_A * width_B);
    float *D = (float *)malloc(sizeof(float) * height_A * width_B);
    float *E = (float *)malloc(sizeof(float) * height_A * width_B);

    memset(A, 0.0, sizeof(float) * height_A * width_A);
    memset(B, 0.0, sizeof(float) * height_B * width_B);
    memset(C, 0.0, sizeof(float) * height_A * width_B);
    memset(D, 0.0, sizeof(float) * height_A * width_B);
    memset(E, 0.0, sizeof(float) * height_A * width_B);


    //产生随机数生成器
    srand((unsigned)time(0));

    randomInit(B, height_B * width_B);
    randomInit(A, height_A * width_A);

    //printMatrix(B, width_B, height_B);
    //printMatrix(A, width_A, height_A);

    //CPU 计算
    unsigned int tick1 = GetTickCount();
    MatrixMulCPU(C, A, B, width_A, height_A, width_B, height_B);
    printf("CPU use time : %dms\n", GetTickCount() - tick1);

    //GPU 
    Type m_Mode = Mode1;

    unsigned int tick2 = GetTickCount();
    cudaError_t cudaStatus = addWithCuda(D, A, B, width_A, height_A, width_B, height_B, m_Mode);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "addWithCuda failed!\n");
        return 1;
    }
    printf("GPU mode1 use time : %dms\n", GetTickCount() - tick2);

    m_Mode = Mode2;
    unsigned int tick3 = GetTickCount();
    cudaStatus = addWithCuda(E, A, B, width_A, height_A, width_B, height_B, m_Mode);
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "addWithCuda failed!\n");
        return 1;
    }
    printf("GPU mode2 use time : %dms\n", GetTickCount() - tick3);

    //检查GPU, CPU 计算的结果是否相同
    if (!CheckAnswer(C, D, height_A * width_B) && !CheckAnswer(C, E, height_A * width_B))
        printf("The answer is wrong!");
    else printf("The answer is right!");

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess)
    {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }
    
    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode)
{
    float *dev_a = 0;
    float *dev_b = 0;
    float *dev_c = 0;
    cudaError_t cudaStatus;

    
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, HA * WB * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, HA * WA * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, HB * WB * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, HA * WA * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, HB * WB * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    //为每一个C[i][j]设置一个线程进行计算
    int block_size = 16;

    dim3 Threads(block_size, block_size);
    dim3 Blocks(WB / block_size, HA / block_size);

    // Launch a kernel on the GPU with one thread for each element.
    if (mode == Mode1)
    {
        MatrixMulGPU_1 << <Blocks, Threads >>>(dev_c, dev_a, dev_b, WA, WB);
    }

    if (mode == Mode2)
    {
        MatrixMulGPU_2<16> << <Blocks, Threads >> >(dev_c, dev_a, dev_b, WA, WB);
    }

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, HA * WB * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}

代码中，使用了CPU的计算和两种GPU的运算，最终的运行结果如下：

可以明显的看出，GPU的运行速度比CPU快很多，并且将任务越细分，运行的速度也更快。

后续我还想通过更多的方式（比如texture binding）来继续进行优化。

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

GPU

GPU与CPU版本的矩阵乘法对比的相关文章

Tensorflow 相同的代码，但从 CPU 设备到 GPU 设备得到不同的结果

我正在尝试实现一个程序来测试 GPU 设备上的 Tensorflow 性能数据测试是MNIST数据使用多层感知器神经网络进行监督训练我跟着这个简单的例子 http gist github com nishidy 8176548ec
在具有 4.14 内核的 IMX6 上使用 vivante GPU

我正在使用 yocto rocko Linux 4 14 24 开发 IMX6QP 并尝试使用 GPU 我的 yocto 配置文件 MACHINE imx6qp tx6 emmc DL DIR BSPDIR downloads SSTATE
Keras 不在具有 python 3.5 和 Tensorflow 1.4 的 Pycharm 上使用 GPU [重复]

这个问题在这里已经有答案了 from tensorflow python client import device lib def get available gpus local device protos device lib list
如何检查 PyTorch 是否正在使用 GPU？

如何检查 PyTorch 是否正在使用 GPU 这nvidia smi命令可以检测 GPU 活动但我想直接从 Python 脚本内部检查它这些功能应该有助于 gt gt gt import torch gt gt gt torch cu
在 Tensorflow 中训练简单模型 GPU 比 CPU 慢

我在 Tensorflow 中设置了一个简单的线性回归问题并在 1 13 1 中使用 Tensorflow CPU 和 GPU 创建了简单的 conda 环境在 NVIDIA Quadro P600 的后端使用 CUDA 10 0 然而
使用“boot”包进行引导的 GPU 计算

我想使用引导程序进行大型分析我发现使用并行计算提高了引导速度如以下代码所示并行计算 detect number of cpu library parallel detectCores library boot boot functio
GPU编程简介[关闭]

Closed 这个问题是基于意见的 help closed questions 目前不接受答案每个人的桌面上都有一台以显卡 GPU 形式存在的巨大的大规模并行超级计算机 GPU 社区的 hello world 相当于什么我该做什么去哪
某些子网格未使用 CUDA 动态并行执行

我正在尝试 CUDA 5 0 GTK 110 中的新动态并行功能我遇到了一个奇怪的行为即我的程序没有返回某些配置的预期结果不仅是意外的而且每次启动都会出现不同的结果现在我想我找到了问题的根源似乎当生成太多子网格时某些子网格由
为什么 PyTorch nn.Module.cuda() 不将模块张量移动到 GPU，而仅将参数和缓冲区移动到 GPU？

nn Module cuda 将所有模型参数和缓冲区移动到 GPU 但为什么不是模型成员张量呢 class ToyModule torch nn Module def init self gt None super ToyModule se
如何在C++中的cudaDeviceReset()之后重用tensorflow？

我正在使用 C 开发一个大型 CUDA 应用程序该应用程序运行各种模型需要完全释放所有 GPU 内存否则其他操作将失败我能够在关闭所有 tf 会话并运行 cudaDeviceReset 后释放所有内存但之后我无法运行任何新的张量流
使 CUDA 内存不足

我正在尝试训练网络但我明白了我将批量大小设置为 300 并收到此错误但即使我将其减少到 100 我仍然收到此错误更令人沮丧的是在 1200 个图像上运行 10 epoch 大约需要 40 分钟有什么建议吗错了我怎样才能加快这
C# - 获取 GPU 的总使用百分比

我正在向我的程序添加一些新功能这些功能当前通过串行连接将 CPU 使用情况和 RAM 使用情况发送到 Arduino 请参阅this https create arduino cc projecthub thesahilsaluja cp
NV_path_rendering替代方案[关闭]

Closed 这个问题不符合堆栈溢出指南 help closed questions 目前不接受答案我刚刚观看了 Siggraph 2012 的一个非常令人印象深刻的演示 http nvidia fullviewmedia com sig
CUDA - 将 CPU 变量传输到 GPU __constant__ 变量

与 CUDA 的任何事情一样最基本的事情有时也是最难的所以我只想将变量从 CPU 复制到 GPUconstant变量我很难过这就是我所拥有的 constant int contadorlinhasx d int main int
错误：NVIDIA-SMI 失败，因为无法与 NVIDIA 驱动程序通信

NVIDIA SMI 抛出此错误 NVIDIA SMI 失败因为无法与 NVIDIA 通信司机确保安装了最新的 NVIDIA 驱动程序并且跑步我清除了 NVIDIA 并按照提到的步骤重新安装了它here https askubun
如何在GPU支持下运行python代码

我创建了一个 Flask 服务用于接受以相机 URL 作为参数的请求用于在相机框架中查找对象桌子椅子等我已经在 Flask 中编写了用于接受 POST 请求的代码 app route rest detectObjects meth
CUDA Thrust 的多 GPU 使用

我想使用我的两张显卡通过 CUDA Thrust 进行计算我有两张显卡在单卡上运行对于两张卡都适用即使我在 std vector 中存储两个 device vector 也是如此如果我同时使用两张卡循环中的第一个周期将起作用并且不
CUDA 添加矩阵的行

我试图将 4800x9600 矩阵的行加在一起得到一个 1x9600 的矩阵我所做的是将 4800x9600 分成 9 600 个矩阵每个矩阵长度为 4800 然后我对 4800 个元素进行缩减问题是这真的很慢有人有什么建议吗
了解流式多处理器 (SM) 和流式处理器 (SP)

我正在尝试了解 GPU 的基本架构我已经阅读了很多材料包括这个非常好的答案 https stackoverflow com a 2213744 2386113 但我仍然很困惑无法得到一个好的图片我的理解 GPU 包含两个或多个流式多
需要 TensorFlow 依赖项。如何在 Windows 上运行 TensorFlow

我有兴趣让 TensorFlow 在 Windows 上运行但目前我意识到这是不可能的因为某些依赖项无法在 Windows 上使用例如巴泽尔之所以出现这种需求是因为据我目前了解从 TensorFlow 访问 GPU 的唯一方法是

随机推荐

面试题（2）

1 J2EE是什么 2 介绍JAVA中的Collection FrameWork 包括如何写自己的数据结构如COLLECTION中遗留类 HASHTABLE VECTOR 和现有类的区别同步 3 Java中异常处理机制事件机制 4 E
实验7-3-8 统计字符出现次数（20 分）

实验7 3 8 统计字符出现次数 20 分本题要求编写程序统计并输出某给定字符在给定字符串中出现的次数输入格式输入第一行给出一个以回车结束的字符串少于80个字符第二行输入一个字符输出格式在一行中输出给定字符在给定字符串中出现
三位水仙花数的两种做法

编写程序计算如下数列的值 1 2 3 4 966 其中所有数字为整数从1开始递增奇数为正偶数为负 s 0 for i in range 967
mybatis plus 写sql语句

一 api public Map
app常见专项测试点

常见的apk专项测试主要有几类主要指项目中经常用到的 1 稳定性 2 安全性 3 兼容性 4 版本升级 5 流量测试 6 实际测试总结对整个项目团队是如何进行测试以及测试的内容一稳定性测试稳定性测试是指对应用进行长时间的操作检
jq+ajax提交合并表单,JQuery通过Ajax提交表单并返回结果

如下 1 非Ajax 对应后台 2 JQuery之Ajax 在介绍使用JQuery提交表单前我们需要首先引用jquery form js 它来自于http www malsup com jquery form 至此我们需要引用的JS文件
Nice! I just got a Minecraft gift code for FREE! :D https://freeminecraftgiftcode.net

Nice I just got a Minecraft gift code for FREE D https freeminecraftgiftcode net Nice I just got a Minecraft gift code f
Nodejs学习笔记 - 版本号

npm outdated 查看过期了的包也不能说过期反正版本号不是最新的就那么个意思 QWQ npm update 更新那个包语言组织能力有点差理解就行了 QWQ 进入正题了哇版本号 eg 13 1 4 上边的版本号共分为3块
wind10 idea中 go 开发环境搭建

下载安装包国内的不用翻墙 https studygolang com 官网需翻墙 https golang org 这里使用的的是第一种https studygolang com 下载后解压到本地目录结构配置环境变量 path中配
while语句中的break和continue

1 break介绍 include
怎么彻底删除电脑上的软件_彻底删除流氓软件的两个工具！

流氓软件弹出广告我想这些应该让很多同学都苦不苦不堪言卸载不干净无法卸载卸载残留这些顽固的软件就如同牛皮癣一样一旦粘着就很难摆脱本文就来推荐2款软件这两款都是经过多年使用对比保留下来的从此就可以对流氓软件说再见了前言
android studio构建的AAB是什么

AAB 是 Android App Bundle 的缩写是一种由 Google 推出的 Android 应用程序发布格式 AAB 格式的应用程序包含了应用程序的所有代码和资源但是与传统的 APK 包不同 AAB 包含了多个分割的模块每
JAVA、MySql实现登录注册(网页)

初学JAVA EE 老师留下一小作业用JAVA实现与服务器端交互实现登录和注册功能初学一种专业课很多老师都会留下一种让学生实现登录和注册的作业下面是记录的实现步骤 1 首先是账号密码输入框和按钮登录
Sa-Token的Token有效期和临时有效期的区别

各位不要再卷了周六我在家打着游戏群消息就一直叮叮叮进去看了看周六还加班干活哎真卷 ps 在卷就没了吐槽一下进入正题就周六群友提问做一下总结群友问题为什么不能续期先说一下这位群友的测试方法 token有效期 10秒在
关于checkpoint机制

关于checkpoint机制一简介思考一下这个场景如果重做日志可以无限地增大同时缓冲池也足够大那么是不需要将缓冲池中页的新版本刷新回磁盘因为当发生宕机时完全可以通过重做日志来恢复整个数据库系统中的数据到宕机发生的时刻但是这
idea忽略.iml文件和.idea目录（避坑和填坑）

当操作git下的项目时 iml文件和 idea目录下的有些文件会显示已被改动会出现在git待提交目录下这里为了防止提交这些并不需要的东西需要在idea中设置忽略这些东西注输入完 iml和 idea后点回车再点OK 不然不会保存这些
hiberate4整合Spring3.1时出现 java.lang.NoClassDefFoundError: Lorg/hibernate/cache/CacheProvider

hibernate4整合spring3 1的过程中发现了java lang NoClassDefFoundError Lorg hibernate cache CacheProvider异常查了一下相关资料原来发现hibernate4
关于大数据技术原理与应用的学习（5）

学习目标大数据技术原理与应用学习内容 5 NoSQL数据库例如 5 1NoSQL数据库 5 2与关系数据库的比较 5 3四大类型 5 4理论基石 5 5从NoSQL到NewSQL 5 6文档数据库MongoDB 学习时间 2022 0
Android 验证码输入框连续性粘贴使用第三方包

第一步 android compileOptions sourceCompatibility JavaVersion VERSION 1 8 targetCompatibility JavaVersion VERSION 1 8 depen
GPU与CPU版本的矩阵乘法对比

转载自 http www cnblogs com stormhan p 5467187 html 由于刚刚开始学习Cuda 还没有整理出一个完整的Cuda类只是在Nvidia提供的kenerl架构上做修改但用于初体验GPU给我们带来的好

GPU与CPU版本的矩阵乘法对比

GPU与CPU版本的矩阵乘法对比 的相关文章

随机推荐

热门标签

GPU与CPU版本的矩阵乘法对比的相关文章