CUDA编程一天入门

2023-11-06

0 环境准备

1 套路

CUDA 编程模型是一个异构模型，其中同时使用 CPU 和 GPU。在 CUDA 中，主机是指 CPU 及其内存，而设备是指 GPU 及其内存。在主机上运行的代码可以管理主机和设备上的内存，还可以启动内核，内核是在设备上执行的功能。这些内核由许多 GPU 线程并行执行。

鉴于 CUDA 编程模型的异构性质，CUDA C 程序的典型操作顺序是：

声明并分配主机和设备内存。

初始化主机数据。

将数据从主机传输到设备。

执行一个或多个内核。

将结果从设备传输到主机。

2 并行执行内核设置

三重 V 形之间的信息是执行配置，它指示并行执行内核的设备线程数。在 CUDA 中，软件中有一个线程层次结构，它模仿线程处理器在 GPU 上的分组方式。在 CUDA 编程模型中，我们谈到启动带有线程块网格的内核。执行配置中的第一个参数指定网格中的线程块数，第二个参数指定线程块中的线程数。

int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);

3 示例代码simpleTexture3D

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
  3D texture sample

  This sample loads a 3D volume from disk and displays slices through it
  using 3D texture lookups.
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_gl.h>

#if defined(__APPLE__) || defined(MACOSX)
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#include <GLUT/glut.h>
#ifndef glutCloseFunc
#define glutCloseFunc glutWMCloseFunc
#endif
#else
#include <GL/freeglut.h>
#endif

// includes, cuda
#include <vector_types.h>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h>

// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>

typedef unsigned int uint;
typedef unsigned char uchar;

#define MAX_EPSILON_ERROR 5.0f
#define THRESHOLD 0.15f

const char *sSDKsample = "simpleTexture3D";

const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);

const uint width = 512, height = 512;
const dim3 blockSize(16, 16, 1);
const dim3 gridSize(width / blockSize.x, height / blockSize.y);

float w = 0.5;  // texture coordinate in z

GLuint pbo;  // OpenGL pixel buffer object
struct cudaGraphicsResource
    *cuda_pbo_resource;  // CUDA Graphics Resource (to transfer PBO)

bool linearFiltering = true;
bool animate = true;

StopWatchInterface *timer = NULL;

uint *d_output = NULL;

// Auto-Verification Code
const int frameCheckNumber = 4;
int fpsCount = 0;  // FPS count for averaging
int fpsLimit = 1;  // FPS limit for sampling
int g_Index = 0;
unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0;
volatile int g_GraphicsMapFlag = 0;

int *pArgc = NULL;
char **pArgv = NULL;

#ifndef MAX
#define MAX(a, b) ((a > b) ? a : b)
#endif

extern "C" void cleanup();
extern "C" void setTextureFilterMode(bool bLinearFilter);
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
                              uint imageW, uint imageH, float w);
extern void cleanupCuda();

void loadVolumeData(char *exec_path);

void computeFPS() {
  frameCount++;
  fpsCount++;

  if (fpsCount == fpsLimit) {
    char fps[256];
    float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
    sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);

    glutSetWindowTitle(fps);
    fpsCount = 0;

    fpsLimit = ftoi(MAX(1.0f, ifps));
    sdkResetTimer(&timer);
  }
}

// render image using CUDA
void render() {
  // map PBO to get CUDA device pointer
  g_GraphicsMapFlag++;
  checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
  size_t num_bytes;
  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
      (void **)&d_output, &num_bytes, cuda_pbo_resource));
  // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);

  // call CUDA kernel, writing results to PBO
  render_kernel(gridSize, blockSize, d_output, width, height, w);

  getLastCudaError("render_kernel failed");

  if (g_GraphicsMapFlag) {
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
    g_GraphicsMapFlag--;
  }
}

// display results using OpenGL (called by GLUT)
void display() {
  sdkStartTimer(&timer);

  render();

  // display results
  glClear(GL_COLOR_BUFFER_BIT);

  // draw image from PBO
  glDisable(GL_DEPTH_TEST);
  glRasterPos2i(0, 0);
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
  glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

  glutSwapBuffers();
  glutReportErrors();

  sdkStopTimer(&timer);
  computeFPS();
}

void idle() {
  if (animate) {
    w += 0.01f;
    glutPostRedisplay();
  }
}

void keyboard(unsigned char key, int x, int y) {
  switch (key) {
    case 27:
#if defined(__APPLE__) || defined(MACOSX)
      exit(EXIT_SUCCESS);
      glutDestroyWindow(glutGetWindow());
      return;
#else
      glutDestroyWindow(glutGetWindow());
      return;
#endif

    case '=':
    case '+':
      w += 0.01f;
      break;

    case '-':
      w -= 0.01f;
      break;

    case 'f':
      linearFiltering = !linearFiltering;
      setTextureFilterMode(linearFiltering);
      break;

    case ' ':
      animate = !animate;
      break;

    default:
      break;
  }

  glutPostRedisplay();
}

void reshape(int x, int y) {
  glViewport(0, 0, x, y);

  glMatrixMode(GL_MODELVIEW);
  glLoadIdentity();

  glMatrixMode(GL_PROJECTION);
  glLoadIdentity();
  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
}

void cleanup() {
  sdkDeleteTimer(&timer);

  // add extra check to unmap the resource before unregistering it
  if (g_GraphicsMapFlag) {
    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
    g_GraphicsMapFlag--;
  }

  // unregister this buffer object from CUDA C
  checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
  glDeleteBuffers(1, &pbo);
  cleanupCuda();
}

void initGLBuffers() {
  // create pixel buffer object
  glGenBuffers(1, &pbo);
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
  glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4,
               0, GL_STREAM_DRAW_ARB);
  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

  // register this buffer object with CUDA
  checkCudaErrors(cudaGraphicsGLRegisterBuffer(
      &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
}

// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size) {
  FILE *fp = fopen(filename, "rb");

  if (!fp) {
    fprintf(stderr, "Error opening file '%s'\n", filename);
    return 0;
  }

  uchar *data = (uchar *)malloc(size);
  size_t read = fread(data, 1, size, fp);
  fclose(fp);

  printf("Read '%s', %zu bytes\n", filename, read);

  return data;
}

void initGL(int *argc, char **argv) {
  // initialize GLUT callback functions
  glutInit(argc, argv);
  glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
  glutInitWindowSize(width, height);
  glutCreateWindow("CUDA 3D texture");
  glutDisplayFunc(display);
  glutKeyboardFunc(keyboard);
  glutReshapeFunc(reshape);
  glutIdleFunc(idle);

  if (!isGLVersionSupported(2, 0) ||
      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
    fprintf(stderr, "Required OpenGL extensions are missing.");
    exit(EXIT_FAILURE);
  }
}

void runAutoTest(const char *ref_file, char *exec_path) {
  checkCudaErrors(
      cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));

  // render the volumeData
  render_kernel(gridSize, blockSize, d_output, width, height, w);

  checkCudaErrors(cudaDeviceSynchronize());
  getLastCudaError("render_kernel failed");

  void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
  checkCudaErrors(cudaMemcpy(h_output, d_output,
                             width * height * sizeof(GLubyte) * 4,
                             cudaMemcpyDeviceToHost));
  sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
             "simpleTexture3D.bin");

  bool bTestResult = sdkCompareBin2BinFloat(
      "simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path),
      width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path);

  checkCudaErrors(cudaFree(d_output));
  free(h_output);

  sdkStopTimer(&timer);
  sdkDeleteTimer(&timer);

  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

void loadVolumeData(char *exec_path) {
  // load volume data
  const char *path = sdkFindFilePath(volumeFilename, exec_path);

  if (path == NULL) {
    fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n",
            volumeFilename);
    exit(EXIT_FAILURE);
  }

  size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;
  uchar *h_volume = loadRawFile(path, size);

  initCuda(h_volume, volumeSize);
  sdkCreateTimer(&timer);

  free(h_volume);
}


// Program main

int main(int argc, char **argv) {
  pArgc = &argc;
  pArgv = argv;

  char *ref_file = NULL;

#if defined(__linux__)
  setenv("DISPLAY", ":0", 0);
#endif

  printf("%s Starting...\n\n", sSDKsample);

  if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
    fpsLimit = frameCheckNumber;
    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
  }

  // use command-line specified CUDA device, otherwise use device with highest
  // Gflops/s
  findCudaDevice(argc, (const char **)argv);

  if (ref_file) {
    loadVolumeData(argv[0]);
    runAutoTest(ref_file, argv[0]);
  } else {
    initGL(&argc, argv);

    // OpenGL buffers
    initGLBuffers();

    loadVolumeData(argv[0]);
  }

  printf(
      "Press space to toggle animation\n"
      "Press '+' and '-' to change displayed slice\n");

#if defined(__APPLE__) || defined(MACOSX)
  atexit(cleanup);
#else
  glutCloseFunc(cleanup);
#endif

  glutMainLoop();

  exit(EXIT_SUCCESS);
}

/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <helper_cuda.h>
#include <helper_math.h>

typedef unsigned int uint;
typedef unsigned char uchar;

cudaArray *d_volumeArray = 0;
cudaTextureObject_t tex;  // 3D texture

__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
                         cudaTextureObject_t texObj) {
  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

  float u = x / (float)imageW;
  float v = y / (float)imageH;
  // read from 3D texture
  float voxel = tex3D<float>(texObj, u, v, w);

  if ((x < imageW) && (y < imageH)) {
    // write output color
    uint i = __umul24(y, imageW) + x;
    d_output[i] = voxel * 255;
  }
}

extern "C" void setTextureFilterMode(bool bLinearFilter) {
  if (tex) {
    checkCudaErrors(cudaDestroyTextureObject(tex));
  }
  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));

  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = d_volumeArray;

  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));

  texDescr.normalizedCoords = true;
  texDescr.filterMode =
      bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
  ;
  texDescr.addressMode[0] = cudaAddressModeWrap;
  texDescr.addressMode[1] = cudaAddressModeWrap;
  texDescr.addressMode[2] = cudaAddressModeWrap;
  texDescr.readMode = cudaReadModeNormalizedFloat;

  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
}

extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
  // create 3D array
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
  checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));

  // copy data to 3D array
  cudaMemcpy3DParms copyParams = {0};
  copyParams.srcPtr =
      make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar),
                          volumeSize.width, volumeSize.height);
  copyParams.dstArray = d_volumeArray;
  copyParams.extent = volumeSize;
  copyParams.kind = cudaMemcpyHostToDevice;
  checkCudaErrors(cudaMemcpy3D(&copyParams));

  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));

  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = d_volumeArray;

  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));

  // access with normalized texture coordinates
  texDescr.normalizedCoords = true;
  // linear interpolation
  texDescr.filterMode = cudaFilterModeLinear;
  // wrap texture coordinates
  texDescr.addressMode[0] = cudaAddressModeWrap;
  texDescr.addressMode[1] = cudaAddressModeWrap;
  texDescr.addressMode[2] = cudaAddressModeWrap;
  texDescr.readMode = cudaReadModeNormalizedFloat;

  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
}

extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
                              uint imageW, uint imageH, float w) {
  d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
}

void cleanupCuda() {
  if (tex) {
    checkCudaErrors(cudaDestroyTextureObject(tex));
  }
  if (d_volumeArray) {
    checkCudaErrors(cudaFreeArray(d_volumeArray));
  }
}

#endif  // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_

4 参考链接

GitHub - NVIDIA/cuda-samples: Samples for CUDA Developers which demonstrates features in CUDA Toolkit

CUDA routines are functions that can be executed on the GPU using many threads in parallel 1. There are many CUDA code samples included as part of the CUDA Toolkit to help you get started on writing CUDA C/C++ applications 2. You can also find some easy introductions to CUDA C and C++ on the NVIDIA Technical Blog 3 1.

GitHub - LitLeo/OpenCUDA

CUDA开发环境搭建 - 知乎 (zhihu.com)

(191条消息) VS+CUDA 新建项目里没有CUDA选项（附详细图文步骤）_cuda vs_Xav Zewen的博客-CSDN博客

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

CUDA编程

CUDA

CUDA编程一天入门的相关文章

CUDA 常量内存是否应该被均匀地访问？

我的 CUDA 应用程序的恒定内存小于 8KB 既然它都会被缓存我是否需要担心每个线程访问相同的地址以进行优化如果是如何确保所有线程同时访问同一地址既然它都会被缓存我是否需要担心每个线程访问相同的地址以进行优化是的这缓存本身每
Cuda 6.5 找不到 - libGLU。（在 ubuntu 14.04 64 位上）

我已经在我的ubuntu上安装了cuda 6 5 我的显卡是 GTX titan 当我想要制作 cuda 样本之一时模拟粒子我收到这条消息 gt gt gt WARNING libGLU so not found refer to C
从 CUDA 设备写入输出文件

我是 CUDA 编程的新手正在将 C 代码重写为并行 CUDA 新代码有没有一种方法可以直接从设备写入输出数据文件而无需将数组从设备复制到主机我假设如果cuPrintf存在一定有地方可以写一个cuFprintf 抱歉如果答案已经
CUDA、NPP 滤波器

CUDA NPP 库支持使用 nppiFilter 8u C1R 命令过滤图像但不断出现错误我可以毫无问题地启动并运行 boxFilterNPP 示例代码 eStatusNPP nppiFilterBox 8u C1R oDeviceS
cudaMemcpy() 与 cudaMemcpyFromSymbol()

我试图找出原因cudaMemcpyFromSymbol 存在似乎 symbol func 可以做的所有事情 nonSymbol cmd 也可以做 symbol func 似乎可以轻松移动数组或索引的一部分但这也可以使用 nonSymbo
将 nvidia 运行时添加到 docker 运行时

我正在运行虚拟机GCP配备特斯拉 GPU 并尝试部署一个PyTorch基于应用程序使用 GPU 加速我想让 docker 使用这个 GPU 可以从容器访问它我设法在主机上安装了所有驱动程序并且该应用程序在那里运行良好但是当我尝试在
大型跨平台软件项目的技巧/资源

我将开始一个大型软件项目涉及跨平台 GUI 和大量的数字运算我计划用 C 和 CUDA 编写大部分应用程序后端并用 Qt4 编写 GUI 我计划使用 Make 作为我的构建系统这将是一个只有两名开发人员的项目一旦我相对深入地了解它
cudaDeviceScheduleBlockingSync 和 cudaDeviceScheduleYield 之间有什么区别？

正如这里所说如何减少 CUDA 同步延迟延迟 https stackoverflow com questions 11953722 how to reduce cuda synchronize latency delay 等待设备结果有
CUDA Thrust 的多 GPU 使用

我想使用我的两张显卡通过 CUDA Thrust 进行计算我有两张显卡在单卡上运行对于两张卡都适用即使我在 std vector 中存储两个 device vector 也是如此如果我同时使用两张卡循环中的第一个周期将起作用并且不
CUDA 8 编译错误 -std=gnu++11

我正在尝试转换一些代码以使用 CUDA 并且我认为我遇到了兼容性问题我们使用CMake 这些是我使用的 gcc 和 CUDA 版本 gcc version gcc Ubuntu 5 4 0 6ubuntu1 16 04 5 5 4 0 2
NVCC 警告级别

我希望 NVCC 将以下警告视为错误 warning calling a host function foo from a host device function bar NVCC 文档 NVIDIA CUDA 编译器驱动程序 NVCC
GPU上动态分配内存

是否可以在内核内的 GPU 全局内存上动态分配内存我不知道我的答案有多大因此我需要一种方法为答案的每个部分分配内存 CUDA 4 0 允许我们使用 RAM 这是一个好主意还是会降低速度可以在内核中使用 malloc 检查以下内容摘自
使用 GPU 进行 Matlab 卷积

我用gpuArray尝试了matlab的卷积函数conv2 convn 例如 convn gpuArray rand 100 100 10 single gpuArray rand 5 single 并将其与 cpu 版本 convn ra
无法在 CUDA 中找到 1 到 100 数字的简单和？

我正在研究使用 CUDA 的图像处理算法在我的算法中我想使用 CUDA 内核找到图像所有像素的总和所以我在cuda中制作了内核方法来测量16位灰度图像的所有像素的总和但我得到了错误的答案所以我在cuda中编写了一个简单的程序来查
CUDA 和 Eigen 的成员“已声明”错误

我只是 CUDA 和 Nsight 的初学者希望利用出色的 GPU 性能进行线性代数运算例如 CUBLAS 我在以下人员的帮助下编写了很多自定义代码Eigen http eigen tuxfamily org index php tit
使用推力来处理 CUDA 类中的向量？

我对 C 类的推力的适用性有疑问我正在尝试实现一个类对象该对象接收顶点的 x y z 坐标作为 ver1 ver2 和 ver3 然后分配给一个三角形并计算面积和法向量然而我不太明白如何创建一类推力向量这是我从文件中读取的顶点坐
无法编译cuda_ndarray.cu：libcublas.so.7.5：无法打开共享对象文件

我正在尝试在 aws 实例中导入 theano 库以使用 GPU 我已经使用 boto 编写了一个 python 脚本来自动执行 aws 设置该脚本本质上会从我的本地计算机对实例执行 ssh 然后启动一个 bash 脚本其中我执行 py
CUDA 中的 JPEG 库

我正在尝试在 CUDA 中压缩和解压缩图像到目前为止我已经找到了这个库 http sourceforge net projects cuj2k source navbar http sourceforge net projects cuj
如何从尖点库矩阵格式获取原始指针

我需要从尖点库矩阵格式获取原始指针例如 cusp coo matrix
CUDA cutil.h 在哪里？

有谁知道包含 cutil h 的 SDK 工具包在哪里我尝试了 CUDA toolkits3 2 和 toolkits5 0 我知道这个版本已经不支持 cutil h 我还注意到一些提到的如何在 Linux 中包含 cutil h htt

随机推荐

计网实验A3：简单的web服务器

文章目录计网实验A3 简单的web服务器实验介绍相关背景介绍 Socket编程接口 HTTP传输协议实验功能要求总体设计详细设计数据结构设计函数分析调试设计运行结果实验总结困难与解决心得与思考计网实验A3 简单的
Andrew Ng机器学习算法入门((六):多变量线性回归方程求解

多变量线性回归之前讨论的都是单变量的情况例如房价与房屋面积之前的关系但是实际上房价除了房屋面积之外还要房间数楼层等因素相关那么此时就变成了一个多变量线性回归的问题在实际问题中多变量的线性回归问题是更加常见的下面这个例子就
Tomcat 相关配置参数说明，性能调优

Tomcat 相关配置参数说明 1 server xml connect中相关参数说明
爬虫简单爬取网页图片

仅供学习请遵守法律法规和robots协议请在爬取时设置爬取延时防止给网站造成不必要的麻烦和损失也避免给自己送进去爬取图片一般需要导入的库有 import requests import re 正则表达式 import os os用
多线程提高spark streaming数据写入到数据库

多线程提高spark streaming数据写入到数据库需求集群环境资源有限需要跑多个spark streaming任务每个任务必须占据1核 cpu利用率很低需要对数据进行实时统计更新到数据库mysql给业务实时展示数据聚合程度
java-打印项目相对路径的根目录

IDEA里 System out println System getProperty user dir
Java中栈Stack的bug(继承) 以及自己实现一个栈支持范型动态扩展

问题解决一封装Stack 解决二自己实现 Array java ArrayStack java 问题 import java util Stack public class Main public static void main S
【论文笔记】对比学习综述

跟李沐学AI的b站视频视频论文精读笔记第五期 https www bilibili com s video BV19S4y1M7hm 最后有总结请添加图片描述
Java 中通过 key 获取锁的正确方式

一概览本文我们将了解如何通过特定键获取锁以保证该键上的操作的线程安全并且不妨碍其他键一般来说我们需要实现两个方法 void lock String key void unlock String key 本文以字符串作为键为例大
hive报错FAILED: SemanticException org.apache.hadoop.hive.ql.metadata.HiveException

问题描述创建表时没有问题简单插入数据出现这个报错 hive报错FAILED SemanticException org apache hadoop hive ql metadata HiveException Failed to ex
ffmpeg windows编译及调试完整版

目录编译基础环境准备依赖环境安装依赖库安装 X264 fdk aac X265 ffmpeg 4 3 6 调试基础项目环境搭建 VS2019项目创建 VS2019项目代码 vs2019配置 VS2019调试编译基础环境准备 1
多行字符串无法匹配问题处理

关于多行字符串无法匹配的问题处理当前问题需要加上下面的东西 Pattern MULTILINE Pattern DOTALL 这样才能处理多行问题防止拿不到匹配符中的效果这里简单的处理方式入如下所示 Pattern patternC
《MySQL实战45讲》读后感 21

在上一篇文章中我和你介绍了间隙锁和 next key lock 的概念但是并没有说明加锁规则间隙锁的概念理解起来确实有点儿难尤其在配合上行锁以后很容易在判断是否会出现锁等待的问题上犯错所以今天我们就先从这个加锁规则开始吧首先
aspose java api_NetBeans中如何下载并使用Aspose Java API和示例

为了提升文件格式应用程序界面 API 的用户体验 Java NetBeans是Java开发人员在管理Java项目开发组件或应用程序时使用最多的集成开发环境之一为了让他们在NetBeans项目中通过一个简单的点击就能下载和使用多个Aspo
NITIRE 2023官方的PSNR及SSIM计算代码

NITIRE 2023官方的PSNR及SSIM计算代码问题描述做图像复原任务时总避免不了计算PSNR和SSIM等图像质量评估指标但是网上实在是太多计算这类指标的代码了不同代码计算的结果还可能存在差异有使用matlab计算SSIM
提高C++性能的编程技术笔记：多线程内存池+测试代码

为了使多个线程并发地分配和释放内存必须在分配器方法中添加互斥锁全局内存管理器通过new 和delete 实现是通用的因此它的开销也非常大因为单线程内存管理器要比多线程内存管理器快的多所以如果要分配的大多数内存块限于单线程中使用
MySQL多表操作：建表与多表查询

目录多表间的建表原则一对多多对多一对一实例演示多表查询数据准备多表查询笛卡尔积内连接隐式内连接显式内连接外连接左外连接右外连接子查询子查询的多种情况多表间的建表原则一对多如分类与商品是一对多的关系一个
花了 44 大洋，我实现 Java 发送短信功能！

本篇为从 0 到 1 的一个用 Java 对接腾讯短信发送功能的一篇流程文章而标题中的 44 大洋则是用来购买短信套餐当然如果你是首次使用腾讯云平台则可以享受免费使用这一功能我是老用户那废话不多说我们往下看对接步骤 1 短
java设计模式之代理模式

什么是代理模式为什么要使用代理模式代理模式属于结构型通过目标对象类完成其实现静态代理静态代理要求目标对象和代理对象实现同一个业务接口由目标类去创建和实现代理类负责实现增强静态代理的缺点代理类和目标类实现了相同的接口每个代
CUDA编程一天入门

目录 0 环境准备 1 套路 2 并行执行内核设置 3 示例代码simpleTexture3D 4 参考链接 0 环境准备 1 套路 CUDA 编程模型是一个异构模型其中同时使用 CPU 和 GPU 在 CUDA 中主机是指 CPU 及

CUDA编程一天入门

0 环境准备

1 套路

2 并行执行内核设置

3 示例代码simpleTexture3D

4 参考链接

CUDA编程一天入门 的相关文章

随机推荐

热门标签

CUDA编程一天入门的相关文章