基于nvJpeg编解码图片

2023-10-27

NvJpeg介绍

nvJPEG库为深度学习和超大规模多媒体应用程序中常用的图像格式提供了高性能、GPU加速的JPEG解码功能。该库提供单个和批处理JPEG解码功能，可有效利用可用GPU资源实现最佳性能；以及用户管理内存分配的灵活性需要解码。

NvJpeg编码

在这里插入图片描述

#include "nvjpeg_coder.h"

#include "logger.h"
#include <cstdint>
#include <string>


NvJpegCoder::NvJpegCoder(size_t width, size_t height) :
    image_width_(width),
    image_height_(height)
{
    
}

NvJpegCoder::~NvJpegCoder()
{

}

int device_malloc(void **p, size_t s) 
{ 
    return (int)cudaMalloc(p, s); 
}

int device_free(void *p) 
{
    return (int)cudaFree(p); 
}

/**
 * @brief 
 * 
 * @param format 
 * @return int 
 */
int NvJpegCoder::initialize(const input_pixel_format format)
{
    cudaDeviceProp props;

    input_pixfmt_ = format;
    // cuda event create
#ifdef ELAPSEDTIME
    CHECK_CUDA(cudaEventCreate(&start_event_));
    CHECK_CUDA(cudaEventCreate(&stop_event_));
#endif
    // cuda buffer malloc
    if (format == RGBI)
    {
        cuda_ebcoder_buffer_size_ = image_width_ * image_height_ * 3;
    } else {
        LOG_ERROR("invalid input pixel forat:%d\n", format);
        return -1;
    }
    CHECK_CUDA(cudaMalloc((void**)&cuda_encoder_buffer_ptr_, cuda_ebcoder_buffer_size_));

    // nvjpeg create and initialize
    nvjpegDevAllocator_t dev_allocator = {&device_malloc, &device_free};
    CHECK_NVJPEG(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator, &nvjpeg_handle_));
    CHECK_NVJPEG(nvjpegJpegStateCreate(nvjpeg_handle_, &jpeg_state_));
    CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle_, &encoder_state_, NULL));
    CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle_, &encoder_params_, NULL));
    
    // get device 
    cudaGetDeviceProperties(&props, 0);

    // config input parameters
	nvjpegEncoderParamsSetEncoding(encoder_params_, 
        nvjpegJpegEncoding_t::NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN, NULL);
	nvjpegEncoderParamsSetOptimizedHuffman(encoder_params_, 1, NULL);
	nvjpegEncoderParamsSetQuality(encoder_params_, 100, NULL);
    if (format == RGBI)
    {
        nvjpegEncoderParamsSetSamplingFactors(encoder_params_, 
            nvjpegChromaSubsampling_t::NVJPEG_CSS_444, NULL);
    } else {
        LOG_ERROR("invalid input pixel forat:%d\n", format);
        return -1;
    }

    return 0;
}

/**
 * @brief encode raw image to jpeg format
 * 
 * @param input        : raw image data
 * @param input_len    : raw image data len
 * @param width        : raw image width
 * @param height       : raw image height
 * @param output       : jpeg data(notice, this point must free after no used)
 * @param output_len   : jpeg data len
 * @return int         : 
            0: success
            other: failed
 */
int NvJpegCoder::encode(uint8_t *input, size_t input_len, size_t width, 
    size_t height, uint8_t **output, size_t &output_len)
{
    nvjpegImage_t imgdesc;
    nvjpegInputFormat_t input_format;
#ifdef ELAPSEDTIME
    CHECK_CUDA(cudaEventRecord(start_event_));
#endif
    // copy host data buffer to deivce data buffer
    CHECK_CUDA(cudaMemcpy(cuda_encoder_buffer_ptr_, input, input_len, cudaMemcpyHostToDevice));

    if (input_pixfmt_ == RGBI)
    {
        imgdesc = 
        {
            {
                cuda_encoder_buffer_ptr_
            },
            {
                (unsigned int)width*3
            }
        };

        input_format = NVJPEG_INPUT_RGBI;

        CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle_,
                    encoder_state_,
                    encoder_params_,
                    &imgdesc,
                    input_format,
                    width,
                    height,
                    NULL));
    } else {
        LOG_ERROR("invalid input pixel format:%d", input_pixfmt_);
        return -1;
    }

    // std::vector<unsigned char> obuffer;
    size_t length = 0;
    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
                nvjpeg_handle_,
                encoder_state_,
                NULL,
                &length,
                NULL));

    // obuffer.resize(length);
    *output = new uint8_t[length];
    output_len = length;
    CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
                nvjpeg_handle_,
                encoder_state_,
                *output,
                &length,
                NULL));
    // static int count = 0;
    // std::cout << "Writing JPEG file: sample.jpg"  << std::endl;
    // std::ofstream outputFile("sample" + std::to_string(count++) +  ".jpg", std::ios::out | std::ios::binary);
    // outputFile.write(reinterpret_cast<const char *>(obuffer.data()), static_cast<int>(length));

#ifdef ELAPSEDTIME
    float cost_time_ms = 0.0;
    CHECK_CUDA(cudaEventRecord(stop_event_));
    CHECK_CUDA(cudaEventSynchronize(stop_event_));
    /*CHECK_CUDA*/(cudaEventElapsedTime(&cost_time_ms, start_event_, stop_event_));
    LOG_INFO("encode image cost time:%f ms", cost_time_ms);
#endif

    return 0;
}

int NvJpegCoder::deinitialize()
{
    cudaFree(cuda_encoder_buffer_ptr_);

    cudaEventDestroy(start_event_);
    cudaEventDestroy(stop_event_);

    CHECK_NVJPEG(nvjpegEncoderParamsDestroy(encoder_params_));
    CHECK_NVJPEG(nvjpegEncoderStateDestroy(encoder_state_));
    CHECK_NVJPEG(nvjpegJpegStateDestroy(jpeg_state_));
    CHECK_NVJPEG(nvjpegDestroy(nvjpeg_handle_));
}

NvJpeg解码


int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
                  std::vector<nvjpegImage_t> &out, decode_params_t &params,
                  double &time) {
  CHECK_CUDA(cudaStreamSynchronize(params.stream));
  cudaEvent_t startEvent = NULL, stopEvent = NULL;
  float loopTime = 0; 
  
  CHECK_CUDA(cudaEventCreate(&startEvent, cudaEventBlockingSync));
  CHECK_CUDA(cudaEventCreate(&stopEvent, cudaEventBlockingSync));


  std::vector<const unsigned char*> batched_bitstreams;
  std::vector<size_t> batched_bitstreams_size;
  std::vector<nvjpegImage_t>  batched_output;

  // bit-streams that batched decode cannot handle
  std::vector<const unsigned char*> otherdecode_bitstreams;
  std::vector<size_t> otherdecode_bitstreams_size;
  std::vector<nvjpegImage_t> otherdecode_output;

  if(params.hw_decode_available){
    for(int i = 0; i < params.batch_size; i++){
      // extract bitstream meta data to figure out whether a bit-stream can be decoded
      nvjpegJpegStreamParseHeader(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], params.jpeg_streams[0]);
      int isSupported = -1;
      nvjpegDecodeBatchedSupported(params.nvjpeg_handle, params.jpeg_streams[0], &isSupported);

      if(isSupported == 0){
        batched_bitstreams.push_back((const unsigned char *)img_data[i].data());
        batched_bitstreams_size.push_back(img_len[i]);
        batched_output.push_back(out[i]);
      } else {
        otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data());
        otherdecode_bitstreams_size.push_back(img_len[i]);
        otherdecode_output.push_back(out[i]);
      }
    }
  } else {
    for(int i = 0; i < params.batch_size; i++) {
      otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data());
      otherdecode_bitstreams_size.push_back(img_len[i]);
      otherdecode_output.push_back(out[i]);
    }
  }

  CHECK_CUDA(cudaEventRecord(startEvent, params.stream));

    if(batched_bitstreams.size() > 0)
     {
          CHECK_NVJPEG(
               nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
                                            batched_bitstreams.size(), 1, params.fmt));

         CHECK_NVJPEG(nvjpegDecodeBatched(
             params.nvjpeg_handle, params.nvjpeg_state, batched_bitstreams.data(),
             batched_bitstreams_size.data(), batched_output.data(), params.stream));
     }

    if(otherdecode_bitstreams.size() > 0)
    {
          CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
          int buffer_index = 0;
          CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
          for (int i = 0; i < params.batch_size; i++) {
              CHECK_NVJPEG(
                  nvjpegJpegStreamParse(params.nvjpeg_handle, otherdecode_bitstreams[i], otherdecode_bitstreams_size[i],
                  0, 0, params.jpeg_streams[buffer_index]));

              CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
                  params.pinned_buffers[buffer_index]));

              CHECK_NVJPEG(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
                  params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));

              CHECK_CUDA(cudaStreamSynchronize(params.stream));

              CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
                  params.jpeg_streams[buffer_index], params.stream));

              buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync

              CHECK_NVJPEG(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
                  &otherdecode_output[i], params.stream));

          }
    }
  CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));

  CHECK_CUDA(cudaEventSynchronize(stopEvent));
  CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
  time = 0.001 * static_cast<double>(loopTime); // cudaEventElapsedTime returns milliseconds

  return EXIT_SUCCESS;
}

int write_images(std::vector<nvjpegImage_t> &iout, std::vector<int> &widths,
                 std::vector<int> &heights, decode_params_t &params,
                 FileNames &filenames) {
  for (int i = 0; i < params.batch_size; i++) {
    // Get the file name, without extension.
    // This will be used to rename the output file.
    size_t position = filenames[i].rfind("/");
    std::string sFileName =
        (std::string::npos == position)
            ? filenames[i]
            : filenames[i].substr(position + 1, filenames[i].size());
    position = sFileName.rfind(".");
    sFileName = (std::string::npos == position) ? sFileName
                                                : sFileName.substr(0, position);
    std::string fname(params.output_dir + "/" + sFileName + ".bmp");

    int err;
    if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) {
      err = writeBMP(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
                     iout[i].channel[1], iout[i].pitch[1], iout[i].channel[2],
                     iout[i].pitch[2], widths[i], heights[i]);
    } else if (params.fmt == NVJPEG_OUTPUT_RGBI ||
               params.fmt == NVJPEG_OUTPUT_BGRI) {
      // Write BMP from interleaved data
      err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
                      widths[i], heights[i]);
    }
    if (err) {
      std::cout << "Cannot write output file: " << fname << std::endl;
      return EXIT_FAILURE;
    }
    std::cout << "Done writing decoded image to file: " << fname << std::endl;
  }
  return EXIT_SUCCESS;
}

double process_images(FileNames &image_names, decode_params_t &params,
                      double &total) {
  // vector for storing raw files and file lengths
  FileData file_data(params.batch_size);
  std::vector<size_t> file_len(params.batch_size);
  FileNames current_names(params.batch_size);
  std::vector<int> widths(params.batch_size);
  std::vector<int> heights(params.batch_size);
  // we wrap over image files to process total_images of files
  FileNames::iterator file_iter = image_names.begin();

  // stream for decoding
  CHECK_CUDA(
      cudaStreamCreateWithFlags(&params.stream, cudaStreamNonBlocking));

  int total_processed = 0;

  // output buffers
  std::vector<nvjpegImage_t> iout(params.batch_size);
  // output buffer sizes, for convenience
  std::vector<nvjpegImage_t> isz(params.batch_size);

  for (int i = 0; i < iout.size(); i++) {
    for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
      iout[i].channel[c] = NULL;
      iout[i].pitch[c] = 0;
      isz[i].pitch[c] = 0;
    }
  }

  double test_time = 0;
  int warmup = 0;
  while (total_processed < params.total_images) {
    if (read_next_batch(image_names, params.batch_size, file_iter, file_data,
                        file_len, current_names))
      return EXIT_FAILURE;

    if (prepare_buffers(file_data, file_len, widths, heights, iout, isz,
                        current_names, params))
      return EXIT_FAILURE;

    double time;
    if (decode_images(file_data, file_len, iout, params, time))
      return EXIT_FAILURE;
    if (warmup < params.warmup) {
      warmup++;
    } else {
      total_processed += params.batch_size;
      test_time += time;
    }

    if (params.write_decoded)
      write_images(iout, widths, heights, params, current_names);
  }
  total = test_time;

  release_buffers(iout);

  CHECK_CUDA(cudaStreamDestroy(params.stream));

  return EXIT_SUCCESS;
}

int main(int argc, const char *argv[]) {
  int pidx;

  if ((pidx = findParamIndex(argv, argc, "-h")) != -1 ||
      (pidx = findParamIndex(argv, argc, "--help")) != -1) {
    std::cout << "Usage: " << argv[0]
              << " -i images_dir [-b batch_size] [-t total_images] "
                 "[-w warmup_iterations] [-o output_dir] "
                 "[-pipelined] [-batched] [-fmt output_format]\n";
    std::cout << "Parameters: " << std::endl;
    std::cout << "\timages_dir\t:\tPath to single image or directory of images"
              << std::endl;
    std::cout << "\tbatch_size\t:\tDecode images from input by batches of "
                 "specified size"
              << std::endl;
    std::cout << "\ttotal_images\t:\tDecode this much images, if there are "
                 "less images \n"
              << "\t\t\t\t\tin the input than total images, decoder will loop "
                 "over the input"
              << std::endl;
    std::cout << "\twarmup_iterations\t:\tRun this amount of batches first "
                 "without measuring performance"
              << std::endl;
    std::cout
        << "\toutput_dir\t:\tWrite decoded images as BMPs to this directory"
        << std::endl;
    std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl;
    std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl;
    std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One "
                 "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]"
              << std::endl;
    return EXIT_SUCCESS;
  }

  decode_params_t params;

  params.input_dir = "./";
  if ((pidx = findParamIndex(argv, argc, "-i")) != -1) {
    params.input_dir = argv[pidx + 1];
  } else {
    // Search in default paths for input images.
     int found = getInputDir(params.input_dir, argv[0]);
    if (!found)
    {
      std::cout << "Please specify input directory with encoded images"<< std::endl;
      return EXIT_FAILURE;
    }
  }

  params.batch_size = 1;
  if ((pidx = findParamIndex(argv, argc, "-b")) != -1) {
    params.batch_size = std::atoi(argv[pidx + 1]);
  }

  params.total_images = -1;
  if ((pidx = findParamIndex(argv, argc, "-t")) != -1) {
    params.total_images = std::atoi(argv[pidx + 1]);
  }

  params.warmup = 0;
  if ((pidx = findParamIndex(argv, argc, "-w")) != -1) {
    params.warmup = std::atoi(argv[pidx + 1]);
  }

  params.fmt = NVJPEG_OUTPUT_RGB;
  if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) {
    std::string sfmt = argv[pidx + 1];
    if (sfmt == "rgb")
      params.fmt = NVJPEG_OUTPUT_RGB;
    else if (sfmt == "bgr")
      params.fmt = NVJPEG_OUTPUT_BGR;
    else if (sfmt == "rgbi")
      params.fmt = NVJPEG_OUTPUT_RGBI;
    else if (sfmt == "bgri")
      params.fmt = NVJPEG_OUTPUT_BGRI;
    else if (sfmt == "yuv")
      params.fmt = NVJPEG_OUTPUT_YUV;
    else if (sfmt == "y")
      params.fmt = NVJPEG_OUTPUT_Y;
    else if (sfmt == "unchanged")
      params.fmt = NVJPEG_OUTPUT_UNCHANGED;
    else {
      std::cout << "Unknown format: " << sfmt << std::endl;
      return EXIT_FAILURE;
    }
  }

  params.write_decoded = false;
  if ((pidx = findParamIndex(argv, argc, "-o")) != -1) {
    params.output_dir = argv[pidx + 1];
    if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR &&
        params.fmt != NVJPEG_OUTPUT_RGBI && params.fmt != NVJPEG_OUTPUT_BGRI) {
      std::cout << "We can write ony BMPs, which require output format be "
                   "either RGB/BGR or RGBi/BGRi"
                << std::endl;
      return EXIT_FAILURE;
    }
    params.write_decoded = true;
  }

  nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
  nvjpegPinnedAllocator_t pinned_allocator ={&host_malloc, &host_free};

  nvjpegStatus_t status = nvjpegCreateEx(NVJPEG_BACKEND_HARDWARE, &dev_allocator,
                                &pinned_allocator,NVJPEG_FLAGS_DEFAULT,  &params.nvjpeg_handle);
  params.hw_decode_available = true;
  if( status == NVJPEG_STATUS_ARCH_MISMATCH) {
    std::cout<<"Hardware Decoder not supported. Falling back to default backend"<<std::endl;
    CHECK_NVJPEG(nvjpegCreateEx(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
                              &pinned_allocator, NVJPEG_FLAGS_DEFAULT, &params.nvjpeg_handle));
    params.hw_decode_available = false;
  } else {
    CHECK_NVJPEG(status);
  }

  CHECK_NVJPEG(
      nvjpegJpegStateCreate(params.nvjpeg_handle, &params.nvjpeg_state));



  create_decoupled_api_handles(params);

  // read source images
  FileNames image_names;
  readInput(params.input_dir, image_names);

  if (params.total_images == -1) {
    params.total_images = image_names.size();
  } else if (params.total_images % params.batch_size) {
    params.total_images =
        ((params.total_images) / params.batch_size) * params.batch_size;
    std::cout << "Changing total_images number to " << params.total_images
              << " to be multiple of batch_size - " << params.batch_size
              << std::endl;
  }

  std::cout << "Decoding images in directory: " << params.input_dir
            << ", total " << params.total_images << ", batchsize "
            << params.batch_size << std::endl;

  double total;
  if (process_images(image_names, params, total)) return EXIT_FAILURE;
  std::cout << "Total decoding time: " << total << " (s)" << std::endl;
  std::cout << "Avg decoding time per image: " << total / params.total_images 
            << " (s)" << std::endl;
  std::cout << "Avg images per sec: " << params.total_images / total
            << std::endl;
  std::cout << "Avg decoding time per batch: "
            << total / ((params.total_images + params.batch_size - 1) /
                        params.batch_size) 
            << " (s)" << std::endl;

  destroy_decoupled_api_handles(params);

  CHECK_NVJPEG(nvjpegJpegStateDestroy(params.nvjpeg_state));
  CHECK_NVJPEG(nvjpegDestroy(params.nvjpeg_handle));

  return EXIT_SUCCESS;
}

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

CUDA

人工智能

深度学习

计算机视觉

基于nvJpeg编解码图片的相关文章

在 __device/global__ CUDA 内核中动态分配内存

根据CUDA 编程指南 http developer download nvidia com compute cuda 3 2 prod toolkit docs CUDA C Programming Guide pdf 第 122 页可
CUDA 矩阵加法时序，按行与按行比较按栏目

我目前正在学习 CUDA 并正在做一些练习其中之一是实现以 3 种不同方式添加矩阵的内核每个元素 1 个线程每行 1 个线程和每列 1 个线程矩阵是方阵并被实现为一维向量我只需用以下命令对其进行索引 A N row col 直觉
在 cudaFree() 之前需要 cudaDeviceSynchronize() 吗？

CUDA 版本 10 1 帕斯卡 GPU 所有命令都发送到默认流 void ptr cudaMalloc ptr launch kernel lt lt lt gt gt gt ptr cudaDeviceSynchronize Is th
CUDA - 将 CPU 变量传输到 GPU __constant__ 变量

与 CUDA 的任何事情一样最基本的事情有时也是最难的所以我只想将变量从 CPU 复制到 GPUconstant变量我很难过这就是我所拥有的 constant int contadorlinhasx d int main int
最小化 MC 模拟期间存储的 cuRAND 状态数量

我目前正在 CUDA 中编写蒙特卡罗模拟因此我需要生成lots使用随机数cuRAND图书馆每个线程处理一个巨大的元素floatarray 示例中省略并在每次内核调用时生成 1 或 2 个随机数通常的方法参见下面的示例似乎是为每
具有 Cuda Thrust 的多个 GPU？

如何将 Thrust 与多个 GPU 一起使用这只是使用 cudaSetDevice deviceId 的问题吗然后运行相关的 Thrust 代码使用 CUDA 4 0 或更高版本 cudaSetDevice deviceId 接下来
PyInstaller 是否包含 CUDA

我正在开发一个Python脚本我使用Python 3 7 3 它使用tensorflow gpu 1 14 0 并使用PyInstaller 3 5将此脚本转换为可执行文件我使用的是 CUDA 10 0 和 cuDNN 7 6 1 我的
CUDA NSight 未随 Windows 8 上的 CUDA 5.0 安装文件一起安装？ [关闭]

Closed 这个问题是无关 help closed questions 目前不接受答案据我所知 Nvidia 网站上没有 Nsight Eclipse 的下载链接它说它将由 CUDA 5 安装本机安装但并没有随CUDA安装一起安装
如何在CUDA应用程序中正确应用线程同步？

一般来说我在应用程序中偶尔会使用线程同步因为我并不经常需要此功能我并不是真正的高级 C C 程序员但我也不是初学者我开始学习 CUDA C 对当今 GPU 与 CPU 的能力相比感到兴奋我意识到 CUDA 编程主要是关于并行线程
CUDA 添加矩阵的行

我试图将 4800x9600 矩阵的行加在一起得到一个 1x9600 的矩阵我所做的是将 4800x9600 分成 9 600 个矩阵每个矩阵长度为 4800 然后我对 4800 个元素进行缩减问题是这真的很慢有人有什么建议吗
CUDA 模型 - 什么是扭曲尺寸？

最大工作组大小和扭曲大小之间有什么关系假设我的设备有 240 个 CUDA 流处理器 SP 并返回以下信息 CL DEVICE MAX COMPUTE UNITS 30 CL DEVICE MAX WORK ITEM SIZES 512
GPU上动态分配内存

是否可以在内核内的 GPU 全局内存上动态分配内存我不知道我的答案有多大因此我需要一种方法为答案的每个部分分配内存 CUDA 4 0 允许我们使用 RAM 这是一个好主意还是会降低速度可以在内核中使用 malloc 检查以下内容摘自
使用 GPU 进行 Matlab 卷积

我用gpuArray尝试了matlab的卷积函数conv2 convn 例如 convn gpuArray rand 100 100 10 single gpuArray rand 5 single 并将其与 cpu 版本 convn ra
我们如何在每次运行时使用不同的种子在 CUDA C 中生成随机数？

我正在研究一个随机过程我想在每次运行程序时在 CUDA 内核中生成不同的系列随机数这类似于我们在 C 中声明种子时间空接下来是 srand 种子和兰特我可以通过内核将种子从主机传递到设备但是这样做的问题是我必须将整个种子
直接在主机上访问设备向量元素的最快方法

我请您参考以下页面http code google com p thrust wiki QuickStartGuide Vectors http code google com p thrust wiki QuickStartGuide V
cudaMalloc使用向量>进行管理 > C++ - NVIDIA CUDA

我正在通过 NVIDIA GeForce GT 650M GPU 为我创建的模拟实现多线程为了确保一切正常工作我创建了一些辅助代码来测试一切是否正常在某一时刻我需要更新变量向量它们都可以单独更新这是它的要点 device int
一维纹理内存访问比一维全局内存访问更快吗？

我正在测量标准纹理和 1Dtexture 内存访问之间的差异为此我创建了两个内核 global void texture1D float doarray int size int index calculate each thread
CUDA 中的 JPEG 库

我正在尝试在 CUDA 中压缩和解压缩图像到目前为止我已经找到了这个库 http sourceforge net projects cuj2k source navbar http sourceforge net projects cuj
为什么使用 boost::none 无法通过 nvcc 编译？

我正在尝试编译以下代码 include
如何从尖点库矩阵格式获取原始指针

我需要从尖点库矩阵格式获取原始指针例如 cusp coo matrix

随机推荐

关于nmealib库的学习

调用流程前部分的调用流程如下 nmea parse gt nmea parser push parser buff buff sz gt nparsed nmea parser real push parser buff nparse g
[HLS]HTTP Live Streaming流与TS流比较

转自 http www cuplayer com player PlayerCode FMS 2013 0725 905 4 html 你说的应该是 HTTP Live Streaming 1 吧这个是 Apple 为了提高流播效率开发的
面向对象编程及其三大特征（JAVA）

面向对象编程及其三大特征 JAVA 一面向对象编程介绍面向过程与面向对象面向对象的本质二面向对象编程的三大特征封装封装的概念封装的优点封装举例 Person类继承继承的概念继承的优点继承类型继承注意事项继承举例
【2023B题】人工智能对大学生学习影响的评价(思路、代码）

目录 1 概述 2 Matlab代码实现 3 参考文献 4 运行结果 1 概述人工智能简称AI 最初由麦卡锡明斯基等科学家于1956年在美国达特茅斯学院开会研讨时提出 2016年人工智能AlphaGo 4 1战胜韩国围棋高手李世石期
MATLAB人脸检测算法

MATLAB人脸检测算法人脸检测是将人脸从复杂的背景图像中检测出来它是实现人脸识别的基础和前提肤色重建使用人脸肤色模板对灰度图像进行肤色重建使其变成彩色图像效果如下在函数function R gray2rgb1 img1 im
Qt 程序获取各种文件路径方法

Qt 程序获取程序所在路径用户目录路径临时文件夹等特殊路径的方法经常我们的程序中需要访问一些特殊的路径比如程序所在的路径用户目录路径临时文件夹等在 Qt 中实现这几个功能所用的方法虽然都不难但是各不相同每次用到时还要现去查
20张原版Python科学速查表，数据分析不再是难题

1 Python基础 2 Pandas基础 3 Numpy基础 4 PySpark基础 5 数组排序算法 6 scikit learn算法小抄内容就不给大家一一自行领取哦
编译UNP源码

读UNP需要用到官方的源码记录一下折腾的记录源码下载地址 http www unpbook com 下载源码后解压出文件夹unpv13e cd进入后依次进行下面操作 configure cd lib make 这两步没有问题出现w
程序内执行shell命令自动获取局域网内所有在线IP和开放端口（Linux）

前言题目说的有点长其实就是以下几件事第一是让Shell命令可以在程序内执行然后将执行结果传递给变量从而可以进行后续的事务处理第二是使用nmap命令获取当前局域网内所有在线IP和开放的端口最后就是对nmap获得的结果进行处理得
Nexus因异常重启导致OrientDB数据库变为只读的问题修复

文章目录环境问题描述问题排查解决步骤环境 CentOS 7 6 Nexus 3 21 2 问题描述装有Nexus的服务机在异常故障重启后 Nexus可以正常读取和拉取jar包但尝试上传时报错打开Nexus服务器上的日志文件发
机器学习PCA——实验报告

机器学习实验报告实验报告pdf可在该网址下载一实验目的与要求二实验内容与方法 2 0 PCA算法学习与回顾 2 0 1 PCA推导的优化问题 2 0 2 优化问题的解 2 0 3 算法流程 2 1 人脸数据集 2 2 实验流程图
Qt—QPainter基本图形绘制详解

QPainter描述 1 QPainter 类在小部件和其他绘制设备上执行低级绘制 2 QPainter 提供了高度优化的功能来完成大多数图形GUI程序所需的工作它可以画从简单的线条到复杂的形状它还可以绘制对齐的文本和像素图 QPain
sideeffects没配置css也还在,webpack4踩坑配置之sideEffects, 打包文件中没有css文件

最开始先来讲一下sideEffects sideEffects sideEffects意为副作用无副作用类似纯函数的概念接受参数并返回值不产生其他影响这里的副作用针对模块如果一个模块单纯的导入导出变量那就是无副作用的如果还修改其
docker从安装到部署一个项目

一 centos安装docker 参考博客 https blog csdn net m0 47010003 article details 127775185 1 设置一下下载Docker的镜像源设置下载的镜像源为国内的阿里云如果不设置
sqlilabs第五关

文章目录前言一二次注入二别的方法三 sqlmap 前言本来这篇博客前几天就准备写奈何因为各种原因拖到了现在然后参加ctf又白给不说了都是泪流眼泪 jpg 如有错误希望各位师傅指正拜托 jpg 一二次注入本次采用
iOS一键生成所有图标

记录一个iOS开发一键生成图标的网站 https icon wuruihong com
CUDA性能优化系列——Kmeans算法调优（二）

本篇介绍Kmeans算法中计算新的聚类中心部分这部分主要逻辑根据计算出的新的分类信息对全部数据点依次对每个类别求出所属当前类别的数据点个数与坐标和本质上就是进行规约运算 V1 Atomic实现全局规约由于最终生成16个聚类中心因
k-均值聚类Python代码实现

k 均值聚类的简介可以参考 http blog csdn net fengbingchun article details 79276668 这里给出两种方式的k 均值实现 code主要来自于网络 1 以下code来自于 https mub
TensorFlow、PyTorch分布式训练

要在两台主机之间使用分布式训练您可以使用一些深度学习框架提供的工具和库来实现这里以TensorFlow为例介绍一下如何在两台主机之间使用分布式训练首先您需要安装TensorFlow和CUDA等相关软件并确保两台主机都可以访问彼此
基于nvJpeg编解码图片

NvJpeg介绍 nvJPEG库为深度学习和超大规模多媒体应用程序中常用的图像格式提供了高性能 GPU加速的JPEG解码功能该库提供单个和批处理JPEG解码功能可有效利用可用GPU资源实现最佳性能以及用户管理内存分配的灵活性需要解码