CUDA学习（二十三）

2023-11-16

Direct3D互操作性：
Direct3D 9Ex，Direct3D 10和Direct3D 11支持Direct3D互操作性。
CUDA上下文只能与满足以下条件的Direct3D设备互操作：必须使用设置为D3DDEVTYPE_HAL的DeviceType和具有D3DCREATE_HARDWARE_VERTEXPROCESSING标志的BehaviorFlags创建Direct3D 9Ex设备; 必须使用设置为D3D_DRIVER_TYPE_HARDWARE的DriverType创建Direct3D 10和Direct3D 11设备。
可以映射到CUDA地址空间的Direct3D资源是Direct3D缓冲区，纹理和曲面。这些资源是使用cudaGraphicsD3D9RegisterResource（），cudaGraphicsD3D10RegisterResource（）和cudaGraphicsD3D11RegisterResource（）注册的。
以下代码示例使用内核动态修改存储在顶点缓冲区对象中的顶点的2D宽度x高度网格：
Direct3D 9 版本：

IDirect3D9* D3D;
IDirect3DDevice9* device;
struct CUSTOMVERTEX {
    FLOAT x, y, z;
    DWORD color;
};
IDirect3DVertexBuffer9* positionsVB;
struct cudaGraphicsResource* positionsVB_CUDA;
int main()
{
    int dev;
    // Initialize Direct3D
    D3D = Direct3DCreate9Ex(D3D_SDK_VERSION);
    // Get a CUDA-enabled adapter
    unsigned int adapter = 0;
    for (; adapter < g_pD3D->GetAdapterCount(); adapter++) {
        D3DADAPTER_IDENTIFIER9 adapterId;
        g_pD3D->GetAdapterIdentifier(adapter, 0, &adapterId);
        if (cudaD3D9GetDevice(&dev, adapterId.DeviceName)
            == cudaSuccess)
            break;
    }
    // Create device
    ...
        D3D->CreateDeviceEx(adapter, D3DDEVTYPE_HAL, hWnd,
            D3DCREATE_HARDWARE_VERTEXPROCESSING,
            &params, NULL, &device);
    // Use the same device
    cudaSetDevice(dev);
    // Create vertex buffer and register it with CUDA
    unsigned int size = width * height * sizeof(CUSTOMVERTEX);
    device->CreateVertexBuffer(size, 0, D3DFVF_CUSTOMVERTEX,
        D3DPOOL_DEFAULT, &positionsVB, 0);
    cudaGraphicsD3D9RegisterResource(&positionsVB_CUDA,
        positionsVB,
        cudaGraphicsRegisterFlagsNone);
    cudaGraphicsResourceSetMapFlags(positionsVB_CUDA,
        cudaGraphicsMapFlagsWriteDiscard);
    // Launch rendering loop
    while (...) {
        ...
            Render();
        ...
    }
    ...
}
void Render()
{
    // Map vertex buffer for writing from CUDA
    float4* positions;
    cudaGraphicsMapResources(1, &positionsVB_CUDA, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,
        &num_bytes,
        positionsVB_CUDA));
        // Execute kernel
        dim3 dimBlock(16, 16, 1);
        dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
        createVertices << <dimGrid, dimBlock >> >(positions, time,
            width, height);
        // Unmap vertex buffer
        cudaGraphicsUnmapResources(1, &positionsVB_CUDA, 0);
        // Draw and present
        ...
}
void releaseVB()
{
    cudaGraphicsUnregisterResource(positionsVB_CUDA);
    positionsVB->Release();
}
__global__ void createVertices(float4* positions, float time,
    unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // Calculate uv coordinates
    float u = x / (float)width;
    float v = y / (float)height;
    u = u * 2.0f - 1.0f;
    v = v * 2.0f - 1.0f;
    // Calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u * freq + time)
        * cosf(v * freq + time) * 0.5f;
    // Write positions
    positions[y * width + x] =
        make_float4(u, w, v, __int_as_float(0xff00ff00));
}

Direct3D 10 版本：

ID3D10Device* device;
struct CUSTOMVERTEX {
    FLOAT x, y, z;
    DWORD color;
};
ID3D10Buffer* positionsVB;
struct cudaGraphicsResource* positionsVB_CUDA;
int main()
{
    int dev;
    // Get a CUDA-enabled adapter
    IDXGIFactory* factory;
    CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&factory);
    IDXGIAdapter* adapter = 0;
    for (unsigned int i = 0; !adapter; ++i) {
        if (FAILED(factory->EnumAdapters(i, &adapter))
            break;
        if (cudaD3D10GetDevice(&dev, adapter) == cudaSuccess)
            break;
        adapter->Release();
    }
    factory->Release();
    // Create swap chain and device
    ...
        D3D10CreateDeviceAndSwapChain(adapter,
            D3D10_DRIVER_TYPE_HARDWARE, 0,
            D3D10_CREATE_DEVICE_DEBUG,
            D3D10_SDK_VERSION,
            &swapChainDesc, &swapChain,
            &device);
    adapter->Release();
    // Use the same device
    cudaSetDevice(dev);
    // Create vertex buffer and register it with CUDA
    unsigned int size = width * height * sizeof(CUSTOMVERTEX);
    D3D10_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D10_USAGE_DEFAULT;
    bufferDesc.ByteWidth = size;
    bufferDesc.BindFlags = D3D10_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = 0;
    device->CreateBuffer(&bufferDesc, 0, &positionsVB);
    cudaGraphicsD3D10RegisterResource(&positionsVB_CUDA,
        positionsVB,
        cudaGraphicsRegisterFlagsNone);
    cudaGraphicsResourceSetMapFlags(positionsVB_CUDA,
        cudaGraphicsMapFlagsWriteDiscard);
    // Launch rendering loop
    while (...) {
        ...
            Render();
        ...
    }
    ...
}
void Render()
{
    // Map vertex buffer for writing from CUDA
    float4* positions;
    cudaGraphicsMapResources(1, &positionsVB_CUDA, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,
        &num_bytes,
        positionsVB_CUDA));
        // Execute kernel
        dim3 dimBlock(16, 16, 1);
        dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
        createVertices << <dimGrid, dimBlock >> >(positions, time,
            width, height);
        // Unmap vertex buffer
        cudaGraphicsUnmapResources(1, &positionsVB_CUDA, 0);
        // Draw and present
        ...
}
void releaseVB()
{
    cudaGraphicsUnregisterResource(positionsVB_CUDA);
    positionsVB->Release();
}
__global__ void createVertices(float4* positions, float time,
    unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // Calculate uv coordinates
    float u = x / (float)width;
    float v = y / (float)height;
    u = u * 2.0f - 1.0f;
    v = v * 2.0f - 1.0f;
    // Calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u * freq + time)
        * cosf(v * freq + time) * 0.5f;
    // Write positions
    positions[y * width + x] =
        make_float4(u, w, v, __int_as_float(0xff00ff00));
}

Direct3D 11版本

ID3D11Device* device;
struct CUSTOMVERTEX {
    FLOAT x, y, z;
    DWORD color;
};
ID3D11Buffer* positionsVB;
struct cudaGraphicsResource* positionsVB_CUDA;
int main()
{
    int dev;
    // Get a CUDA-enabled adapter
    IDXGIFactory* factory;
    CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&factory);
    IDXGIAdapter* adapter = 0;
    for (unsigned int i = 0; !adapter; ++i) {
        if (FAILED(factory->EnumAdapters(i, &adapter))
            break;
        if (cudaD3D11GetDevice(&dev, adapter) == cudaSuccess)
            break;
        adapter->Release();
    }
    factory->Release();
    // Create swap chain and device
    ...
        sFnPtr_D3D11CreateDeviceAndSwapChain(adapter,
            D3D11_DRIVER_TYPE_HARDWARE,
            0,
            D3D11_CREATE_DEVICE_DEBUG,
            featureLevels, 3,
            D3D11_SDK_VERSION,
            &swapChainDesc, &swapChain,
            &device,
            &featureLevel,
            &deviceContext);
    adapter->Release();
    // Use the same device
    cudaSetDevice(dev);
    // Create vertex buffer and register it with CUDA
    unsigned int size = width * height * sizeof(CUSTOMVERTEX);
    D3D11_BUFFER_DESC bufferDesc;
    bufferDesc.Usage = D3D11_USAGE_DEFAULT;
    bufferDesc.ByteWidth = size;
    bufferDesc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
    bufferDesc.CPUAccessFlags = 0;
    bufferDesc.MiscFlags = 0;
    device->CreateBuffer(&bufferDesc, 0, &positionsVB);
    cudaGraphicsD3D11RegisterResource(&positionsVB_CUDA,
        positionsVB,
        cudaGraphicsRegisterFlagsNone);
    cudaGraphicsResourceSetMapFlags(positionsVB_CUDA,
        cudaGraphicsMapFlagsWriteDiscard);
    // Launch rendering loop
    while (...) {
        ...
            Render();
        ...
    }
    ...
}
void Render()
{
    // Map vertex buffer for writing from CUDA
    float4* positions;
    cudaGraphicsMapResources(1, &positionsVB_CUDA, 0);
    size_t num_bytes;
    cudaGraphicsResourceGetMappedPointer((void**)&positions,
        &num_bytes,
        positionsVB_CUDA));
        // Execute kernel
        dim3 dimBlock(16, 16, 1);
        dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
        createVertices << <dimGrid, dimBlock >> >(positions, time,
            width, height);
        // Unmap vertex buffer
        cudaGraphicsUnmapResources(1, &positionsVB_CUDA, 0);
        // Draw and present
        ...
}
void releaseVB()
{
    cudaGraphicsUnregisterResource(positionsVB_CUDA);
    positionsVB->Release();
}
__global__ void createVertices(float4* positions, float time,
    unsigned int width, unsigned int height)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    // Calculate uv coordinates
    float u = x / (float)width;
    float v = y / (float)height;
    u = u * 2.0f - 1.0f;
    v = v * 2.0f - 1.0f;
    // Calculate simple sine wave pattern
    float freq = 4.0f;
    float w = sinf(u * freq + time)
        * cosf(v * freq + time) * 0.5f;
    // Write positions
    positions[y * width + x] =
        make_float4(u, w, v, __int_as_float(0xff00ff00));
}

SLI互操作性;
在具有多个GPU的系统中，所有支持CUDA的GPU都可以通过CUDA驱动程序和运行时作为单独的设备访问。当系统处于SLI模式时，有以下特殊的考虑:
首先，在一个GPU上的一个CUDA设备中的分配会消耗作为Direct3D或OpenGL设备的SLI配置的一部分的其他GPU上的存储器。因此，分配可能会比预期的更早失败;
其次，应用程序应该为SLI配置中的每个GPU创建多个CUDA上下文。虽然这不是一个严格的要求，但它避免了设备之间不必要的数据传输。应用程序可以使用Direct3D的cudaD3D [GetDevices（）]和OpenGL的cudaGLGetDevices（）函数来标识正在当前执行渲染的设备的CUDA设备句柄和下一帧。给定此信息，当deviceList参数设置为cudaD3D [9 | 10] GetDevices（）或cudaGLGetDevices（）时，应用程序通常会选择合适的设备并将Direct3D或OpenGL资源映射到cudaD3D [9 | 10] 11返回的CUDA设备 DeviceListCurrentFrame或cudaGLDeviceListCurrentFrame。
请注意，从cudaGraphicsD9D [9 | 10 | 11] RegisterResource和cudaGraphicsGLRegister [Buffer | Image]只能在设备上使用。因此，在SLI配置中，在不同的CUDA设备上计算不同帧的数据时，需要分别注册资源.
有关CUDA运行时如何分别与Direct3D和OpenGL互操作的详细信息，请参见Direct3D互操作性和OpenGL互操作性。;
timg

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

python

CUDA学习（二十三）的相关文章

如何生成给定范围内的回文数列表？

假设范围是 1 X 120 这是我尝试过的 gt gt gt def isPalindrome s check if a number is a Palindrome s str s return s s 1 gt gt gt def ge
如何在android上的python kivy中关闭应用程序后使服务继续工作

我希望我的服务在关闭应用程序后继续工作但我做不到我听说我应该使用startForeground 但如何在Python中做到这一点呢应用程序代码 from kivy app import App from kivy uix floatl
如何在Windows上模拟socket.socketpair

标准Python函数套接字套接字对 https docs python org 3 library socket html socket socketpair不幸的是它在 Windows 上不可用从 Python 3 4 1 开始我
SQL Alchemy 中的 NULL 安全不等式比较？

目前我知道如何表达 NULL 安全的唯一方法 SQL Alchemy 中的比较其中与 NULL 条目的比较计算结果为 True 而不是 NULL 是 or field None field value 有没有办法在 SQL Alchem
Spark的distinct()函数是否仅对每个分区中的不同元组进行洗牌

据我了解 distinct 哈希分区 RDD 来识别唯一键但它是否针对仅移动每个分区的不同元组进行了优化想象一个具有以下分区的 RDD 1 2 2 1 4 2 2 1 3 3 5 4 5 5 5 在此 RDD 上的不同键上所有重复键
为 pandas 数据透视表中的每个值列定义 aggfunc

试图生成具有多个值列的数据透视表我知道我可以使用 aggfunc 按照我想要的方式聚合值但是如果我不想对两列求和或求平均值而是想要一列的总和同时求另一列的平均值该怎么办那么使用 pandas 可以做到这一点吗 df pd D
如何使用装饰器禁用某些功能的中间件？

我想模仿的行为csrf exempt see here https docs djangoproject com en 1 11 ref csrf django views decorators csrf csrf exempt and h
运行多个 scrapy 蜘蛛的正确方法

我只是尝试使用在同一进程中运行多个蜘蛛新的 scrapy 文档 http doc scrapy org en 1 0 topics practices html但我得到 AttributeError CrawlerProcess objec
在 NumPy 中获取 ndarray 的索引和值

我有一个 ndarrayA任意维数N 我想创建一个数组B元组数组或列表其中第一个N每个元组中的元素是索引最后一个元素是该索引的值A 例如 A array 1 2 3 4 5 6 Then B 0 0 1 0 1 2 0 2 3 1 0
feedparser 在脚本运行期间失败，但无法在交互式 python 控制台中重现

当我运行 eclipse 或在 iPython 中运行脚本时它失败了 ascii codec can t decode byte 0xe2 in position 32 ordinal not in range 128 我不知道为什么但
python pandas 中的双端队列

我正在使用Python的deque 实现一个简单的循环缓冲区 from collections import deque import numpy as np test sequence np array range 100 2 resha
Python：字符串不会转换为浮点数[重复]

这个问题在这里已经有答案了我几个小时前写了这个程序 while True print What would you like me to double line raw input gt if line done break else f
Pandas Dataframe 中 bool 值的条件前向填充

问题如何转发 fill boolTruepandas 数据框中的值如果是当天的第一个条目 True 到一天结束时请参阅以下示例和所需的输出 Data import pandas as pd import numpy as np df
如何将 numpy.matrix 提高到非整数幂？

The 运算符为numpy matrix不支持非整数幂 gt gt gt m matrix 1 0 0 5 0 5 gt gt gt m 2 5 TypeError exponent must be an integer 我想要的是 oct
如何将 PIL 图像转换为 NumPy 数组？

如何转换 PILImage来回转换为 NumPy 数组这样我就可以比 PIL 进行更快的像素级转换PixelAccess允许我可以通过以下方式将其转换为 NumPy 数组 pic Image open foo jpg pix numpy
设置 torch.gather(...) 调用的结果

我有一个形状为 n x m 的 2D pytorch 张量我想使用索引列表来索引第二个维度可以使用 torch gather 完成然后然后还设置新值到索引的结果 Example data torch tensor 0 1 2 3 4
如何从没有结尾的管道中读取 python 中的 stdin

当管道来自打开时不知道正确的名称我无法从 python 中的标准输入或管道读取数据文件我有作为例子管道测试 py import sys import time k 0 try for line in sys stdin k k
在 Pandas DataFrame Python 中添加新列[重复]

这个问题在这里已经有答案了例如我在 Pandas 中有数据框 Col1 Col2 A 1 B 2 C 3 现在如果我想再添加一个名为 Col3 的列并且该值基于 Col2 式中如果Col2 gt 1 则Col3为0 否则为1 所以
在python中，如何仅搜索所选子字符串之前的一个单词

给定文本文件中的长行列表我只想返回紧邻其前面的子字符串例如单词狗描述狗的单词例如假设有这些行包含狗 hotdog big dog is dogged dog spy with my dog brown dogs 在这种情况下期望
Spark.read 在 Databricks 中给出 KrbException

我正在尝试从 databricks 笔记本连接到 SQL 数据库以下是我的代码 jdbcDF spark read format com microsoft sqlserver jdbc spark option url jdbc sql

随机推荐

再见！微软官宣放弃Mac 版 Visual Studio IDE

程序员的成长之路互联网程序员技术资料共享关注阅读本文大概需要 5 分钟来自撰稿丨千山对于Visual Studio 只要是开发者或多或少都接触过发布于1997年的Visual Studio标志着微软第一次将这么多开发工
关系代数之专门的关系运算（选择、投影）

选择选择运算是从指定的关系中选出满足给定条件用逻辑表达式表达的元组而组成一个新的关系进行选择运算的对象是一个关系当中某一个属性的值选择运算是将一张表当中的某一属性进行筛选比如将性别 sex 这列当中性别为女的元组筛选出来组成
constraint的一些用法总结

主要就是增加约束的以下几种约束并一一列举 1 主键约束要对一个列加主键约束的话这列就必须要满足的条件就是分空因为主键约束就是对一个列进行了约束约束为非空不重复以下是代码要对一个列加主键列名为id 表名为emp 格式
刷脸支付：双12刷脸支付5折省翻天，政策持续补贴预热双十二

刷脸支付成潮流三家巨头争江山刷脸支付的使用场景正在深入拓展进入每一个与人民生活息息相关的行业在政府综合政务社会公交运输商超营销运营酒店景区服务等各方面都出现了刷脸支付的身影我们能够看到科技是在进步的社会是在进步的二维码
Caffe2——cifar10数据集创建lmdb或leveldb类型的数据

cifar10数据集和mnist数据集存储方式不同 cifar10数据集把标签和图像数据以bin文件的方式存放在同一个文件内这种存放方式使得每个子cifar数据bin文件的结构相同所以cifar转换数据代码比mnist的代码更加的模块化
Vue项目提示 doesn‘t work properly without JavaScript enabled. Please enable it to continue

由于本地是用docker部署了一套微服务为了避免跨越问题前端使用的nginx配置转发后端路径访问返回状态时200 但是在response返回We re sorry but doesn t work properly without J
「雕爷学编程」Arduino动手做（37）——MQ-3乙醇易燃气酒精传感器模块

37款传感器与模块的提法在网络上广泛流传其实Arduino能够兼容的传感器模块肯定是不止37种的鉴于本人手头积累了一些传感器和模块依照实践出真知一定要动手做的理念以学习和交流为目的这里准备逐一动手试试做实验不管成功与否都
Android studio心得——fragment动态加载

前言在Android应用程序中 Fragment是一种可以嵌入Activity中的组件通过 Fragment 我们可以将UI 目录前言一什么是Android Studio 二简介Fragment 三学期知识汇总四什么是碎片
C++类与对象--static修饰符

C 类与对象 static修饰符 1 类静态数据成员的定义及初始化 1 1 声明 1 2 初始化 1 3 调用 1 4 案例 1 5 小结 2 类静态成员函数的定义 2 1 声明 2 2 调用 2 3 案例 2 4 小结 3 static
数据库字段类型

太长时间没有操作数据库收集了部分有用的资料一创建数据表 CREATE TABLE mytable id VARCHAR 4 NOT NULL name VARCHAR 10 sex CHAR 1 createtime DATE age
ROS系统

参考 https blog csdn net qq 28087491 article details 119053810 https www bilibili com video BV1zt411G7Vn spm id from 333 3
静态网页怎样实现动态交互?-JavaScript

在Html基础上 javascript能够开发交互式web网页 javascript的出现使得网页和用户之间实现了一种实时性的动态的交互性的关系 javascript短小精悍又是在客户机上执行的大大提高了网页的浏览速度和交互能力同
Python高级培训第三次作业

任务作业 import threading 导入threading库 import time 导入time库 class Get time object 创建类Get time 用于获取当前时间 def init self each ti
“msg“:“Request method ‘GET‘ not supported“,“code“:500原因及解决

GetMapping add parentId 这里的路径纠错漏了 controller 缺少add的保存方法 GetMapping add parentId 及其以下 Html出现错误如下图
B树及其基本操作、B+树的基本概念

B树及其基本操作 B 树的基本概念 1 B树 B 树的基本概念 1 B树的基本概念及性质 2 B 树的基本概念及性质 2 B树与B 树的区别 3 B树的基本操作 1 B树的查找 2 B树的插入 3 B树的删除 1 B树 B 树的基本概念 1
SpringBoot集成海康设备网络SDK

文章目录 SDK介绍概述功能下载对接指南集成初始化项目初始化SDK 初始化SDK概述新建AppRunner 新建SdkInitService 新建InitSdkTask 新建 HCNetSDK 调用业务接口部署拷贝so库
解决鼠标右击菜单的新建中没有“文本文档”的问题

解决鼠标右击菜单的新建中没有文本文档的问题原创丶无殇 2022 2 12 注意博主测试平台为WIN10系统其他系统不保证一定可以一问题现象在桌面右击打开新建菜单时没有文本文档这个选项二问题原因有以下可能安装某个软件
单链表的数据结构和基本操作

单链表的基本操作头结点单链表的基本操作头结点单链表的数据结构头结点的初始化插入新结点头插法插入新结点尾插法插入新结点按位置插入新结点删除节点头删尾删按位置删头指针单链表的基本操作实现代码链表是一种线性结构在存储
UE4 技能系统(GAS插件的使用) 01--Build Basic Classes 创建基础类

翻译自Udemy的视频课程introduction to unreal engine 4 ability system Build Basic Classes 创建基础类 1 创建工程和人物设置 01 创建Git仓库使用UE4 22 VS
CUDA学习（二十三）

Direct3D互操作性 Direct3D 9Ex Direct3D 10和Direct3D 11支持Direct3D互操作性 CUDA上下文只能与满足以下条件的Direct3D设备互操作必须使用设置为D3DDEVTYPE HAL的Dev

CUDA学习（二十三）

CUDA学习（二十三） 的相关文章

随机推荐

热门标签

CUDA学习（二十三）的相关文章