CUDA实现多batch基数排序

2023-11-12

CUDA实现多batch基数排序

基数排序是具有固定迭代次数的排序算法, 其通过对最低位到最高位的一一比较，对数值排序。GPU版的基数排序将数据分为N个部分并行进行基数排序，随后并行规约得到排序后的数组。
这里实现了一版多batch的基数排序实例，并同时输出原数组的序号，基本实现了argsort的功能，代码如下所示：

#include<iostream>
#include<cuda_runtime.h>


__device__ void preprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
    if(tidy>numData) return;
    if(tidx>batch) return;    
    for(int i = tidy;i<numData;i+=blockDim.y)
    {
        unsigned int *data_temp = (unsigned int *)(&data[i + tidx*numData]);    
        *data_temp = (*data_temp >> 31 & 0x1)? ~(*data_temp): (*data_temp) | 0x80000000; 
    }
}

__device__ void Aeprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
    for(int i = tidy;i<numData;i+=blockDim.y)
    {
        unsigned int* data_temp = (unsigned int *)(&data[i + tidx*numData]);
        *data_temp = (*data_temp >> 31 & 0x1)? (*data_temp) & 0x7fffffff: ~(*data_temp);
    }
}


__device__ void radixKernel(float* data0,float* data1,int tidx,int tidy,int batch,int numData){
    for(int bit=0;bit<32;bit++){
        unsigned int mask = 1 << bit;
        unsigned int cnt0 = 0,cnt1 = 0;   
        for(int i=tidy;i<numData;i+=blockDim.y){
            unsigned int *temp =(unsigned int *) &data0[i + tidx*numData];
            if(*temp&mask){
                data1[tidy+cnt1 + tidx*numData] = data0[i + tidx*numData];
                cnt1 += blockDim.y;
            }
            else{
                data0[tidy+cnt0+ tidx*numData] = data0[i+ tidx*numData];
                cnt0 += blockDim.y;
            }
        }    
        for(int j=0;j<cnt1;j+=blockDim.y){
            data0[j+cnt0+tidy+ tidx*numData] = data1[j+tidy+ tidx*numData];
        } 
    }
    return;
}

__device__ void mergeKernel(float* data0,float* data1,int* index,int tidx,int tidy,int batch,int numData){
    int numPerList = ceil((float)numData / blockDim.y);
    extern __shared__ int listIndexrecordValrecordTid[]; 
    int* listIndex = (int*)listIndexrecordValrecordTid; //记录线程上指针位置
    float* recordVal = (float*)listIndexrecordValrecordTid + batch*blockDim.y;  //得到应比较的数
    int * recordTid = (int*)listIndexrecordValrecordTid + 2*batch*blockDim.y;  // 记录当前线程
    int* recordSrcIndex = (int*)listIndexrecordValrecordTid + 3*batch*blockDim.y;  // 记录原index
    listIndex[tidy + tidx * blockDim.y] = 0;
    recordVal[tidy + tidx * blockDim.y] = 0;
    recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
    recordSrcIndex[tidy + tidx * blockDim.y] = 0;
    __syncthreads();

    for(int i=0;i<numData;i++){
        recordVal[tidy + tidx * blockDim.y] = 0;
        recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
        recordSrcIndex[tidy + tidx * blockDim.y] = 0; 
        if(listIndex[tidy + tidx * blockDim.y] < numPerList)
        {
            int src_index = tidy + tidx * numData + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
            int batch_index = tidy + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
            if(batch_index < numData)
            {
                recordVal[tidy + tidx * blockDim.y] = data0[src_index];
                recordSrcIndex[tidy + tidx * blockDim.y] = src_index;
            }
            else{
                unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
                *temp = 0xffffffff;
            }
        }else{
                unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
                *temp = 0xffffffff;
        }

        __syncthreads();
        int tidMax = blockDim.y >> 1;
        while (tidMax!=0)
        {
            if(tidy < tidMax)
            {
                unsigned int* temp1 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y];
                unsigned int *temp2 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y + tidMax];
                if(*temp2 < *temp1)
                {
                    recordVal[tidy + tidx * blockDim.y] = recordVal[tidy + tidx * blockDim.y + tidMax];
                    recordTid[tidy + tidx * blockDim.y] = recordTid[tidy + tidx * blockDim.y + tidMax];
                    recordSrcIndex[tidy + tidx * blockDim.y] = recordSrcIndex[tidy + tidx * blockDim.y + tidMax];
                }
            }
            tidMax = tidMax >> 1;
            __syncthreads();

        }
        if(tidy==0){
            listIndex[recordTid[tidx * blockDim.y]]++;
            data1[i + tidx * numData] = recordVal[tidx * blockDim.y];
            index[i + tidx * numData] = recordSrcIndex[tidx * blockDim.y]%numData;
        }
        __syncthreads();
        
    }
    return;
}

__global__ void radixSortGpu(float* src_data, float* dst_data, int* index,int batch, int dataLen){
    int tidx = threadIdx.x;
    int tidy = threadIdx.y;
    preprocess_float(src_data, batch,dataLen, tidx,tidy); 
    __syncthreads();
    radixKernel(src_data,dst_data,tidx,tidy,batch,dataLen);
    __syncthreads();
    mergeKernel(src_data,dst_data,index,tidx,tidy,batch,dataLen);
    __syncthreads();
    Aeprocess_float(dst_data, batch,dataLen, tidx,tidy);
    return;
}


int main(){
    using namespace std;
    int batch=4;
    int inputsLen = 40;
    int numBolcks=128;
    float* inputs;
    float* outputs;
    int* index;
    srand(100);    
    cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);

    cout<<"input rand :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
            index[i+j*inputsLen] = i;
            cout<<inputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;

    const dim3 blockSize(batch,numBolcks);
    const dim3 girdSize(1,1);
    radixSortGpu<<<girdSize,blockSize,4*batch*numBolcks*sizeof(float)>>>(inputs,outputs,index,batch,inputsLen);
    cudaDeviceSynchronize();

    cout<<"output :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;   

    cout<<"index :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<index[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;  

    return 0;
}

编译执行代码，可以得到结果:

input rand :
0.315598, 0.284943, 0.240601, 0.484127, 0.375793, 0.0537027, 0.570274, 0.970005, 0.515422, 0.429529, 0.408115, 0.150135, 0.586551, 0.631635, 0.61386, 0.411339, 0.107092, 0.871626, 0.264386, 0.621543, 0.670743, 0.358033, 0.208356, 0.534175, 0.384512, 0.844556, 0.883552, 0.461531, 0.650512, 0.772418, 0.496347, 0.96611, 0.0573612, 0.736949, 0.450236, 0.433154, 0.790652, 0.0205103, 0.403159, 0.306074,  
0.450039, 0.811274, 0.456208, 0.0365907, 0.442909, 0.0700681, 0.44793, 0.550001, 0.941694, 0.712316, 0.171544, 0.612436, 0.0703487, 0.3799, 0.146612, 0.45486, 0.224456, 0.0301636, 0.916391, 0.874968, 0.802581, 0.412738, 0.841078, 0.859943, 0.149687, 0.291314, 0.293097, 0.940339, 0.311825, 0.696256, 0.246413, 0.761864, 0.50753, 0.702621, 0.798455, 0.950439, 0.772689, 0.246385, 0.50044, 0.714383,  
0.9587, 0.671984, 0.326819, 0.0290491, 0.0518843, 0.473431, 0.483909, 0.27634, 0.503595, 0.4003, 0.151308, 0.306176, 0.813039, 0.992386, 0.166119, 0.962726, 0.2837, 0.459215, 0.903065, 0.595525, 0.155472, 0.149477, 0.357389, 0.663002, 0.852098, 0.155843, 0.613441, 0.624787, 0.402228, 0.113881, 0.33917, 0.360928, 0.785866, 0.665989, 0.389977, 0.83775, 0.13942, 0.873886, 0.11409, 0.643015,  
0.274187, 0.265398, 0.949191, 0.0872253, 0.257784, 0.115309, 0.0499512, 0.541484, 0.574525, 0.953016, 0.137009, 0.729996, 0.102493, 0.494398, 0.392998, 0.954591, 0.650241, 0.00643936, 0.579378, 0.0524684, 0.120321, 0.918549, 0.413396, 0.906187, 0.584538, 0.803373, 0.743937, 0.723958, 0.67726, 0.858027, 0.366973, 0.951447, 0.123425, 0.316164, 0.0386718, 0.38121, 0.431473, 0.0886231, 0.922694, 0.00599772,  
 
output :
0.0205103, 0.0537027, 0.0573612, 0.107092, 0.150135, 0.208356, 0.240601, 0.264386, 0.284943, 0.306074, 0.315598, 0.358033, 0.375793, 0.384512, 0.403159, 0.408115, 0.411339, 0.429529, 0.433154, 0.450236, 0.461531, 0.484127, 0.496347, 0.515422, 0.534175, 0.570274, 0.586551, 0.61386, 0.621543, 0.631635, 0.650512, 0.670743, 0.736949, 0.772418, 0.790652, 0.844556, 0.871626, 0.883552, 0.96611, 0.970005,  
0.0301636, 0.0365907, 0.0700681, 0.0703487, 0.146612, 0.149687, 0.171544, 0.224456, 0.246385, 0.246413, 0.291314, 0.293097, 0.311825, 0.3799, 0.412738, 0.442909, 0.44793, 0.450039, 0.45486, 0.456208, 0.50044, 0.50753, 0.550001, 0.612436, 0.696256, 0.702621, 0.712316, 0.714383, 0.761864, 0.772689, 0.798455, 0.802581, 0.811274, 0.841078, 0.859943, 0.874968, 0.916391, 0.940339, 0.941694, 0.950439,  
0.0290491, 0.0518843, 0.113881, 0.11409, 0.13942, 0.149477, 0.151308, 0.155472, 0.155843, 0.166119, 0.27634, 0.2837, 0.306176, 0.326819, 0.33917, 0.357389, 0.360928, 0.389977, 0.4003, 0.402228, 0.459215, 0.473431, 0.483909, 0.503595, 0.595525, 0.613441, 0.624787, 0.643015, 0.663002, 0.665989, 0.671984, 0.785866, 0.813039, 0.83775, 0.852098, 0.873886, 0.903065, 0.9587, 0.962726, 0.992386,  
0.00599772, 0.00643936, 0.0386718, 0.0499512, 0.0524684, 0.0872253, 0.0886231, 0.102493, 0.115309, 0.120321, 0.123425, 0.137009, 0.257784, 0.265398, 0.274187, 0.316164, 0.366973, 0.38121, 0.392998, 0.413396, 0.431473, 0.494398, 0.541484, 0.574525, 0.579378, 0.584538, 0.650241, 0.67726, 0.723958, 0.729996, 0.743937, 0.803373, 0.858027, 0.906187, 0.918549, 0.922694, 0.949191, 0.951447, 0.953016, 0.954591,  
 
index :
37, 5, 32, 16, 11, 22, 2, 18, 1, 39, 0, 21, 4, 24, 38, 10, 15, 9, 35, 34, 27, 3, 30, 8, 23, 6, 12, 14, 19, 13, 28, 20, 33, 29, 36, 25, 17, 26, 31, 7,  
17, 3, 5, 12, 14, 24, 10, 16, 37, 30, 25, 26, 28, 13, 21, 4, 6, 0, 15, 2, 38, 32, 7, 11, 29, 33, 9, 39, 31, 36, 34, 20, 1, 22, 23, 19, 18, 27, 8, 35,  
3, 4, 29, 38, 36, 21, 10, 20, 25, 14, 7, 16, 11, 2, 30, 22, 31, 34, 9, 28, 17, 5, 6, 8, 19, 26, 27, 39, 23, 33, 1, 32, 12, 35, 24, 37, 18, 0, 15, 13,  
39, 17, 34, 6, 19, 3, 37, 12, 5, 20, 32, 10, 4, 1, 0, 33, 30, 35, 14, 22, 36, 13, 7, 8, 18, 24, 16, 28, 27, 11, 26, 25, 29, 23, 21, 38, 2, 31, 9, 15,

对比numpy中的结果，确信结果无误。

或者使用nvidia官方的cub库也可以实现同样的效果，具体代码如下：

#include<cuda_runtime.h>
#include<iostream>
#include<cub/cub.cuh>


int main(){
    using namespace std;
    int batch=4;
    int inputsLen = 40;
    int* d_offset;
    float* inputs;
    float* outputs;
    int* index;
    int* outIndex;
    srand(100);    
    cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);
    cudaMallocManaged(&outIndex,sizeof(int)*inputsLen*batch);
    cudaMallocManaged(&d_offset,sizeof(int)*(batch+1));

    cout<<"input rand :"<<endl;
    d_offset[0] = 0;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
            index[i+j*inputsLen] = i;
            cout<<inputs[i+j*inputsLen]<<", ";
        }
        d_offset[j+1] = inputsLen*(j+1); // 1 2 3 4 --> 0 3 
        cout<<" "<<endl;
    }
    cout<<" "<<endl;
    size_t  temp_storage_bytes  = 0;
    void    *d_temp_storage     = NULL;
    cub::DeviceSegmentedRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes,
        inputs, outputs,
        index, outIndex,
        batch * inputsLen, batch,
        d_offset, d_offset + 1);
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    cub::DeviceSegmentedRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes,
        inputs, outputs,
        index, outIndex,
        batch * inputsLen, batch,
        d_offset,d_offset + 1);
    cudaDeviceSynchronize();
    
    cout<<"output :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;   

    cout<<"index :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outIndex[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;  

    return 0;
}

更新：

实现了一个基于cuda加速多batch的双线性插值resize，稍加改造可以用于视频流模型输入时的resize方法，具体代码如下：

#include<iostream>
#include<cuda_runtime.h>
#include<opencv2/opencv.hpp>
#include<getopt.h>
#include<string.h>
#include<vector>
#include<fstream>

using namespace std;
using namespace cv;

__device__ uchar3 getValues(uchar3* input,int x,int y,int b, int H,int W){
    if (x<0 || x>W || y<0 || y>H) return make_uchar3(0,0,0);

    return input[b*H*W + y*W + x];
}

__global__ void bilinearKernel( uchar3*input, uchar3*output, int oriH, int oriW, int outH, int outW, int batch, float scaleX, float scaleY, int shiftX,int shiftY ){

    int b = threadIdx.x + blockDim.x * blockIdx.x;
    int outXY = threadIdx.y + blockDim.y * blockIdx.y;
    int outX = outXY % outW;
    int outY = outXY / outW;
    if (outX>=outW || outY>=outH || b>=batch) return;

    float srcX = (outX - shiftX + 0.5) * scaleX -0.5;
    float srcY = (outY - shiftY + 0.5) * scaleY -0.5;

    int minSrcX = (int)srcX;
    int minSrcY = (int)srcY;

    int maxSrcX = (int)srcX + 1;
    int maxSrcY = (int)srcY + 1;

    float w1 = (srcX-minSrcX) * ( srcY-minSrcY);
    float w2 = (maxSrcX-srcX) * ( srcY-minSrcY);
    float w3 = (maxSrcX-srcX) * ( maxSrcY-srcY);
    float w4 = (srcX-minSrcX) * ( maxSrcY-srcY);

    uchar3 v1 = getValues(input,minSrcX,minSrcY,b,oriH,oriW);
    uchar3 v2 = getValues(input,maxSrcX,minSrcY,b,oriH,oriW);
    uchar3 v3 = getValues(input,maxSrcX,maxSrcY,b,oriH,oriW);
    uchar3 v4 = getValues(input,minSrcX,maxSrcY,b,oriH,oriW);

    output[b*outW*outH + outY*outW + outX].x = (uchar)(w1 * (float)v1.x + w2 * (float)v2.x + w3 * (float)v3.x + w4 * (float)v4.x);
    output[b*outW*outH + outY*outW + outX].y = (uchar)(w1 * (float)v1.y + w2 * (float)v2.y + w3 * (float)v3.y + w4 * (float)v4.y);
    output[b*outW*outH + outY*outW + outX].z = (uchar)(w1 * (float)v1.z + w2 * (float)v2.z + w3 * (float)v3.z + w4 * (float)v4.z);

    return;
}

void stringSplit(string str, const const char split,vector<string>& res)
{
	istringstream iss(str);	// 输入流
	string token;			// 接收缓冲区
	while (getline(iss, token, split))	// 以split为分隔符
	{
		res.push_back(token);
	}
}

int main(int argc,char**argv){

    int outH;
    int outW;
    int keepRatio = 0;
    int keepCenter = 0;
    string imgPath;
    string outPath;

    int opt=0,option_index = 0;
    static struct option opts[]=
    {
        {"outH",required_argument,nullptr,'h'},// 长选项名，required_argument 表明要跟参数，返回值是什么，返回值
        {"outW",required_argument,nullptr,'w'},
        {"keepRatio",no_argument,nullptr,'r'},
        {"keepCenter",no_argument,nullptr,'c'},
        {"imgPath",required_argument,nullptr,'i'},
        {"outPath",required_argument,nullptr,'o'},
        {0,0,0,0}
    };

    while((opt=getopt_long_only(argc,argv,"h:w:i:o:rc",opts,&option_index))!=-1)
    {
        switch (opt)
        {
        case 'h':outH = atoi(optarg);break;
        case 'w':outW = atoi(optarg);break;
        case 'i':imgPath = string(optarg);break;
        case 'o':outPath = string(optarg);break;
        case 'r':keepRatio = 1;break;
        case 'c':keepCenter =1;break;
        
        default:
            break;
        }
    }

    if(imgPath.find(".jpg") != string::npos || imgPath.find(".png") != string::npos){

        Mat img = imread(imgPath);
        int oriWidth = img.size().width;
        int oriHeight = img.size().height;

        uchar3* inputs;
        uchar3* outputs;  
        cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight);
        cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW);

        float scaleX = (oriWidth*1.0f / outW);
        float scaleY = (oriHeight*1.0f / outH);
        float shiftX = 0.f ,shiftY = 0.f;
        if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}

        cudaMemcpy(inputs,img.data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);

        dim3 blockSize(1,512);
        dim3 gridSize(1,(outH*outW+512-1)/512);
        bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,1,scaleX,scaleY,shiftX,shiftY);

        Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
        cudaMemcpy(outImg.data,outputs,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost);

        imwrite(outPath,outImg);
    }
    else if (imgPath.find(".txt") != string::npos)
    {
        cout<<"read image list "<<imgPath<<endl;
        ifstream inputImageNameList(imgPath);
        vector<string> fileNames;
        vector<Mat> imgs;
        auto dataptr = imgs.data();
        int oriWidth = 0;
        int oriHeight = 0;
        if(!inputImageNameList.is_open()){
            cout<<"can not read image list "<<imgPath<<endl;
            return 1;
        }
        string strLine;
        while (getline(inputImageNameList,strLine)){
            Mat img = imread(strLine);
            oriWidth = img.size().width;
            oriHeight = img.size().height;  
            imgs.push_back(img);

            vector<string> strList;
            string str2("This-is-a-test");
            stringSplit(strLine, '/', strList);	// 将子串存放到strList中
            int lenStrList = strList.size();
            fileNames.push_back(strList[lenStrList-1]);

        }
        inputImageNameList.close();

        int batch = fileNames.size();

        uchar3* inputs;
        uchar3* outputs;  
        cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight*batch);
        cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW*batch);

        float scaleX = (oriWidth*1.0f / outW);
        float scaleY = (oriHeight*1.0f / outH);
        float shiftX = 0.f ,shiftY = 0.f;
        if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}

        Mat outImg_x(oriHeight,oriWidth,CV_8UC3,Scalar(0,0,0));
        for(int b=0;b<batch;++b){
            cudaMemcpy(inputs+oriHeight*oriWidth*b,imgs[b].data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);
        }
        dim3 blockSize(1,512);
        dim3 gridSize(batch,(outH*outW+512-1)/512);
        bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,batch,scaleX,scaleY,shiftX,shiftY);

        Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
        for(int b=0;b<batch;++b){
            cudaMemcpy(outImg.data,outputs+b*outH*outW,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost); 
            imwrite(outPath+"result_"+fileNames[b],outImg);       
        }
        

    }
    return;

}

本文内容由网友自发贡献，版权归原作者所有，本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容，请联系:hwhale#tublm.com(使用前将#替换为@)

学习所得

c

算法

CUDA实现多batch基数排序的相关文章

用于代数简化和求解的 C# 库 [关闭]

Closed 这个问题正在寻求书籍工具软件库等的推荐不满足堆栈溢出指南 help closed questions 目前不接受答案网络上有很多代数求解器和简化器例如 algebra com 上不错的代数求解器和简化器然而我正在
注销租约抛出 InvalidOperationException

我有一个使用插件的应用程序我在另一个应用程序域中加载插件我使用 RemoteHandle 类http www pocketsilicon com post Things That Make My Life Hell Part 1 App
在 DataView 的 RowFilter 中选择 DISTINCT

我试图根据与另一个表的关系缩小 DataView 中的行范围我使用的 RowFilter 如下 dv new DataView myDS myTable id IN SELECT DISTINCT parentID FROM myOthe
复制 std::function 的成本有多高？

While std function是可移动的但在某些情况下不可能或不方便复制它会受到重大处罚吗它是否可能取决于捕获变量的大小如果它是使用 lambda 表达式创建的它依赖于实现吗 std function通常被实现为值语义小缓
单个对象的 Monogame XNA 变换矩阵？

我读过一些解释 XNA Monogame 变换矩阵的教程问题是这些矩阵应用于 SpriteBatch Begin matrix 这意味着所有 Draw 代码都将被转换如何将变换矩阵应用于单个可绘制对象就我而言我想转换滚动背景使其自
获取两个工作日之间的天数差异

这听起来很简单但我不明白其中的意义那么获取两次之间的天数的最简单方法是什么DayOfWeeks当第一个是起点时如果下一个工作日较早则应考虑在下周 The DayOfWeek 枚举 http 20 20 5B1 5D 3a 20htt
使用接口有什么好处？

使用接口有什么用我听说它用来代替多重继承并且还可以用它来完成数据隐藏还有其他优点吗哪些地方使用了接口程序员如何识别需要该接口有什么区别explicit interface implementation and implicit
在 Visual Studio 2010 中从 Fortran 调用 C++ 函数

我想从 Fortran 调用 C 函数为此我在 Visual Studio 2010 中创建了一个 FORTRAN 项目之后我将一个 Cpp 项目添加到该 FORTRAN 项目中当我要构建程序时出现以下错误 Error 1 unr
qdbusxml2cpp 未知类型

在使用 qdbusxml2cpp 程序将以下 xml 转换为 Qt 类时我收到此错误 qdbusxml2cpp c ObjectManager a ObjectManager ObjectManager cpp xml object ma
DbContext 和 ObjectContext 有什么区别

From MSDN 表示工作单元和存储库模式的组合使您能够查询数据库并将更改分组在一起然后将这些更改作为一个单元写回存储 DbContext在概念上类似于ObjectContext 我虽然DbContext只处理与数据库的连接以及针对数
如何检测表单的任何控件的变化？

如何检测 C 中表单的任何控件的更改由于我在一个表单上有许多控件并且如果表单中的任何控件值发生更改我需要禁用按钮我正在寻找一些内置函数事件处理程序属性并且不想为此创建自定义函数不我不知道任何时候都会触发任何事件any控制表
Azure 辅助角色“请求输入之一超出范围”的内部异常。

我在辅助角色中调用 CloudTableClient CreateTableIfNotExist 方法但收到一个异常其中包含请求输入之一超出范围的内部异常我做了一些研究发现这是由于将表命名为非法表名引起的但是我尝试为我的表命
外键与独立关系 - Entity Framework 5 有改进吗？

我读过了several http www ladislavmrnka com 2011 05 foreign key vs independent associations in ef 4 文章和问题 https stackoverflow
动态添加 ASP.Net 控件

我有一个存储过程它根据数据库中存储的记录数返回多行现在我想有一种方法来创建 div 带有包含该行值的控件的标记如果从数据库返回 10 行则 10 div 必须创建标签我有下面的代码来从数据库中获取结果但我不知道如何从这里继续 S
Cmake 链接共享库：包含库中的头文件时“没有这样的文件或目录”

我正在学习使用 CMake 构建库构建库的代码结构如下 include Test hpp ITest hpp interface src Test cpp ITest cpp 在 CMakeLists txt 中我用来构建库的句子是 f
使用 C# 读取 Soap 消息
不同类型指针之间的减法[重复]

这个问题在这里已经有答案了我试图找到两个变量之间的内存距离具体来说我需要找到 char 数组和 int 之间的距离 char data 5 int a 0 printf p n p n data 5 a long int distan
调用堆栈中的“外部代码”是什么意思？

我在 Visual Studio 中调用一个方法并尝试通过检查调用堆栈来调试它其中一些行标记为外部代码这到底是什么意思方法来自 dll已被处决外部代码意味着该dll没有可用的调试信息你能做的就是在Call Stack窗口中单
当从finally中抛出异常时，Catch块不会被评估

出现这个问题的原因是之前在 NET 4 0 中运行的代码在 NET 4 5 中因未处理的异常而失败部分原因是 try finallys 如果您想了解详细信息请阅读更多内容微软连接 https connect microsoft com
从列表中选择项目以求和

我有一个包含数值的项目列表我需要使用这些项目求和我需要你的帮助来构建这样的算法下面是一个用 C 编写的示例描述了我的问题 int sum 21 List

随机推荐

js数据类型之对象object类型(数组与自定义对象)

对象object 数组与自定义对象 JavaScript 中的所有事物都是对象字符串数值数组函数此外 JavaScript 允许自定义对象 JavaScript 提供多个内建对象比如 String Date Array 等等对
CoordinatorLayout+AppBarLayout+CollapsingToolbarLayout+Toolbar实现渐变透明的状态栏

在之前的一篇博文里面我已经说明了CoordinatorLayout使用过程中遇到的问题之后又发现结合CollapsingToolbarLayout使用时的另一个问题 CollapsingToolbarLayout里面的ImageView为
[架构之路-208]- 人人都是产品经理 - 什么是产品经理？产品经理具体是做什么的？

目录一什么是产品经理产品经理具体做什么二产品经理的岗位职责三产品经理的职业规划一什么是产品经理产品经理具体做什么在外行人看来产品经理常常被误认为是经理其实产品经理只是一个岗位名称并不是真正意义上的经理或者说
深入浅出UML类图（五）

实例分析3 售票机控制程序某运输公司决定为新的售票机开发车票销售的控制软件图I给出了售票机的面板示意图以及相关的控制部件图I 售票机面板示意图售票机相关部件的作用如下所述 1 目的地键盘用来输入行程目的地的代码例如 200表示总站
python的几个重要基本概念

1 整数小数布尔值和空值整数 int类型计算机中整数是有最大值的与计算机的存储能力有关即使是这样计算机中的整数值也是很大很大的这一点基本上不需要担心的小数也称浮点数 float类型小数就是带小数点的数包括 1 0 等等
sqlite数据库-------清除数据，数据库文件大小不变解决方法

现象删除表格的全部数据 DELETE FROM Name 原因当在sqlite中删除了大量数据后数据库文件的大小还是那样没有变原因是从Sqlite删除数据后未使用的磁盘空间被添加到一个内在的空闲列表中用于存储你下次插入的数
IP协议号与传输层端口

网络层数据包的包格式里面有个很重要的字段叫做协议号比如在传输层如果是tcp连接那么在网络层ip包里面的协议号就将会有个值是6 如果是udp的话那个值就是17 传输层传输层通过接口关联端口的字段叫做端口应用层协议号是存在于IP
设计模式：桥接模式（c++实现案例）

桥接模式桥接模式是一种结构型设计模式可将业务逻辑或一个大类拆分为不同的层次结构从而能独立地进行开发桥接模式通过将继承改为组合的方式来解决这个问题具体来说就是抽取其中一个维度并使之成为独立的类层次这样就可以在初始类中引用这个新层
2016年蓝桥杯省赛C/C++ A组-寒假作业

题目现在小学的数学题目也不是那么好玩的看看这个寒假作业每个方块代表1 13中的某一个数字但不能重复比如 6 7 13 9 8 1 3 4 12 10 2 5 以及 7 6 13 9 8 1 3 4 12 10 2 5 就算两种解法
ug产品摆正高级技巧_UG NX如何摆正产品零件模型

原标题 UG NX如何摆正产品零件模型有时我们拿到一个产品模型按F8也是一个歪的视图如图那么该如何才能将产品摆正呢其实很简单我们只需要移动下就好了按ctrl t移动对象选中模型变化选项里面运动选择从csys到csys 指
数据结构——串——王道

目录串定义串和线性表的联系及不同串的基本操作存储结构顺序存储链式存储基本操作的实现字符串模式匹配算法朴素模式匹配算法 KMP算法串定义串即字符串 String 是由零个或多个字符组成的有限序列一般记为其中 S
编写cuda代码查看自己机器的显卡信息

代码如下 cudaDeviceProp 是一个结构体一些参数就是cudaDeviceProp 结构体中的参数 cudaGetDeviceProperties cudade 0 这个函数就是实例化第一个参数就是实例化第二个参数是哪一个显
Windows如何开启虚拟化，以安装虚拟机？

参考 Windows如何开启虚拟化以安装虚拟机作者一只青木呀发布时间 2020 08 28 20 58 45 网址 https blog csdn net weixin 45309916 article details 108286
STM32Cube 中的STM32例程EXAMPLE

STM32 Cube 其实包含的大量的程序例子一些模块程序编写不需要再去找其他例子在这就能找到在这里插入图片描述 https img blog csdnimg cn 7862d4aba5b54d4da2db87586b4ae859 p
Spring源码--Bean的加载

bean的加载在AbstractBeanFactory的doGetBean中 protected
Allegro怎么进行走线

1 切换到走线模式点击走线在options中进行走线参数设置如下图
2022蓝桥杯省赛——顺子日期

问题描述本题为填空题只需要算出结果后在代码中使用输出语句将所填结果输出即可小明特别喜欢顺子顺子指的就是连续的三个数字 123 456 等顺子日期指的就是在日期的 yyyymmdd 表示法中存在任意连续的三位数是一个顺子的日期
并行CRC—Verilog代码实现

CRC并行Verilog 1 构造系数方阵F 2 计算CRC系数 3 计算DATA系数 3 1 DATA系数真相 4 合并系数 5 验证 6 其他选项 6 1输入翻转 6 2预设初值 6 3输出翻转取反 7 Matlab生成并行crc代码
vue实现文件批量上传及进度条展示

主要实现功能 1 多文件拖拽上传 2 显示文件上传进度条 3 上传期间还可以继续拖拽上传 4 可以支持手动修改上传并发任务数 5 上传失败自动重试最大重试次数3次 6 上传过程中可以手动取消上传本次使用的是element ui 上传拖
CUDA实现多batch基数排序

CUDA实现多batch基数排序基数排序是具有固定迭代次数的排序算法其通过对最低位到最高位的一一比较对数值排序 GPU版的基数排序将数据分为N个部分并行进行基数排序随后并行规约得到排序后的数组这里实现了一版多batch的基数排序实

CUDA实现多batch基数排序

CUDA实现多batch基数排序

更新：

CUDA实现多batch基数排序 的相关文章

随机推荐

热门标签

CUDA实现多batch基数排序的相关文章