如何优化这个 CUDA 内核

2024-04-23

我已经分析了我的模型,似乎该内核约占我总运行时间的 2/3。我一直在寻找优化它的建议。代码如下。

__global__ void calcFlux(double* concs, double* fluxes, double* dt)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    fluxes[idx]=knowles_flux(idx, concs);
    //fluxes[idx]=flux(idx, concs);
}

__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;
    if (r == ((maxlength)-1))
    {
        //Calculation type : "Max"
        flux = -km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0];
    }
    else if (r > ((nc)-1))
    {
        //Calculation type : "F"
        //arrSum3(conc, &frag_term, r+1, maxlength-1);
        for (int s = r+1; s < (maxlength); s++)
        {
            frag_term += conc[s];
        }
        flux = -(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0];
    }
    else if (r == ((nc)-1))
    {
        //Calculation type : "N"
        //arrSum3(conc, &frag_term, r+1, maxlength-1);
        for (int s = r+1; s < (maxlength); s++)
        {
            frag_term += conc[s];
        }
        flux = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0];
    }
    else if (r < ((nc)-1))
    {
    //Calculation type : "O"
        flux = 0;
    }
    return flux;
}

只是为了让您了解为什么 for 循环是一个问题,该内核是在大约 maxlength = 9000 个元素的数组上启动的。就我们现在的目的而言,nc 的范围是 2-6。下面说明了该内核如何处理传入数组 (conc)。对于此数组,需要对不同的元素组应用五种不同类型的计算。

Array element : 0 1 2 3 4 5 6 7 8 9 ... 8955 8956 8957 8958 8959 8960
Type of calc  : M O O O O O N F F F ...   F   F    F    F    F   Max

我现在一直在尝试解决的潜在问题是四重 if-else 和 for 循环的分支分歧。

我处理分支分歧的想法是将该内核分解为四个独立的设备功能或内核,分别处理每个区域并同时启动。我不确定这是否比仅仅让分支发散更好,如果我没记错的话,这会导致四种计算类型串行运行。

为了处理 for 循环,您会注意到有一个被注释掉的 arrSum3 函数,它是我根据之前(可能写得不好)编写的并行归约内核编写的。使用它代替 for 循环大大增加了我的运行时间。我觉得有一种聪明的方法可以完成我想要用 for 循环做的事情,但我只是不那么聪明,我的顾问厌倦了我“浪费时间”思考它。

感谢任何帮助。

EDIT

完整代码位于此处:https://stackoverflow.com/q/21170233/1218689 https://stackoverflow.com/q/21170233/1218689


假设 sgn() 和 abs() 不是从“if”和“else”派生的

__device__ double knowles_flux(int r, double *conc)
{
    double frag_term = 0;
    double flux = 0;

        //Calculation type : "Max"
        //no divergence
        //should prefer 20-30 extra cycles instead of a branching.
        //may not be good for CPU
        fluxA = (1-abs(sgn(r-(maxlength-1)))) * (-km*(r)*conc[r]+2*(ka)*conc[r-1]*conc[0]);
        //is zero if r and maxlength-1 are not equal

        //always compute this in shared memory so work will be equal for all cores, no divergence

        // you should divide kernel into several pieces to do a reduction
        // but if you dont want that, then you can try :
        for (int s = 0;s<someLimit ; s++) // all count for same number of cycles so no divergence
        {
            frag_term += conc[s] * (   abs(sgn( s-maxlength ))*sgn(1- sgn( s-maxlength ))  )* (      sgn(1+sgn(s-(r+1)))  );
        }
         //but you can make easier of this using "add and assign" operation
         // in local memory (was it __shared in CUDA?)
         //  global conc[] to local concL[] memory(using all cores)(100 cycles)
         // for(others from zero to upper_limit)
         // if(localID==0)
         // {
         //    frag_termL[0]+=concL[s]             // local to local (10 cycles/assign.)
         //    frag_termL[0+others]=frag_termL[0]; // local to local (10 cycles/assign.)
         // }  -----> uses nearly same number of cycles but uses much less energy
         //using single core (2000 instr. with single core vs 1000 instr. with 2k cores)
         // in local memory, then copy it to private registers accordingly using all cores



        //Calculation type : "F"

        fluxB = (  abs(sgn(r-(nc-1)))*sgn(1+sgn(r-(nc-1)))   )*(-(km)*(r)*conc[r] + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0] + 2*(ka)*conc[r-1]*conc[0]);
        // is zero if r is not greater than (nc-1)



        //Calculation type : "N"


        fluxC = (   1-abs(sgn(r-(nc-1)))   )*((kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[r]*conc[0]);
        //zero if r and nc-1 are not equal



    flux=fluxA+fluxB+fluxC; //only one of these can be different than zero

    flux=flux*(   -sgn(r-(nc-1))*sgn(1-sgn(r-(nc-1)))  )
    //zero if r > (nc-1)

    return flux;
}

好吧,让我稍微打开一下:

if(a>b) x+=y;

可以看作

if a-b is negative sgn(a-b) is -1
then adding 1 to that -1 gives zero ==> satisfies lower part of comparison(a<b)
x+= (sgn(a-b) +1) = 0 if a<b (not a>b), x unchanged

if(a-b) is zero, sgn(a-b) is zero
then we should multiply the upper solution with sgn(a-b) too!
x+= y*(sgn(a-b) +1)*sgn(a-b)
means
x+= y*( 0  +  1) * 0 = 0           a==b is satisfied too!

lets check what happens if a>b
x+= y*(sgn(a-b) +1)*sgn(a-b)
x+= y*(1 +1)*1  ==> y*2 is not acceptable, needs another sgn on outherside

x+= y* sgn((sgn(a-b)+1)*sgn(a-b))

x+= y* sgn((1+1)*1)

x+= y* sgn(2)   

x+= y only when a is greater than b

当有太多的时候

abs(sgn(r-(nc-1))

然后你可以重新使用它作为

tmp=abs(sgn(r-(nc-1))

.....  *tmp*(tmp-1) ....
...... +tmp*zxc[s] .....
......  ......

进一步减少总周期!寄存器访问可以达到 TB/s 级别,因此不应该成为问题。就像为全球访问所做的那样:

tmpGlobal= conc[r];

...... tmpGlobal * tmp .....
.... tmpGlobal +x -y ....

所有私有寄存器每秒都以 TB 为单位执行操作。

警告:reading如果 conc[0] 的实际地址已经不是真正的零,那么只要将 conc[-1] 的实际地址乘以零,来自 conc[-1] 的数据就不会导致任何错误。但writing是危险的。

如果你无论如何都需要逃离 conc[-1] ,你也可以将索引乘以一些绝对值!看:

 tmp=conc[i-1] becomes   tmp=conc[abs((i-1))] will always read from positive index, the value will be multiplied by zero later anyway. This was lower bound protection.
  You can apply a higher bound protection too. Just this adds even more cycles.

如果在访问 conc[r-1] 和 conc[r+1] 时处理纯标量值不够快,请考虑使用向量洗牌操作。向量元素之间的洗牌操作比通过本地内存将其复制到另一个核心/线程更快。

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

如何优化这个 CUDA 内核 的相关文章

随机推荐