如何解决英特尔 FPGA 的 OpenCL 中本地内存阵列的 RAM 消耗量如此之大
我在 OpenCL 中为 FPGA 板编写了一个简单的代码。我使用 DE10 nano 仅共享板和 Intel SDK 18.1 。主要问题是 Ram 消耗过多。 HTML 报告主要显示本地内存数组中的问题。在 ND 范围内核中,这个问题变得更糟!
(积极的编译器优化:将不必要的存储移除到本地内存)
顺便说一下,在循环分析选项卡中有 II : ~1 并且在详细信息窗格中提到:
(II 是由于以下可停止指令的近似值:加载操作 #no,存储操作 #no)。我怎样才能解决它并达到 II 的确切 1 ?!
代码:
#define IDX(i,j,n) ((i) * (n) + (j))
//#include<stdlib.h>
__kernel void PushKernel( uint column,__global int * restrict height,__global int * restrict excessFlow,__global int * restrict netFlowOutS,__global int * restrict netFlowInT,uint s,uint t,uint row,__global int * restrict residualFlow_up,__global int * restrict residualFlow_down,__global int * restrict residualFlow_right,__global int * restrict residualFlow_left)
{
const uint num_column=6;
const uint num_row=4;
int FlowOutS=*netFlowOutS;
int FlowInT=*netFlowInT;
uint source=s;
uint destination=t;
uint index;
__local int heights_horizontal_cache[6];
__local int excessFlow_horizontal_cache[6];
__local int excessFlow_horizontal_cache_temp[6];
__local int residualFlow_right_cache[6];
__local int residualFlow_left_cache[6];
__local int outS_cache;
//#pragma unroll
//#pragma loop_coalesce
#pragma ivdep
//#pragma ii 1
for(int i=0; i<num_row; i++){index=IDX(i,num_column);
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){//index=IDX(i,num_column);
heights_horizontal_cache[j]=height[index+j];
excessFlow_horizontal_cache[j]=excessFlow[index+j];
excessFlow_horizontal_cache_temp[j]=0;
residualFlow_right_cache[j]=residualFlow_right[index+j];
residualFlow_left_cache[j]=residualFlow_left[index+j];
outS_cache=0;
}
//mem_fence(CLK_GLOBAL_MEM_FENCE);
///////////////////////////////////////////////////////////////////////push to right
//#pragma ivdep array (residualFlow_right_cache)
#pragma ivdep
#pragma unroll
for(int j=0; j<num_column-1; j++){
//index=IDX(i,num_column);
if(index+j != source && index+j != destination && excessFlow_horizontal_cache[j]>0 && residualFlow_right_cache[j]>0 && heights_horizontal_cache[j]==heights_horizontal_cache[j+1]+1){
int delta = min(excessFlow_horizontal_cache[j],residualFlow_right_cache[j]);
residualFlow_right_cache[j]-=delta;
residualFlow_left_cache[j+1]+=delta;
excessFlow_horizontal_cache[j]-=delta;
//excessFlow_horizontal_cache[j+1]+=delta;
excessFlow_horizontal_cache_temp[j+1]=delta;
if (IDX(i,j+1,num_column) == s) {
//FlowOutS-=delta;
outS_cache=delta;
}
else if (IDX(i,num_column) == t) {
FlowInT+=delta;}
}
///////////////////////////////////////////////////////////////////////results back to global
//mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){
excessFlow_horizontal_cache[j]+=excessFlow_horizontal_cache_temp[j];
}
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){
//index=IDX(i,num_column);
excessFlow[index+j]=excessFlow_horizontal_cache[j];
residualFlow_right[index+j]=residualFlow_right_cache[j];
residualFlow_left[index+j]=residualFlow_left_cache[j];
}
}
FlowOutS-=outS_cache;
*netFlowOutS=FlowOutS;
*netFlowInT=FlowInT;
}
这里是 HTML 报告:
HTML Report
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。