如何解决在 C++ 中使用 Openacc并行化时应该如何设置 copy()?
我正在使用 gcc 编译器。 (g++ -o test testfile.cpp)
我想使用 Openacc 来并行化我的代码,但我对正确使用 #pragma 有点困惑。
以下是我使用并行化的部分。
即使在使用 Openacc 之后,代码也不比以前快。
我猜这与“数据移动”有关。
所以我想我需要在这里使用#pragma acc 数据复制。但我不确定如何正确使用它。
有什么帮助吗? 提前致谢。
#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h> // power
#include <cmath> // abs
#include <fstream>
using namespace std;
using namespace chrono;
// Dynamically allocation with values(float)
void dallo_fn(float**** pMat,int Na,int Nd,int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
fill_n(Mat[i][j],Ny,1);
}
}
*pMat = Mat;
}
// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat,int Ny) {
float*** Mat = new float** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new float* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new float[Ny];
}
}
*pMat = Mat;
}
// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat,int Ny) {
int*** Mat = new int** [Na];
for (int i = 0; i < Na; i++) {
Mat[i] = new int* [Nd];
for (int j = 0; j < Nd; j++) {
Mat[i][j] = new int[Ny];
}
}
*pMat = Mat;
}
// Utility function
float utility(float a,float a_f,float d,float d_f,float y,double sig,double psi,double delta,double R) {
float C;
C = y + a - a_f / R - (d_f - (1 - delta) * d);
float result;
if (C > 0) {
result = 1 / (1 - 1 / sig) * pow(pow(C,psi) * pow(d_f,1 - psi),(1 - 1 / sig));
}
else {
result = -999999;
}
return result;
}
int main()
{
float duration;
// Iteration Parameters
double tol = 0.000001;
int itmax = 200;
int H = 15;
// Model Parameters and utility function
double sig = 0.75;
double beta = 0.95;
double psi = 0.5;
double delta = 0.1;
double R = 1 / beta - 0.00215;
// =============== 2. discretizing the state space =========================
// Size of arrays
const int Na = 1 * 91;
const int Nd = 1 * 71;
const int Ny = 3;
// Variables for discretization of state space
const float amin = -2;
const float amax = 7;
const float dmin = 0.01;
const float dmax = 7;
const float ymin = 0.5;
const float ymax = 1.5;
const float Ptrans[3] = { 0.2,0.6,0.2 };
// discretization of state space
float ca = (amax - amin) / (Na - 1.0);
float cd = (dmax - dmin) / (Nd - 1.0);
float cy = (ymax - ymin) / (Ny - 1.0);
float* A = new float[Na];
float* Y = new float[Ny];
float* D = new float[Nd];
for (int i = 0; i < Na; i++) {
A[i] = amin + i * ca;
}
for (int i = 0; i < Nd; i++) {
D[i] = dmin + i * cd;
}
for (int i = 0; i < Ny; i++) {
Y[i] = ymin + i * cy;
}
// === 3. Initial guesses,Variable initialization and Transition matrix ===
// Initial guess for value function
float*** V;
dallo_fn(&V,Na,Nd,Ny);
float*** Vnew;
dallo_fn(&Vnew,Ny);
// Initialization of other variables
float Val[Na][Nd];
float** Vfuture = new float* [Na];
for (int i = 0; i < Na; i++)
{
Vfuture[i] = new float[Nd];
}
float** temphoward = new float* [Na];
for (int i = 0; i < Na; i++)
{
temphoward[i] = new float[Nd];
}
float*** Vhoward;
dallo_fn0(&Vhoward,Ny);
float*** tempdiff;
dallo_fn0(&tempdiff,Ny);
int*** maxposition_a;
dallo_fn1(&maxposition_a,Ny);
int*** maxposition_d;
dallo_fn1(&maxposition_d,Ny);
float** mg_A_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_A_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_A_v[i][j] = A[i];
}
}
float** mg_D_v = new float* [Na];
for (int i = 0; i < Na; i++)
{
mg_D_v[i] = new float[Nd];
}
for (int j = 0; j < Nd; j++) {
for (int i = 0; i < Na; i++) {
mg_D_v[i][j] = D[j];
}
}
float***** Uvec = new float**** [Na];
for (int i = 0; i < Na; i++) {
Uvec[i] = new float*** [Nd];
for (int j = 0; j < Nd; j++) {
Uvec[i][j] = new float** [Ny];
for (int k = 0; k < Ny; k++) {
Uvec[i][j][k] = new float* [Na];
for (int l = 0; l < Na; L++) {
Uvec[i][j][k][l] = new float[Nd];
}
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
for (int l = 0; l < Na; L++) {
for (int m = 0; m < Nd; m++) {
Uvec[i][j][k][l][m] = utility(A[i],mg_A_v[l][m],D[j],mg_D_v[l][m],Y[k],sig,psi,delta,R);
}
}
}
}
}
// Value function iteration
int it;
float dif;
float max;
it = 0;
dif = 1;
// ================ 4. Value function iteration ============================
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::Now();
it = it + 1;
// V = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc kernels
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::Now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
}
解决方法
您是否使用该标志来启用 OpenACC,即“-fopenacc”?如果不是,则 OpenACC 指令将被忽略。
请注意,您需要使用更新的 GNU 版本,最好是 10.2,因为 GNU 对 OpenACC 的支持多年来一直在改进。我相信他们的编译器循环依赖性分析仍然缺乏,因此将在设备上按顺序运行“内核”计算区域。因此,就目前而言,您需要坚持使用“平行”区域。如果你真的想使用“内核”,我建议切换到 NVIDIA HPC 编译器(完全公开,我为 NVIDIA 工作)
现在我认为最初的问题只是您没有启用 OpenACC 以及为什么它的速度相同。实际上,如果您尝试卸载它,我希望这种情况会非常慢。除了在设备上按顺序运行“内核”区域外,每个时间步长都需要在主机和设备之间来回传输数据。
最佳策略是在 while 循环之外有一个数据区域,需要时使用“更新”指令来同步阵列的设备和主机副本,然后确保所有计算都已卸载到设备上。
由于您没有发布完整的复制器,我无法测试此代码,因此无法验证它是否正确。但是为了让您了解此策略,我修改了以下代码:
#pragma acc enter data copyin(Vnew[:Na][:Nd][:Ny],Ptrans[:Ny]) \
create(Vfuture[:Na][Nd],V[:Na][:Nd][:Ny],maxposition_a[:Na][:Nd][:Ny],maxposition_b[:Na][:Nd][:Ny]) \
create(Vhoward[:Na][:Nd][:Ny]) // add others here as needed
while (dif >= tol && it <= itmax) {
system_clock::time_point start = system_clock::now();
it = it + 1;
// V = Vnew;
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
V[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
Vfuture[i][j] = 0;
for (int k = 0; k < Ny; k++) {
Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
}
}
}
#pragma acc parallel loop collapse(3) default(present)
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
max = -99999;
for (int d = 0; d < Na; d++) {
for (int e = 0; e < Nd; e++) {
Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
if (max < Val[d][e]) {
max = Val[d][e];
maxposition_a[a][b][c] = d;
maxposition_d[a][b][c] = e;
}
}
}
Vnew[a][b][c] = max;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
#pragma acc parallel loop collapse(3) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
Vhoward[i][j][k] = Vnew[i][j][k];
}
}
}
#pragma acc parallel loop collapse(2) default(present)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// I'm unclear why your using a 2D array for temphoward. It's preventing
// parallelzation of the inner loop and could be replaced with a scalar.
temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
+ beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
}
}
}
}
// Calculate Diff
dif = -100000;
#pragma acc parallel loop collapse(3) reduction(max:dif)
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
// Again,why aren't you using a scalar here for tempdiff?
tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
if (tempdiff[i][j][k] > dif) {
dif = tempdiff[i][j][k];
}
}
}
}
system_clock::time_point end = system_clock::now();
std::chrono::duration<float> sec = end - start;
cout << dif << endl;
cout << it << endl;
cout << sec.count() << endl;
}
#pragma acc update self(Vnew[:Na][:Nd][:Ny])
for (int k = 0; k < Ny; k++) {
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
cout << Vnew[i][j][k];
}
cout << '\n';
}
}
}
#pragma acc exit data delete(Vnew,Ptrans,Vfuture,V,maxposition_a,maxposition_b,Vhoward)
// add others here as needed
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。