在 C++ 中使用 Openacc并行化时应该如何设置 copy()？

如何解决在 C++ 中使用 Openacc并行化时应该如何设置 copy()？

我正在使用 gcc 编译器。 (g++ -o test testfile.cpp)

我想使用 Openacc 来并行化我的代码，但我对正确使用 #pragma 有点困惑。

以下是我使用并行化的部分。

即使在使用 Openacc 之后，代码也不比以前快。

我猜这与“数据移动”有关。

所以我想我需要在这里使用#pragma acc 数据复制。但我不确定如何正确使用它。

有什么帮助吗？提前致谢。

#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h>       // power
#include <cmath>        // abs
#include <fstream>

using namespace std;
using namespace chrono;

// Dynamically allocation with values(float)
void dallo_fn(float**** pMat,int Na,int Nd,int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
            fill_n(Mat[i][j],Ny,1);
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat,int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat,int Ny) {
    int*** Mat = new int** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new int* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new int[Ny];
        }
    }
    *pMat = Mat;
}

// Utility function
float utility(float a,float a_f,float d,float d_f,float y,double sig,double psi,double delta,double R) {
    float C;
    C = y + a - a_f / R - (d_f - (1 - delta) * d);
    float result;
    if (C > 0) {
        result = 1 / (1 - 1 / sig) * pow(pow(C,psi) * pow(d_f,1 - psi),(1 - 1 / sig));
    }
    else {
        result = -999999;
    }
    return result;
}


int main()
{
    
    float duration;

    // Iteration Parameters
    double tol = 0.000001;
    int itmax = 200;
    int H = 15;

    // Model Parameters and utility function
    double sig = 0.75;
    double beta = 0.95;
    double psi = 0.5;
    double delta = 0.1;
    double R = 1 / beta - 0.00215;

    // =============== 2. discretizing the state space =========================

    // Size of arrays
    const int Na = 1 * 91;
    const int Nd = 1 * 71;
    const int Ny = 3;

    // Variables for discretization of state space
    const float amin = -2;
    const float amax = 7;
    const float dmin = 0.01;
    const float dmax = 7;
    const float ymin = 0.5;
    const float ymax = 1.5;
    const float Ptrans[3] = { 0.2,0.6,0.2 };

    // discretization of state space
    float ca = (amax - amin) / (Na - 1.0);
    float cd = (dmax - dmin) / (Nd - 1.0);
    float cy = (ymax - ymin) / (Ny - 1.0);

    float* A = new float[Na];
    float* Y = new float[Ny];
    float* D = new float[Nd];

    for (int i = 0; i < Na; i++) {
        A[i] = amin + i * ca;
    }
    for (int i = 0; i < Nd; i++) {
        D[i] = dmin + i * cd;
    }
    for (int i = 0; i < Ny; i++) {
        Y[i] = ymin + i * cy;
    }

    // === 3. Initial guesses,Variable initialization and Transition matrix ===

    // Initial guess for value function
    float*** V;
    dallo_fn(&V,Na,Nd,Ny);
    float*** Vnew;
    dallo_fn(&Vnew,Ny);

    // Initialization of other variables
    float Val[Na][Nd];
    float** Vfuture = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        Vfuture[i] = new float[Nd];
    }
    float** temphoward = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        temphoward[i] = new float[Nd];
    }

    float*** Vhoward;
    dallo_fn0(&Vhoward,Ny);
    float*** tempdiff;
    dallo_fn0(&tempdiff,Ny);
    int*** maxposition_a;
    dallo_fn1(&maxposition_a,Ny);
    int*** maxposition_d;
    dallo_fn1(&maxposition_d,Ny);

    float** mg_A_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_A_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_A_v[i][j] = A[i];
        }
    }

    float** mg_D_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_D_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_D_v[i][j] = D[j];
        }
    }

    float***** Uvec = new float**** [Na];
    for (int i = 0; i < Na; i++) {
        Uvec[i] = new float*** [Nd];
        for (int j = 0; j < Nd; j++) {
            Uvec[i][j] = new float** [Ny];
            for (int k = 0; k < Ny; k++) {
                Uvec[i][j][k] = new float* [Na];
                for (int l = 0; l < Na; L++) {
                    Uvec[i][j][k][l] = new float[Nd];
                }
            }
        }
    }

    for (int i = 0; i < Na; i++) {
        for (int j = 0; j < Nd; j++) {
            for (int k = 0; k < Ny; k++) {
                for (int l = 0; l < Na; L++) {
                    for (int m = 0; m < Nd; m++) {
                        Uvec[i][j][k][l][m] = utility(A[i],mg_A_v[l][m],D[j],mg_D_v[l][m],Y[k],sig,psi,delta,R);
                    }
                }
            }
        }
    }

    // Value function iteration
    int it;
    float dif;
    float max;
    it = 0;
    dif = 1;

    // ================ 4. Value function iteration ============================
    
    while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::Now();
        it = it + 1;
        // V = Vnew;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }
        
        #pragma acc kernels
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::Now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

}

解决方法

您是否使用该标志来启用 OpenACC，即“-fopenacc”？如果不是，则 OpenACC 指令将被忽略。

请注意，您需要使用更新的 GNU 版本，最好是 10.2，因为 GNU 对 OpenACC 的支持多年来一直在改进。我相信他们的编译器循环依赖性分析仍然缺乏，因此将在设备上按顺序运行“内核”计算区域。因此，就目前而言，您需要坚持使用“平行”区域。如果你真的想使用“内核”，我建议切换到 NVIDIA HPC 编译器（完全公开，我为 NVIDIA 工作）

现在我认为最初的问题只是您没有启用 OpenACC 以及为什么它的速度相同。实际上，如果您尝试卸载它，我希望这种情况会非常慢。除了在设备上按顺序运行“内核”区域外，每个时间步长都需要在主机和设备之间来回传输数据。

最佳策略是在 while 循环之外有一个数据区域，需要时使用“更新”指令来同步阵列的设备和主机副本，然后确保所有计算都已卸载到设备上。

由于您没有发布完整的复制器，我无法测试此代码，因此无法验证它是否正确。但是为了让您了解此策略，我修改了以下代码：

#pragma acc enter data copyin(Vnew[:Na][:Nd][:Ny],Ptrans[:Ny]) \
        create(Vfuture[:Na][Nd],V[:Na][:Nd][:Ny],maxposition_a[:Na][:Nd][:Ny],maxposition_b[:Na][:Nd][:Ny]) \
        create(Vhoward[:Na][:Nd][:Ny]) // add others here as needed

while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        #pragma acc parallel loop collapse(3) default(present)
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        #pragma acc parallel loop collapse(2) default(present)
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }

        #pragma acc parallel loop collapse(3) default(present)
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            #pragma acc parallel loop collapse(3) default(present)
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            #pragma acc parallel loop collapse(2) default(present)
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        // I'm unclear why your using a 2D array for temphoward.  It's preventing
                        // parallelzation of the inner loop and could be replaced with a scalar.
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        #pragma acc parallel loop collapse(3) reduction(max:dif)
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    // Again,why aren't you using a scalar here for tempdiff?
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

#pragma acc update self(Vnew[:Na][:Nd][:Ny])
    for (int k = 0; k < Ny; k++) {
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                cout << Vnew[i][j][k];
            }
            cout << '\n';
        }
    }

}

#pragma acc exit data delete(Vnew,Ptrans,Vfuture,V,maxposition_a,maxposition_b,Vhoward)
// add others here as needed