我可以使用 GPU 在 Visual Studio 中并行化 C++ 代码吗?

如何解决我可以使用 GPU 在 Visual Studio 中并行化 C++ 代码吗?

我正在尝试使用 GPU 在 Visual Studio C++ 中并行化我的代码。

目前,我使用 OpenMP 来使用 CPU 并行化。

但我正在考虑使用 GPU 并行化,因为我认为如果我在计算中使用更大尺寸的数组会更快。

以下是我正在处理的代码。我只使用了一次并行化。

我发现为了使用 GPU 并行化,我需要使用 OpenCL 或 Cuda。

OpenCL 和 Cuda 似乎需要更改整个代码。所以我想知道是否有一种方法可以在不更改整个代码的情况下使用 GPU 并行化(也许只是更改 '#pragma omp parallel for')

#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h>       // power
#include <cmath>        // abs
#include <fstream>
#include <omp.h>

using namespace std;
using namespace chrono;

// Dynamically allocation with values(float)
void dallo_fn(float**** pMat,int Na,int Nd,int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
            fill_n(Mat[i][j],Ny,1);
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat,int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat,int Ny) {
    int*** Mat = new int** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new int* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new int[Ny];
        }
    }
    *pMat = Mat;
}

// Utility function
float utility(float a,float a_f,float d,float d_f,float y,double sig,double psi,double delta,double R) {
    float C;
    C = y + a - a_f / R - (d_f - (1 - delta) * d);
    float result;
    if (C > 0) {
        result = 1 / (1 - 1 / sig) * pow(pow(C,psi) * pow(d_f,1 - psi),(1 - 1 / sig));
    }
    else {
        result = -999999;
    }
    return result;
}


int main()
{
    
#if defined _OPENMP
    omp_set_num_threads(8);
#endif

    float duration;

    // Iteration Parameters
    double tol = 0.000001;
    int itmax = 200;
    int H = 15;

    // Model Parameters and utility function
    double sig = 0.75;
    double beta = 0.95;
    double psi = 0.5;
    double delta = 0.1;
    double R = 1 / beta - 0.00215;

    // =============== 2. Discretizing the state space =========================

    // Size of arrays
    const int Na = 1 * 91;
    const int Nd = 1 * 71;
    const int Ny = 3;

    // Variables for discretization of state space
    const float amin = -2;
    const float amax = 7;
    const float dmin = 0.01;
    const float dmax = 7;
    const float ymin = 0.5;
    const float ymax = 1.5;
    const float Ptrans[3] = { 0.2,0.6,0.2 };

    // Discretization of state space
    float ca = (amax - amin) / (Na - 1.0);
    float cd = (dmax - dmin) / (Nd - 1.0);
    float cy = (ymax - ymin) / (Ny - 1.0);

    float* A = new float[Na];
    float* Y = new float[Ny];
    float* D = new float[Nd];

    for (int i = 0; i < Na; i++) {
        A[i] = amin + i * ca;
    }
    for (int i = 0; i < Nd; i++) {
        D[i] = dmin + i * cd;
    }
    for (int i = 0; i < Ny; i++) {
        Y[i] = ymin + i * cy;
    }

    // === 3. Initial guesses,Variable initialization and Transition matrix ===

    // Initial guess for value function
    float*** V;
    dallo_fn(&V,Na,Nd,Ny);
    float*** Vnew;
    dallo_fn(&Vnew,Ny);

    // Initialization of other variables
    float Val[Na][Nd];
    float** Vfuture = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        Vfuture[i] = new float[Nd];
    }
    float** temphoward = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        temphoward[i] = new float[Nd];
    }

    float*** Vhoward;
    dallo_fn0(&Vhoward,Ny);
    float*** tempdiff;
    dallo_fn0(&tempdiff,Ny);
    int*** maxposition_a;
    dallo_fn1(&maxposition_a,Ny);
    int*** maxposition_d;
    dallo_fn1(&maxposition_d,Ny);

    float** mg_A_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_A_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_A_v[i][j] = A[i];
        }
    }

    float** mg_D_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_D_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_D_v[i][j] = D[j];
        }
    }

    float***** Uvec = new float**** [Na];
    for (int i = 0; i < Na; i++) {
        Uvec[i] = new float*** [Nd];
        for (int j = 0; j < Nd; j++) {
            Uvec[i][j] = new float** [Ny];
            for (int k = 0; k < Ny; k++) {
                Uvec[i][j][k] = new float* [Na];
                for (int l = 0; l < Na; l++) {
                    Uvec[i][j][k][l] = new float[Nd];
                }
            }
        }
    }

    for (int i = 0; i < Na; i++) {
        for (int j = 0; j < Nd; j++) {
            for (int k = 0; k < Ny; k++) {
                for (int l = 0; l < Na; l++) {
                    for (int m = 0; m < Nd; m++) {
                        Uvec[i][j][k][l][m] = utility(A[i],mg_A_v[l][m],D[j],mg_D_v[l][m],Y[k],sig,psi,delta,R);
                    }
                }
            }
        }
    }

    // Value function iteration
    int it;
    float dif;
    float max;
    it = 0;
    dif = 1;

    // ================ 4. Value function iteration ============================

    while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }
        #pragma omp parallel for private(Val)     // USE PARALLELIZATION
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }

    for (int k = 0; k < Ny; k++) {
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                cout << Vnew[i][j][k];
            }
            cout << '\n';
        }
    }
    cout << omp_get_max_threads() << endl;

}

解决方法

没有方便的方法来添加 #pragma,一切都神奇地在 GPU 上运行。

但是您的代码非常适合 GPU 加速:在您的循环中,元素彼此独立。您尤其可以在 GPU 上并行化 NaNdNy 循环。您需要:

  1. 包括 OpenCL C++ 标头,请参阅 here
  2. 线性化三重循环:创建一个线性索引n = (i*Nd+j)*Ny+k;,将三个循环合而为一
  3. 将您的代码转移到 OpenCL C 并摆脱线性循环,一个内核外观的简单示例是 here
  4. 创建缓冲区(在 GPU 上分配内存)
  5. 在 C++ 中创建内核对象(每个线性化三重循环一个)并将缓冲区链接为内核参数
  6. 手动处理 CPUGPU 内存传输(enqueueReadBuffer/enqueueWriteBuffer)
  7. 运行内核(enqueueNDRangeKernel)

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


使用本地python环境可以成功执行 import pandas as pd import matplotlib.pyplot as plt # 设置字体 plt.rcParams[&#39;font.sans-serif&#39;] = [&#39;SimHei&#39;] # 能正确显示负号 p
错误1:Request method ‘DELETE‘ not supported 错误还原:controller层有一个接口,访问该接口时报错:Request method ‘DELETE‘ not supported 错误原因:没有接收到前端传入的参数,修改为如下 参考 错误2:cannot r
错误1:启动docker镜像时报错:Error response from daemon: driver failed programming external connectivity on endpoint quirky_allen 解决方法:重启docker -&gt; systemctl r
错误1:private field ‘xxx‘ is never assigned 按Altʾnter快捷键,选择第2项 参考:https://blog.csdn.net/shi_hong_fei_hei/article/details/88814070 错误2:启动时报错,不能找到主启动类 #
报错如下,通过源不能下载,最后警告pip需升级版本 Requirement already satisfied: pip in c:\users\ychen\appdata\local\programs\python\python310\lib\site-packages (22.0.4) Coll
错误1:maven打包报错 错误还原:使用maven打包项目时报错如下 [ERROR] Failed to execute goal org.apache.maven.plugins:maven-resources-plugin:3.2.0:resources (default-resources)
错误1:服务调用时报错 服务消费者模块assess通过openFeign调用服务提供者模块hires 如下为服务提供者模块hires的控制层接口 @RestController @RequestMapping(&quot;/hires&quot;) public class FeignControl
错误1:运行项目后报如下错误 解决方案 报错2:Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.8.1:compile (default-compile) on project sb 解决方案:在pom.
参考 错误原因 过滤器或拦截器在生效时,redisTemplate还没有注入 解决方案:在注入容器时就生效 @Component //项目运行时就注入Spring容器 public class RedisBean { @Resource private RedisTemplate&lt;String
使用vite构建项目报错 C:\Users\ychen\work&gt;npm init @vitejs/app @vitejs/create-app is deprecated, use npm init vite instead C:\Users\ychen\AppData\Local\npm-
参考1 参考2 解决方案 # 点击安装源 协议选择 http:// 路径填写 mirrors.aliyun.com/centos/8.3.2011/BaseOS/x86_64/os URL类型 软件库URL 其他路径 # 版本 7 mirrors.aliyun.com/centos/7/os/x86
报错1 [root@slave1 data_mocker]# kafka-console-consumer.sh --bootstrap-server slave1:9092 --topic topic_db [2023-12-19 18:31:12,770] WARN [Consumer clie
错误1 # 重写数据 hive (edu)&gt; insert overwrite table dwd_trade_cart_add_inc &gt; select data.id, &gt; data.user_id, &gt; data.course_id, &gt; date_format(
错误1 hive (edu)&gt; insert into huanhuan values(1,&#39;haoge&#39;); Query ID = root_20240110071417_fe1517ad-3607-41f4-bdcf-d00b98ac443e Total jobs = 1
报错1:执行到如下就不执行了,没有显示Successfully registered new MBean. [root@slave1 bin]# /usr/local/software/flume-1.9.0/bin/flume-ng agent -n a1 -c /usr/local/softwa
虚拟及没有启动任何服务器查看jps会显示jps,如果没有显示任何东西 [root@slave2 ~]# jps 9647 Jps 解决方案 # 进入/tmp查看 [root@slave1 dfs]# cd /tmp [root@slave1 tmp]# ll 总用量 48 drwxr-xr-x. 2
报错1 hive&gt; show databases; OK Failed with exception java.io.IOException:java.lang.RuntimeException: Error in configuring object Time taken: 0.474 se
报错1 [root@localhost ~]# vim -bash: vim: 未找到命令 安装vim yum -y install vim* # 查看是否安装成功 [root@hadoop01 hadoop]# rpm -qa |grep vim vim-X11-7.4.629-8.el7_9.x
修改hadoop配置 vi /usr/local/software/hadoop-2.9.2/etc/hadoop/yarn-site.xml # 添加如下 &lt;configuration&gt; &lt;property&gt; &lt;name&gt;yarn.nodemanager.res