OpenCL - 通过蒙特卡罗模拟逼近 Pi - 错误值

如何解决OpenCL - 通过蒙特卡罗模拟逼近 Pi - 错误值

我目前正在开发应该近似 Pi 的蒙特卡罗模拟。我通过 OpenCL 进行并行化，但通过 OpenCL 获得的时间值比未并行化的时间值要差得多。我究竟做错了什么？我有一台配备 Intel Iris、Intel cpu 和 AMD 显卡的 MacBookPro。

实现必须在 OpenCL 中进行，而不是其他标准。

提前致谢。

我的主要代码：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <limits.h>
#include <math.h>
#include <sys/time.h>
#include <unistd.h>
 
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
 
#define MAX_SOURCE_SIZE (0x100000)
 
int main(void) {

    // ------- 

    // -------
    struct timeval start;
    struct timeval stop;
    //struct timeval differenz;
    // Create the two input vectors
    unsigned long i,j;
    unsigned long inner = 0;
    unsigned long outer = 0;
    const unsigned long LIST_SIZE = pow(2,16);
    const unsigned long LAUF_VAR = 1;
    unsigned long *A = (unsigned long*)malloc(sizeof(unsigned long)*LIST_SIZE);
    unsigned long *C = (unsigned long*)malloc(sizeof(unsigned long)*LIST_SIZE);
    unsigned long *B = (unsigned long*)malloc(sizeof(unsigned long)*LIST_SIZE);
 
    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;
 
    fp = fopen("square.cl","r");
    if (!fp) {
        fprintf(stderr,"Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str,1,MAX_SOURCE_SIZE,fp);
    fclose( fp );
 
    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;   
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
    ret = clGetdeviceids( platform_id,CL_DEVICE_TYPE_ALL,&device_id,&ret_num_devices);
 
    // Create an OpenCL context
    cl_context context = clCreateContext( NULL,NULL,&ret);
 
 
    // Create memory buffers on the device for each vector 
    cl_mem a_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY,LIST_SIZE * sizeof(unsigned long),&ret);
    cl_mem b_mem_obj = clCreateBuffer(context,&ret);
    cl_mem c_mem_obj = clCreateBuffer(context,CL_MEM_WRITE_ONLY,&ret);
 
    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context,(const char **)&source_str,(const size_t *)&source_size,&ret);
 
    // Build the program
    ret = clBuildProgram(program,NULL);
    if(ret != CL_SUCCESS)
    {                 
        printf("Error: Failed to build program executable!\n");
        perror("");
        exit(1);
    }

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context,device_id,&ret);
 
    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program,"square_val",&ret);
 
    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel,sizeof(cl_mem),(void *)&a_mem_obj);
    ret = clSetKernelArg(kernel,(void *)&b_mem_obj);
    ret = clSetKernelArg(kernel,2,(void *)&c_mem_obj);

    // Execute the OpenCL kernel on the list
    size_t global_item_size = LIST_SIZE; // Process the entire lists
    size_t local_item_size = 64; // Divide work items into groups of 64
 
    gettimeofday(&start,NULL);

    // display the result to the screen
    /*
    for(i = 0; i < LIST_SIZE; i++)
    {
        printf("%ld^2 + %ld^2 = %ld\n",A[i],B[i],C[i]);
    }
    */
    for(j = 0; j < LAUF_VAR; j++)
    {
        srand(time(NULL));

        for(i = 0; i < LIST_SIZE; i++) {
            A[i] = (unsigned long) rand() % LIST_SIZE;   
            B[i] = (unsigned long) rand() % LIST_SIZE;
        }

        // copy the lists A and B to their respective memory buffers
        ret = clEnqueueWriteBuffer(command_queue,a_mem_obj,CL_TRUE,A,NULL);
        ret = clEnqueueWriteBuffer(command_queue,b_mem_obj,B,NULL);
        

        ret = clEnqueueNDRangeKernel(command_queue,kernel,&global_item_size,&local_item_size,NULL);
 
        // Read the memory buffer C on the device to the local variable C
        ret = clEnqueueReadBuffer(command_queue,c_mem_obj,C,NULL);

        for(i = 0; i < LIST_SIZE; i++)
        {
            if(C[i] <= (LIST_SIZE * LIST_SIZE))
            {
                inner++;
            } else {
                outer++;
            }
        }
    }
    gettimeofday(&stop,NULL);


    printf("- - - - - - - - - - - - - - -\n");
    printf("Parallel\n");
    printf("- - - - - - - - - - - - - - -\n");
    printf("Inner as unsigned long: %ld\n",inner);
    printf("Outer as unsigned long: %ld\n",outer);
    printf("- - - - - - - - - - - - - - -\n");
    double innerDouble = (double) (inner);
    double outerDouble = (double) (outer);
    double val = (innerDouble/LIST_SIZE);
    printf("Approximation for PI: %.20lf\n",(val*4) / LAUF_VAR);
    printf("@Walk NR: %ld\n",j);
    printf("LIST_SIZE: %lu\n",LIST_SIZE);
    printf("- - - - - - - - - - - - - - -\n");
    printf("took %lu us\n",(stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); 
    //printf("%ju\n",(uintmax_t)SIZE_MAX);

    // Clean up
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel);
    ret = clReleaseProgram(program);
    ret = clReleaseMemObject(a_mem_obj);
    ret = clReleaseMemObject(b_mem_obj);
    ret = clReleaseMemObject(c_mem_obj);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);

    // Create the two input vectors
    inner = 0;
    outer = 0;
    unsigned long *D = (unsigned long*)malloc(sizeof(unsigned long)*LIST_SIZE);
    unsigned long *E = (unsigned long*)malloc(sizeof(unsigned long)*LIST_SIZE);
    unsigned long *F = (unsigned long*)malloc(sizeof(unsigned long)*LIST_SIZE);

    gettimeofday(&start,NULL);
    for(j = 0; j < LAUF_VAR; j++)
    {
        srand(time(NULL));

        for(i = 0; i < LIST_SIZE; i++) {
            D[i] = (unsigned long) rand() % LIST_SIZE;   
            E[i] = (unsigned long) rand() % LIST_SIZE;
        }
        for(i = 0; i < LIST_SIZE; i++) {
            F[i] = D[i] * D[i] + E[i] * E[i];
            if(F[i] <= (LIST_SIZE * LIST_SIZE))
            {
                inner++;
            } else {
                outer++;
            }
        }
    }
    gettimeofday(&stop,NULL);

    printf("\n");
    printf("\n");
    printf("\n");


    printf("- - - - - - - - - - - - - - -\n");
    printf("Seriell\n");
    printf("- - - - - - - - - - - - - - -\n");
    printf("Inner as unsigned long: %ld\n",outer);
    printf("- - - - - - - - - - - - - - -\n");
    innerDouble = (double) (inner);
    outerDouble = (double) (outer);
    val = (innerDouble/LIST_SIZE);
    printf("Approximation for PI: %.20lf\n",LIST_SIZE);
    printf("took %lu us\n",(stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec); 

    free(D);
    free(E);
    free(F);

    return 0;
}

内核：

__kernel void square_val(__global const unsigned long *A,__global const unsigned long *B,__global unsigned long *C) {
 
    // Get the index of the current element to be processed
    unsigned long i = get_global_id(0);
 
    // Do the operation
    C[i] = A[i] * A[i] + B[i] * B[i];
}

结果：

 ./test                                                                                                                                                   
- - - - - - - - - - - - - - -
Parallel
- - - - - - - - - - - - - - -
Inner as unsigned long: 51383
Outer as unsigned long: 14153
- - - - - - - - - - - - - - -
Approximation for PI: 3.13616943359375000000
@Walk NR: 1
LIST_SIZE: 65536
- - - - - - - - - - - - - - -
took 3678 us



- - - - - - - - - - - - - - -
Seriell
- - - - - - - - - - - - - - -
Inner as unsigned long: 51383
Outer as unsigned long: 14153
- - - - - - - - - - - - - - -
Approximation for PI: 3.13616943359375000000
@Walk NR: 1
LIST_SIZE: 65536
took 2010 us

解决方法

我不愿意在没有细粒度分析数据的情况下对您的代码中什么是快或什么是慢做出笼统的陈述，但这里有一些如何改进的候选者：

您在 CPU 和 GPU 之间相当笨拙地拆分算法，并且正在执行最大数量的内存复制，这可能是纯 CPU 版本无法做到的。在 GPU 上进行尽可能多的计算，在设备和主机之间复制尽可能少的数据。
A 和 B 元素的值在 0..65535 范围内。无需将每个元素都设为 64 位整数。
特别是如果您使用的是使用共享内存的 Iris GPU，请使用零拷贝缓冲区。对此有详细的解释，但基本上是：
- 不要：分配主机内存，填充它，然后创建一个 CL 缓冲区并复制到该缓冲区。
- 相反：创建一个 CL 缓冲区，将其映射到主机内存空间，通过映射指针直接填充，然后取消映射。
在 GPU 上生成随机数将为您节省大量内存带宽 - 无需将 A 和 B 复制到设备内存。不过，并非所有随机数生成器都适用于此，当然也没有内置于 OpenCL 中的随机数生成器。
这：if(C[i] <= (LIST_SIZE * LIST_SIZE)) 不必要地在主机上进行计算。是的，比较就是计算。如果您在内核中执行此检查，则不需要写入数组 C - 或者至少，您可以将 0 或 1 写入字节数组而不是 64 位整数。这将节省您的内存带宽和主机端执行时间。
如果您实施了上述建议，您就会意识到最好只增加 GPU 上的内部/外部计数器。
1. 您不需要 2 个计数器，可以通过从总迭代次数中减去第一个来推断出第二个。
2. OpenCL 中最简单的正确方法是在每个工作项中使用原子增量。
3. 从每个工作项原子更新单个内存位置不会有很好的效果。更好：使用工作组。计算使用本地内存为组的所有元素增加多少计数器，然后仅在组的一个工作项中对全局计数器执行原子添加。
在进行上述更改后，您可能希望尝试为每个工作项处理多个 A/B 对，以进一步减少累积计数的开销。

几乎可以肯定，内核在全局大小和每个工作项的工作量方面都太短了。 IE。卸载时间在实际计算中占主导地位。即使在集成显卡上，对于简单内核来说，64k 工作项的全局大小也很小。其次，内核对每个工作项所做的工作很少：2 muls + 1 add。想象一下将一个简单的数学问题发送到火星上一个非常广泛的计算器并等待答案。我们希望为每个工作项做更多的工作。

例如，如果您达到 L3，较旧的 Intel HD（Skylake 或更早版本）加载需要大约 100-150c（100 到 150 纳秒），甚至在未命中时需要更多时间。即使您使用零复制缓冲区（如果可以，您仍然应该这样做），这仍然成立。我没有仔细检查主机代码，但我认为这是流式访问（每个工作项的内存不同），因此缓存不会很好。最后，对于英特尔部分上面的给定内核，线程调度程序将无法完全占用计算单元。也就是说，线程退出/结束的速度比加载新线程的速度要快（“线程调度受限”）。

我不能说你列出的其他部分。但是，我会担心将如此多的数据传送到一个离散的部分，只需要很少的工作（然后将其发回）。几乎可以肯定它不会像 CPU 一样好。

这里有三个想法。

将随机变量工作移到 GPU 上。找到一个对 GPU 友好的简单线性（或置换）同余随机数生成器 (RNG)。您必须为每个工作项独立设置种子，并且最好能从中获取一些随机变量，而不仅仅是一个。我不是 RNG 分裂者，但可能 (uint)get_global_id(0) 是 XORWOW 或 PCG 之类的足够种子。远离 GPU 上的 Mersenne Twister（状态过多）。
先减少 GPU 上的结果，然后再减少主机上的结果。 GPU 上的 Monte Carlo (MC) pi 近似值通常将最终的减少分布到 GPU 上。也就是说，让一个工作组计算自己的 inner 和 outer 副本，然后在主机上或每个工作组原子地将它们全部相加。这是一个包含 256 个工作项的工作组，将其答案减少了一个 inner 和一个 outer。然后要么原子地将该单个值添加到全局副本中，要么将内存分开并让主机对工作组求和进行求和。我不确定哪个更好。
批处理工作。让一个工作项计算 K 个元素，以便摊销 RNG 和归约工作。

还有一个给 pmdj 的帽子提示。他还就可以尝试的事情提出了一些非常好的建议，我肯定他的大部分观点。

有趣的东西！