错误：内核参数具有非平凡复制的可构造类/结构类型+sycl+tbb

如何解决错误：内核参数具有非平凡复制的可构造类/结构类型+sycl+tbb

我尝试提供一种“地图”框架，它通过一些指定目标类型（cpu 或 GPU/加速器）的参数来包装 OneAPI 调用以隐藏硬件定位问题。我的地图骨架传递函数及其衍生物与牛顿方法的初始点。但我有一个错误：

kernel parameter has non-trivially copy constructible class/struct type 'std::function<double (double)>'

我的代码是：

         #include <CL/sycl.hpp>
#include <iostream>
#include <tbb/tbb.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <vector>
#include <string>
#include <queue>
#include<tbb/blocked_range.h>
#include <tbb/global_control.h>
#include <chrono>
#include "uTimer.cpp"
#include <cmath>
#include <random>
#include <ctime>
#include <numeric>
#include <cstdlib>

//#include <dos.h> //for delay
//#include <conio.h> //for getch()
//#include <complex>
#define EPSILON 0.000001 // The step size across the X and Y axis

using namespace tbb;

class Clock {
private:
    typedef std::chrono::high_resolution_clock clock;
    std::chrono::time_point<clock> t;
public:
    Clock() {
        start();
    }
    void start() {
        t = clock::Now();
    }
    double stop() const {
        return std::chrono::duration_cast<std::chrono::duration<double>>(
                clock::Now() - t).count();
    }
};


//std::complex<double> mycomplex(10.0,2.0);

template<class Tin,class Tout>
class Map {
        private:
                std::function<Tout(Tin)> fun;
                std::function<Tout(Tin)> dfun;
        public:
                Map() {};
                Map(std::function<Tout(Tin)> f,std::function<Tout(Tin)> df) {
                    fun = f;
                    dfun = df;
                };


    void operator()(bool use_tbb,Tin &x1) {
        int iter=100;
        Tout x;
        if (use_tbb) {
            uTimer *timer = new uTimer("Executing Code On cpu");
            tbb::parallel_for(tbb::blocked_range < int > (0,iter),[&](tbb::blocked_range<int> t) {
                        for (int index = t.begin(); index < t.end(); ++index) {
                            do
                            {
                                x = x1;
                                x1 = x - (fun(x) / dfun(x));
                            }while (std::abs(x1 - x) >= EPSILON);
                        }
                });
            timer->~uTimer();
        }else {
            sycl::buffer<Tin,1> x1_buffer(&x1,iter);
            sycl::buffer<Tout,1> x_buffer(&x,iter);
            //Profiling GPU

            // Initialize property list with profiling information
            sycl::property_list propList {
                    sycl::property::queue::enable_profiling() };
            // Build the command queue (constructed to handle event profling)
            sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),propList);
            // print out the device information used for the kernel code
            std::cout << "Device: "
                    << gpuQueue.get_device().get_info<sycl::info::device::name>()
                    << std::endl;

            std::cout << "Compute Units: "
                    << gpuQueue.get_device().get_info<
                            sycl::info::device::max_compute_units>()
                    << std::endl;

            auto start_overall = std::chrono::system_clock::Now();
            auto event = gpuQueue.submit([&](sycl::handler &h) {
                //local copy of fun
                auto f = fun;
                auto df = dfun;
                sycl::accessor x1_accessor(x1_buffer,h,sycl::read_write);
                sycl::accessor x_accessor(x_buffer,sycl::read_write);
                h.parallel_for(iter,[=](sycl::id<1> index) {
                    do
                    {
                        x_accessor[index] = x1_accessor[index];
                        x1_accessor[index] = x_accessor[index] - (f(x_accessor[index]) / df(x_accessor[index]));
                    }while (sycl::fabs(f(x1_accessor[index]))>= EPSILON);

                });
            });
            event.wait();
            auto end_overall = std::chrono::system_clock::Now();
            cl_ulong submit_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_submit>();
            cl_ulong start_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_start>();
            cl_ulong end_time = event.template get_profiling_info<
                    cl::sycl::info::event_profiling::command_end>();
            auto submission_time = (start_time - submit_time) / 1000000.0f;
            std::cout << "Submit Time: " << submission_time << " ms"
                    << std::endl;
            auto execution_time = (end_time - start_time) / 1000000.0f;
            std::cout << "Execution Time: " << execution_time << " ms"
                    << std::endl;
            auto execution_overall = std::chrono::duration_cast
                    < std::chrono::milliseconds > (end_overall - start_overall);
            std::cout << "Overall Execution Time: " << execution_overall.count()
                    << " ms" << std::endl;
        };
    };
};





int main(int argc,char *argv[]) {

    //Define a function
    auto f = [](double x) {return pow(x,3);};
    //Define the derivative of function
    auto df = [](double x) {return pow(x,2) *3;};
    //Define an instance of Map class
    auto m1 = Map<double,double>(f,df);
    double x1 = 3;
    m1(true,x1);
    //print the result
    //for (auto &e : r) {
        //std::cout << e << " ";
    //}
    return 0;
}

此外，如果我们不考虑错误，我认为我的代码中的某些内容似乎不正确，但我无法理解它是什么。

解决方法

你不能为所欲为。如果您尝试摆脱 std::function 并使用函数指针，您仍然无法做到（即使它可以简单地复制）。在 SYCL 和任何其他此类语言（CUDA、hip、OpenCL 等）中，设备编译器需要能够编译内核执行/调用的所有函数。所以不，你不能“传入”一个函数。归结为您之前回答的问题之一here

您可以尝试将 lambdas 定义为其他地方的函数，然后从内核中调用它们。如果您希望能够在运行时在各种函数之间进行选择，您可以编写一个模板化内核（假设是一个枚举）并通过 if constexpr（在内核中）调度您的调用以避免运行时成本（和代码重复数据删除）。最后会实例化 n 个 SYCL 内核，每个内核都调用您的一个函数。它们将由设备编译器等正确编译。