微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

Thrust 的exclusive_scan_by_key 函数与顺序实现花费的时间相同吗?

如何解决Thrust 的exclusive_scan_by_key 函数与顺序实现花费的时间相同吗?

我对 Thrust 比较陌生,正在尝试执行分段扫描。这是我的代码,您应该可以按原样运行:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <chrono>

// Sequential scan for cpu
float* test_seqScan(float* in,int s,int m) {
    float* out = new float[s * m];

    for (unsigned int i = 0; i < s; i++) {
        out[i * m] = 0;
    }

    for (unsigned int i = 0; i < s; i++) {
        for (unsigned int j = 1; j < m; j++) {
            out[i * m + j] = out[i * m + j - 1] + in[i * m + j - 1];
        }
    }

    return out;
}

void test_sumScan(thrust::device_vector<float> dev_in,thrust::device_vector<int> dev_keys,int m) {
    // Allocate device memory for output
    thrust::device_vector<float> dev_out(s * m);

    thrust::exclusive_scan_by_key(thrust::device,dev_keys.begin(),dev_keys.end(),dev_in.begin(),dev_out.begin());
}

int main(){
    int s = 100;
    int m = 100000;

    float* seq_in = new float[s * m];

    for (int i = 0; i < s; i++) {
        for (int j = 0; j < m; j++) {
            seq_in[i * m + j] = j + 1;
        }
    }

    thrust::host_vector<float> par_in(s * m);
    for (int i = 0; i < s; i++) {
        for (int j = 0; j < m; j++) {
            par_in[i * m + j] = j + 1;
        }
    }

    thrust::host_vector<int> keys(s * m);
    for (int i = 0; i < s; i++) {
        for (int j = 0; j < m; j++) {
            keys[i * m + j] = i;
        }
    }

    thrust::device_vector<float> dev_in = par_in;
    thrust::device_vector<int> dev_keys = keys;

    auto t1 = std::chrono::high_resolution_clock::Now();
    test_seqScan(seq_in,s,m);
    auto t2 = std::chrono::high_resolution_clock::Now();
    auto duration1 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();

    std::cout << "Sequential duration: " << duration1 << "\n\n";

    auto t3 = std::chrono::high_resolution_clock::Now();
    test_sumScan(dev_in,dev_keys,m);
    auto t4 = std::chrono::high_resolution_clock::Now();
    auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();

    std::cout << "Parallel duration: " << duration2 << "\n\n";
}

我的问题是,无论我将 sm 设置为多小或多大,这两个代码片段都需要完全相同的时间来运行。我认为我做错了什么,但我不知道是什么;谁能指出这个问题?

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。