微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

c – POSIX Threads在C中没有加速

我正在使用Pthreads学习并行处理.我有一个四核处理器.不幸的是,以下代码的并行化部分运行速度比非并行化代码慢大约5倍.我在这做错了什么?在此先感谢您的帮助.
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000

struct params {
  int * arr;
  int sum;
};

/* The worker function for the pthreads */
void * myFun (void * x){
  int i;
  struct params * b = (struct params *) x;
  for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
    b->sum += b->arr[i];
  }
  return NULL;
}

/* unparallelized summing function*/
int arrSum(int * arr,int size){
  int sum = 0;
  for (int i = 0; i != size; ++i){
    sum += arr[i];
  }
  return sum;
}

int main(int argc,char * argv[]){
  clock_t begin,end;
  double runTime;
  int rc,i;
  int sum1,sum2 = 0;
  pthread_t threads[NTHREADS];

  /* create array to sum over */
  int * myArr = NULL;
  myArr = (int *) calloc(SIZE,sizeof(int));
  if (myArr == NULL){
    printf("problem allocating memory\n");
    return 1; 
  }
  for (int i = 0; i < SIZE; ++i){
    myArr[i] = 1;
  }

  /* create array of params structs to Feed to threads */
  struct params p;
  p.sum = 0;
  struct params inputs[NTHREADS];
  for(i = 0; i != NTHREADS; ++i){
    p.arr = myArr + i*(int)(SIZE/NTHREADS);
    inputs[i] = p;
  }

  /* spawn the threads */
  begin = clock();
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_create(&threads[i],NULL,myFun,(void *) &inputs[i]);
  }

  /* wait for threads to finish */
  for(i = 0; i != NTHREADS; ++i){
    rc = pthread_join(threads[i],NULL);
  }
  end = clock();
  runTime = (double)(end - begin)/CLOCKS_PER_SEC;
  printf("Parallelized code run time: %f\n",runTime);

  /* run the unparallelized code */
  begin = clock();
  sum2 = arrSum(myArr,SIZE);
  end = clock();
  runTime = (double)(end - begin)/CLOCKS_PER_SEC;
  printf("Unparallelized code run time: %f\n",runTime);

  /* consolidate and print results from threads */
  for(i = 0; i != NTHREADS; ++i){
    sum1 += inputs[i].sum;
  }
  printf("sum1,sum2: %d,%d \n",sum1,sum2);

  free(myArr);

  /* be disappointed when my parallelized code showed no speedup */
  return 1;
}

解决方法

您缺少并行编程的一个重要方面.

每个进程需要创建一次工作线程,而不是每个任务.

创建和销毁线程需要时间.

解决方案是使用线程池并将任务发送到池中.

我的建议是使用OpenMP,它可以大大简化这项任务,并与许多编译器配合使用.

例:

int sum = 0
#pragma omp for shared(sum)
 for(int i=0; i<SIZE; ++i)
 {
   #pragma omp atomic
   sum += myArr[i]
 }

为了使这项工作更快,请进行一些循环展开 – 例如计算单个for循环范围中的8个数字的总和.

原文地址:https://www.jb51.cc/c/118949.html

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐