c++ - 使用 TBB 的并行性——我们的 list 中应该包含什么？

coder 2024-02-21 原文

直到最近，并行编程的前景才引起了我的注意。从那时起，我使用了各种并行编程库。也许我的第一站是英特尔线程构建模块(TBB)。但是，经常成为瓶颈的是由于舍入等因素以及这些程序在不同处理器架构中的不可预测行为而导致的错误。下面是一段代码，用于计算两组值的 PIL 逊相关系数。它采用了 TBB 的非常基本的并行模式——*parallel_for* 和 *parallel_reduce*:

    // A programme to calculate Pearsons Correlation coefficient 

#include <math.h>
#include <stdlib.h>
#include <iostream>
#include <tbb/task_scheduler_init.h>
#include <tbb/parallel_for.h>
#include <tbb/parallel_reduce.h>
#include <tbb/blocked_range.h>
#include <tbb/tick_count.h>




using namespace std;
using namespace tbb;
const size_t n=100000;
double global=0;

namespace s //Namesapce for serial part
{
double *a,*b;
int j;
double mean_a,mean_b,sd_a=0,sd_b=0,pcc=0;
double sum_a,sum_b,i;
}

namespace p //Namespace for parallel part
{
double *a,*b;
double mean_a,mean_b,pcc;
double sum_a,sum_b,i;
double sd_a,sd_b;
}


class serials
{
public:
               void computemean_serial()
               {
                using namespace s;
            sum_a=0,sum_b=0,i=0;
                a=(double*) malloc(n*sizeof(double));
                b=(double*) malloc(n*sizeof(double));
                for(j=0;j<n;j++,i++)
                { 
                    a[j]=sin(i);
                    b[j]=cos(i);

                    sum_a=sum_a+a[j];
                    sum_b=sum_b+b[j];
                }
                mean_a=sum_a/n;
            mean_b=sum_b/n;
                cout<<"\nMean of a :"<<mean_a;
                cout<<"\nMean of b :"<<mean_b;
               }
               void computesd_serial()
               {
               using namespace s;
               for(j=0;j<n;j++)
               {sd_a=sd_a+pow((a[j]-mean_a),2);
                sd_b=sd_b+pow((b[j]-mean_b),2);
               }
                sd_a=sd_a/n;
               sd_a=sqrt(sd_a);
               sd_b=sd_b/n;
               sd_b=sqrt(sd_b);
               cout<<"\nStandard deviation of a :"<<sd_a;
               cout<<"\nStandard deviation of b :"<<sd_b;
               }
               void pearson_correlation_coefficient_serial()
               {
                using namespace s;
                pcc=0;
                for(j=0;j<n;j++)
                {
                pcc+=(a[j]-mean_a)*(b[j]-mean_b);
                }
                pcc=pcc/(n*sd_a*sd_b);
                cout<<"\nPearson Correlation Coefficient: "<<pcc;
               }

};


class parallel
{
public:

class compute_mean 
{

double *store1,*store2;
public: 

double mean_a,mean_b;

    void operator()( const blocked_range<size_t>& r)
    {
    double *a= store1;
    double *b= store2;

    for(size_t i =r.begin();i!=r.end(); ++i)
    {    
         mean_a+=a[i];
         mean_b+=b[i];
    }
    }
    compute_mean( compute_mean& x, split) : store1(x.store1),store2(x.store2),mean_a(0),mean_b(0){}

    void join(const compute_mean& y) {mean_a+=y.mean_a;mean_b+=y.mean_b;}
    compute_mean(double* a,double* b): store1(a),store2(b),mean_a(0),mean_b(0){}
};

               class read_array
                {
               double *const a,*const b;

                 public:

             read_array(double* vec1, double* vec2) : a(vec1),b(vec2){}  // constructor copies the arguments into local store 
             void operator() (const blocked_range<size_t> &r) const {              // opration to be used in parallel_for 

                     for(size_t k = r.begin(); k!=r.end(); k++,global++)
                     {   
                         a[k]=sin(global);
                         b[k]=cos(global);
                     }

                 }};

            void computemean_parallel()
                        {
                        using namespace p;
                        i=0;
                        a=(double*) malloc(n*sizeof(double));
                        b=(double*) malloc(n*sizeof(double));

                parallel_for(blocked_range<size_t>(0,n,5000),read_array(a,b));
                compute_mean sf(a,b);
                parallel_reduce(blocked_range<size_t>(0,n,5000),sf);
                mean_a=sf.mean_a/n;
                mean_b=sf.mean_b/n;
                cout<<"\nMean of a :"<<mean_a;
                cout<<"\nMean of b :"<<mean_b;
               }

class compute_sd 
{
double *store1,*store2;
double store3,store4;
public: 
double sd_a,sd_b,dif_a,dif_b,temp_pcc;
void operator()( const blocked_range<size_t>& r)
{
    double *a= store1;
    double *b= store2;
    double mean_a=store3;
    double mean_b=store4;
    for(size_t i =r.begin();i!=r.end(); ++i)
    { 
     dif_a=a[i]-mean_a;
     dif_b=b[i]-mean_b;
     temp_pcc+=dif_a*dif_b;
     sd_a+=pow(dif_a,2);
     sd_b+=pow(dif_b,2);
    }}
    compute_sd( compute_sd& x, split) : store1(x.store1),store2(x.store2),store3(p::mean_a),store4(p::mean_b),sd_a(0),sd_b(0),temp_pcc(0){}
    void join(const compute_sd& y) {sd_a+=y.sd_a;sd_b+=y.sd_b;}
    compute_sd(double* a,double* b,double mean_a,double mean_b): store1(a),store2(b),store3(mean_a),store4(mean_b),sd_a(0),sd_b(0),temp_pcc(0){}
};


               void computesd_and_pearson_correlation_coefficient_parallel()
               {
               using namespace p;
               compute_sd obj2(a,b,mean_a,mean_b);
               parallel_reduce(blocked_range<size_t>(0,n,5000),obj2);
               sd_a=obj2.sd_a;
               sd_b=obj2.sd_b;
               sd_a=sd_a/n;
               sd_a=sqrt(sd_a);
               sd_b=sd_b/n;
               sd_b=sqrt(sd_b);
               cout<<"\nStandard deviation of a :"<<sd_a;
               cout<<"\nStandard deviation of b :"<<sd_b;
               pcc=obj2.temp_pcc;
               pcc=pcc/(n*sd_a*sd_b);
               cout<<"\nPearson Correlation Coefficient: "<<pcc;
               }
};

main()
{       
        serials obj_s;
        parallel obj_p;
        cout<<"\nSerial Part";
        cout<<"\n-----------";
        tick_count start_s=tick_count::now();
        obj_s.computemean_serial();
        obj_s.computesd_serial();
        obj_s.pearson_correlation_coefficient_serial();
        tick_count end_s=tick_count::now();
        cout<<"\n";
        task_scheduler_init init;
        cout<<"\nParallel Part";
        cout<<"\n-------------";
        tick_count start_p=tick_count::now();
        obj_p.computemean_parallel();
        obj_p.computesd_and_pearson_correlation_coefficient_parallel();
        tick_count end_p=tick_count::now();
        cout<<"\n";
        cout<<"\nTime Estimates";
        cout<<"\n--------------";
        cout<<"\nSerial Time :"<<(end_s-start_s).seconds()<<" Seconds";
        cout<<"\nParallel time :"<<(end_p-start_p).seconds()<<" Seconds\n";

}

嗯!它在装有 Core i5 的 Windows 机器上运行良好。它为输出中的每个参数提供了完全相同的值，并行代码流形比串行代码更快。这是我的输出:

操作系统:Windows 7 Ultimate 64 位 处理器:core i5

Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Parallel Part
-------------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Time Estimates
--------------
Serial Time : 0.0204829 Seconds
Parallel Time : 0.00939971 Seconds

那么其他机器呢？如果我说它会正常工作，那么至少我的一些 friend 会说“等等，伙计!有些可疑。”尽管并行代码总是比串行代码快，但不同机器的答案(由并行代码和串行代码生成的答案之间)存在细微差别。那么是什么造成了这些差异呢？我们得出的结论是，这种异常行为是以过度并行性和处理器架构差异为代价的舍入误差。

这引出了我的问题:

我们在使用并联时需要注意什么在我们的代码中处理库以利用多核处理器？
在哪些情况下我们甚至不应该使用并行方法虽然有多个处理器的可用性？
为了避免舍入误差，我们能做的最好的事情是什么？(让我指定我不是在谈论强制执行互斥锁和障碍有时可能会限制并行性的范围，但大约有时可以派上用场的简单编程技巧)

很高兴看到您对这些问题的建议。请随意回答如果您有时间限制，选择最适合您的部分。

编辑 - 我在此处包含了更多结果

操作系统 : Linux Ubuntu 64 位 处理器 : core i5

    Serial Part
    -----------
    Mean of a :1.81203e-05
    Mean of b :1.0324e-05
    Standard deviation of a :0.707107
    Standard deviation of b :0.707107
    Pearson Correlation Coefficient: 3.65091e-07

    Parallel Part
    -------------
    Mean of a :-0.000233041
    Mean of b :0.00414375
    Standard deviation of a :2.58428
    Standard deviation of b :54.6333
    Pearson Correlation Coefficient: -0.000538456

    Time Estimates
    --------------
    Serial Time :0.0161237 Seconds
    Parallel Time :0.0103125 Seconds

操作系统 : Linux Fedora 64 位 处理器 : core i3

Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Parallel Part
-------------
Mean of a :-0.00197118
Mean of b :0.00124329
Standard deviation of a :0.707783
Standard deviation of b :0.703951
Pearson Correlation Coefficient: -0.129055

Time Estimates
--------------
Serial Time :0.02257 Seconds
Parallel Time :0.0107966 Seconds

编辑:在 timday 建议的更改之后

操作系统 :Linux Ubuntu 64位 处理器 : corei5

Serial Part
-----------
Mean of a :1.81203e-05
Mean of b :1.0324e-05
Standard deviation of a :0.707107
Standard deviation of b :0.707107
Pearson Correlation Coefficient: 3.65091e-07

Parallel Part
-------------
Mean of a :-0.000304446
Mean of b :0.00172593
Standard deviation of a :0.708465
Standard deviation of b :0.7039
Pearson Correlation Coefficient: -0.140716

Time Estimates
--------------
Serial Time :0.0235391 Seconds
Parallel time :0.00810775 Seconds

最好的问候。

注意 1:我不保证上面这段代码是正确的。我相信是这样。

注 2:这段代码也在 Linux 机器上进行了测试。

注 3:尝试了不同的粒度组合和自动分区选项。

最佳答案

我对 compute_mean( compute_mean& x, split) 构造函数中注释掉的 /*,mean_a(0),mean_b(0)*/ 深表怀疑。似乎您的差异可能是由于未初始化的数据污染了结果。我猜想在您获得一致结果的机器上，没有发生任务拆分，或者这些成员恰好位于零内存上。

同样，您的 compute_sd( compute_sd& x, split) 未初始化 store3 和 store4。

关于c++ - 使用 TBB 的并行性——我们的 list 中应该包含什么？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/12502370/

有关c++ - 使用 TBB 的并行性——我们的 list 中应该包含什么？的更多相关文章

ruby - 如何使用 Nokogiri 的 xpath 和 at_xpath 方法 - 2
我正在学习如何使用Nokogiri，根据这段代码我遇到了一些问题:require'rubygems'require'mechanize'post_agent=WWW::Mechanize.newpost_page=post_agent.get('http://www.vbulletin.org/forum/showthread.php?t=230708')puts"\nabsolutepathwithtbodygivesnil"putspost_page.parser.xpath('/html/body/div/div/div/div/div/table/tbody/tr/td/div
ruby - 使用 RubyZip 生成 ZIP 文件时设置压缩级别 - 2
我有一个Ruby程序，它使用rubyzip压缩XML文件的目录树。gem。我的问题是文件开始变得很重，我想提高压缩级别，因为压缩时间不是问题。我在rubyzipdocumentation中找不到一种为创建的ZIP文件指定压缩级别的方法。有人知道如何更改此设置吗？是否有另一个允许指定压缩级别的Ruby库？最佳答案这是我通过查看rubyzip内部创建的代码。level=Zlib::BEST_COMPRESSIONZip::ZipOutputStream.open(zip_file)do|zip|Dir.glob("**/*")d
ruby - 为什么我可以在 Ruby 中使用 Object#send 访问私有(private)/ protected 方法？ - 2
类classAprivatedeffooputs:fooendpublicdefbarputs:barendprivatedefzimputs:zimendprotecteddefdibputs:dibendendA的实例a=A.new测试a.foorescueputs:faila.barrescueputs:faila.zimrescueputs:faila.dibrescueputs:faila.gazrescueputs:fail测试输出failbarfailfailfail.发送测试[:foo,:bar,:zim,:dib,:gaz].each{|m|a.send(m)resc
ruby-on-rails - 使用 Ruby on Rails 进行自动化测试 - 最佳实践 - 2
很好奇，就使用rubyonrails自动化单元测试而言，你们正在做什么？您是否创建了一个脚本来在cron中运行rake作业并将结果邮寄给您？git中的预提交Hook？只是手动调用？我完全理解测试，但想知道在错误发生之前捕获错误的最佳实践是什么。让我们理所当然地认为测试本身是完美无缺的，并且可以正常工作。下一步是什么以确保他们在正确的时间将可能有害的结果传达给您？最佳答案不确定您到底想听什么，但是有几个级别的自动代码库控制:在处理某项功能时，您可以使用类似autotest的内容获得关于哪些有效，哪些无效的即时反馈。要确保您的提
ruby - 在 Ruby 中使用匿名模块 - 2
假设我做了一个模块如下:m=Module.newdoclassCendend三个问题:除了对m的引用之外，还有什么方法可以访问C和m中的其他内容？我可以在创建匿名模块后为其命名吗(就像我输入“module...”一样)？如何在使用完匿名模块后将其删除，使其定义的常量不再存在？最佳答案三个答案:是的，使用ObjectSpace.此代码使c引用你的类(class)C不引用m:c=nilObjectSpace.each_object{|obj|c=objif(Class===objandobj.name=~/::C$/)}当然这取决于
ruby - 使用 ruby 和 savon 的 SOAP 服务 - 2
我正在尝试使用ruby和Savon来使用网络服务。测试服务为http://www.webservicex.net/WS/WSDetails.aspx?WSID=9&CATID=2require'rubygems'require'savon'client=Savon::Client.new"http://www.webservicex.net/stockquote.asmx?WSDL"client.get_quotedo|soap|soap.body={:symbol=>"AAPL"}end返回SOAP异常。检查soap信封，在我看来soap请求没有正确的命名空间。任何人都可以建议我
python - 如何使用 Ruby 或 Python 创建一系列高音调和低音调的蜂鸣声？ - 2
关闭。这个问题是opinion-based.它目前不接受答案。想要改进这个问题？更新问题，以便editingthispost可以用事实和引用来回答它.关闭4年前。Improvethisquestion我想在固定时间创建一系列低音和高音调的哔哔声。例如:在150毫秒时发出高音调的蜂鸣声在151毫秒时发出低音调的蜂鸣声200毫秒时发出低音调的蜂鸣声250毫秒的高音调蜂鸣声有没有办法在Ruby或Python中做到这一点？我真的不在乎输出编码是什么(.wav、.mp3、.ogg等等)，但我确实想创建一个输出文件。
ruby-on-rails - Rails - 子类化模型的设计模式是什么？ - 2
我有一个模型:classItem项目有一个属性“商店”基于存储的值，我希望Item对象对特定方法具有不同的行为。Rails中是否有针对此的通用设计模式？如果方法中没有大的if-else语句，这是如何干净利落地完成的？最佳答案通常通过Single-TableInheritance. 关于ruby-on-rails-Rails-子类化模型的设计模式是什么？，我们在StackOverflow上找到一个类似的问题： https://stackoverflow.co
ruby-on-rails - 'compass watch' 是如何工作的/它是如何与 rails 一起使用的 - 2
我在我的项目目录中完成了compasscreate.和compassinitrails。几个问题:我已将我的.sass文件放在public/stylesheets中。这是放置它们的正确位置吗？当我运行compasswatch时，它不会自动编译这些.sass文件。我必须手动指定文件:compasswatchpublic/stylesheets/myfile.sass等。如何让它自动运行？文件ie.css、print.css和screen.css已放在stylesheets/compiled。如何在编译后不让它们重新出现的情况下删除它们？我自己编译的.sass文件编译成compiled/t
ruby - 使用 ruby 将 HTML 转换为纯文本并维护结构/格式 - 2
我想将html转换为纯文本。不过，我不想只删除标签，我想智能地保留尽可能多的格式。为插入换行符标签，检测段落并格式化它们等。输入非常简单，通常是格式良好的html(不是整个文档，只是一堆内容，通常没有anchor或图像)。我可以将几个正则表达式放在一起，让我达到80%，但我认为可能有一些现有的解决方案更智能。最佳答案首先，不要尝试为此使用正则表达式。很有可能你会想出一个脆弱/脆弱的解决方案，它会随着HTML的变化而崩溃，或者很难管理和维护。您可以使用Nokogiri快速解析HTML并提取文本:require'nokogiri'h

c++ - 使用 TBB 的并行性——我们的 list 中应该包含什么？

有关c++ - 使用 TBB 的并行性——我们的 list 中应该包含什么？的更多相关文章

随机推荐