c - 多线程对速度没有改进 - 在 C 中使用 pthread

c - 多线程对速度没有改进 - 在 C 中使用 pthread - 为什么？

转载作者：太空宇宙更新时间：2023-11-04 05:50:09

为了更加适应多线程，我编写了一个带有“密集型”计算的小 C 程序。它是 mandelbrot 集的图片，其中每个像素单独计算，然后像素缓冲到行。每个线程都获得相等份额的总行数。因此，例如，如果选择的线程数为两个，则以 1000 行的高度计算的图片应该以两个 500 行的包结束。因此我建议速度减少两倍，但没有改善。为什么？？？我不明白，因为一切正常而且看起来合乎逻辑。如果有人可以给我提示，我将不胜感激。下面您可以看到 main 函数和一个由 main 调用的用于计算 mandelbrot 集的函数。

int main(int argc, char ** argv, char ** envp) {

if(argc != 4)
{
printf("Bitte genau 3 Argumente eingeben.\n");
 return 1;
}
//Structs und Variablen für die Stopuhr
struct timeval start, ende;
long ttlende, ttlstart;

width  = str2num(argv[1]);
height = str2num(argv[2]);

int y;
//char blueGreenRed[3];
//Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel
//char zeile[width*3];

unsigned char info[BMPHEADER_SIZE] = {
              //size
    'B','M',  0,0,0,0, 0,0, 0,0, 54,0,0,0,
              //width  //height
    40,0,0,0, 0,0,0,0, 0,0,0,0,  1,0, 24,0,
              // datasize
    0,0,0,0,  0,0,0,0
};

// BMP lines must be of lengths divisible by 4
char span[4] = "\0\0\0\0";
int spanBytes = 4 - ((width * 3) % 4);
if (spanBytes == 4) spanBytes = 0;
int psize = ((width * 3) + spanBytes) * height;

*( (int*) &info[2])  = BMPHEADER_SIZE + psize;
*( (int*) &info[18]) = width;
*( (int*) &info[22]) = height;
*( (int*) &info[34]) = psize;

write(1, (char *) info, BMPHEADER_SIZE);
//Stoppuhr starten, d.h. get time stamp

//create chunks
int threads= str2num(argv[3]);
int i;
int reminder = height%threads;
int blocksize = height/threads;
int rounds = height/blocksize;
int begin = 1;


//init structs
threadinfo *tinfoptr = getptr(rounds);
//threadinfo tinfo = *tinfoptr;
for (i=1; i<=rounds; ++i){
        int res = blocksize*i;
        if((i==rounds)){
                res = res+reminder;
        }

        //update parameters of tinfo
        (*(tinfoptr+(i-1))).from = begin;
        (*(tinfoptr+(i-1))).to = res;
        (*(tinfoptr+(i-1))).span = span;
        (*(tinfoptr+(i-1))).spanBytes = spanBytes;
        (*(tinfoptr+(i-1))).width = width;
        (*(tinfoptr+(i-1))).height = res-begin+1;
        (*(tinfoptr+(i-1))).results = NULL;
        (*(tinfoptr+(i-1))).threadno = i;
        (*(tinfoptr+(i-1))).blocksizeperthread = -1;
        //altes ende ist neuer start des nächsten blocks.
        begin = res;
}

fprintf(stderr,"inti abgeschlossen, starte threads\n");

pthread_t myThread[rounds];
for (i=1; i<=rounds; ++i){
    fprintf(stderr,"Rufe Thread %d auf\n",i);
    if (pthread_create(&myThread[i-1], NULL, myDo2, (void*)(tinfoptr+.   (i-1))) ) {
        fprintf(stderr, "Error creating thread\n");
        return 1;
    }
}

gettimeofday(&start, NULL);
for (i=1; i<=rounds; ++i){
    /* wait for the second thread to finish */
    if (pthread_join(myThread[i-1], NULL)) {
        fprintf(stderr, "Error joining thread\n");
        return 2;
    }
}
//Stoppuhr beenden, d.h. get time stamp, NULL per Doku.
gettimeofday(&ende,NULL);

    //if the main thread arrives this position, restulptr containts all rows indexed by the threadnr.
    for (i=1; i<=rounds; i++){
        //noch countereinbauen
        int l_blocksize = (tinfoptr+(i-1))->blocksizeperthread;
        for (y=0; y <= l_blocksize; y++) {
            //Zeilenweise nach stdout schreiben
            write(1, (tinfoptr+(i-1))->results[y], width*3);
            // BMP lines must be of lengths divisible by 4
            write(1, span, spanBytes);
        }
    }


ttlende = ende.tv_sec * 1000000 + ende.tv_usec;
ttlstart = start.tv_sec * 1000000 + start.tv_usec;
fprintf(stderr, "\nDauer: %ld Mikrosekunden\n", (ttlende - ttlstart));

return 0;
}

这里调用的函数是:

void* myDo2(void* tiptr){
threadinfo* mythread = (threadinfo*)tiptr;
//copy infos from struct to this thread
int l_from = mythread->from;
int l_to = mythread->to;
int l_width = mythread->width;
int l_height = mythread->height;
//  char **container = createMatrix(l_width*3,l_height);
char **container = malloc (l_height * sizeof(char*));
for(int i = 0; i<l_height; i++){
    container[i] = malloc(l_width*3*sizeof(char));
}

int x,y;
char iterate=0;
Complex c    = {0,0};
Complex newz = {0,0};
float imageRelation = (float)l_width/(float)height;
char blueGreenRed[3];
    //Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel
    char zeile[l_width*3];
    int counter = 0;

for (y=l_from; y <= l_to; ++y)
{
    for (x=1; x <= l_width; ++x) {
        Complex z = {0,0};
        float quad=0;

        c.re = zoom * (-1.0 + imageRelation * ( (x-1.0) / (width-1.0)) );
        c.im = zoom * ( 0.5 - (y-1.0) / (height-1.0) );

        // iterate
        for ( iterate=1; iterate < colorLimit && quad < quadLimit; ++iterate ) {
            quad = z.re * z.re + z.im * z.im;

            newz.re = (z.re * z.re) - (z.im * z.im) + c.re;
            newz.im =  z.re * z.im * 2.0            + c.im;

            z = newz;
        }
        toRGB(iterate, blueGreenRed);
        //Kopiere 3 Bytes von bgr nach zeile + (x-1)*3
        //Beachte: Die Variable zeile ist ein character array daher wird (x-1)*3 benutzt um 3 Byte Pakete pro Pixel in die Zeile zu laden.
        memcpy((zeile + (x-1)*3), blueGreenRed, 3);
    }
    memcpy(container[counter], zeile, l_width*3);
    counter++;
}

mythread->blocksizeperthread = counter-1;
mythread->results = container;
        fprintf(stderr, "Ich bin Thread-Nr. %d\n", mythread->threadno);
        fprintf(stderr, "und habe eine Menge Zeilen von %d\n", mythread->blocksizeperthread);
        fprintf(stderr, "und habe berechnet von %d\n", l_from);
        fprintf(stderr, "und habe berechnet bis %d\n", l_to);
return NULL;
}

非常感谢，你的错误

最佳答案

简而言之，答案是该模型可以工作，但您需要为每个线程分配足够的工作，使其值得吸收启动、停止和同步线程的开销。并且您必须在能够同时运行多个线程(多核机器)的计算机上运行。

我采用了您提供的应用程序并对其进行了修改以实际编译。如果我在有许多可用 CPU 内核的 Linux 机器上运行它并且给 myDo2 工作线程足够的工作，那么我会看到类似于以下的结果:

./test width height num_threads
./test 10000 10000 1
Dauer: 17,660,185 Mikrosekunden

./test 10000 10000 2 
Dauer: 7,864,508 Mikrosekunden

./test 10000 10000 8
Dauer: 1,100,126 Mikrosekunden

这意味着使用 8 个线程时，总挂钟时间从 17.6 秒减少到 1.1 秒，这是一个超过 8 倍的改进(可能是由于更好的内存和缓存使用)。

但是，如果我给每个线程的工作太少，那么我的时间似乎并没有改善，实际上在某些时候会变得更糟。

./test 10 10 1
Dauer: 70 Mikrosekunden

./test 10 10 2
Dauer: 60 Mikrosekunden

./test 10 10 4 
Dauer: 205 Mikrosekunden

在这里您可以看到启动线程、然后停止线程并与该线程同步的开销大于线程内部完成的工作量。

所以编程模型可以工作，但您需要正确使用它。

我在 RedHat 上使用编译了下面的代码

gcc -std=gnu99 test.c -o test -l pthread

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <pthread.h>
#include <string.h>

typedef struct _threadinfo
{
    int from;
    int to;
    int width;
    int height;
    int blocksizeperthread;
    char **results;
    int threadno;
} threadinfo;

typedef struct _cplx
{
    float re;
    float im;
} Complex;

void* myDo2( void *tiptr )
{
    threadinfo *mythread = (threadinfo *)tiptr;
    //copy infos from struct to this thread
    int l_from = mythread->from;
    int l_to = mythread->to;
    int l_width = mythread->width;
    int l_height = mythread->height;
    char **container = malloc(l_height * sizeof(char *));
    for (int i = 0; i < l_height; i++)
    {
        container[i] = malloc(l_width * 3 * sizeof(char));
    }

    int x, y;
    char iterate = 0;
    Complex c    = { 0, 0 };
    Complex newz = { 0, 0 };
    float imageRelation = (float)l_width / (float)l_height;
    char blueGreenRed[3];
    //Ist Buffer für ganze Zeile: Breite * 3 wegen den 3 Bytes pro Pixel
    char zeile[l_width * 3];                            //1000*3
    int counter = 0;
    float zoom = 1.0;
    float colorLimit = 10.0;
    float quadLimit = 10.0;

    for (y = l_from; y <= l_to; ++y)                    //1..500
    {
        for (x = 1; x <= l_width; ++x)                  //1..1000
        {
            Complex z = { 0, 0 };
            float quad = 0;

            c.re = zoom * (-1.0 + imageRelation * ((x - 1.0) / (l_width - 1.0)));
            c.im = zoom * (0.5 - (y - 1.0) / (l_height - 1.0));

            // iterate
            for (iterate = 1; iterate < colorLimit && quad < quadLimit; ++iterate)
            {
                quad = z.re * z.re + z.im * z.im;

                newz.re = (z.re * z.re) - (z.im * z.im) + c.re;
                newz.im =  z.re * z.im * 2.0            + c.im;

                z = newz;
            }
            //toRGB(iterate, blueGreenRed);
            //Kopiere 3 Bytes von bgr nach zeile + (x-1)*3
            //Beachte: Die Variable zeile ist ein character array daher wird
            //(x-1)*3 benutzt um 3 Byte Pakete pro Pixel in die Zeile zu laden.
            memcpy((zeile + (x - 1) * 3), blueGreenRed, 3);
        }
        memcpy(container[counter], zeile, l_width * 3);
        counter++;
    }

    mythread->blocksizeperthread = counter - 1;
    mythread->results = container;
    fprintf(stderr, "Ich bin Thread-Nr. %d\n", mythread->threadno);
    fprintf(stderr, "und habe eine Menge Zeilen von %d\n", mythread->blocksizeperthread);
    fprintf(stderr, "und habe berechnet von %d\n", l_from);
    fprintf(stderr, "und habe berechnet bis %d\n", l_to);
    return NULL;
}

int main(int argc, char **argv, char **envp)
{
    if (argc != 4)
    {
        printf("Bitte genau 3 Argumente eingeben.\n");
        return 1;
    }
//Structs und Variablen für die Stopuhr
    struct timeval start, ende;
    long ttlende, ttlstart;
    int width;
    int height;

    width  = atoi(argv[1]);
    height = atoi(argv[2]);

    int y;

// BMP lines must be of lengths divisible by 4
    char span[4] = "\0\0\0\0";
    int spanBytes = 4 - ((width * 3) % 4);
    if (spanBytes == 4) spanBytes = 0;
    int psize = ((width * 3) + spanBytes) * height;

//Stoppuhr starten, d.h. get time stamp

//create chunks
    int threads = atoi(argv[3]);
    int i;
    int reminder = height % threads;
    int blocksize = height / threads;
    int rounds = height / blocksize;
    int begin = 1;


//init structs
    threadinfo *tinfoptr = malloc( sizeof(threadinfo) * rounds );
//threadinfo tinfo = *tinfoptr;
    for (i = 1; i <= rounds; ++i)
    {
        //res = 500 * 1;
        //res = 500*2;
        int res = blocksize * i;
        if ((i == rounds))
        {
            res = res + reminder;
        }

        //update parameters of tinfo
        (*(tinfoptr + (i - 1))).from = begin;
        (*(tinfoptr + (i - 1))).to = res;
        (*(tinfoptr + (i - 1))).width = width;
        (*(tinfoptr + (i - 1))).height = res - begin + 1;
        (*(tinfoptr + (i - 1))).results = NULL;
        (*(tinfoptr + (i - 1))).threadno = i;
        (*(tinfoptr + (i - 1))).blocksizeperthread = -1;
        //altes ende ist neuer start des nächsten blocks.
        begin = res;
    }

    fprintf(stderr, "inti abgeschlossen, starte threads\n");

    pthread_t myThread[rounds];
    for (i = 1; i <= rounds; ++i)
    {
        fprintf(stderr, "Rufe Thread %d auf\n", i);
        if (pthread_create(&myThread[i - 1], NULL, myDo2,
                           (void *)(tinfoptr + (i - 1))))
        {
            fprintf(stderr, "Error creating thread\n");
            return 1;
        }
    }

    gettimeofday(&start, NULL);
    for (i = 1; i <= rounds; ++i)
    {
        /* wait for the second thread to finish */
        if (pthread_join(myThread[i - 1], NULL))
        {
            fprintf(stderr, "Error joining thread\n");
            return 2;
        }
    }
//Stoppuhr beenden, d.h. get time stamp, NULL per Doku.
    gettimeofday(&ende, NULL);

    ttlende = ende.tv_sec * 1000000 + ende.tv_usec;
    ttlstart = start.tv_sec * 1000000 + start.tv_usec;
    fprintf(stderr, "\nDauer: %ld Mikrosekunden\n", (ttlende - ttlstart));

    return 0;
}

关于c - 多线程对速度没有改进 - 在 C 中使用 pthread - 为什么？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/44007554/

文章推荐： linux - Bash 脚本重命名子目录中的文件 - 基于文件名

文章推荐： linux -/etc/fstab 和/etc/mtab 有什么区别

文章推荐： c - 如何在这个函数中正确使用断言？

c++ - C c;之间有什么区别吗？和 C c = C();?
#include using namespace std; class C{ private: int value; public: C(){ value = 0;
c++ - C 风格字符串差异 : C/C++
这个问题已经有答案了: What is the difference between char a[] = ?string?; and char *p = ?string?;? (8 个回答) 已关闭
c++ - c\c++ 转换为 C#
关闭。此题需要details or clarity 。目前不接受答案。想要改进这个问题吗？通过 editing this post 添加详细信息并澄清问题. 已关闭 7 年前。此帖子已于 8 个月
c# - C、C++、C# 的功能测试工具
除了调试之外，是否有任何针对 c、c++ 或 c# 的测试工具，其工作原理类似于将独立函数复制粘贴到某个文本框，然后在其他文本框中输入参数？最佳答案也许您会考虑单元测试。我推荐你谷歌测试和谷歌模拟
c# - C/C++/C# 在监视器上设置窗口位置
我想在第二台显示器中移动一个窗口 (HWND)。问题是我尝试了很多方法，例如将分辨率加倍或输入负值，但它永远无法将窗口放在我的第二台显示器上。关于如何在 C/C++/c# 中执行此操作的任何线索最
c# - C/C++/C#中的DES实现
我正在寻找 C/C++/C## 中不同类型 DES 的现有实现。我的运行平台是Windows XP/Vista/7。我正在尝试编写一个 C# 程序，它将使用 DES 算法进行加密和解密。我需要一些实
c# - 在条件中使用赋值是否安全？ C/C++、C#
很难说出这里要问什么。这个问题模棱两可、含糊不清、不完整、过于宽泛或夸夸其谈，无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开，visit the help center . 关闭 1
c++ - C/C++/C# 强制窗口在最上面
有没有办法强制将另一个窗口置于顶部？不是应用程序的窗口，而是另一个已经在系统上运行的窗口。 (Windows, C/C++/C#) 最佳答案 SetWindowPos(that_window_ha
c# - 套接字服务器应用程序的选择 : C/C++ or C#
假设您可以在 C/C++ 或 Csharp 之间做出选择，并且您打算在 Windows 和 Linux 服务器上运行同一服务器的多个实例，那么构建套接字服务器应用程序的最明智选择是什么？最佳答案如
c++ - C/C++ 运行时库和 C/C++ 标准库的区别
你们能告诉我它们之间的区别吗？顺便问一下，有什么叫C++库或C库的吗？最佳答案 C++ 标准库和 C 标准库是 C++ 和 C 标准定义的库，提供给 C++ 和 C 程序使用。那是那些词的共同
c++ - &C::c 和 &(C::c) 有什么区别？
下面的测试代码，我将输出信息放在注释中。我使用的是 gcc 4.8.5 和 Centos 7.2。 #include #include class C { public:
c++ - 什么 C++(通用 (c/c++) 与 (通用 c)/c++ )
很难说出这里问的是什么。这个问题是含糊的、模糊的、不完整的、过于宽泛的或修辞性的，无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开它，visit the help center 。已关
c# - 通过网络在 C/C++ 服务器、C/C++ 和 C# 客户端之间发送数据结构
我的客户将使用名为 annoucement 的结构/类与客户通信。我想我会用 C++ 编写服务器。会有很多不同的类继承annoucement。我的问题是通过网络将这些类发送给客户端我想也许我应该使用
c# - C/C++ - 如何将 Buffer.BlockCopy (C#) 转换为 C/C++
我在 C# 中有以下函数: public Matrix ConcatDescriptors(IList> descriptors) { int cols = descriptors[0].Co
c++ - C/C++ - 对其他人隐藏 C 或 C++ 函数代码
我有一个项目要编写一个函数来对某些数据执行某些操作。我可以用 C/C++ 编写代码，但我不想与雇主共享该函数的代码。相反，我只想让他有权在他自己的代码中调用该函数。是否可以？我想到了这两种方法 - 在
c# - 在托管代码(C++、C、C++/CLI、C#)中使用非托管代码时处理错误
我使用的是编写糟糕的第 3 方 (C/C++) Api。我从托管代码(C++/CLI)中使用它。有时会出现“访问冲突错误”。这使整个应用程序崩溃。我知道我无法处理这些错误[如果指针访问非法内存位置等，
c# - C#、C/C++ 或 Objective-C 中的眼动追踪库
关闭。这个问题不符合Stack Overflow guidelines .它目前不接受答案。我们不允许提问寻求书籍、工具、软件库等的推荐。您可以编辑问题，以便用事实和引用来回答。关闭 7 年前。
c++ - C/C++/Objective-C 文本识别库
已关闭。此问题不符合Stack Overflow guidelines 。目前不接受答案。要求我们推荐或查找工具、库或最喜欢的场外资源的问题对于 Stack Overflow 来说是偏离主题的，因为
c# - 将 C/C++ 函数导入 C#
我有一些 C 代码，将使用 P/Invoke 从 C# 调用。我正在尝试为这个 C 函数定义一个 C# 等效项。 SomeData* DoSomething(); struct SomeData {
c - C语言中 "c -= --c - c++;"的结果应该是什么？
这个问题已经有答案了: Why are these constructs using pre and post-increment undefined behavior? (14 个回答) 已关闭 6

太空宇宙

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - 多线程对速度没有改进 - 在 C 中使用 pthread - 为什么？