gpt4 book ai didi

c# - 在双核上使用 Parallel.For... 实现 5 倍性能?

转载 作者:太空狗 更新时间:2023-10-29 20:05:34 24 4
gpt4 key购买 nike

当我遇到一个有趣的结果时,我正在做一些有趣的实验计算:

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 19636ms
For Loop: 12612ms
Parallel.For Loop: 3835ms

这不是我所期望的。

系统:Windows 7 64位,i3 2120 [双核,4线程],Visual Studio 2010。

构建:优化开启, Release模式 [无调试器],32 位。

次要的兴趣是令人失望的 64 位性能。虽然它在比率方面更符合我的预期,但它通过整体变慢来实现这一点。

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 23409ms
For Loop: 24373ms
Parallel.For Loop: 6839ms

计算很简单:对于索引 x 和 y,找到最接近的 Vector3 并将其存储在二维数组中。

这个问题,如果你敢的话,是试图解释为什么内联 for 循环这么慢。解释 64 位版本性能不足的奖励积分。

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
class Program
{
const int numPoints = 700;
const int textureSize = 1024;

static Random rnd = new Random();

static void Main(string[] args)
{
while (true)
{
Console.WriteLine("Starting");
Console.WriteLine();

var pointCloud = new Vector3[numPoints];

for (int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);

var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];

var sw1 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();

var sw2 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();


var sw3 = Stopwatch.StartNew();

Parallel.For(0, textureSize, x =>
{
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();

Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine(); Console.WriteLine();
Console.ReadLine();
}
}

private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
if (!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}

private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}

struct Vector3
{
public float x;
public float y;
public float z;

public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}

public static Vector3 operator -(Vector3 a, Vector3 b)
{
return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
}

public float sqrMagnitude()
{
return x * x + y * y + z * z;
}

public float DistanceToPoint(Vector3 point)
{
return (this - point).sqrMagnitude();
}
}
}
}

更新:感谢 Drew Marsh 的努力,我们现在有了这个内联所有 V3 操作的 super 优化版本。

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
class RevisedProgram
{
const int numPoints = 700;
const int textureSize = 1024;

static Random rnd = new Random();

static void Main(string[] args)
{
while (true)
{
Console.WriteLine("Starting REVISED");
Console.WriteLine();

var pointCloud = new Vector3[numPoints];

for (int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);

var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];

var sw1 = Inline(pointCloud, result1);

var sw2 = NotInline(pointCloud, result2);

var sw3 = Parallelized(pointCloud, result3);

Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine();
Console.WriteLine();
Console.ReadLine();
}
}

private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
{
var sw3 = Stopwatch.StartNew();

Parallel.For(0, textureSize, x =>
{
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
return sw3;
}

private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
{
var sw2 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
return sw2;
}

private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
{
var sw1 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
return sw1;
}

private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
if (!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}

private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);

var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}


struct Vector3
{
public float x;
public float y;
public float z;

public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}
}
}
}

结果如下:

x86

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 3820ms
For Loop: 3962ms
Parallel.For Loop: 1681ms

x64

Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 10978ms
For Loop: 10924ms
Parallel.For Loop: 3073ms

所以好消息是我们可以大幅提高此代码的性能 - 并使单线程版本的运行速度与其并行表亲保持一致。

坏消息是,这意味着完全放弃 x64 并手动内联所有数学。

在这个阶段,我对编译器的性能感到非常失望 - 我希望它们会好得多。

结论

这真是荒唐可悲……虽然我们真的不知道为什么我们可以有根据地猜测它是由一个愚蠢的编译器引起的。只需将编译器从 x64 更改为 x86 并进行一些手动内联,即可将 24 秒缩短为 3.8 秒,这并不是我所期望的。然而,我已经完成了我正在编写的概念证明,多亏了一个简单的空间哈希,我可以在 0.7 秒内计算出一个 1024 x 1024 的图像,其中有 70,000 个“点”——比我原来的 x64 场景快 ~340000%,并且没有线程或内衬。因此,我已经接受了一个答案 - 迫切的需求已经消失,尽管我仍在调查这个问题。

代码可用herehere - 它生成了一个很好的 Voronoi 图作为副作用 :P

最佳答案

所有数据来自 8 核 i7、Win7、x64

令人惊讶的是,您确实获得了 5 倍。您编写的此测试的一个问题是,您已将所有三种方法都放在 Main 方法中,这迫使编译器必须创建 gobblygook 并保持同步以满足 中使用的闭包的需要Parallel.For 妨碍了内联方法。如果按如下方式分解工作,您会发现所有三种实现的性能都明显提高……至少对于 x86:

x86 之前:

For Loop (Inline): 24313ms 
For Loop: 25236ms
Parallel.For Loop: 3840ms

x86 之后:

For Loop (Inline): 13007ms
For Loop: 13013ms
Parallel.For Loop: 2208ms

因此,查看我的 x86 Parallel。对于结果,您会发现它的扩展约为 ~5.9 倍,并且每个版本在隔离时都快得多。

接下来,值得注意的是,经过同样的更改后 x64 绝对没有任何增益。事实上,在 3 项测试中的 2 项测试中,每次运行的结果都略高。

x64 之前

For Loop (Inline): 24222ms
For Loop: 25197ms
Parallel.For Loop: 3810ms

x64 之后

For Loop (Inline): 25302ms
For Loop: 25209ms
Parallel.For Loop: 3821ms

除了人们一直想出这样的代码使 x64 JIT 看起来很糟糕之外,我没有直接的答案为什么 x64 会如此糟糕,所以也许其他人可以插话。

也就是说,在这样的实现中,我确实有另一件事你可能想要考虑研究:缓存行失效。 There is an awesome MSDN article here由@StephenToub 撰写,解释了这一切。长话短说;博士;就是这样,因为您的所有数据都存储在一个数组和差异中。具有不同本地 (L2) 缓存的内核将修改该阵列的部分内容,它们必须将数据与与它们重叠的其他内核同步。如果部分差异。正在处理的核心靠得太近,你最终会得到很多这样的同步,这会吞噬你的并行 yield 。这篇文章展示了一种技术,您实际上在工作数组中分配了足够的空间来分隔包含您要处理的数据的实际部分,这样当这些核心处理数据时,它们不必使另一个无效核心。 for 循环,而不是更接近 8 倍。我敢打赌,如果您投入工作来解决任何缓存行失效问题,您可以从中再挤出 10% 以上。请记住,在设置和协调并行工作时总会有一些开销,因此您永远不会获得 100% 的完美。

这是您的程序的修订版,其中每种方法都分解为单独的方法:

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
class RevisedProgram
{
const int numPoints = 700;
const int textureSize = 1024;

static Random rnd = new Random();

static void Main(string[] args)
{
while(true)
{
Console.WriteLine("Starting REVISED");
Console.WriteLine();

var pointCloud = new Vector3[numPoints];

for(int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);

var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];

var sw1 = Inline(pointCloud, result1);

var sw2 = NotInline(pointCloud, result2);


var sw3 = Parallelized(pointCloud, result3);

Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine();
Console.WriteLine();
Console.ReadLine();
}
}

private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
{
var sw3 = Stopwatch.StartNew();

Parallel.For(0, textureSize, x =>
{
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
return sw3;
}

private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
{
var sw2 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
return sw2;
}

private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
{
var sw1 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
return sw1;
}

private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
if(!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}

private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);

for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}

struct Vector3
{
public float x;
public float y;
public float z;

public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}

public static Vector3 operator -(Vector3 a, Vector3 b)
{
return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
}

public float sqrMagnitude()
{
return x * x + y * y + z * z;
}

public float DistanceToPoint(Vector3 point)
{
return (this - point).sqrMagnitude();
}
}
}
}

更新:

根据 Feng Yuan 指出的 x64 JIT 未内联的方法,您可以将程序更改为内联计算,并从 x64 版本获得比 x86 版本更好的性能。这显然很糟糕,但这是我以前见过 x64 JIT 破坏的那种东西。这是新数字:

内联 x64 之后:

For Loop (Inline): 19032ms
For Loop: 19209ms
Parallel.For Loop: 3015ms

代码的内联版本:

using System;
using System.Diagnostics;
using System.Threading.Tasks;

namespace TextureFromPoints
{
class RevisedProgram
{
const int numPoints = 700;
const int textureSize = 1024;

static Random rnd = new Random();

static void Main(string[] args)
{
while(true)
{
Console.WriteLine("Starting REVISED");
Console.WriteLine();

var pointCloud = new Vector3[numPoints];

for(int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);

var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];

var sw1 = Inline(pointCloud, result1);

var sw2 = NotInline(pointCloud, result2);


var sw3 = Parallelized(pointCloud, result3);

Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine();
Console.WriteLine();
Console.ReadLine();
}
}

private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
{
var sw3 = Stopwatch.StartNew();

Parallel.For(0, textureSize, x =>
{
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
return sw3;
}

private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
{
var sw2 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
return sw2;
}

private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
{
var sw1 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = nearestV3 - targetPos;
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = currentV3 - targetPos;
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
return sw1;
}

private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
if(!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}

private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = nearestV3 - targetPos;
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;

for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = currentV3 - targetPos;
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}

private static float DistanceToPoint(Vector3 vector, Vector3 point)
{
Vector3 final = vector - point;

return final.x * final.x + final.y * final.y + final.z * final.z;
}

struct Vector3
{
public float x;
public float y;
public float z;

public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}

public static Vector3 operator -(Vector3 a, Vector3 b)
{
return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
}
}
}
}

关于c# - 在双核上使用 Parallel.For... 实现 5 倍性能?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/11572635/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com