- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我一直在尝试使用 Alea GPU 在 F# 中编写并行 Floyd-Warshall 算法,并以另一个用户在此处提供的 CUDA 代码为基础
The Floyd-Warshall algorithm in CUDA
我写了以下简单的实现
type FWModule<'T>(target:GPUModuleTarget, tileDim:int) =
inherit GPUModule(target)
[<Kernel;ReflectedDefinition>]
member this.FloydWKernel (width:int) (k:int) (data:deviceptr<float>) =
let col = blockIdx.x * blockDim.x + threadIdx.x
let row = blockIdx.y
if col >= width then () //out of bounds
let index = width * row + col
let best = __shared__.Variable<float>()
if threadIdx.x = 0 then best := data.[width*row+k]
__syncthreads()
let tmp = data.[k*width+col]
let candidate = !best + tmp
data.[index] <- min data.[index] candidate
member this.LaunchParams width =
let blockdim = dim3(tileDim)
let griddim = dim3(divup width tileDim, width)
LaunchParam(griddim, blockdim)
member this.FloydW (width:int) (k:int) (data:deviceptr<float>) =
let lp = this.LaunchParams width
this.GPULaunch <@ this.FloydWKernel @> lp width k idata odata
member this.FW(size:int, A:float[])=
use deviceArr = this.GPUWorker.Malloc(A)
for k in 0 .. size-1 do
this.FloydW size k deviceArr.Ptr deviceArr.Ptr
deviceArr.Gather()
let tileDim = 256
let apsp = new FWModule<float>(GPUModuleTarget.DefaultWorker, tileDim)
fsi
中运行时
let m = [|0.0 ; 5.0 ; 9.0 ; infinity;
infinity; 0.0 ; 1.0 ; infinity;
infinity; infinity; 0.0 ; 2.0;
infinity; 3.0 ; infinity; 0.0|];;
apsp.FW (4,m);;
[|0.0; 5.0; 6.0; 8.0;
4.0; 0.0; 1.0; 3.0;
3.0; 3.0; 0.0; 1.0;
1.0; 1.0; 1.0; 0.0|]
floydwarshall
let floydwarshall (l:int, mat:float[]) =
let a = Array.copy mat
for k in 0 .. (l-1) do
for i in 0 .. (l-1) do
for j in 0 .. (l-1) do
a.[i*l+j] <- min a.[i*l+j] (a.[i*l+k] + a.[k*l+j])
a
floydwarshall (4,m);;
[|0.0 ; 5.0; 6.0; 8.0;
infinity; 0.0; 1.0; 3.0;
infinity; 5.0; 0.0; 2.0;
infinity; 3.0; 4.0; 0.0|]
最佳答案
这是我们示例库中的一些源代码片段,您可以在 Alea GPU 示例库中找到 http://www.quantalea.com/gallery .
这是单阶段算法。它不是最快的,但相当容易理解。
public static class FloydWarshallSingleStage
{
const int BlockWidth = 16;
/// <summary>
/// Kernel for parallel Floyd Warshall algorithm on GPU.
/// </summary>
/// <param name="u">Number vertex of which is performed relaxation paths [v1, v2]</param>
/// <param name="n">Number of vertices in the graph G:=(V,E), n := |V(G)|</param>
/// <param name="d">Matrix of shortest paths d(G)</param>
/// <param name="p">Matrix of predecessors p(G)</param>
public static void KernelSingleStage(int u, int[,] d, int[,] p)
{
var n = d.GetLength(0);
var v1 = blockDim.y * blockIdx.y + threadIdx.y;
var v2 = blockDim.x * blockIdx.x + threadIdx.x;
if (v1 < n && v2 < n)
{
var newPath = d[v1, u] + d[u, v2];
var oldPath = d[v1, v2];
if (oldPath > newPath)
{
d[v1, v2] = newPath;
p[v1, v2] = p[u, v2];
}
}
}
[GpuManaged]
public static void Run(Gpu gpu, int[,] d, int[,] p)
{
var n = d.GetLength(0);
var gridDim = new dim3((n - 1) / BlockWidth + 1, (n - 1) / BlockWidth + 1, 1);
var blockDim = new dim3(BlockWidth, BlockWidth, 1);
var lp = new LaunchParam(gridDim, blockDim);
for (var u = 0; u < n; u++)
{
gpu.Launch(KernelSingleStage, lp, u, d, p);
}
}
}
public class FloydWarshallMultiStage
{
private const int None = -1;
private const int Inf = 1061109567;
//[GpuParam]
//private readonly Constant<int> BlockSize;
//[GpuParam]
//private readonly Constant<int> ThreadSize;
//[GpuParam]
//private readonly Constant<int> VirtualBlockSize;
private const int BlockSize = 16;
private const int ThreadSize = 2;
private const int VirtualBlockSize = BlockSize*ThreadSize;
public FloydWarshallMultiStage(int blockSize, int threadSize)
{
//BlockSize = new Constant<int>(blockSize);
//ThreadSize = new Constant<int>(threadSize);
//VirtualBlockSize = new Constant<int>(blockSize * threadSize);
}
/// <summary>
/// Kernel for parallel Floyd Warshall algorithm on GPU computing independent blocks.
/// </summary>
/// <param name="block">Number block of which is performed relaxation paths [v1, v2]</param>
/// <param name="n">Number of vertices in the graph G:=(V,E), n := |V(G)|</param>
/// <param name="pitch">Width to get to next row in number of int</param>
/// <param name="d">Matrix of shortest paths d(G)</param>
/// <param name="p">Matrix of predecessors p(G)</param>
public void KernelPhaseOne(int block, int n, int pitch, int[,] d, int[,] p)
{
var newPred = 0;
var tx = threadIdx.x;
var ty = threadIdx.y;
var v1 = VirtualBlockSize*block + ty;
var v2 = VirtualBlockSize*block + tx;
var primaryD = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var primaryP = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
if (v1 < n && v2 < n)
{
primaryD[ty, tx] = d[v1, v2];
primaryP[ty, tx] = p[v1, v2];
newPred = primaryP[ty, tx];
}
else
{
primaryD[ty, tx] = Inf;
primaryP[ty, tx] = None;
}
DeviceFunction.SyncThreads();
for (var i = 0; i < VirtualBlockSize; i++)
{
var newPath = primaryD[ty, i] + primaryD[i, tx];
DeviceFunction.SyncThreads();
if (newPath < primaryD[ty, tx])
{
primaryD[ty, tx] = newPath;
newPred = primaryP[i, tx];
}
DeviceFunction.SyncThreads();
primaryP[ty, tx] = newPred;
}
if (v1 < n && v2 < n)
{
d[v1, v2] = primaryD[ty, tx];
p[v1, v2] = primaryP[ty, tx];
}
}
/// <summary>
/// Kernel for parallel Floyd Warshall algorithm on GPU to compute block depending on a single independent block.
/// </summary>
/// <param name="block">Number block of which is performed relaxation paths [v1, v2]</param>
/// <param name="n">Number of vertices in the graph G:=(V,E), n := |V(G)|</param>
/// <param name="pitch"></param>
/// <param name="d">Matrix of shortest paths d(G)</param>
/// <param name="p">Matrix of predecessors p(G)</param>
public void KernelPhaseTwo(int block, int n, int pitch, int[,] d, int[,] p)
{
if (blockIdx.x == block) return;
var newPath = 0;
var newPred = 0;
var tx = threadIdx.x;
var ty = threadIdx.y;
var v1 = VirtualBlockSize*block + ty;
var v2 = VirtualBlockSize*block + tx;
var primaryD = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var currentD = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var primaryP = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
var currentP = __shared__.Array2D<int>(VirtualBlockSize, VirtualBlockSize);
if (v1 < n && v2 < n)
{
primaryD[ty, tx] = d[v1, v2];
primaryP[ty, tx] = p[v1, v2];
}
else
{
primaryD[ty, tx] = Inf;
primaryP[ty, tx] = None;
}
// load i-aligned singly dependent blocks
if (blockIdx.y == 0)
{
v1 = VirtualBlockSize*block + ty;
v2 = VirtualBlockSize*blockIdx.x + tx;
}
// load j-aligned singly dependent blocks
else
{
v1 = VirtualBlockSize*blockIdx.x + ty;
v2 = VirtualBlockSize*block + tx;
}
if (v1 < n && v2 < n)
{
currentD[ty, tx] = d[v1, v2];
currentP[ty, tx] = p[v1, v2];
newPred = currentP[ty, tx];
}
else
{
currentD[ty, tx] = Inf;
currentP[ty, tx] = None;
}
DeviceFunction.SyncThreads();
// compute i-aligned singly dependent blocks
if (blockIdx.y == 0)
{
for (var i = 0; i < VirtualBlockSize; i++)
{
newPath = primaryD[ty, i] + currentD[i, tx];
DeviceFunction.SyncThreads();
if (newPath < currentD[ty, tx])
{
currentD[ty, tx] = newPath;
newPred = currentP[i, tx];
}
DeviceFunction.SyncThreads();
currentP[ty, tx] = newPred;
}
}
// compute j-aligned singly dependent blocks
else
{
for (var i = 0; i < VirtualBlockSize; i++)
{
newPath = currentD[ty, i] + primaryD[i, tx];
DeviceFunction.SyncThreads();
if (newPath < currentD[ty, tx])
{
currentD[ty, tx] = newPath;
currentP[ty, tx] = primaryP[i, tx];
}
DeviceFunction.SyncThreads();
}
}
if (v1 < n && v2 < n)
{
d[v1, v2] = currentD[ty, tx];
p[v1, v2] = currentP[ty, tx];
}
}
/// <summary>
/// Kernel for parallel Floyd Warshall algorithm on GPU to compute dependent block depending on the singly dependent blocks.
/// </summary>
/// <param name="block">Number block of which is performed relaxation paths [v1, v2]</param>
/// <param name="n">Number of vertices in the graph G:=(V,E), n := |V(G)|</param>
/// <param name="pitch"></param>
/// <param name="d">Matrix of shortest paths d(G)</param>
/// <param name="p">Matrix of predecessors p(G)</param>
public void KernelPhaseThree(int block, int n, int pitch, int[,] d, int[,] p)
{
if (blockIdx.x == block || blockIdx.y == block) return;
var tx = threadIdx.x*ThreadSize;
var ty = threadIdx.y*ThreadSize;
var v1 = blockDim.y*blockIdx.y*ThreadSize + ty;
var v2 = blockDim.x*blockIdx.x*ThreadSize + tx;
var primaryRowD = __shared__.Array2D<int>(BlockSize*ThreadSize, BlockSize*ThreadSize);
var primaryColD = __shared__.Array2D<int>(BlockSize*ThreadSize, BlockSize*ThreadSize);
var primaryRowP = __shared__.Array2D<int>(BlockSize*ThreadSize, BlockSize*ThreadSize);
var v1Row = BlockSize*block*ThreadSize + ty;
var v2Col = BlockSize*block*ThreadSize + tx;
// load data for virtual block
for (var i = 0; i < ThreadSize; i++)
{
for (var j = 0; j < ThreadSize; j++)
{
var idx = tx + j;
var idy = ty + i;
if (v1Row + i < n && v2 + j < n)
{
primaryRowD[idy, idx] = d[v1Row + i, v2 + j];
primaryRowP[idy, idx] = p[v1Row + i, v2 + j];
}
else
{
primaryRowD[idy, idx] = Inf;
primaryRowP[idy, idx] = None;
}
if (v1 + i < n && v2Col + j < n)
{
primaryColD[idy, idx] = d[v1 + i, v2Col + j];
}
else
{
primaryColD[idy, idx] = Inf;
}
}
}
DeviceFunction.SyncThreads();
// compute data for virtual block
for (var i = 0; i < ThreadSize; i++)
{
for (var j = 0; j < ThreadSize; j++)
{
if (v1 + i < n && v2 + j < n)
{
var path = d[v1 + i, v2 + j];
var predecessor = p[v1 + i, v2 + j];
var idy = ty + i;
var idx = tx + j;
for (var k = 0; k < BlockSize*ThreadSize; k++)
{
var newPath = primaryColD[idy, k] + primaryRowD[k, idx];
if (path > newPath)
{
path = newPath;
predecessor = primaryRowP[k, idx];
}
}
d[v1 + i, v2 + j] = path;
p[v1 + i, v2 + j] = predecessor;
}
}
}
}
/// <summary>
/// Parallel multi-stage Floyd Warshall algorithm on GPU.
/// </summary>
/// <param name="gpu">The GPU on which the kernels should run</param>
/// <param name="n">Number of vertices in the graph G:=(V,E), n := |V(G)|</param>
/// <param name="g">The graph G:=(V,E)</param>
/// <param name="d">Matrix of shortest paths d(G)</param>
/// <param name="p">Matrix of predecessors p(G)</param>
public void Run(Gpu gpu, int[,] d, int[,] p, bool verbose = false)
{
var n = d.GetLength(0);
var gridDim1 = new dim3(1, 1, 1);
var gridDim2 = new dim3((n - 1)/VirtualBlockSize + 1, 2, 1);
var gridDim3 = new dim3((n - 1)/VirtualBlockSize + 1, (n - 1)/VirtualBlockSize + 1, 1);
var blockDim1 = new dim3(VirtualBlockSize, VirtualBlockSize, 1);
var blockDim2 = new dim3(VirtualBlockSize, VirtualBlockSize, 1);
var blockDim3 = new dim3(BlockSize, BlockSize, 1);
var numOfBlock = (n - 1)/VirtualBlockSize + 1;
var pitchInt = n;
if (verbose)
{
Console.WriteLine($"|V| {n}");
Console.WriteLine($"Phase 1: grid dim {gridDim1} block dim {blockDim1}");
Console.WriteLine($"Phase 2: grid dim {gridDim2} block dim {blockDim2}");
Console.WriteLine($"Phase 3: grid dim {gridDim3} block dim {blockDim3}");
}
for (var block = 0; block < numOfBlock; block++)
{
gpu.Launch(KernelPhaseOne, new LaunchParam(gridDim1, blockDim1), block, n, pitchInt, d, p);
gpu.Launch(KernelPhaseTwo, new LaunchParam(gridDim2, blockDim2), block, n, pitchInt, d, p);
gpu.Launch(KernelPhaseThree, new LaunchParam(gridDim3, blockDim3), block, n, pitchInt, d, p);
}
}
}
关于f# - Floyd Warshall 使用 Alea GPU,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36482098/
一 问题描述 求节点0到节点2的最短路径。 二 代码 package graph.floyd; import java.util.Scanner; public class Floyd {
一 背景介绍 如果求解任意两个节点之间的最短路径,则需要以每个节点为源点,重复调用 n 次 DijKstra 算法。其实是完全没有必要这么麻烦,Floyd 算法可用于求解任意两个节点之间的最短距离。F
我已经为一个 100 x 100 的邻接矩阵编写了代码,它表示以下有向图: 我正在尝试使用 Floyd-Warshall 算法为图中所有蓝色节点对找到最短路径。你如何只找到所选节点的所有对最短路径?这
我收到一个关于 Floyd's cycle-finding algorithm 的面试问题: Floyd 的循环查找算法何时会失败? 我的意思是,是否有规则可以找到快指针和慢指针之间的步长? 最佳答案
我正在尝试使用 java.awt.image.BufferedImage 在 Java 中实现 Floyd Steinberg 算法。 我使用了描述的算法 here使用自定义调色板,我希望获得与维基百
我正在尝试实现 Floyd-Warshall 算法(所有对最短路径)。在下面的代码中,当我输入一些数字时,它会给出最后一个数字作为输入。我知道代码不完整。 现在我应该怎么做才能为每个 i 和 j 打印
我试图在 .NET 的 C++ 上找到这个算法,但找不到,我找到了这个: // Best solution function boolean hasLoop(Node startNode){ No
我想在这个图问题中重建从源到目标顶点的路径。 如何存储路径,以及在找到从 s 到 d 的最小成本后如何检索它? 请帮我找到一个简单的答案? 例如在这一点上, adjmat[i][j] = Math.m
是否可以使用邻接表对 Floyd Warshall 进行编码?我必须处理文本文件中的一百万个顶点,因此邻接矩阵不是解决方案。任何实现已经可用?请帮忙。 最佳答案 您不能将 Floyd Warshall
我实现了 Floyd-Warshall 算法。根据他们的矩阵,我可以得到正确的结果,关于两个地方之间的最短路径和他们的距离。我的问题是如何打印从 i 到 j 的最短距离。我做了一些研究,发现了一个类似
这可能是一个糟糕的问题,因为我的代表很低,但我已经研究了几个小时的其他解决方案,我的代码似乎与我遇到的工作解决方案几乎相同。请不要忽略基于低代表的问题。 输出矩阵 d[][] 包含给定顶点对之间最短路
Mr. Rowan plans to make a walking tour of Paris. However, since he is a little lazy, he wants to tak
使用 Floyd 时如何使用我电脑上的数据文件? 运行后: floyd init floyd run --gpu --env tensorflow-1.3 "python model.py" 我得到的
假设我有 9 个顶点。所以我有 9x9 解决方案矩阵和 matrix[6,0] = infinity, matrix[6,9]=1, matrix[9,0]=1 现在算法的工作原理如下: for k
This维基百科页面解释了 Floyd Warshall 算法,用于查找图中节点之间的最短路径。维基百科页面使用图像左侧的图表 作为起始图(在 k = 0 时的第一次迭代之前),然后显示剩余的迭代(k
考虑以下链表: 1->2->3->4->5->6->7->8->9->4->...->9->4..... 上面的列表有一个循环如下: [4->5->6->7->8->9->4] 在白板上绘制链表,我尝
我知道当图中有负权重环时,没有找到最小距离的方法,也就没有最小距离的意义了。我的问题是,如果我们向 Floyd Warshall 算法提供具有负权重循环的图,会发生什么情况?它会在 O(n3) 内无限
有人可以告诉我这个过程在 for 迭代中的时间复杂度吗?这段代码是FloydWarshall算法的“重构路径”部分。prev[n][n]是最短路径中源节点和目的节点之间的节点矩阵。printAllSP
问题陈述:https://www.hackerrank.com/challenges/floyd-city-of-blinding-lights 代码: import scala.io.StdIn._
我能够理解 Floyd 循环查找算法工作原理的基本原理。我唯一无法理解的是 while 循环条件,如下所示: while(slow && fast && fast->next){ sl
我是一名优秀的程序员,十分优秀!