c# - 如何使用多线程C#实现Eratosthenes筛网？-6ren

c# - 如何使用多线程C#实现Eratosthenes筛网？

转载作者：行者123 更新时间：2023-11-30 13:11:38

我正在尝试使用Mutithreading实现Eratosthenes筛网。这是我的实现:

using System;
using System.Collections.Generic;
using System.Threading;

namespace Sieve_Of_Eratosthenes 
{
    class Controller 
        {
        public static int upperLimit = 1000000000;
        public static bool[] primeArray = new bool[upperLimit];

        static void Main(string[] args) 
        {
        DateTime startTime = DateTime.Now;

        Initialize initial1 = new Initialize(0, 249999999);
        Initialize initial2 = new Initialize(250000000, 499999999);
        Initialize initial3 = new Initialize(500000000, 749999999);
        Initialize initial4 = new Initialize(750000000, 999999999);

        initial1.thread.Join();
        initial2.thread.Join();
        initial3.thread.Join();
        initial4.thread.Join();

        int sqrtLimit = (int)Math.Sqrt(upperLimit);

        Sieve sieve1 = new Sieve(249999999);
        Sieve sieve2 = new Sieve(499999999);
        Sieve sieve3 = new Sieve(749999999);
        Sieve sieve4 = new Sieve(999999999);

        for (int i = 3; i < sqrtLimit; i += 2) 
            {
            if (primeArray[i] == true) 
                {
                int squareI = i * i;

                    if (squareI <= 249999999) 
                    {
                sieve1.set(i);
                sieve2.set(i);
                sieve3.set(i);
                sieve4.set(i);
                sieve1.thread.Join();
                sieve2.thread.Join();
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 249999999 & squareI <= 499999999) 
                    {
                sieve2.set(i);
                sieve3.set(i);
                sieve4.set(i);
                sieve2.thread.Join();
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 499999999 & squareI <= 749999999) 
                    {
                sieve3.set(i);
                sieve4.set(i);
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 749999999 & squareI <= 999999999) 
                    {
                sieve4.set(i);
                sieve4.thread.Join();
            }
            }
        }    

        int count = 0;
        primeArray[2] = true;
        for (int i = 2; i < upperLimit; i++) 
            {
            if (primeArray[i]) 
                {
                count++;
            }
        }

        Console.WriteLine("Total: " + count);

        DateTime endTime = DateTime.Now;
        TimeSpan elapsedTime = endTime - startTime;
        Console.WriteLine("Elapsed time: " + elapsedTime.Seconds);
        }

        public class Initialize 
        {
            public Thread thread;
        private int lowerLimit;
        private int upperLimit;

        public Initialize(int lowerLimit, int upperLimit) 
            {
            this.lowerLimit = lowerLimit;
            this.upperLimit = upperLimit;
            thread = new Thread(this.InitializeArray);
            thread.Priority = ThreadPriority.Highest;
            thread.Start();
        }

        private void InitializeArray() 
            {
            for (int i = this.lowerLimit; i <= this.upperLimit; i++) 
                {
                if (i % 2 == 0) 
                    {
                    Controller.primeArray[i] = false;
            } 
                    else 
                    {
                Controller.primeArray[i] = true;
            }
            }
        }
        }

        public class Sieve 
            {
            public Thread thread;
            public int i;
            private int upperLimit;

            public Sieve(int upperLimit) 
                {
                this.upperLimit = upperLimit;
            }

        public void set(int i) 
            {
            this.i = i;
            thread = new Thread(this.primeGen);
            thread.Start();
        }

        public void primeGen() 
            {
            for (int j = this.i * this.i; j <= this.upperLimit; j += i) 
                {
                Controller.primeArray[j] = false;
            }
        }
        }
    }
}

这需要30秒才能产生输出，有什么办法可以加快速度吗？

编辑:
这是TPL的实现:

public LinkedList<int> GetPrimeList(int limit) {
        LinkedList<int> primeList = new LinkedList<int>();
        bool[] primeArray = new bool[limit];

        Console.WriteLine("Initialization started...");

        Parallel.For(0, limit, i => {
            if (i % 2 == 0) {
                primeArray[i] = false;
            } else {
                primeArray[i] = true;
            }
        }
        );
        Console.WriteLine("Initialization finished...");

        /*for (int i = 0; i < limit; i++) {
            if (i % 2 == 0) {
                primeArray[i] = false;
            } else {
                primeArray[i] = true;
            }
        }*/

        int sqrtLimit = (int)Math.Sqrt(limit);
        Console.WriteLine("Operation started...");
        Parallel.For(3, sqrtLimit, i => {
            lock (this) {
                if (primeArray[i]) {
                    for (int j = i * i; j < limit; j += i) {
                        primeArray[j] = false;
                    }

                }
            }
        }
        );
        Console.WriteLine("Operation finished...");
        /*for (int i = 3; i < sqrtLimit; i += 2) {
            if (primeArray[i]) {
                for (int j = i * i; j < limit; j += i) {
                    primeArray[j] = false;
                }
            }
        }*/

        //primeList.AddLast(2);
        int count = 1;
        Console.WriteLine("Counting started...");
        Parallel.For(3, limit, i => {
            lock (this) {
                if (primeArray[i]) {
                    //primeList.AddLast(i);
                    count++;
                }
            }
        }
        );
        Console.WriteLine("Counting finished...");
        Console.WriteLine(count);

        /*for (int i = 3; i < limit; i++) {
            if (primeArray[i]) {
                primeList.AddLast(i);
            }
        }*/

        return primeList;
    }

谢谢你。

最佳答案

编辑:

我对这个问题的答案是:是的，您可以肯定地使用任务并行库(TPL)来查找质数达到十亿的速度。问题中给定的代码很慢，因为它没有有效地使用内存或多处理程序，并且最终输出也没有效率。

因此，不仅是多处理，还可以做很多事情来加快Eratosthenese筛的速度，如下所示:

您将所有数字(偶数和奇数)过筛，这两个数字都占用更多内存
(十亿个字节代表您的十亿个范围)，并且由于速度较慢
进行不必要的处理。仅使用两个是事实
仅偶数素数，因此使数组仅表示奇数素数
一半的内存需求并减少了复合数量
剔除操作数超过两倍，因此该操作
在您的计算机上可能需要20秒钟左右的时间才能准备好
十亿。

如此庞大的复合数之所以被淘汰的部分原因
内存阵列是如此之慢，以至于它大大超出了CPU缓存
大小，以便在某种程度上可以对主内存进行许多访问
随机的方式意味着剔除给定的复合数
表示法可能需要一百多个CPU时钟周期，而如果
它们都在L1缓存中，只需要一个周期，
L2缓存仅大约四个周期；并不是所有的访问都会受到最坏的影响
案例时间，但这无疑会减慢处理速度。使用一点
代表主要候选人的压缩数组将减少使用
内存的八分之一，使最坏情况下的访问减少
常见的。虽然访问会有计算开销
单个位，您会发现时间净增
节省减少平均内存访问时间将大于
这个费用。实现此目的的简单方法是使用BitArray
而不是一堆 bool 。使用编写自己的位访问
移位和“与”操作将比使用
BitArray类。您会发现使用BitArray和
另一个因素是您要对一个进行两次自己的位运算
大约有十或十二秒的线程性能
这个变化。

您发现的素数计数输出不是很有效，因为它
需要数组访问和每个候选素数的if条件。
一旦将筛网缓冲区作为数组的压缩字数组
位，您可以通过计数查找来更有效地执行此操作
表(LUT)消除了if条件，只需要两个
数组按每个打包字访问。这样做，时间到了
与时间相比，计数成为工作的微不足道的部分
剔除合成数字，以便进一步节省下来
大约八秒钟的素数达到十亿。

可以进一步减少已处理的主要候选对象的数量
是应用车轮因式分解的结果，因此删除了
素数2、3和5的因数由处理和
调整位打包的方法也可以增加有效
给定大小的位缓冲区的最大范围是另一个的两倍。
这样可以减少复合数字剔除操作的数量
高达三倍的另一个巨大因素，尽管代价是
具有更高的计算复杂度。

为了进一步减少内存使用，使内存访问甚至
效率更高，并为每页的多处理做准备
段，您可以将工作划分为多个页面
比L1或L2缓存大小大。这就需要一个基地
所有素数到最大值的平方根的素数表
主要候选项，并重新计算的起始地址参数
跨给定页面段使用的每个基本素数，
但这仍然比使用大型剔除阵列更有效。一个
实现此页面分割的另一个好处是
不必事先指定筛分上限，但
而是可以根据需要将基础素数扩展为更高的基数
页面已处理。经过所有的优化，
您可能会产生多达10亿个素数
大约2.5秒。

最后，您可以对页面的多处理进行最后的润色
使用TPL或Threads分割，使用的缓冲区大小约为
L2缓存大小(每个内核)将产生一个额外的增益
双核非超线程(HT)较旧时的两倍
处理器作为英特尔E7500 Core2Duo的执行时间来查找
素数达到1.25秒左右的十亿个。

我已经实现了一个多线程的Eratosthenes筛网作为answer to another thread，以表明Atkin的Sieve筛网比Eratosthenes的筛网没有任何优势。它使用Tasks和TaskFactory中的Task Parallel Library(TPL)，因此至少需要DotNet Framework4。我使用上面讨论的所有优化方法an alternate answer to the same quesion进一步调整了该代码。我在此处重新发布了经过调整的代码，其中添加了注释和易于阅读的格式，如下所示:
using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; class UltimatePrimesSoE : IEnumerable<ulong> { #region const and static readonly field's, private struct's and classes //one can get single threaded performance by setting NUMPRCSPCS = 1 static readonly uint NUMPRCSPCS = (uint)Environment.ProcessorCount + 1; //the L1CACHEPOW can not be less than 14 and is usually the two raised to the power of the L1 or L2 cache const int L1CACHEPOW = 14, L1CACHESZ = (1 << L1CACHEPOW), MXPGSZ = L1CACHESZ / 2; //for buffer ushort[] const uint CHNKSZ = 17; //this times BWHLWRDS (below) times two should not be bigger than the L2 cache in bytes //the 2,3,57 factorial wheel increment pattern, (sum) 48 elements long, starting at prime 19 position static readonly byte[] WHLPTRN = { 2,3,1,3,2,1,2,3,3,1,3,2,1,3,2,3,4,2,1,2,1,2,4,3, 2,3,1,2,3,1,3,3,2,1,2,3,1,3,2,1,2,1,5,1,5,1,2,1 }; const uint FSTCP = 11; static readonly byte[] WHLPOS; static readonly byte[] WHLNDX; //look up wheel position from index and vice versa static readonly byte[] WHLRNDUP; //to look up wheel rounded up index positon values, allow for overflow in size static readonly uint WCRC = WHLPTRN.Aggregate(0u, (acc, n) => acc + n); //small wheel circumference for odd numbers static readonly uint WHTS = (uint)WHLPTRN.Length; static readonly uint WPC = WHTS >> 4; //number of wheel candidates static readonly byte[] BWHLPRMS = { 2,3,5,7,11,13,17 }; const uint FSTBP = 19; //big wheel primes, following prime //the big wheel circumference expressed in number of 16 bit words as in a minimum bit buffer size static readonly uint BWHLWRDS = BWHLPRMS.Aggregate(1u, (acc, p) => acc * p) / 2 / WCRC * WHTS / 16; //page size and range as developed from the above static readonly uint PGSZ = MXPGSZ / BWHLWRDS * BWHLWRDS; static readonly uint PGRNG = PGSZ * 16 / WHTS * WCRC; //buffer size (multiple chunks) as produced from the above static readonly uint BFSZ = CHNKSZ * PGSZ, BFRNG = CHNKSZ * PGRNG; //number of uints even number of caches in chunk static readonly ushort[] MCPY; //a Master Copy page used to hold the lower base primes preculled version of the page struct Wst { public ushort msk; public byte mlt; public byte xtr; public ushort nxt; } static readonly byte[] PRLUT; /*Wheel Index Look Up Table */ static readonly Wst[] WSLUT; //Wheel State Look Up Table static readonly byte[] CLUT; // a Counting Look Up Table for very fast counting of primes class Bpa { //very efficient auto-resizing thread-safe read-only indexer class to hold the base primes array byte[] sa = new byte[0]; uint lwi = 0, lpd = 0; object lck = new object(); public uint this[uint i] { get { if (i >= this.sa.Length) lock (this.lck) { var lngth = this.sa.Length; while (i >= lngth) { var bf = (ushort[])MCPY.Clone(); if (lngth == 0) { for (uint bi = 0, wi = 0, w = 0, msk = 0x8000, v = 0; w < bf.Length; bi += WHLPTRN[wi++], wi = (wi >= WHTS) ? 0 : wi) { if (msk >= 0x8000) { msk = 1; v = bf[w++]; } else msk <<= 1; if ((v & msk) == 0) { var p = FSTBP + (bi + bi); var k = (p * p - FSTBP) >> 1; if (k >= PGRNG) break; var pd = p / WCRC; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC]; for (uint wrd = kd * WPC + (uint)(kn >> 4), ndx = wi * WHTS + kn; wrd < bf.Length; ) { var st = WSLUT[ndx]; bf[wrd] |= st.msk; wrd += st.mlt * pd + st.xtr; ndx = st.nxt; } } } } else { this.lwi += PGRNG; cullbf(this.lwi, bf); } var c = count(PGRNG, bf); var na = new byte[lngth + c]; sa.CopyTo(na, 0); for (uint p = FSTBP + (this.lwi << 1), wi = 0, w = 0, msk = 0x8000, v = 0; lngth < na.Length; p += (uint)(WHLPTRN[wi++] << 1), wi = (wi >= WHTS) ? 0 : wi) { if (msk >= 0x8000) { msk = 1; v = bf[w++]; } else msk <<= 1; if ((v & msk) == 0) { var pd = p / WCRC; na[lngth++] = (byte)(((pd - this.lpd) << 6) + wi); this.lpd = pd; } } this.sa = na; } } return this.sa[i]; } } } static readonly Bpa baseprms = new Bpa(); //the base primes array using the above class struct PrcsSpc { public Task tsk; public ushort[] buf; } //used for multi-threading buffer array processing #endregion #region private static methods static int count(uint bitlim, ushort[] buf) { //very fast counting method using the CLUT look up table if (bitlim < BFRNG) { var addr = (bitlim - 1) / WCRC; var bit = WHLNDX[bitlim - addr * WCRC] - 1; addr *= WPC; for (var i = 0; i < 3; ++i) buf[addr++] |= (ushort)((unchecked((ulong)-2) << bit) >> (i << 4)); } var acc = 0; for (uint i = 0, w = 0; i < bitlim; i += WCRC) acc += CLUT[buf[w++]] + CLUT[buf[w++]] + CLUT[buf[w++]]; return acc; } static void cullbf(ulong lwi, ushort[] b) { //fast buffer segment culling method using a Wheel State Look Up Table ulong nlwi = lwi; for (var i = 0u; i < b.Length; nlwi += PGRNG, i += PGSZ) MCPY.CopyTo(b, i); //copy preculled lower base primes. for (uint i = 0, pd = 0; ; ++i) { pd += (uint)baseprms[i] >> 6; var wi = baseprms[i] & 0x3Fu; var wp = (uint)WHLPOS[wi]; var p = pd * WCRC + PRLUT[wi]; var k = ((ulong)p * (ulong)p - FSTBP) >> 1; if (k >= nlwi) break; if (k < lwi) { k = (lwi - k) % (WCRC * p); if (k != 0) { var nwp = wp + (uint)((k + p - 1) / p); k = (WHLRNDUP[nwp] - wp) * p - k; } } else k -= lwi; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC]; for (uint wrd = (uint)kd * WPC + (uint)(kn >> 4), ndx = wi * WHTS + kn; wrd < b.Length; ) { var st = WSLUT[ndx]; b[wrd] |= st.msk; wrd += st.mlt * pd + st.xtr; ndx = st.nxt; } } } static Task cullbftsk(ulong lwi, ushort[] b, Action<ushort[]> f) { // forms a task of the cull buffer operaion return Task.Factory.StartNew(() => { cullbf(lwi, b); f(b); }); } //iterates the action over each page up to the page including the top_number, //making an adjustment to the top limit for the last page. //this method works for non-dependent actions that can be executed in any order. static void IterateTo(ulong top_number, Action<ulong, uint, ushort[]> actn) { PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s] = new PrcsSpc { buf = new ushort[BFSZ], tsk = Task.Factory.StartNew(() => { }) }; var topndx = (top_number - FSTBP) >> 1; for (ulong ndx = 0; ndx <= topndx; ) { ps[0].tsk.Wait(); var buf = ps[0].buf; for (var s = 0u; s < NUMPRCSPCS - 1; ++s) ps[s] = ps[s + 1]; var lowi = ndx; var nxtndx = ndx + BFRNG; var lim = topndx < nxtndx ? (uint)(topndx - ndx + 1) : BFRNG; ps[NUMPRCSPCS - 1] = new PrcsSpc { buf = buf, tsk = cullbftsk(ndx, buf, (b) => actn(lowi, lim, b)) }; ndx = nxtndx; } for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s].tsk.Wait(); } //iterates the predicate over each page up to the page where the predicate paramenter returns true, //this method works for dependent operations that need to be executed in increasing order. //it is somewhat slower than the above as the predicate function is executed outside the task. static void IterateUntil(Func<ulong, ushort[], bool> prdct) { PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; for (var s = 0u; s < NUMPRCSPCS; ++s) { var buf = new ushort[BFSZ]; ps[s] = new PrcsSpc { buf = buf, tsk = cullbftsk(s * BFRNG, buf, (bfr) => { }) }; } for (var ndx = 0UL; ; ndx += BFRNG) { ps[0].tsk.Wait(); var buf = ps[0].buf; var lowi = ndx; if (prdct(lowi, buf)) break; for (var s = 0u; s < NUMPRCSPCS - 1; ++s) ps[s] = ps[s + 1]; ps[NUMPRCSPCS - 1] = new PrcsSpc { buf = buf, tsk = cullbftsk(ndx + NUMPRCSPCS * BFRNG, buf, (bfr) => { }) }; } } #endregion #region initialization /// <summary> /// the static constructor is used to initialize the static readonly arrays. /// </summary> static UltimatePrimesSoE() { WHLPOS = new byte[WHLPTRN.Length + 1]; //to look up wheel position index from wheel index for (byte i = 0, acc = 0; i < WHLPTRN.Length; ++i) { acc += WHLPTRN[i]; WHLPOS[i + 1] = acc; } WHLNDX = new byte[WCRC + 1]; for (byte i = 1; i < WHLPOS.Length; ++i) { for (byte j = (byte)(WHLPOS[i - 1] + 1); j <= WHLPOS[i]; ++j) WHLNDX[j] = i; } WHLRNDUP = new byte[WCRC * 2]; for (byte i = 1; i < WHLRNDUP.Length; ++i) { if (i > WCRC) WHLRNDUP[i] = (byte)(WCRC + WHLPOS[WHLNDX[i - WCRC]]); else WHLRNDUP[i] = WHLPOS[WHLNDX[i]]; } Func<ushort, int> nmbts = (v) => { var acc = 0; while (v != 0) { acc += (int)v & 1; v >>= 1; } return acc; }; CLUT = new byte[1 << 16]; for (var i = 0; i < CLUT.Length; ++i) CLUT[i] = (byte)nmbts((ushort)(i ^ -1)); PRLUT = new byte[WHTS]; for (var i = 0; i < PRLUT.Length; ++i) { var t = (uint)(WHLPOS[i] * 2) + FSTBP; if (t >= WCRC) t -= WCRC; if (t >= WCRC) t -= WCRC; PRLUT[i] = (byte)t; } WSLUT = new Wst[WHTS * WHTS]; for (var x = 0u; x < WHTS; ++x) { var p = FSTBP + 2u * WHLPOS[x]; var pr = p % WCRC; for (uint y = 0, pos = (p * p - FSTBP) / 2; y < WHTS; ++y) { var m = WHLPTRN[(x + y) % WHTS]; pos %= WCRC; var posn = WHLNDX[pos]; pos += m * pr; var nposd = pos / WCRC; var nposn = WHLNDX[pos - nposd * WCRC]; WSLUT[x * WHTS + posn] = new Wst { msk = (ushort)(1 << (int)(posn & 0xF)), mlt = (byte)(m * WPC), xtr = (byte)(WPC * nposd + (nposn >> 4) - (posn >> 4)), nxt = (ushort)(WHTS * x + nposn) }; } } MCPY = new ushort[PGSZ]; foreach (var lp in BWHLPRMS.SkipWhile(p => p < FSTCP)) { var p = (uint)lp; var k = (p * p - FSTBP) >> 1; var pd = p / WCRC; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC]; for (uint w = kd * WPC + (uint)(kn >> 4), ndx = WHLNDX[(2 * WCRC + p - FSTBP) / 2] * WHTS + kn; w < MCPY.Length; ) { var st = WSLUT[ndx]; MCPY[w] |= st.msk; w += st.mlt * pd + st.xtr; ndx = st.nxt; } } } #endregion #region public class // this class implements the enumeration (IEnumerator). // it works by farming out tasks culling pages, which it then processes in order by // enumerating the found primes as recognized by the remaining non-composite bits // in the cull page buffers. class nmrtr : IEnumerator<ulong>, IEnumerator, IDisposable { PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; ushort[] buf; public nmrtr() { for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s] = new PrcsSpc { buf = new ushort[BFSZ] }; for (var s = 1u; s < NUMPRCSPCS; ++s) { ps[s].tsk = cullbftsk((s - 1u) * BFRNG, ps[s].buf, (bfr) => { }); } buf = ps[0].buf; } ulong _curr, i = (ulong)-WHLPTRN[WHTS - 1]; int b = -BWHLPRMS.Length - 1; uint wi = WHTS - 1; ushort v, msk = 0; public ulong Current { get { return this._curr; } } object IEnumerator.Current { get { return this._curr; } } public bool MoveNext() { if (b < 0) { if (b == -1) b += buf.Length; //no yield!!! so automatically comes around again else { this._curr = (ulong)BWHLPRMS[BWHLPRMS.Length + (++b)]; return true; } } do { i += WHLPTRN[wi++]; if (wi >= WHTS) wi = 0; if ((this.msk <<= 1) == 0) { if (++b >= BFSZ) { b = 0; for (var prc = 0; prc < NUMPRCSPCS - 1; ++prc) ps[prc] = ps[prc + 1]; ps[NUMPRCSPCS - 1u].buf = buf; ps[NUMPRCSPCS - 1u].tsk = cullbftsk(i + (NUMPRCSPCS - 1u) * BFRNG, buf, (bfr) => { }); ps[0].tsk.Wait(); buf = ps[0].buf; } v = buf[b]; this.msk = 1; } } while ((v & msk) != 0u); _curr = FSTBP + i + i; return true; } public void Reset() { throw new Exception("Primes enumeration reset not implemented!!!"); } public void Dispose() { } } #endregion #region public instance method and associated sub private method /// <summary> /// Gets the enumerator for the primes. /// </summary> /// <returns>The enumerator of the primes.</returns> public IEnumerator<ulong> GetEnumerator() { return new nmrtr(); } /// <summary> /// Gets the enumerator for the primes. /// </summary> /// <returns>The enumerator of the primes.</returns> IEnumerator IEnumerable.GetEnumerator() { return new nmrtr(); } #endregion #region public static methods /// <summary> /// Gets the count of primes up the number, inclusively. /// </summary> /// <param name="top_number">The ulong top number to check for prime.</param> /// <returns>The long number of primes found.</returns> public static long CountTo(ulong top_number) { if (top_number < FSTBP) return BWHLPRMS.TakeWhile(p => p <= top_number).Count(); var cnt = (long)BWHLPRMS.Length; IterateTo(top_number, (lowi, lim, b) => { Interlocked.Add(ref cnt, count(lim, b)); }); return cnt; } /// <summary> /// Gets the sum of the primes up the number, inclusively. /// </summary> /// <param name="top_number">The uint top number to check for prime.</param> /// <returns>The ulong sum of all the primes found.</returns> public static ulong SumTo(uint top_number) { if (top_number < FSTBP) return (ulong)BWHLPRMS.TakeWhile(p => p <= top_number).Aggregate(0u, (acc, p) => acc += p); var sum = (long)BWHLPRMS.Aggregate(0u, (acc, p) => acc += p); Func<ulong, uint, ushort[], long> sumbf = (lowi, bitlim, buf) => { var acc = 0L; for (uint i = 0, wi = 0, msk = 0x8000, w = 0, v = 0; i < bitlim; i += WHLPTRN[wi++], wi = wi >= WHTS ? 0 : wi) { if (msk >= 0x8000) { msk = 1; v = buf[w++]; } else msk <<= 1; if ((v & msk) == 0) acc += (long)(FSTBP + ((lowi + i) << 1)); } return acc; }; IterateTo(top_number, (pos, lim, b) => { Interlocked.Add(ref sum, sumbf(pos, lim, b)); }); return (ulong)sum; } /// <summary> /// Gets the prime number at the zero based index number given. /// </summary> /// <param name="index">The long zero-based index number for the prime.</param> /// <returns>The ulong prime found at the given index.</returns> public static ulong ElementAt(long index) { if (index < BWHLPRMS.Length) return (ulong)BWHLPRMS.ElementAt((int)index); long cnt = BWHLPRMS.Length; var ndx = 0UL; var cycl = 0u; var bit = 0u; IterateUntil((lwi, bfr) => { var c = count(BFRNG, bfr); if ((cnt += c) < index) return false; ndx = lwi; cnt -= c; c = 0; do { var w = cycl++ * WPC; c = CLUT[bfr[w++]] + CLUT[bfr[w++]] + CLUT[bfr[w]]; cnt += c; } while (cnt < index); cnt -= c; var y = (--cycl) * WPC; ulong v = ((ulong)bfr[y + 2] << 32) + ((ulong)bfr[y + 1] << 16) + bfr[y]; do { if ((v & (1UL << ((int)bit++))) == 0) ++cnt; } while (cnt <= index); --bit; return true; }); return FSTBP + ((ndx + cycl * WCRC + WHLPOS[bit]) << 1); } #endregion }

上面的代码将枚举到四核i7-2700K(3.5 GHz)的四核(包括HT在内的八个线程)上，在大约1.55秒内将质数提高到10亿，而E7500的速度可能会降低多达四倍，这是因为线程较少且略微较低的时钟速度。大约有四分之三的时间只是运行枚举MoveNext()方法和Current属性的时间，因此我提供了公共(public)静态方法“CountTo”，“SumTo”和“ElementAt”来计算a中的质数或总和。不使用枚举的范围和第n个基于零的素数。使用UltimatePrimesSoE.CountTo(1000000000)静态方法在我的计算机上大约在0.32秒内产生50847534，因此在Intel E7500上花费的时间不应超过大约1.28秒。

有趣的是，此代码在x86 32位模式下的运行速度比在x64 64位模式下快30％，这可能是因为避免了将uint32数字扩展为ulong的额外开销。以上所有时序均适用于64位模式。 END_EDIT_ADD

在将近300行(密集)的代码行中，此实现并不简单，但这是进行所有描述的优化以使此代码如此高效的代价。 the other answer by Aaron Murgatroyd并不需要更多的代码行；尽管他的代码不那么密集，但他的代码也慢了大约四倍。实际上，几乎所有执行时间都花在了我的代码的私有(private)静态“cullbf”方法的最后一个“for循环”中，该方法只有四个语句，加上范围条件检查。所有其他代码只是为了支持该循环的重复应用。

该代码比其他答案更快的原因与该代码比您的代码更快的原因相同，除了他仅处理奇数素数候选者的步骤(1)优化之外。他对多处理的使用几乎完全无效，因为只有30％的优势，而不是正确应用时在真正的四核CPU上应该有的四分之一，因为它按每个素数而不是小页面上的所有素数进行线程化，与不直接使用包含边界检查的数组直接使用数组相比，使用不安全的指针数组访问作为消除每个循环的数组边界检查的DotNet计算成本的方法实际上会使代码变慢，因为DotNet Just In Time(JIT)编译器会产生效率很低的代码用于指针访问。另外，他的代码像我的代码一样枚举素数，该枚举每个枚举素数的CPU时钟周期成本为10，这在他的情况下也稍差一些，因为他使用的内置C#迭代器要少一些比我的“自己动手” IEnumerator接口(interface)更有效。但是，为了获得最大速度，我们应该完全避免枚举。但是，即使他提供的“Count”实例方法也使用“foreach”循环，这意味着枚举。

总而言之，此答案代码产生的主要答案比您的问题代码在E7500 CPU上快25倍(在具有更多内核/线程的CPU上，其答案快很多倍)占用的内存更少，并且不限于较小的主要范围。 32位数字范围，但以增加代码复杂度为代价。

关于c# - 如何使用多线程C#实现Eratosthenes筛网？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/4700107/

文章推荐： c# - 用逗号打印整数作为千位分隔符(1234567 到 "1,234,567")

文章推荐： c# - 系统参数异常 : DataGridViewComboBoxCell value is not valid

c++ - C c;之间有什么区别吗？和 C c = C();?
#include using namespace std; class C{ private: int value; public: C(){ value = 0;
c++ - C 风格字符串差异 : C/C++
这个问题已经有答案了: What is the difference between char a[] = ?string?; and char *p = ?string?;? (8 个回答) 已关闭
c++ - c\c++ 转换为 C#
关闭。此题需要details or clarity 。目前不接受答案。想要改进这个问题吗？通过 editing this post 添加详细信息并澄清问题. 已关闭 7 年前。此帖子已于 8 个月
c# - C、C++、C# 的功能测试工具
除了调试之外，是否有任何针对 c、c++ 或 c# 的测试工具，其工作原理类似于将独立函数复制粘贴到某个文本框，然后在其他文本框中输入参数？最佳答案也许您会考虑单元测试。我推荐你谷歌测试和谷歌模拟
c# - C/C++/C# 在监视器上设置窗口位置
我想在第二台显示器中移动一个窗口 (HWND)。问题是我尝试了很多方法，例如将分辨率加倍或输入负值，但它永远无法将窗口放在我的第二台显示器上。关于如何在 C/C++/c# 中执行此操作的任何线索最
c# - C/C++/C#中的DES实现
我正在寻找 C/C++/C## 中不同类型 DES 的现有实现。我的运行平台是Windows XP/Vista/7。我正在尝试编写一个 C# 程序，它将使用 DES 算法进行加密和解密。我需要一些实
c# - 在条件中使用赋值是否安全？ C/C++、C#
很难说出这里要问什么。这个问题模棱两可、含糊不清、不完整、过于宽泛或夸夸其谈，无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开，visit the help center . 关闭 1
c++ - C/C++/C# 强制窗口在最上面
有没有办法强制将另一个窗口置于顶部？不是应用程序的窗口，而是另一个已经在系统上运行的窗口。 (Windows, C/C++/C#) 最佳答案 SetWindowPos(that_window_ha
c# - 套接字服务器应用程序的选择 : C/C++ or C#
假设您可以在 C/C++ 或 Csharp 之间做出选择，并且您打算在 Windows 和 Linux 服务器上运行同一服务器的多个实例，那么构建套接字服务器应用程序的最明智选择是什么？最佳答案如
c++ - C/C++ 运行时库和 C/C++ 标准库的区别
你们能告诉我它们之间的区别吗？顺便问一下，有什么叫C++库或C库的吗？最佳答案 C++ 标准库和 C 标准库是 C++ 和 C 标准定义的库，提供给 C++ 和 C 程序使用。那是那些词的共同
c++ - &C::c 和 &(C::c) 有什么区别？
下面的测试代码，我将输出信息放在注释中。我使用的是 gcc 4.8.5 和 Centos 7.2。 #include #include class C { public:
c++ - 什么 C++(通用 (c/c++) 与 (通用 c)/c++ )
很难说出这里问的是什么。这个问题是含糊的、模糊的、不完整的、过于宽泛的或修辞性的，无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开它，visit the help center 。已关
c# - 通过网络在 C/C++ 服务器、C/C++ 和 C# 客户端之间发送数据结构
我的客户将使用名为 annoucement 的结构/类与客户通信。我想我会用 C++ 编写服务器。会有很多不同的类继承annoucement。我的问题是通过网络将这些类发送给客户端我想也许我应该使用
c# - C/C++ - 如何将 Buffer.BlockCopy (C#) 转换为 C/C++
我在 C# 中有以下函数: public Matrix ConcatDescriptors(IList> descriptors) { int cols = descriptors[0].Co
c++ - C/C++ - 对其他人隐藏 C 或 C++ 函数代码
我有一个项目要编写一个函数来对某些数据执行某些操作。我可以用 C/C++ 编写代码，但我不想与雇主共享该函数的代码。相反，我只想让他有权在他自己的代码中调用该函数。是否可以？我想到了这两种方法 - 在
c# - 在托管代码(C++、C、C++/CLI、C#)中使用非托管代码时处理错误
我使用的是编写糟糕的第 3 方 (C/C++) Api。我从托管代码(C++/CLI)中使用它。有时会出现“访问冲突错误”。这使整个应用程序崩溃。我知道我无法处理这些错误[如果指针访问非法内存位置等，
c# - C#、C/C++ 或 Objective-C 中的眼动追踪库
关闭。这个问题不符合Stack Overflow guidelines .它目前不接受答案。我们不允许提问寻求书籍、工具、软件库等的推荐。您可以编辑问题，以便用事实和引用来回答。关闭 7 年前。
c++ - C/C++/Objective-C 文本识别库
已关闭。此问题不符合Stack Overflow guidelines 。目前不接受答案。要求我们推荐或查找工具、库或最喜欢的场外资源的问题对于 Stack Overflow 来说是偏离主题的，因为
c# - 将 C/C++ 函数导入 C#
我有一些 C 代码，将使用 P/Invoke 从 C# 调用。我正在尝试为这个 C 函数定义一个 C# 等效项。 SomeData* DoSomething(); struct SomeData {
c - C语言中 "c -= --c - c++;"的结果应该是什么？
这个问题已经有答案了: Why are these constructs using pre and post-increment undefined behavior? (14 个回答) 已关闭 6

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c# - 如何使用多线程C#实现Eratosthenes筛网？