c# - 如何在 C# 中解析文本文件并进行 io 绑定(bind)？-6ren

c# - 如何在 C# 中解析文本文件并进行 io 绑定(bind)？

转载作者：太空狗更新时间：2023-10-29 20:14:34

众所周知，如果您从磁盘读取数据，您将受到 IO 限制，并且您可以比从磁盘读取数据更快地处理/解析读取的数据。

但是这种常识(神话？)并没有反射(reflect)在我的测试中。当我读取一个文本文件时，每行中用空格分隔的 double 和 int 都比我的物理磁盘速度慢得多(因子 6)。
文本文件看起来像这样

1,1 0
2,1 1
3,1 2

更新
当我在一次读取中使用完整缓冲区执行 ReadFile 以获得“真实”性能时，我已经包含了 PInvoke 性能。

ReadFile 性能 - ReadFileIntoByteBuffer

StringReader.ReadLine 性能 - CountLines

StringReader.Readline 不安全性能 - ParseLinesUnsafe

StringReader.Read 不安全的字符 buf - ParseLinesUnsafeCharBuf

StringReader.ReadLine + 解析性能 - ParseLines

结果是

Did native read 179,0MB in                    0,4s, 484,2MB/s
Did read 10.000.000 lines in                  1,6s, 112,7MB/s
Did parse and read unsafe 179,0MB in          2,3s,  76,5MB/s
Did parse and read unsafe char buf 179,0MB in 2,8s,  63,5MB/s
Did read and parse 179,0MB in                 9,3s,  19,3MB/s

尽管我确实尝试跳过 ParseLinesUnsafeCharBuf 中的字符串构造开销，但它仍然比每次分配一个新字符串的版本慢很多。它仍然比使用最简单的解决方案的原始 20 MB 好得多，但我确实认为 .NET 应该能够做得更好。如果 remoe 解析字符串的逻辑，我会得到 258,8 MB/s，这是非常好的并且接近 native 速度。但是我没有看到使用不安全代码使我的解析更简单的方法。我必须处理不完整的线条，这使得它非常复杂。

更新
从数字可以清楚地看出，一个简单的 string.split 确实花费太多了。但是 StringReader 也确实花费了很多。高度优化的解决方案如何看起来更接近真实的光盘速度？我尝试了很多使用不安全代码和字符缓冲区的方法，但性能提升可能是 30%，但没有达到我需要的数量级。我可以接受 100MB/s 的解析速度。这应该可以通过托管代码实现还是我错了？

难道 C# 的解析速度不可能比我从硬盘读取的速度快吗？它是英特尔 Postville X25M。 CPU 是较旧的 Intel Dual Core。我有 3 GB RAM Windows 7 .NET 3.5 SP1 和 .NET 4。

但我确实在普通硬盘上也看到了相同的结果。使用今天的硬盘，线性读取速度可以达到 400MB/s。这是否意味着我应该重构我的应用程序以在实际需要时按需读取数据，而不是急切地将其读入内存，代价是由于增加的对象图导致 GC 周期更长，GC 次数增加。

I have noticed如果我的托管应用程序使用超过 500MB 的内存，它的响应速度就会大大降低。一个主要的影响因素似乎是对象图的复杂性。因此，在需要时读取数据可能会更好。至少这是我对当前数据的结论。

这是代码

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Diagnostics;
using System.Runtime.InteropServices;
using Microsoft.Win32.SafeHandles;
using System.ComponentModel;

namespace IOBound
{
    class Program
    {
        static void Main(string[] args)
        {
            string data = @"C:\Source\IOBound\NumericData.txt";
            if (!File.Exists(data))
            {
                CreateTestData(data);
            }

            int MB = (int) (new FileInfo(data).Length/(1024*1024));

            var sw = Stopwatch.StartNew();
            uint bytes = ReadFileIntoByteBuffer(data);
            sw.Stop();
            Console.WriteLine("Did native read {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                bytes/(1024*1024), sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            int n = CountLines(data);
            sw.Stop();
            Console.WriteLine("Did read {0:N0} lines in {1:F1}s, {2:F1}MB/s",
                n, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            ParseLinesUnsafe(data);
            sw.Stop();
            Console.WriteLine("Did parse and read unsafe {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            ParseLinesUnsafeCharBuf(data);
            sw.Stop();
            Console.WriteLine("Did parse and read unsafe char buf {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            ParseLines(data);
            sw.Stop();
            Console.WriteLine("Did read and parse {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

        }

        private unsafe static uint ReadFileIntoByteBuffer(string data)
        {
            using(var stream = new FileStream(data, FileMode.Open))
            {
                byte[] buf = new byte[200 * 1024 * 1024];
                fixed(byte* pBuf = &buf[0])
                {
                    uint dwRead = 0;
                    if (ReadFile(stream.SafeFileHandle, pBuf, 200 * 1000 * 1000, out dwRead, IntPtr.Zero) == 0)
                    {
                        throw new Win32Exception();
                    }
                    return dwRead;
                }

            }
        }

        private static int CountLines(string data)
        {
            using (var reader = new StreamReader(data))
            {
                string line;
                int count = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                }

                return count;
            }
        }

        unsafe private static void ParseLinesUnsafeCharBuf(string data)
        {
            var dobules = new List<double>();
            var ints = new List<int>();

            using (var reader = new StreamReader(data))
            {
                double d = 0;
                long a = 0, b = 0;
                int i = 0;
                char[] buffer = new char[10*1000*1000];
                int readChars = 0;
                int startIdx = 0;

                fixed(char *ln = buffer)
                {
                    while ((readChars = reader.Read(buffer, startIdx, buffer.Length - startIdx)) != 0)
                    {
                        char* pEnd = ln + readChars + startIdx;
                        char* pCur = ln;
                        char* pLineStart = null;

                        while (pCur != pEnd)
                        {
                            a = 0;
                            b = 0;

                            while (pCur != pEnd && *pCur == '\r' || *pCur == '\n')
                            {
                                pCur++;
                            }
                            pLineStart = pCur;

                            while(pCur != pEnd && char.IsNumber(*pCur))
                            {
                                a = a * 10 + (*pCur++ - '0');
                            }
                            if (pCur == pEnd || *pCur == '\r')
                            {
                                goto incompleteLine;
                            }

                            if (*pCur++ == ',')
                            {
                                long div = 1;
                                while (pCur != pEnd && char.IsNumber(*pCur))
                                {
                                    b += b * 10 + (*pCur++ - '0');
                                    div *= 10;
                                }
                                if (pCur == pEnd || *pCur == '\r')
                                {
                                    goto incompleteLine;
                                }
                                d = a + ((double)b) / div;
                            }
                            else
                            {
                                goto skipRest;
                            }

                            while (pCur != pEnd && char.IsWhiteSpace(*pCur))
                            {
                                pCur++;
                            }
                            if (pCur == pEnd || *pCur == '\r')
                            {
                                goto incompleteLine;
                            }

                            i = 0;
                            while (pCur != pEnd && char.IsNumber(*pCur))
                            {
                                i = i * 10 + (*pCur++ - '0');
                            }
                            if (pCur == pEnd)
                            {
                                goto incompleteLine;
                            }

                            dobules.Add(d);
                            ints.Add(i);

                            continue;

incompleteLine:
                            startIdx = (int)(pEnd - pLineStart);
                            Buffer.BlockCopy(buffer, (int)(pLineStart - ln) * 2, buffer, 0, 2 * startIdx);
                            break;
skipRest:
                            while (pCur != pEnd && *pCur != '\r')
                            {
                                pCur++;   
                            }
                            continue;
                        }
                    }
                }
            }
        }

        unsafe private static void ParseLinesUnsafe(string data)
        {
            var dobules = new List<double>();
            var ints = new List<int>();

            using (var reader = new StreamReader(data))
            {
                string line;
                double d=0;
                long a = 0, b = 0;
                int ix = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    int len = line.Length;
                    fixed (char* ln = line)
                    {
                        while (ix < len && char.IsNumber(ln[ix]))
                        { 
                            a = a * 10 + (ln[ix++] - '0');
                        }

                        if (ln[ix] == ',')
                        {
                            ix++;
                            long div = 1;
                            while (ix < len && char.IsNumber(ln[ix]))
                            {
                                b += b * 10 + (ln[ix++] - '0');
                                div *= 10;
                            }
                            d = a + ((double)b) / div;
                        }

                        while (ix < len && char.IsWhiteSpace(ln[ix]))
                        {
                            ix++;
                        }

                        int i = 0;
                        while (ix < len && char.IsNumber(ln[ix]))
                        { 
                            i = i * 10 + (ln[ix++] - '0');
                        }

                        dobules.Add(d);
                        ints.Add(ix);
                    }
                }
            }
        }



        private static void ParseLines(string data)
        {
            var dobules = new List<double>();
            var ints = new List<int>();

            using (var reader = new StreamReader(data))
            {
                string line;
                char[] sep  = new char[] { ' ' };
                while ((line = reader.ReadLine()) != null)
                {
                    var parts = line.Split(sep);
                    if (parts.Length == 2)
                    {
                        dobules.Add( double.Parse(parts[0]));
                        ints.Add( int.Parse(parts[1]));
                    }
                }
            }
        }

        static void CreateTestData(string fileName)
        {
            FileStream fstream = new FileStream(fileName, FileMode.Create);
            using (StreamWriter writer = new StreamWriter(fstream, Encoding.UTF8))
            {
                for (int i = 0; i < 10 * 1000 * 1000; i++)
                {
                    writer.WriteLine("{0} {1}", 1.1d + i, i);
                }
            }
        }

        [DllImport("kernel32.dll", SetLastError = true)]
        unsafe static extern uint ReadFile(SafeFileHandle hFile, [Out] byte* lpBuffer, uint nNumberOfBytesToRead, out uint lpNumberOfBytesRead, IntPtr lpOverlapped);

    }
}

最佳答案

所以这里有几个问题。其他人已经评论了 Windows 的 IO 缓存以及实际的硬件缓存，所以我将不理会它。

另一个问题是您测量 read() + parse() 的组合操作，并将其与 read() 的速度进行比较。本质上，您需要意识到 A + B 总是大于 A(假设非负)这一事实。

因此，要确定您是否受 IO 限制，您需要了解读取文件需要多长时间。你已经做到了。在我的机器上，您的测试运行时间约为 220 毫秒以读取文件。

现在您需要测量解析这么多不同的字符串需要多长时间。这有点难以隔离。因此，假设我们将它们放在一起并从解析时间中减去读取所需的时间。此外，我们不是要衡量您对数据的处理，而只是分析，所以扔掉 List 和 List，让我们进行分析。在我的机器上运行它大约需要 1000 毫秒，减去 220 毫秒的读取时间，您的解析代码每 100 万行大约需要 780 毫秒。

那么为什么它这么慢(比读取慢 3-4 倍)？再次让我们消除一些东西。注释掉 int.Parse 和 double.Parse 并再次运行。这比 220 的读取时间少 460 毫秒好多了，我们现在解析时间为 240 毫秒。当然，'parse' 只是调用 string.Split()。 Hrmmm 看起来像 string.Split 将花费您与磁盘 IO 一样多的成本，考虑到 .NET 如何处理字符串也就不足为奇了。

那么 C# 解析的速度可以比从磁盘读取更快吗？嗯，是的，它可以，但你将不得不变得讨厌。您会看到 int.Parse 和 double.Parse 受制于它们具有文化意识的事实。由于这一点以及这些解析例程处理许多格式的事实，它们在您的示例中有些昂贵。我的意思是说我们每微秒(百万分之一秒)解析一个 double 和 int ，这通常还不错。

因此，为了匹配磁盘读取的速度(因此受 IO 限制)，我们需要重写处理文本行的方式。这是一个讨厌的例子，但它适用于你的例子......

int len = line.Length;
fixed (char* ln = line)
{
    double d;
    long a = 0, b = 0;
    int ix = 0;
    while (ix < len && char.IsNumber(ln[ix]))
        a = a * 10 + (ln[ix++] - '0');
    if (ln[ix] == '.')
    {
        ix++;
        long div = 1;
        while (ix < len && char.IsNumber(ln[ix]))
        {
            b += b * 10 + (ln[ix++] - '0');
            div *= 10;
        }
        d = a + ((double)b)/div;
    }

    while (ix < len && char.IsWhiteSpace(ln[ix]))
        ix++;

    int i = 0;
    while (ix < len && char.IsNumber(ln[ix]))
        i = i * 10 + (ln[ix++] - '0');
}

运行这个糟糕的代码会产生大约 450 毫秒的运行时间，或者大约 2n 的读取时间。所以，假装你认为上面的代码片段是可以接受的(我希望你不会)，你可以让一个线程读取字符串和另一个解析，你将接近于 IO 绑定(bind)。将两个线程放在解析上，您将受到 IO 限制。你是否应该这样做是另一个问题。

那么让我们回到你最初的问题:

It is known that if you read data from disc you are IO bound and you can process/parse the read data much faster than you can read it from disc.

But this common wisdom (myth?)

嗯，不，我不会把这称为神话。事实上，我会争论你的原始代码仍然是 IO Bound。您碰巧在孤立地运行测试，因此影响很小，仅为从设备读取时间的 1/6。但是考虑一下如果该磁盘繁忙会发生什么？如果您的防病毒扫描程序扫描每个文件怎么办？简单地说，您的程序会随着 HDD 事件的增加而变慢，并且可能成为 IO Bound。

恕我直言，这种“常识”的原因是:
It's easier to get IO bound on writes than on reads.

写入设备需要更长的时间，并且通常比生成数据更昂贵。如果您想查看 IO Bound 的实际效果，请查看您的“CreateTestData”方法。您的 CreateTestData 方法将数据写入磁盘所需的时间是调用 String.Format(...) 的 2 倍。这是完全缓存。关闭缓存 ( FileOptions.WriteThrough ) 并重试...现在 CreateTestData 慢了 3-4 倍。使用以下方法自己尝试一下:
static int CreateTestData(string fileName)
{
    FileStream fstream = new FileStream(fileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, FileOptions.WriteThrough);
    using (StreamWriter writer = new StreamWriter(fstream, Encoding.UTF8))
    {
        for (int i = 0; i < linecount; i++)
        {
            writer.WriteLine("{0} {1}", 1.1d + i, i);
        }
    }
    return linecount;
}
static int PrintTestData(string fileName)
{
    for (int i = 0; i < linecount; i++)
    {
        String.Format("{0} {1}", 1.1d + i, i);
    }
    return linecount;
}
这仅适用于初学者，如果您真的想获得 IO 绑定(bind)，您可以开始使用直接 IO。请参阅 CreateFile 上的文档使用 FILE_FLAG_NO_BUFFERING。当您开始绕过硬件缓存并等待 IO 完成时，写入速度会变慢。这是传统数据库写入速度非常慢的一个主要原因。他们必须强制硬件完成写入并等待它。只有这样他们才能将事务称为“已提交”，数据在物理设备上的文件中。

更新

好的阿洛伊斯，看来你只是在寻找你能走多快。为了更快，您需要停止处理字符串和字符并删除分配以加快速度。以下代码将上面的行/字符解析器改进了大约一个数量级(仅计算行数增加了大约 30 毫秒)，同时仅在堆上分配了一个缓冲区。

警告你需要意识到我正在证明它可以快速完成。我是不是建议你走这条路。这段代码有一些严重的限制和/或错误。就像当你以“1.2589E+19”的形式击中双倍时会发生什么？坦率地说，我认为您应该坚持使用原始代码，而不必担心尝试优化它。或者将文件格式更改为二进制而不是文本(请参阅 BinaryWriter )。如果您使用的是二进制文件，您可以使用以下代码的变体 BitConvert.ToDouble/ ToInt32它会更快。
private static unsafe int ParseFast(string data)
{
    int count = 0, valid = 0, pos, stop, temp;
    byte[] buffer = new byte[ushort.MaxValue];

    const byte Zero = (byte) '0';
    const byte Nine = (byte) '9';
    const byte Dot = (byte)'.';
    const byte Space = (byte)' ';
    const byte Tab = (byte) '\t';
    const byte Line = (byte) '\n';

    fixed (byte *ptr = buffer)
    using (Stream reader = File.OpenRead(data))
    {
        while (0 != (temp = reader.Read(buffer, valid, buffer.Length - valid)))
        {
            valid += temp;
            pos = 0;
            stop = Math.Min(buffer.Length - 1024, valid);
            while (pos < stop)
            {
                double d;
                long a = 0, b = 0;
                while (pos < valid && ptr[pos] >= Zero && ptr[pos] <= Nine)
                    a = a*10 + (ptr[pos++] - Zero);
                if (ptr[pos] == Dot)
                {
                    pos++;
                    long div = 1;
                    while (pos < valid && ptr[pos] >= Zero && ptr[pos] <= Nine)
                    {
                        b += b*10 + (ptr[pos++] - Zero);
                        div *= 10;
                    }
                    d = a + ((double) b)/div;
                }
                else
                    d = a;

                while (pos < valid && (ptr[pos] == Space || ptr[pos] == Tab))
                    pos++;

                int i = 0;
                while (pos < valid && ptr[pos] >= Zero && ptr[pos] <= Nine)
                    i = i*10 + (ptr[pos++] - Zero);

                DoSomething(d, i);

                while (pos < stop && ptr[pos] != Line)
                    pos++;
                while (pos < stop && !(ptr[pos] >= Zero && ptr[pos] <= Nine))
                    pos++;
            }

            if (pos < valid)
                Buffer.BlockCopy(buffer, pos, buffer, 0, valid - pos);
            valid -= pos;
        }
    }
    return count;
}

关于c# - 如何在 C# 中解析文本文件并进行 io 绑定(bind)？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/7153315/

文章推荐： c# - 确定 MSI 的版本而不安装它

文章推荐： c# - 如何比较 C# 中的相对文件或目录路径？

文章推荐： c++ - 结构中的字符

太空狗

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c# - 如何在 C# 中解析文本文件并进行 io 绑定(bind)？