gpt4 book ai didi

c# - 对于非常长的字符串列表,什么是合适的搜索/检索方法?

转载 作者:IT王子 更新时间:2023-10-29 03:43:20 26 4
gpt4 key购买 nike

这不是一个非常罕见的问题,但我似乎仍然无法找到真正解释该选择的答案。

我有一个非常大的字符串列表(准确地说是 SHA-256 哈希的 ASCII 表示),我需要查询该列表中是否存在字符串。

此列表中可能有超过 1 亿个条目,我将需要多次重复查询条目是否存在。

鉴于大小,我怀疑我能否将其全部塞入 HashSet<string> .什么是合适的检索系统来最大限度地提高性能?

我可以对列表进行预排序,我可以将它放入 SQL 表中,我可以将它放入文本文件中,但我不确定在我的应用程序中什么才是最有意义的。

这些方法或其他检索方法在性能方面是否有明显的赢家?

最佳答案

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Security.Cryptography;

namespace HashsetTest
{
abstract class HashLookupBase
{
protected const int BucketCount = 16;

private readonly HashAlgorithm _hasher;

protected HashLookupBase()
{
_hasher = SHA256.Create();
}

public abstract void AddHash(byte[] data);
public abstract bool Contains(byte[] data);

private byte[] ComputeHash(byte[] data)
{
return _hasher.ComputeHash(data);
}

protected Data256Bit GetHashObject(byte[] data)
{
var hash = ComputeHash(data);
return Data256Bit.FromBytes(hash);
}

public virtual void CompleteAdding() { }
}

class HashsetHashLookup : HashLookupBase
{
private readonly HashSet<Data256Bit>[] _hashSets;

public HashsetHashLookup()
{
_hashSets = new HashSet<Data256Bit>[BucketCount];

for(int i = 0; i < _hashSets.Length; i++)
_hashSets[i] = new HashSet<Data256Bit>();
}

public override void AddHash(byte[] data)
{
var item = GetHashObject(data);
var offset = item.GetHashCode() & 0xF;
_hashSets[offset].Add(item);
}

public override bool Contains(byte[] data)
{
var target = GetHashObject(data);
var offset = target.GetHashCode() & 0xF;
return _hashSets[offset].Contains(target);
}
}

class ArrayHashLookup : HashLookupBase
{
private Data256Bit[][] _objects;
private int[] _offsets;
private int _bucketCounter;

public ArrayHashLookup(int size)
{
size /= BucketCount;
_objects = new Data256Bit[BucketCount][];
_offsets = new int[BucketCount];

for(var i = 0; i < BucketCount; i++) _objects[i] = new Data256Bit[size + 1];

_bucketCounter = 0;
}

public override void CompleteAdding()
{
for(int i = 0; i < BucketCount; i++) Array.Sort(_objects[i]);
}

public override void AddHash(byte[] data)
{
var hashObject = GetHashObject(data);
_objects[_bucketCounter][_offsets[_bucketCounter]++] = hashObject;
_bucketCounter++;
_bucketCounter %= BucketCount;
}

public override bool Contains(byte[] data)
{
var hashObject = GetHashObject(data);
return _objects.Any(o => Array.BinarySearch(o, hashObject) >= 0);
}
}

struct Data256Bit : IEquatable<Data256Bit>, IComparable<Data256Bit>
{
public bool Equals(Data256Bit other)
{
return _u1 == other._u1 && _u2 == other._u2 && _u3 == other._u3 && _u4 == other._u4;
}

public int CompareTo(Data256Bit other)
{
var rslt = _u1.CompareTo(other._u1); if (rslt != 0) return rslt;
rslt = _u2.CompareTo(other._u2); if (rslt != 0) return rslt;
rslt = _u3.CompareTo(other._u3); if (rslt != 0) return rslt;

return _u4.CompareTo(other._u4);
}

public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj))
return false;
return obj is Data256Bit && Equals((Data256Bit) obj);
}

public override int GetHashCode()
{
unchecked
{
var hashCode = _u1.GetHashCode();
hashCode = (hashCode * 397) ^ _u2.GetHashCode();
hashCode = (hashCode * 397) ^ _u3.GetHashCode();
hashCode = (hashCode * 397) ^ _u4.GetHashCode();
return hashCode;
}
}

public static bool operator ==(Data256Bit left, Data256Bit right)
{
return left.Equals(right);
}

public static bool operator !=(Data256Bit left, Data256Bit right)
{
return !left.Equals(right);
}

private readonly long _u1;
private readonly long _u2;
private readonly long _u3;
private readonly long _u4;

private Data256Bit(long u1, long u2, long u3, long u4)
{
_u1 = u1;
_u2 = u2;
_u3 = u3;
_u4 = u4;
}

public static Data256Bit FromBytes(byte[] data)
{
return new Data256Bit(
BitConverter.ToInt64(data, 0),
BitConverter.ToInt64(data, 8),
BitConverter.ToInt64(data, 16),
BitConverter.ToInt64(data, 24)
);
}
}

class Program
{
private const int TestSize = 150000000;

static void Main(string[] args)
{
GC.Collect(3);
GC.WaitForPendingFinalizers();

{
var arrayHashLookup = new ArrayHashLookup(TestSize);
PerformBenchmark(arrayHashLookup, TestSize);
}

GC.Collect(3);
GC.WaitForPendingFinalizers();

{
var hashsetHashLookup = new HashsetHashLookup();
PerformBenchmark(hashsetHashLookup, TestSize);
}

Console.ReadLine();
}

private static void PerformBenchmark(HashLookupBase hashClass, int size)
{
var sw = Stopwatch.StartNew();

for (int i = 0; i < size; i++)
hashClass.AddHash(BitConverter.GetBytes(i * 2));

Console.WriteLine("Hashing and addition took " + sw.ElapsedMilliseconds + "ms");

sw.Restart();
hashClass.CompleteAdding();
Console.WriteLine("Hash cleanup (sorting, usually) took " + sw.ElapsedMilliseconds + "ms");

sw.Restart();
var found = 0;

for (int i = 0; i < size * 2; i += 10)
{
found += hashClass.Contains(BitConverter.GetBytes(i)) ? 1 : 0;
}

Console.WriteLine("Found " + found + " elements (expected " + (size / 5) + ") in " + sw.ElapsedMilliseconds + "ms");
}
}
}

结果非常有希望。他们运行单线程。哈希集版本可以在 7.9GB RAM 使用率下达到每秒超​​过 100 万次查找。基于数组的版本使用较少的 RAM (4.6GB)。两者之间的启动时间几乎相同(388 对 391 秒)。哈希集以 RAM 换取查找性能。由于内存分配限制,两者都必须分桶。

Array performance:

Hashing and addition took 307408ms

Hash cleanup (sorting, usually) took 81892ms

Found 30000000 elements (expected 30000000) in 562585ms [53k searches per second]

======================================

Hashset performance:

Hashing and addition took 391105ms

Hash cleanup (sorting, usually) took 0ms

Found 30000000 elements (expected 30000000) in 74864ms [400k searches per second]

关于c# - 对于非常长的字符串列表,什么是合适的搜索/检索方法?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/22389931/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com