gpt4 book ai didi

javascript - 用于计算大基数的 LogLog 和 HyperLogLog 算法

转载 作者:IT王子 更新时间:2023-10-29 02:55:55 25 4
gpt4 key购买 nike

我在哪里可以找到 LogLog algorithm 的有效实现? ?曾尝试自己实现,但我的实现草案产生了奇怪的结果。

Here它是:

function LogLog(max_error, max_count)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}

var m = 1.30 / max_error;
var k = Math.ceil(log2(m * m));
m = Math.pow(2, k);

var k_comp = 32 - k;

var l = log2(log2(max_count / m));
if (isNaN(l)) l = 1; else l = Math.ceil(l);
var l_mask = ((1 << l) - 1) >>> 0;

var M = [];
for (var i = 0; i < m; ++i) M[i] = 0;

function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;

var rank = 0;
for (var i = 0; i < k_comp; ++i)
{
if ((hash >>> i) & 1)
{
rank = i + 1;
break;
}
}

M[j] = Math.max(M[j], rank & l_mask);
}
else
{
var c = 0;
for (var i = 0; i < m; ++i) c += M[i];
return 0.79402 * m * Math.pow(2, c / m);
}
}

return {count: count};
}

function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}

var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words

var log_log = LogLog(0.01, 100000);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]));
alert(log_log.count());

由于未知原因,实现对max_error参数非常敏感,它是决定结果大小的主要因素。我敢肯定,有一些愚蠢的错误:)

更新:此问题已在 newer version 中解决的算法。我稍后会发布它的实现。

最佳答案

Here它是基于newer paper的算法的更新版本:

var pow_2_32 = 0xFFFFFFFF + 1;

function HyperLogLog(std_error)
{
function log2(x)
{
return Math.log(x) / Math.LN2;
}

function rank(hash, max)
{
var r = 1;
while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; }
return r;
}

var m = 1.04 / std_error;
var k = Math.ceil(log2(m * m)), k_comp = 32 - k;
m = Math.pow(2, k);

var alpha_m = m == 16 ? 0.673
: m == 32 ? 0.697
: m == 64 ? 0.709
: 0.7213 / (1 + 1.079 / m);

var M = []; for (var i = 0; i < m; ++i) M[i] = 0;

function count(hash)
{
if (hash !== undefined)
{
var j = hash >>> k_comp;
M[j] = Math.max(M[j], rank(hash, k_comp));
}
else
{
var c = 0.0;
for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]);
var E = alpha_m * m * m / c;

// -- make corrections

if (E <= 5/2 * m)
{
var V = 0;
for (var i = 0; i < m; ++i) if (M[i] == 0) ++V;
if (V > 0) E = m * Math.log(m / V);
}
else if (E > 1/30 * pow_2_32)
E = -pow_2_32 * Math.log(1 - E / pow_2_32);

// --

return E;
}
}

return {count: count};
}

function fnv1a(text)
{
var hash = 2166136261;
for (var i = 0; i < text.length; ++i)
{
hash ^= text.charCodeAt(i);
hash += (hash << 1) + (hash << 4) + (hash << 7) +
(hash << 8) + (hash << 24);
}
return hash >>> 0;
}

var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words

var seed = Math.floor(Math.random() * pow_2_32); // make more fun

var log_log = HyperLogLog(0.065);
for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed);
var count = log_log.count();
alert(count + ', error ' +
(count - words.length) / (words.length / 100.0) + '%');

关于javascript - 用于计算大基数的 LogLog 和 HyperLogLog 算法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/5990713/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com