gpt4 book ai didi

postgresql - 如何在不首先连接所有内容的情况下逐步散列 postgresql 中的聚合结果

转载 作者:行者123 更新时间:2023-12-04 10:56:56 25 4
gpt4 key购买 nike

我想计算一组结果的哈希 (sha256),我知道该怎么做:

SELECT digest( string_agg(id_,':'), 'sha256') from mytable order by id_;

它有效,但它首先连接所有记录,最后计算哈希值。我希望它是增量的,以避免大型数据集(数百万行)可能出现的内存问题,例如:

SELECT digest_agg(id_, ':', 'sha256') from mytable order by id_;

确保哈希在途中逐行递增计算。

最佳答案

好的,我终于为此编写了一个 c 扩展。我将它张贴在这里以防它对某人有用。

/* ------- OS Includes ------------ */
#include <stdio.h>
#include <openssl/rsa.h>
#include <openssl/pem.h>
#include <openssl/err.h>

/* ----- PostgreSQL Includes -------*/
#include "postgres.h"
#include "fmgr.h"
#include "funcapi.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
PG_MODULE_MAGIC;

typedef struct sha256_state
{
SHA256_CTX* sha256;
bool has_data;
} SHA256_STATE;

/* --------- Prototypes ---------- */
void _PG_init(void);
void _PG_fini(void);
Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS);
Datum dv_sha256_agg_final(PG_FUNCTION_ARGS);
SHA256_STATE* sha256_init(void);
void sha256_update(SHA256_STATE* state, char* string, int32 string_size);
void sha256_final(SHA256_STATE* state, char outputBuffer[65]);

/* ------ Version1 convention ---- */
PG_FUNCTION_INFO_V1(dv_sha256_agg_sfunc);
PG_FUNCTION_INFO_V1(dv_sha256_agg_final);

// -----------------------------------------------------------
// Implementations
// -----------------------------------------------------------

void _PG_init(void) {/* Do nothing */}
void _PG_fini(void) {/* Do nothing */}

Datum dv_sha256_agg_sfunc(PG_FUNCTION_ARGS) {

SHA256_STATE* state;
bool is_first = PG_ARGISNULL(0);
text* string;
int32 string_size;
char* data;
char SEP[] = {'\n'}; // Hardcoded separator (can be improved)

if (is_first) {
// First iteration: Create the state
state = sha256_init();
}
else {
// Next iterations: Restore the state
state = (SHA256_STATE*) PG_GETARG_POINTER(0);
}

if (PG_ARGISNULL(1)) {
PG_RETURN_POINTER(state);
}

// Get current chunk
string = PG_GETARG_TEXT_PP(1);
string_size = VARSIZE_ANY_EXHDR(string);
data = VARDATA_ANY(string);

// Add separator
if (state->has_data) {
sha256_update(state, SEP, 1);
}

// Update state
sha256_update(state, data, string_size);
state->has_data = true;

// Updated state
PG_RETURN_POINTER(state);

}

Datum dv_sha256_agg_final(PG_FUNCTION_ARGS) {
SHA256_STATE* state;
char out[65];
text* hash = (text*) palloc(65 + VARHDRSZ);
if (PG_ARGISNULL(0)) {
PG_RETURN_NULL();
}
else {
state = (SHA256_STATE*) PG_GETARG_POINTER(0);
sha256_final(state, out);
SET_VARSIZE(hash, 65 + VARHDRSZ);
memcpy(VARDATA(hash), out, 65);
PG_RETURN_TEXT_P(hash);
}
}

SHA256_STATE* sha256_init() {
SHA256_STATE* state = (SHA256_STATE*) palloc(sizeof(SHA256_STATE));
state->sha256 = (SHA256_CTX*) palloc(sizeof(SHA256_CTX));
SHA256_Init(state->sha256);
state->has_data = false;
return state;
}

void sha256_update(SHA256_STATE* state, char* string, int32 string_size) {
SHA256_Update(state->sha256, string, string_size);
}

void sha256_final(SHA256_STATE* state, char outputBuffer[65]) {
int i;
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_Final(hash, state->sha256);
for(i = 0; i < SHA256_DIGEST_LENGTH; i++) {
sprintf(outputBuffer + (i * 2), "%02x", hash[i]);
}
outputBuffer[64] = 0;
}

聚合定义:

CREATE FUNCTION dv_sha256_agg_sfunc(state internal, input text)
RETURNS internal AS 'MODULE_PATHNAME',
'dv_sha256_agg_sfunc'
LANGUAGE C VOLATILE;

CREATE FUNCTION dv_sha256_agg_final(state internal)
RETURNS text AS 'MODULE_PATHNAME',
'dv_sha256_agg_final'
LANGUAGE C VOLATILE;

CREATE AGGREGATE dv_sha256_agg(input text) (
SFUNC = dv_sha256_agg_sfunc,
STYPE = internal,
FINALFUNC = dv_sha256_agg_final
);

测试:

select 
dv_sha256_agg(id_::text),
encode(digest(string_agg(id_::text,E'\n'),'sha256'),'hex')
from
generate_series(1,100) id_(id_);

结果

                          dv_sha256_agg                           |                              encode                              
------------------------------------------------------------------+------------------------------------------------------------------
4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87 | 4187fe63fa78d8b4333e6ffc9122e0273ddf90251ced32e1e5b398639c193c87
(1 row)

注意事项:

  • 分隔符硬编码为\n
  • 忽略空值

关于postgresql - 如何在不首先连接所有内容的情况下逐步散列 postgresql 中的聚合结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59109684/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com