gpt4 book ai didi

c - C 中最快的去交织操作?

转载 作者:太空狗 更新时间:2023-10-29 16:43:59 25 4
gpt4 key购买 nike

我有一个指向字节数组mixed 的指针,它包含两个不同数组array1array2 的交错字节。假设 mixed 看起来像这样:

a1b2c3d4...

我需要做的是对字节进行去交错处理,以便得到 array1 = abcd...array2 = 1234...。提前知道了mixed的长度,array1array2的长度是等价的,都等于mixed/2

这是我当前的实现(array1array2 已经分配):

int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
array1[i] = mixed[j];
array2[i] = mixed[j+1];
}

这避免了任何昂贵的乘法或除法运算,但运行速度仍然不够快。我希望有像 memcpy 这样的东西,它采用一个索引器,可以使用低级 block 复制操作来加速这个过程。有没有比我目前拥有的更快的实现方式?

编辑

目标平台是适用于 iOS 和 Mac 的 Objective-C。 iOS 设备的快速操作更为重要,因此专门针对 iOS 的解决方案总比没有好。

更新

感谢大家的回复,尤其是 Stephen Canon、Graham Lee 和 Mecki。这是我的“主”函数,它使用 Stephen 的 NEON 内在函数(如果可用),否则使用 Graham 的 union 游标,按照 Mecki 的建议减少迭代次数。

void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength)
{
#if defined __ARM_NEON__
// attempt to use NEON intrinsics

// iterate 32-bytes at a time
div_t dstABLength_32 = div(dstABLength, 32);
if (dstABLength_32.rem == 0)
{
while (dstABLength_32.quot --> 0)
{
const uint8x16_t a = vld1q_u8(srcA);
const uint8x16_t b = vld1q_u8(srcB);
const uint8x16x2_t ab = { a, b };
vst2q_u8(dstAB, ab);
srcA += 16;
srcB += 16;
dstAB += 32;
}
return;
}

// iterate 16-bytes at a time
div_t dstABLength_16 = div(dstABLength, 16);
if (dstABLength_16.rem == 0)
{
while (dstABLength_16.quot --> 0)
{
const uint8x8_t a = vld1_u8(srcA);
const uint8x8_t b = vld1_u8(srcB);
const uint8x8x2_t ab = { a, b };
vst2_u8(dstAB, ab);
srcA += 8;
srcB += 8;
dstAB += 16;
}
return;
}
#endif

// if the bytes were not aligned properly
// or NEON is unavailable, fall back to
// an optimized iteration

// iterate 8-bytes at a time
div_t dstABLength_8 = div(dstABLength, 8);
if (dstABLength_8.rem == 0)
{
typedef union
{
uint64_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
} ab8x8_t;

uint64_t *dstAB64 = (uint64_t *)dstAB;
int j = 0;
for (int i = 0; i < dstABLength_8.quot; i++)
{
ab8x8_t cursor;
cursor.narrow.a1 = srcA[j ];
cursor.narrow.b1 = srcB[j++];
cursor.narrow.a2 = srcA[j ];
cursor.narrow.b2 = srcB[j++];
cursor.narrow.a3 = srcA[j ];
cursor.narrow.b3 = srcB[j++];
cursor.narrow.a4 = srcA[j ];
cursor.narrow.b4 = srcB[j++];
dstAB64[i] = cursor.wide;
}
return;
}

// iterate 4-bytes at a time
div_t dstABLength_4 = div(dstABLength, 4);
if (dstABLength_4.rem == 0)
{
typedef union
{
uint32_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
} ab8x4_t;

uint32_t *dstAB32 = (uint32_t *)dstAB;
int j = 0;
for (int i = 0; i < dstABLength_4.quot; i++)
{
ab8x4_t cursor;
cursor.narrow.a1 = srcA[j ];
cursor.narrow.b1 = srcB[j++];
cursor.narrow.a2 = srcA[j ];
cursor.narrow.b2 = srcB[j++];
dstAB32[i] = cursor.wide;
}
return;
}

// iterate 2-bytes at a time
div_t dstABLength_2 = div(dstABLength, 2);
typedef union
{
uint16_t wide;
struct { uint8_t a; uint8_t b; } narrow;
} ab8x2_t;

uint16_t *dstAB16 = (uint16_t *)dstAB;
for (int i = 0; i < dstABLength_2.quot; i++)
{
ab8x2_t cursor;
cursor.narrow.a = srcA[i];
cursor.narrow.b = srcB[i];
dstAB16[i] = cursor.wide;
}
}

void deinterleave(const uint8_t *srcAB, uint8_t *dstA, uint8_t *dstB, size_t srcABLength)
{
#if defined __ARM_NEON__
// attempt to use NEON intrinsics

// iterate 32-bytes at a time
div_t srcABLength_32 = div(srcABLength, 32);
if (srcABLength_32.rem == 0)
{
while (srcABLength_32.quot --> 0)
{
const uint8x16x2_t ab = vld2q_u8(srcAB);
vst1q_u8(dstA, ab.val[0]);
vst1q_u8(dstB, ab.val[1]);
srcAB += 32;
dstA += 16;
dstB += 16;
}
return;
}

// iterate 16-bytes at a time
div_t srcABLength_16 = div(srcABLength, 16);
if (srcABLength_16.rem == 0)
{
while (srcABLength_16.quot --> 0)
{
const uint8x8x2_t ab = vld2_u8(srcAB);
vst1_u8(dstA, ab.val[0]);
vst1_u8(dstB, ab.val[1]);
srcAB += 16;
dstA += 8;
dstB += 8;
}
return;
}
#endif

// if the bytes were not aligned properly
// or NEON is unavailable, fall back to
// an optimized iteration

// iterate 8-bytes at a time
div_t srcABLength_8 = div(srcABLength, 8);
if (srcABLength_8.rem == 0)
{
typedef union
{
uint64_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
} ab8x8_t;

uint64_t *srcAB64 = (uint64_t *)srcAB;
int j = 0;
for (int i = 0; i < srcABLength_8.quot; i++)
{
ab8x8_t cursor;
cursor.wide = srcAB64[i];
dstA[j ] = cursor.narrow.a1;
dstB[j++] = cursor.narrow.b1;
dstA[j ] = cursor.narrow.a2;
dstB[j++] = cursor.narrow.b2;
dstA[j ] = cursor.narrow.a3;
dstB[j++] = cursor.narrow.b3;
dstA[j ] = cursor.narrow.a4;
dstB[j++] = cursor.narrow.b4;
}
return;
}

// iterate 4-bytes at a time
div_t srcABLength_4 = div(srcABLength, 4);
if (srcABLength_4.rem == 0)
{
typedef union
{
uint32_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
} ab8x4_t;

uint32_t *srcAB32 = (uint32_t *)srcAB;
int j = 0;
for (int i = 0; i < srcABLength_4.quot; i++)
{
ab8x4_t cursor;
cursor.wide = srcAB32[i];
dstA[j ] = cursor.narrow.a1;
dstB[j++] = cursor.narrow.b1;
dstA[j ] = cursor.narrow.a2;
dstB[j++] = cursor.narrow.b2;
}
return;
}

// iterate 2-bytes at a time
div_t srcABLength_2 = div(srcABLength, 2);
typedef union
{
uint16_t wide;
struct { uint8_t a; uint8_t b; } narrow;
} ab8x2_t;

uint16_t *srcAB16 = (uint16_t *)srcAB;
for (int i = 0; i < srcABLength_2.quot; i++)
{
ab8x2_t cursor;
cursor.wide = srcAB16[i];
dstA[i] = cursor.narrow.a;
dstB[i] = cursor.narrow.b;
}
}

最佳答案

在我的脑海中,我不知道用于解交织 2 channel 字节数据的库函数。但是,值得向 Apple 提交错误报告以请求此类功能。

与此同时,使用 NEON 或 SSE 内在函数对此类函数进行矢量化非常容易。具体来说,在 ARM 上,您需要使用 vld1q_u8 从每个源数组加载一个 vector ,使用 vuzpq_u8 对它们进行解交错,使用 vst1q_u8存储结果 vector ;这是我尚未测试甚至未尝试构建的粗略草图,但它应该可以说明总体思路。更复杂的实现绝对是可能的(特别是,NEON 可以在一条指令中加载/存储两个 16B 寄存器,编译器可能不会这样做,并且可能需要一些流水线和/或展开有益的取决于你的缓冲区有多长):

#if defined __ARM_NEON__
# include <arm_neon.h>
#endif
#include <stdint.h>
#include <stddef.h>

void deinterleave(uint8_t *mixed, uint8_t *array1, uint8_t *array2, size_t mixedLength) {
#if defined __ARM_NEON__
size_t vectors = mixedLength / 32;
mixedLength %= 32;
while (vectors --> 0) {
const uint8x16_t src0 = vld1q_u8(mixed);
const uint8x16_t src1 = vld1q_u8(mixed + 16);
const uint8x16x2_t dst = vuzpq_u8(src0, src1);
vst1q_u8(array1, dst.val[0]);
vst1q_u8(array2, dst.val[1]);
mixed += 32;
array1 += 16;
array2 += 16;
}
#endif
for (size_t i=0; i<mixedLength/2; ++i) {
array1[i] = mixed[2*i];
array2[i] = mixed[2*i + 1];
}
}

关于c - C 中最快的去交织操作?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14567786/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com