c - ARM 和 Intel 内部函数是否对 AES 使用相同的子 key 计划？-6ren

c - ARM 和 Intel 内部函数是否对 AES 使用相同的子 key 计划？

转载作者：行者123 更新时间：2023-12-04 12:34:42

我正在尝试使用内部函数在 ARMv8 上插入 AES 实现。我有一个 C++ 实现，我有一个 Intel 内部函数实现。

这些实现应该是等效的，所以我尝试使用 Intel 作为 ARMv8 的蓝图。有一些差异，但它们已被考虑在内。问题是，我得到了不同的结果。

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)

    uint8x16_t data = vld1q_u8(in);

    // AES encryption with ARM intrinsics:
    // rnds-1 (9 for AES128) cycles of AES:
    // (Add, Shift, Sub) plus Mix Columns
    unsigned int i;
    for (i=0; i<rounds; ++i)
    {
        // AES single round encryption
        data = vaeseq_u8(data, rdkeys[i]);
        // AES mix columns
        data = vaesmcq_u8(data);
    }
    // One round of encryption: AES, no Mix Columns
    data = vaeseq_u8(data, rdkeys[i++]);
    // Final Add (bitwise Xor)
    data = veorq_u8(data, rdkeys[i]);
    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);
    data = _mm_xor_si128(data, rdkeys[0]);
    for (unsigned int i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);
    }
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
    data = _mm_aesenclast_si128(data, rdkeys[rounds]);
    _mm_storeu_si128((__m128i*)out, data);

#endif
}

在这一点上，我正试图避开子 key 计算。我对这两种实现使用了相同的一组轮 key :

#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif

// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
    memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey));

但是，我得出了不同的结果。以下是转储产生的内容:

英特尔 AES-NI:

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69
...

Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69

ARMv8 AES:

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9
...
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9

我一直在为结果绞尽脑汁。添加更多 printf 无助于识别问题。我开始认为 Intel 和 ARM 内在函数使用不同的子 key 计划。

ARM 和 Intel 内部函数是否对 AES 使用相同的子 key 计划？

下图来自 paper by Cynthia Crutchfield .它检查了 Intel 内在函数和 ARM 内在函数的映射。

下面是完整的程序。还列出了构建它们的命令行。

英特尔:

g++ -Wall -maes aes-test.cxx -o aes-test.exe

AEMv8:

 g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe

程序:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(__ARM_FEATURE_CRYPTO)
# include <arm_neon.h>
# include <arm_acle.h>
#elif defined(__AES__)
# include <wmmintrin.h>
# include <emmintrin.h>
#endif

#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif

#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0]))

static const unsigned int ROUNDS=10;
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);

void Print(const char* label, const Byte *in, size_t len, bool lf=false)
{
    if (label)
        printf("%s: ", label);

    for (size_t i=0; in && i<len; ++i)
        printf("%02X ", in[i]);    
    printf("\n");

    if (lf)
        printf("\n");
}

int main(int argc, char* argv[])
{
    Byte cipher[16], recover[16];
    const Byte plain[16] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
    };

    // Avoid subkey scheduling at this point
    RoundKey rdkeys[ROUNDS+1];
    for (size_t i=0; i<COUNTOF(rdkeys); ++i)
        memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i]));

    AES_encrypt(plain, cipher, rdkeys, ROUNDS);

    return 0;
}

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
    Print("In", in, 16);

#if defined(__ARM_FEATURE_CRYPTO)

    // Load the block
    uint8x16_t data = vld1q_u8(in);

    Print("Data (in)", (Byte*)&data, 16, true);

    // AES encryption with ARM intrinsics:
    // rnds-1 (9 for AES128) cycles of AES:
    // (Add, Shift, Sub) plus Mix Columns
    unsigned int i;
    for (i=0; i<rounds; ++i)
    {
        // AES single round encryption
        data = vaeseq_u8(data, rdkeys[i]);
        // AES mix columns
        data = vaesmcq_u8(data);

        Print("Key", (Byte*)&rdkeys[i], 16);
        Print("Data", (Byte*)&data, 16, true);
    }

    Print("Key", (Byte*)&rdkeys[i], 16);

    // One round of encryption: AES, no Mix Columns
    data = vaeseq_u8(data, rdkeys[i++]);

    Print("Data", (Byte*)&data, 16, true);

    // Final Add (bitwise Xor)
    data = veorq_u8(data, rdkeys[i]);

    Print("Data (xor)", (Byte*)&data, 16);

    // Store the output data
    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);

    Print("Data (in)", (Byte*)&data, 16);

    data = _mm_xor_si128(data, rdkeys[0]);

    Print("Key", (Byte*)&rdkeys[0], 16);
    Print("Data (xor)", (Byte*)&data, 16, true);

    for (unsigned int i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);

        Print("Key", (Byte*)&rdkeys[i], 16);
        Print("Data", (Byte*)&data, 16, true);
    }
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]);

    Print("Key", (Byte*)&rdkeys[rounds-1], 16);
    Print("Data", (Byte*)&data, 16, true);

    data = _mm_aesenclast_si128(data, rdkeys[rounds]);

    Print("Key", (Byte*)&rdkeys[rounds], 16);
    Print("Data", (Byte*)&data, 16, true);

    _mm_storeu_si128((__m128i*)out, data);

#endif

    Print("Out", out, 16);
}

最佳答案

Does ARM and Intel intrinsics use the same subkey schedule for AES?

看来答案是肯定的。我仍然需要针对真实的 key 调度进行测试，但我能够使用相同的 key 调度对 Intel 和 ARMv8 内部函数产生相同的结果。

看起来在 Crutchfield 的引用实现中有一个 off-by-one。它应该使用 rounds-1，而不是 rounds 作为循环控制。这意味着我用 11 轮测试 ARMv8，而不是 10 轮。当 ARMv8 代码生成 F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9 而不是 时，我应该怀疑它>F9 F9 ... F9 F9。

这是更新后的代码:

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)

    uint8x16_t data = vld1q_u8(in);

    unsigned int i;
    for (i=0; i<rounds-1; ++i)
    {
        data = vaeseq_u8(data, rdkeys[i]);
        data = vaesmcq_u8(data);
    }

    data = vaeseq_u8(data, rdkeys[i++]);
    data = veorq_u8(data, rdkeys[i]);

    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);
    data = _mm_xor_si128(data, rdkeys[0]);

    unsigned int i;
    for (i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);
    }

    data = _mm_aesenc_si128(data, rdkeys[i++]);
    data = _mm_aesenclast_si128(data, rdkeys[i]);
    _mm_storeu_si128((__m128i*)out, data);

#endif
}

关于c - ARM 和 Intel 内部函数是否对 AES 使用相同的子 key 计划？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/45528569/

文章推荐： r - 使用 Rcpp Sugar 将均值和标准差传递给 dnorm()

文章推荐： python - 用 pybind11 包装 C++ 抽象类时出错

文章推荐： Firebase 托管重写为外部 url

文章推荐： r - ggplot2 binwidth 在 facet_wrap 直方图中没有响应

intel-pin - intel pin工具中图像的含义
我是Intel pin工具的新手，最近开始研究pin工具。在教程中，描述了pin工具的模式: Sometimes, however, it can be useful to look at diffe
intel-pin - intel pin工具中图像的含义
我是Intel pin工具的新手，最近开始研究pin工具。在教程中，描述了pin工具的模式: Sometimes, however, it can be useful to look at diffe
intel - 如何开始使用库 intel ipp？
我得到了这份工作:1。产生一个正弦信号。2。使用 FFT 构建其频谱。首先，我为 visual studio 2010 安装了 Intel Parallel Studio XE 2011。在 vs 2
opencl - intel-compute-runtime、intel-opencl-runtime 和 intel-opencl-sdk 之间有什么区别？
看起来 Intel 提供了许多 OpenCL 实现。 ArchWiki描述 OpenCL 实现。它说 beignet 和 intel-opencl 已弃用。那么，intel-compute-runti
intel - 如何读取 "Intel Intrinsics Guide"？
我正在尝试通过阅读 Intel Intrinsics Guide 来开始使用 AVX512 内在函数但到目前为止我发现它没有定义命名数据类型或用于解释的伪代码语法。没有这样的定义，所谓的指南对我起码没
intel - AMD 与 Intel 处理器制作可执行文件
关闭。这个问题是opinion-based 。目前不接受答案。想要改进这个问题吗？更新问题，以便 editing this post 可以用事实和引文来回答它。 . 已关闭 4 年前。 Improv
android-studio - "Intel Atom Image"、 "Google APIs Intel Atom image"和 "Google play Intel Atom Image"之间有什么区别？
在 Android SDK 管理器中，我可以看到 3 种类型的 Intel Atom 图像。有人可以解释“Intel Atom Image”、“Google APIs Intel Atom Image
intel-pin - 使用 intel pintool 记录所有指令
我写了这个 pintool: #include "pin.H" #include #include VOID Instruction(INS ins, VOID *v) { cou
intel - 了解 Intel Intrinsics Guide 中的代码示例
我正在尝试了解 _mm256_permute2f128_ps() 的作用，但无法完全理解 intel's code-example . DEFINE SELECT4(src1, src2, contr
intel - 使用 Intel 内在函数 SSSE3 的替代方案时性能下降
我正在开发一个性能关键应用程序，该应用程序必须移植到仅支持 MMX、SSE、SSE2 和 SSE3 的英特尔凌动处理器中。我以前的应用程序支持 SSSE3 和 AVX，现在我想将其降级为 Intel
intel-pin - Intel Pin 3.0无法识别MPX指令？
我有最新版本的 Intel Pin 3.0 版本 76887。我有一个支持 MPX 的玩具示例: #include int g[10]; int main(int argc, char **arg
intel - 在 Intel 上使用 OpenSolaris 研究 SPARC 可执行结构
我想研究和比较elf、SPARC和PA-RISC的可执行文件结构。为了进行研究，我想在 Intel 机器 (Core2Duo) 上安装 OpenSolaris。但我有一个基本的疑问，它会起作用吗？
intel-mkl - 无法使用 g++ 将数学库与 intel mkl 链接
我尝试使用 g++ 用 intel mkl 11.1 进行编译: g++ -m32 test.c -lmkl_intel -lmkl_intel_thread -lmkl_core -liomp5 -
c++ - 我如何使用 intel 编译器和 intel mpi 安装 boost？
我正在按照以下说明进行操作: https://software.intel.com/en-us/articles/building-boost-with-intel-c-compiler-150 Co
c++ - -masm=intel 标志不适用于使用 Intel 语法在 gcc 编译器中运行汇编语言
我正在尝试在我的 C 程序中使用内联汇编程序 __asm，使用 Intel 语法而不是 AT&T 语法。我正在使用 gcc -S -masm=intel test.c 进行编译但它给出了错误。下面是我
c++ - Intel HD GPU 与 Intel CPU 性能比较
我是 OpenCL 的新手，目前对其性能有一些疑问。我有 Intel(R) Core(TM) i5-4460 CPU @ 3.20GHz + ubuntu + Beignet(Intel 开源 op
Makefile:Intel fortran，文件夹中的源文件，和 Intel Math Kernel Library
我在/ex 文件夹中有一个 main.f90。 f77 子程序文件在/ex/src 中。子程序文件再次使用 BLAS 和 LAPACK 库。对于 BLAS 和 LAPACK，我必须使用英特尔数学核心函
c++ - 为什么此代码链接到 Intel Compiler 2015 而不是 Intel Compiler 2018？
我的团队最近从 2015 年英特尔编译器(并行工作室)升级到 2018 年版本，我们遇到了一个链接器问题，让每个人都焦头烂额。我有以下类(为简洁起见进行了适度编辑)，用于处理子进程的包装以及与它们对
intel - 为什么 Intel Haswell XEON CPU 偶尔会错误计算 FFT 和 ART？
在最后几天，我观察到我无法解释的新工作站的行为。对这个问题做一些研究，INTEL Haswell architecture 中可能存在一个可能的错误。以及在当前的 Skylake Generation
android-emulator - Intel HAXM 安装错误 - 此计算机不支持 Intel 虚拟化技术 (VT-x)
我的 HAXM 安装存在问题。事情是这样的。每次尝试为我的计算机安装 HAXM 时，我都会收到此错误: 问题是，我的计算机支持虚拟化技术(见下图)。知道如何解决这个问题吗？最佳答案只需执行以下步骤

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - ARM 和 Intel 内部函数是否对 AES 使用相同的子 key 计划？