gpt4 book ai didi

c++ - SSE/AVX 对齐内存上的 valarray

转载 作者:塔克拉玛干 更新时间:2023-11-03 00:45:02 32 4
gpt4 key购买 nike

有没有办法确保 valarray 使用对齐内存,以便它可以用 SSE 和 AVX 进行矢量化?据我所知,STL 不保证对齐,您可以将分配器传递给 valarray。还有其他方法可以实现吗?

提前致谢!

最佳答案

我通常使用 std::vector使用我自己的分配器,它将对齐作为模板参数并调用 _mm_malloc()_aligned_malloc() .这非常有效,也适用于 AVX(32 字节对齐)。适当编写的模板化用户代码会自动选择所需的对齐方式。

下面的代码为AlignmentAllocator<>和 helper 。在 gcc 和 icpc 下测试。

/// allocate and de-allocate aligned memory
template<std::size_t alignment>
struct static_allocator {
static void*allocate(std::size_t n)
{
if(n == 0) return 0;
if(n > max_size())
throw std::bad_alloc();
void*ret =
#if defined(__GNUC__) || defined (__INTEL_COMPILER)
_mm_malloc
#else
_aligned_malloc
#endif
(n,alignment);
if(!ret)
throw std::bad_alloc();
return ret;
}
static void deallocate(void*p)
{
#if defined(__GNUC__) || defined (__INTEL_COMPILER)
_mm_free
#else
_aligned_free
#endif
(p);
}
static std::size_t max_size ()
{ return std::numeric_limits<std::size_t>::max(); }
};

/// allocate and de-allocate unaligned memory
template<>
struct static_allocator<1> {
static std::size_t max_size () noexcept
{ return std::numeric_limits<std::size_t>::max(); }
static void*allocate(std::size_t n)
{
if(n == 0) return 0;
void*ret = new char[n];
return ret;
}
static void deallocate(void*p)
{ delete[] static_cast<char*>(p); }
};

template<> struct static_allocator<0>;

/// allocator with explicit alignment
template<typename _Tp, std::size_t alignment = 16>
class AlignmentAllocator
{
typedef static_allocator<alignment> static_alloc;
public:
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;

template <typename _Tp1>
struct rebind
{ typedef AlignmentAllocator<_Tp1, alignment> other; };

AlignmentAllocator() {}

AlignmentAllocator(const AlignmentAllocator&) {}

template <typename _Tp1>
AlignmentAllocator(const AlignmentAllocator<_Tp1, alignment> &) {}

~AlignmentAllocator() {}

pointer address (reference x) const
{
#if __cplusplus >= 201103L
return std::addressof(x);
#else
return reinterpret_cast<_Tp*>(&reinterpret_cast<char&>(x));
#endif
}

const_pointer address (const_reference x) const
{
#if __cplusplus >= 201103L
return std::addressof(x);
#else
return reinterpret_cast<const _Tp*>(&reinterpret_cast<const char&>(x));
#endif
}

pointer allocate (size_type n, const void* = 0)
{ return static_cast<pointer>(static_alloc::allocate(n*sizeof(value_type))); }

void deallocate (pointer p, size_type)
{ static_alloc::deallocate(p); }

size_type max_size () const
{ return static_alloc::max_size() / sizeof (value_type); }

#if __cplusplus >= 201103L

template<typename _Up, typename... _Args>
void construct(_Up* p, _Args&&... args)
{ ::new(static_cast<void*>(p)) _Up(std::forward<_Args>(args)...); }

template<typename _Up>
void destroy(_Up* p)
{ p->~_Up(); }

#else

void construct (pointer p, const_reference val)
{ ::new(static_cast<void*>(p)) value_type(val); }

void destroy (pointer p)
{ p->~value_type (); }

#endif

bool operator!=(const AlignmentAllocator&) const
{ return false; }

// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const AlignmentAllocator&) const
{ return true; }

};// class AlignmentAllocator<>

/// AlignmentAllocator<void> specialization.
template<std::size_t alignment>
class AlignmentAllocator<void, alignment>
{
public:
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef void* pointer;
typedef const void* const_pointer;
typedef void value_type;

template<typename _Tp1>
struct rebind
{ typedef AlignmentAllocator<_Tp1, alignment> other; };
};

关于c++ - SSE/AVX 对齐内存上的 valarray,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/13711649/

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com