gpt4 book ai didi

c++ - 为什么我的自定义分配器比默认分配器慢

转载 作者:行者123 更新时间:2023-11-30 03:26:41 28 4
gpt4 key购买 nike

我是 C++ 分配器的新手,花了一整天时间尝试构建自己的分配器。我以 A. Alecsandrescu Loki 分配器为垫脚石并遵循 this教程。最终,我制作了一个工作分配器并准备休息一下,结果发现这个自定义分配器比默认分配器慢得多。这是完整的代码:

#include <cstddef>
#include <iostream>
#include <vector>
#include <list>
#include <chrono>
#include <string>
using namespace std::chrono;

using uchar = unsigned char;

class Chunk
{
private:

friend class FixedAllocator;

void init(size_t blockSize, uchar blocks);

void release();

void* allocate(size_t blockSize);

void deallocate(void* p, size_t blockSize);

inline bool hasBlock(void* p, size_t chunkLen) const
{
uchar * pc = static_cast<uchar*>(p);
return (pData <= pc) && (pc <= (pData + chunkLen));
}

inline bool releasable(uchar numBlocks) const
{
return blocksAvailable == numBlocks;
}

uchar* pData;

uchar firstAvailableBlock, blocksAvailable;
};


void Chunk::init(size_t blockSize, uchar blocks)
{
// for n of Ts it will allocate n * sizeof(T) memory
pData = new uchar[blockSize * blocks];
firstAvailableBlock = 0;
blocksAvailable = blocks;
uchar i = 0;
uchar* p = pData;
// used by allocate method to move forward firstAvailableBlock
for (; i != blocks; p += blockSize)
{
*p = ++i;
}
}


void Chunk::release()
{
::operator delete(pData);
}


void* Chunk::allocate(size_t blockSize)
{
if (!blocksAvailable) return 0;
// move firstAvailableBlock one block ahead
uchar* pResult = pData + firstAvailableBlock * blockSize;
firstAvailableBlock = *pResult;
--blocksAvailable;
return pResult;
}


void Chunk::deallocate(void* p, size_t blockSize)
{
uchar* toRelease = static_cast<uchar*>(p);
// find last but one available block
firstAvailableBlock = static_cast<uchar>((toRelease - pData) / blockSize);
++blocksAvailable;
}


class FixedAllocator
{
private:
size_t blockSize;
uchar blocks;
using Chunks = std::vector<Chunk>;
Chunks chunks;
Chunk* allocChunk;
public:
FixedAllocator();
~FixedAllocator();
void init(size_t blockSize, size_t pageSize);
void * allocate();
void deallocate(void* p);
};


FixedAllocator::FixedAllocator():
blockSize(0),
blocks(0),
chunks(0),
allocChunk(nullptr)
{
}


FixedAllocator::~FixedAllocator()
{
Chunks::iterator it;
for (it = chunks.begin(); it != chunks.end(); ++it)
{
it->release();
}
}


void FixedAllocator::init(size_t blockSize_, size_t pageSize)
{
blockSize = blockSize_;
size_t numBlocks = pageSize / blockSize;
blocks = static_cast<uchar>(numBlocks);
}


void* FixedAllocator::allocate()
{
if (!allocChunk || allocChunk->blocksAvailable == 0)
{
Chunks::iterator it = chunks.begin();
for (;;++it)
{
if (it == chunks.end())
{
// allocate memory for one more chunk
chunks.reserve(chunks.size() + 1);
Chunk newChunk;
newChunk.init(blockSize, blocks);
// add new chunk to memory pool
chunks.push_back(newChunk);
// points to new just initiated chunk
allocChunk = &chunks.back();
break;
}
if (it->blocksAvailable > 0)
{
// points to chunk with available blocks
allocChunk = &*it;
break;
}
}
}
return allocChunk->allocate(blockSize);
}


void FixedAllocator::deallocate(void* p)
{
size_t chunkLen = blocks * blockSize;
Chunks::iterator it;
int cPos = 0;
for (it = chunks.begin(); it != chunks.end(); ++it, ++cPos)
{
if (it->hasBlock(p, chunkLen))
{
it->deallocate(p, blockSize);
if (it->releasable(blocks)) {
it->release();
chunks.erase(chunks.begin() + cPos);
// allocChunk may point to deleted chunk
// so, reset it
if (!chunks.empty()) {
allocChunk = &chunks.back();
} else {
allocChunk = nullptr;
}
} else {
// there are free blocks in chunk
// so, reset allocChunk for fast search
allocChunk = &*it;
}
break;
}
}
}


class SmallObjAllocator
{
public:
SmallObjAllocator(size_t pageSize, size_t maxObjectSize);
void* allocate(size_t numBytes);
void deallocate(void* p, size_t numBytes);
private:
FixedAllocator* pool;
size_t maxObjectSize;
};


SmallObjAllocator::SmallObjAllocator(size_t pageSize, size_t maxObjectSize_):
pool(nullptr),
maxObjectSize(maxObjectSize_)
{
pool = new FixedAllocator[maxObjectSize];
for (size_t i = 0; i < maxObjectSize; ++i)
{
pool[i].init(i + 1, pageSize);
}
}


void* SmallObjAllocator::allocate(size_t numBytes) {
if (numBytes > maxObjectSize)
{
return ::operator new(numBytes);
}
FixedAllocator& alloc = pool[numBytes-1];
return alloc.allocate();
}


void SmallObjAllocator::deallocate(void* p, size_t numBytes)
{
if (numBytes > maxObjectSize)
{
::operator delete(p);
return;
}
FixedAllocator& alloc = pool[numBytes-1];
alloc.deallocate(p);
}


template<typename T, size_t numBlocks = 64>
class Allocator
{
public:

Allocator(){};

template<typename U, size_t N>
Allocator(Allocator<U, N> const&);

template<typename U>
struct rebind
{
using other = Allocator<U, numBlocks>;
};

T* allocate(size_t cnt)
{
return reinterpret_cast<T*>(
allocator.allocate(sizeof(T) * cnt)
);
}

void deallocate(T* p, size_t cnt)
{
allocator.deallocate(p, sizeof(T) * cnt);
}

void construct(T* p, T const& val)
{
::new((void *)p) T(val);
}

void destroy(T* p)
{
return ((T*) p)->~T();
}

using value_type = T;

private:
static SmallObjAllocator allocator;
};


template<typename T, size_t numBlocks>
SmallObjAllocator Allocator<T, numBlocks>::allocator(numBlocks * sizeof(T), sizeof(T));


template<class List>
void test(std::string comment, List l)
{
std::cout << comment;
auto start_time = high_resolution_clock::now();
for (int i = 0; i < 10000; ++i)
{
l.push_back(i);
}
auto end_time = high_resolution_clock::now();
std::cout << duration_cast<milliseconds>(end_time - start_time).count() << "ms" << std::endl;
}


int main() {
test("default list ", std::list<int>());
test("list with custom allocator ", std::list<int, Allocator<int, 10000>>());
return 0;
}

如您所见,在我的客户端代码中,我进行了一些分析,此分析显示默认列表的填充时间为 0 毫秒,而带有自定义分配器的列表的填充时间为 3 毫秒。我认为整个问题出在 deallocate 方法上并将其注释掉,但仍然得到完全相同的图片。那么,这种性能下降的原因可能是什么?我错过了什么?

最佳答案

默认分配器( std::allocator )通常实现为围绕 new 的相对较薄的包装器和 delete .

您示例中的分配器似乎是一个混合 sub/bump(增量) 分配器。总而言之,如果分配器内存耗尽,它会从系统分配一 block 内存,然后从可用的 block 中 bump 分配。

除其他事项外,请考虑:

  • 它不是线程安全的。并发访问最终会破坏它。这对于使用单线程的独立分析来说无关紧要,但仍然是一个重要的考虑因素。
  • 它手动管理所有地方的内存。即 Chunk管理内存但没有析构函数,需要 Chunk::release被要求摧毁它(即在 ~FixedAllocator() ).使用 RAII 避免手动内存管理(即使在编写分配器时) :

    class Chunk
    {
    // private: not required, classes are private by default.
    friend class FixedAllocator;

    // Replaced init(...) with constructor.
    Chunk(size_t blockSize, uchar block) :
    pData(new uchar[blockSize * blocks]),
    firstAvailableBlock(0),
    blocksAvailable(blocks)
    {
    uchar* p = pData;
    for (uchar i = 0; i != blocks; p += blockSize)
    {
    *p = ++i;
    }
    }
    Chunk(const Chunk& other) = delete; // Disable copy construction.
    Chunk(Chunk&& other) :
    pData(std::move(other.pData)),
    firstAvailableBlock(other.firstAvailableBlock),
    blocksAvailable(other.blocksAvailable)
    {
    other.firstAvailableBlock = 0;
    other.blocksAvailable = 0;
    }

    Chunk& operator=(const Chunk&& other) = delete; // Disable copy assignment.
    Chunk& operator=(Chunk&& other)
    {
    pData = std::move(other.pData);
    firstAvailableBlock = other.firstAvailableBlock;
    blocksAvailable = other.blocksAvailable;
    other.firstAvailableBlock = 0;
    other.blocksAvailable = 0;
    return *this;
    }

    //...
    void release()
    {
    pData.reset();
    }
    //...

    std::unique_ptr<uchar[]> pData; // Automatically deleted in the implicitly generated destructor.
    uchar firstAvailableBlock, blocksAvailable;
    };

    // And of course don't forget to update chunk creation:
    //...
    Chunk newChunk(blockSize, blocks);
    chunks.push_back(std::move(newChunk));
    //...
  • Chunk::hasBlock不考虑漏洞。如果您要分配 10 字节/5 字节/10 字节,则稍后释放 5 字节 block ,hasBlock会返回 false对于 5 字节 block 内的范围,即使该空间实际上是可用的。正确解决这个问题需要一个系统来跟踪分配。

它比较慢,因为它比典型的 std::allocator 做更多的整体工作实现。

  • 小对象大小设置为sizeof(int) ,这很可能是 4。std::list 的大小节点至少为 12(后向指针(4-8),前向指针(4-8),对象(4+))。因此,至少对于列表节点,SmallObjAllocator::allocate()SmallObjAllocator::deallocate()不会调用newdelete ,而不是总是调用 FixedAllocator::allocate()FixedAllocator::deallocate() .

  • FixedAllocator::allocate()FixedAllocator::deallocate()很慢。它们都执行线性搜索,这在最坏的情况下意味着它们遍历所有 block 。即使在一般情况下,很多时间花在分配器上而不是你的程序上。优化这两个功能将产生最多的结果。

  • blockSize您的分配器设置为 sizeof(int) * 10000 (大概 40k)。因此,10k 次插入 std::list<int>至少需要 120kb( sizeof(node) * 10000 ),所以很可能是 FixedAllocator在您的示例中至少调整两次(假设调整大小策略加倍)。您可以通过设置 blockSize 来消除调整大小足够高,永远不需要调整大小。
    Allocator<int, 100000> (100k) 对于您的示例来说应该绰绰有余。

分配器是一个非常复杂的主题,老实说,有太多的细节需要详细说明如何在不写短篇小说的情况下完全解释如何优化您的示例。我建议阅读分配器设计并研究现实世界中使用的分配器,以更好地理解该主题。

参见:

关于c++ - 为什么我的自定义分配器比默认分配器慢,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48102916/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com