gpt4 book ai didi

c++ - 在文本中搜索 25 000 个单词

转载 作者:IT老高 更新时间:2023-10-28 22:07:21 25 4
gpt4 key购买 nike

我需要在文本中找到大约 25 000 个单词。为此目的最合适的算法/库是什么?

目标语言是 C++

最佳答案

I once used the Boyer-Moore algorithm and it was quite fast.

Boyer-Moore 不适合高效地搜索多个单词。实际上有一种非常有效的算法可以做到这一点,称为 Wu-Manber 算法。我将发布一个引用实现。但是请注意,我前段时间这样做只是为了教育目的。因此,该实现并不适合直接使用,而且还可以提高效率。

它还使用 Dinkumware STL 中的 stdext::hash_map。替换为 std::tr1::unordered_map 或适当的实现。

lecture script 中有对该算法的解释。摘自 Knut Reinert 在柏林自由大学举办的一场讲座。

original paper也在网上(刚刚又找到了),但我不是特别喜欢那里提供的伪代码。

#ifndef FINDER_HPP
#define FINDER_HPP

#include <string>

namespace thru { namespace matching {

class Finder {
public:
virtual bool find() = 0;

virtual std::size_t position() const = 0;

virtual ~Finder() = 0;

protected:
static size_t code_from_chr(char c) {
return static_cast<size_t>(static_cast<unsigned char>(c));
}
};

inline Finder::~Finder() { }

} } // namespace thru::matching

#endif // !defined(FINDER_HPP)

#include <vector>
#include <hash_map>

#include "finder.hpp"

#ifndef WUMANBER_HPP
#define WUMANBER_HPP

namespace thru { namespace matching {

class WuManberFinder : public Finder {
public:

WuManberFinder(std::string const& text, std::vector<std::string> const& patterns);

bool find();

std::size_t position() const;

std::size_t pattern_index() const;

private:

template <typename K, typename V>
struct HashMap {
typedef stdext::hash_map<K, V> Type;
};

typedef HashMap<std::string, std::size_t>::Type shift_type;
typedef HashMap<std::string, std::vector<std::size_t> >::Type hash_type;

std::string const& m_text;
std::vector<std::string> const& m_patterns;
shift_type m_shift;
hash_type m_hash;
std::size_t m_pos;
std::size_t m_find_pos;
std::size_t m_find_pattern_index;
std::size_t m_lmin;
std::size_t m_lmax;
std::size_t m_B;
};

} } // namespace thru::matching

#endif // !defined(WUMANBER_HPP)

#include <cmath>
#include <iostream>

#include "wumanber.hpp"

using namespace std;

namespace thru { namespace matching {

WuManberFinder::WuManberFinder(string const& text, vector<string> const& patterns)
: m_text(text)
, m_patterns(patterns)
, m_shift()
, m_hash()
, m_pos()
, m_find_pos(0)
, m_find_pattern_index(0)
, m_lmin(m_patterns[0].size())
, m_lmax(m_patterns[0].size())
, m_B()
{
for (size_t i = 0; i < m_patterns.size(); ++i) {
if (m_patterns[i].size() < m_lmin)
m_lmin = m_patterns[i].size();
else if (m_patterns[i].size() > m_lmax)
m_lmax = m_patterns[i].size();
}

m_pos = m_lmin;
m_B = static_cast<size_t>(ceil(log(2.0 * m_lmin * m_patterns.size()) / log(256.0)));

for (size_t i = 0; i < m_patterns.size(); ++i)
m_hash[m_patterns[i].substr(m_patterns[i].size() - m_B)].push_back(i);

for (size_t i = 0; i < m_patterns.size(); ++i) {
for (size_t j = 0; j < m_patterns[i].size() - m_B + 1; ++j) {
string bgram = m_patterns[i].substr(j, m_B);
size_t pos = m_patterns[i].size() - j - m_B;

shift_type::iterator old = m_shift.find(bgram);
if (old == m_shift.end())
m_shift[bgram] = pos;
else
old->second = min(old->second, pos);
}
}
}

bool WuManberFinder::find() {
while (m_pos <= m_text.size()) {
string bgram = m_text.substr(m_pos - m_B, m_B);
shift_type::iterator i = m_shift.find(bgram);
if (i == m_shift.end())
m_pos += m_lmin - m_B + 1;
else {
if (i->second == 0) {
vector<size_t>& list = m_hash[bgram];
// Verify all patterns in list against the text.
++m_pos;
for (size_t j = 0; j < list.size(); ++j) {
string const& str = m_patterns[list[j]];
m_find_pos = m_pos - str.size() - 1;
size_t k = 0;

for (; k < str.size(); ++k)
if (str[k] != m_text[m_find_pos + k])
break;

if (k == str.size()) {
m_find_pattern_index = list[j];
return true;
}
}
}
else
m_pos += i->second;
}
}

return false;
}

size_t WuManberFinder::position() const {
return m_find_pos;
}

size_t WuManberFinder::pattern_index() const {
return m_find_pattern_index;
}

} } // namespace thru::matching

使用示例:

vector<string> patterns;
patterns.push_back("announce");
patterns.push_back("annual");
patterns.push_back("annually");

WuManberFinder wmf("CPM_annual_conference_announce", patterns);

while (wmf.find())
cout << "Pattern \"" << patterns[wmf.pattern_index()] <<
"\" found at position " << wmf.position() << endl;

关于c++ - 在文本中搜索 25 000 个单词,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/154365/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com