gpt4 book ai didi

mysql - 检测重复英文名

转载 作者:行者123 更新时间:2023-11-29 09:00:21 26 4
gpt4 key购买 nike

我试图找到一个示例,演示 Lucene 或其他类型的索引,可以检查英文名字和姓氏组合是否可能重复。重复检查需要能够考虑常见的昵称,即罗伯特的鲍勃和威廉的比尔,以及拼写错误。有谁知道一个例子吗?

我计划在用户注册期间执行重复项搜索。需要根据从存储用户名的数据库表构建的索引来检查新的用户记录。

最佳答案

我会在索引时对firstName使用SynonymFilter,这样你就有了所有可能的组合(Bob -> Robert,Robert -> Bob,等等...)。为您现有的用户建立索引。

然后使用 QueryParser(分析器中没有 SynonymFilter)来询问一些模糊查询。

这是我想出的代码:

public class NameDuplicateTests {
private Analyzer analyzer;
private IndexSearcher searcher;
private IndexReader reader;
private QueryParser qp;

private final static Multimap<String, String> firstNameSynonyms;
static {
firstNameSynonyms = HashMultimap.create();
List<String> robertSynonyms = ImmutableList.of("Bob", "Bobby", "Robert");
for (String name: robertSynonyms) {
firstNameSynonyms.putAll(name, robertSynonyms);
}
List<String> willSynonyms = ImmutableList.of("William", "Will", "Bill", "Billy");
for (String name: willSynonyms) {
firstNameSynonyms.putAll(name, willSynonyms);
}
}

public static Analyzer createAnalyzer() {
return new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream tokenizer = new WhitespaceTokenizer(reader);
if (fieldName.equals("firstName")) {
tokenizer = new SynonymFilter(tokenizer, new SynonymEngine() {
@Override
public String[] getSynonyms(String s) throws IOException {
return firstNameSynonyms.get(s).toArray(new String[0]);
}
});
}
return tokenizer;
}
};
}


@Before
public void setUp() throws Exception {
Directory dir = new RAMDirectory();
analyzer = createAnalyzer();

IndexWriter writer = new IndexWriter(dir, analyzer, IndexWriter.MaxFieldLength.UNLIMITED);
ImmutableList<String> firstNames = ImmutableList.of("William", "Robert", "Bobby", "Will", "Anton");
ImmutableList<String> lastNames = ImmutableList.of("Robert", "Williams", "Mayor", "Bob", "FunkyMother");

for (int id = 0; id < firstNames.size(); id++) {
Document doc = new Document();
doc.add(new Field("id", String.valueOf(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("firstName", firstNames.get(id), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("lastName", lastNames.get(id), Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
}
writer.close();

qp = new QueryParser(Version.LUCENE_30, "firstName", new WhitespaceAnalyzer());
searcher = new IndexSearcher(dir);
reader = searcher.getIndexReader();
}

@After
public void tearDown() throws Exception {
searcher.close();
}

@Test
public void testNameFilter() throws Exception {
search("+firstName:Bob +lastName:Williams");
search("+firstName:Bob +lastName:Wolliam~");
}

private void search(String query) throws ParseException, IOException {
Query q = qp.parse(query);
System.out.println(q);
TopDocs res = searcher.search(q, 3);
for (ScoreDoc sd: res.scoreDocs) {
Document doc = reader.document(sd.doc);
System.out.println("Found " + doc.get("firstName") + " " + doc.get("lastName"));
}
}
}

结果是:

+firstName:Bob +lastName:Williams
Found Robert Williams
+firstName:Bob +lastName:wolliam~0.5
Found Robert Williams

希望有帮助!

关于mysql - 检测重复英文名,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8814190/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com