gpt4 book ai didi

python - 从Python使用Elasticsearch API时已删除的文档

转载 作者:行者123 更新时间:2023-12-03 02:37:48 25 4
gpt4 key购买 nike

我对Elasticsearch相对较新,并且在确定为什么python dataframe中的记录数与索引文档计数Elasticsearch不同时遇到了问题。

我首先通过运行以下命令创建索引:您可以看到有62932条记录。

我正在使用以下方法在elasticsearch中创建索引:
Python code

当我检查Kibana Management/Index Management中的索引时,只有62630个文档。根据“统计”窗口,有302个已删除计数。我不知道这是什么意思。

以下是STATS窗口的输出
{
"_shards": {
"total": 2,
"successful": 1,
"failed": 0
},
"stats": {
"uuid": "egOx_6EwTFysBr0WkJyR1Q",
"primaries": {
"docs": {
"count": 62630,
"deleted": 302
},
"store": {
"size_in_bytes": 4433722
},
"indexing": {
"index_total": 62932,
"index_time_in_millis": 3235,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 140,
"query_time_in_millis": 1178,
"query_current": 0,
"fetch_total": 140,
"fetch_time_in_millis": 1233,
"fetch_current": 0,
"scroll_total": 1,
"scroll_time_in_millis": 6262,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 2,
"total_time_in_millis": 417,
"total_docs": 62932,
"total_size_in_bytes": 4882755,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 26,
"total_time_in_millis": 597,
"external_total": 24,
"external_total_time_in_millis": 632,
"listeners": 0
},
"flush": {
"total": 1,
"periodic": 0,
"total_time_in_millis": 10
},
"warmer": {
"current": 0,
"total": 23,
"total_time_in_millis": 0
},
"query_cache": {
"memory_size_in_bytes": 17338,
"total_count": 283,
"hit_count": 267,
"miss_count": 16,
"cache_size": 4,
"cache_count": 4,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 2,
"memory_in_bytes": 22729,
"terms_memory_in_bytes": 17585,
"stored_fields_memory_in_bytes": 2024,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 512,
"points_memory_in_bytes": 2112,
"doc_values_memory_in_bytes": 496,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 62932,
"size_in_bytes": 17585006,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 0
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
},
"total": {
"docs": {
"count": 62630,
"deleted": 302
},
"store": {
"size_in_bytes": 4433722
},
"indexing": {
"index_total": 62932,
"index_time_in_millis": 3235,
"index_current": 0,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 0
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 140,
"query_time_in_millis": 1178,
"query_current": 0,
"fetch_total": 140,
"fetch_time_in_millis": 1233,
"fetch_current": 0,
"scroll_total": 1,
"scroll_time_in_millis": 6262,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 2,
"total_time_in_millis": 417,
"total_docs": 62932,
"total_size_in_bytes": 4882755,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 0,
"total_auto_throttle_in_bytes": 20971520
},
"refresh": {
"total": 26,
"total_time_in_millis": 597,
"external_total": 24,
"external_total_time_in_millis": 632,
"listeners": 0
},
"flush": {
"total": 1,
"periodic": 0,
"total_time_in_millis": 10
},
"warmer": {
"current": 0,
"total": 23,
"total_time_in_millis": 0
},
"query_cache": {
"memory_size_in_bytes": 17338,
"total_count": 283,
"hit_count": 267,
"miss_count": 16,
"cache_size": 4,
"cache_count": 4,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 2,
"memory_in_bytes": 22729,
"terms_memory_in_bytes": 17585,
"stored_fields_memory_in_bytes": 2024,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 512,
"points_memory_in_bytes": 2112,
"doc_values_memory_in_bytes": 496,
"index_writer_memory_in_bytes": 0,
"version_map_memory_in_bytes": 0,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {}
},
"translog": {
"operations": 62932,
"size_in_bytes": 17585006,
"uncommitted_operations": 0,
"uncommitted_size_in_bytes": 55,
"earliest_last_modified_age": 0
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 0,
"miss_count": 0
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
}
}
}

为什么文档计数与索引总数不同?我已导出数据,且记录数与文档数匹配。我如何找出为什么删除文档并确保将来不再删除它们?

最佳答案

可能的原因:

  • 删除的文档占用了索引中的磁盘空间。
  • 内存中按文档的数据结构(例如规范或字段数据)仍将占用已删除文档的RAM。
  • 搜索吞吐量较低,因为每次搜索都必须为每个潜在的命中检查已删除的位集。在下面的更多内容。
  • 用于查询评分的汇总术语统计信息仍将反射(reflect)已删除的术语和文档。合并完成后,术语统计将突然跳近其真实值,从而更改命中分数。实际上,这种影响是很小的,除非删除的文档的统计数据与索引的其余部分不同。
  • 删除的文档将单个分片的最大2.1 B文档中的文档ID bundle 在一起。如果您的碎片接近该极限(不建议!),这可能很重要。
  • 模糊查询的结果可能略有不同,因为它们可能与虚假词匹配。

  • https://www.elastic.co/guide/en/elasticsearch/reference/current//cat-indices.html
    https://www.elastic.co/blog/lucenes-handling-of-deleted-documents

    关于python - 从Python使用Elasticsearch API时已删除的文档,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58526750/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com