gpt4 book ai didi

python - 如何根据文本中的关键字将一个html页面拆分为多个html

转载 作者:行者123 更新时间:2023-12-01 06:38:49 26 4
gpt4 key购买 nike

我想根据关键字PART将单个html文件拆分为多个html文件。给定的 html 文件包含提及四个部分的文本 - 第 I 部分、第 II 部分、第 III 部分和第 IV 部分。

我想将 html 分成 5 部分:

  • 第 0 部分 - 应包含从 html 开头到第 I 部分之前的文本
  • 第 I 部分 - 应包含从第 I 部分开始到第 II 部分之前的文本
  • 第 II 部分 - 应包含从第 II 部分开始到第 III 部分之前的文本
  • 第 III 部分 - 应包含从第 III 部分开始到第 IV 部分之前的文本
  • 第 IV 部分 - 应包含从第 IV 部分开始到结束的文本。

这些是一些示例 html 文件:

https://www.sec.gov/Archives/edgar/data/763744/000076374419000018/lcii-20181231.htm https://www.sec.gov/Archives/edgar/data/820027/000082002719000010/amp12312018.htm

请引用下面我的代码:

import sys
import re
from bs4 import BeautifulSoup
import os
import numpy as np
from urllib.request import urlopen
import pandas as pd
list_values_page_number=[]


type_parts = ['PART 0','PART I','PART II','PART III','PART IV']
output_path = r"D:\Tasks\10K\SEGMENTATION\2_segmentation"
input_files = ['https://www.sec.gov/Archives/edgar/data/763744/000076374419000018/lcii-20181231.htm',
'https://www.sec.gov/Archives/edgar/data/820027/000082002719000010/amp12312018.htm']
input_folder = r'D:\Tasks\10K\input_files'
#content_segmentation_file_name = '/home/mobius365/Downloads/10-K_financial_documents/content_segmentation.csv'

#co_ent_nbr_links = dict(zip(list(input_data_frame["CO_Ent_Nbr"]),list(input_data_frame["Updated_Links"])))


def page_segmentation(list_of_content,prev_index, page_number):
global Part_page_number
global previous_index
global count
global store_index_list
global output_file_storage_folder
global file_content_prettified_list
global part_repeat_storage_list
global indices
page_soup = BeautifulSoup(" ".join(list_of_content), "lxml")
values_with_part=page_soup.findAll(text=re.compile("Par|PAR|ART"))
list_of_values=[]
values_with_part=[values_list.strip() for values_list in values_with_part]
for Part_values in values_with_part:
if (("ART" in Part_values.strip()[:5] or "art" in Part_values.strip()[:5] ) and Part_values.strip()[-1] in ["I","V"] and len(Part_values)<9):
list_of_values.append(Part_values)
elif(len(Part_values.strip())<6):
list_of_values.append(Part_values)
else:
pass

if len(list_of_values) == 1 :
values_parents_finder = page_soup.find(text=re.compile(list_of_values[0]))
parent_0_value = values_parents_finder.findParents()[0].text.strip().upper()
parent_1_value = values_parents_finder.findParents()[1].text.strip().upper()
parent_0_value = parent_0_value.replace(u'\xa0', u' ')
parent_1_value = parent_1_value.replace(u'\xa0', u' ')
parent_0_value = re.sub(' +', '',parent_0_value)
parent_1_value = re.sub(' +', '',parent_1_value)
if ((parent_0_value[0]=='P' and parent_0_value[-1] in ["I","V"]) or (parent_1_value[0]=='P' and (parent_1_value[-1] in ["I","V"] or parent_1_value[-2:] in ["I.","V."] ))):

if(parent_0_value[:4].upper()=='PART' and (parent_0_value[-1] in ["I","V"] or parent_0_value[-2:] in ["I.","V."])):
temp_name=re.sub('t', 't ',parent_0_value)
temp_name=re.sub('T', 'T ',parent_0_value)
else:
temp_name=re.sub('t', 't ',parent_1_value)
temp_name=re.sub('T', 'T ',parent_1_value)

if (temp_name not in part_repeat_storage_list):
part_repeat_storage_list.append(temp_name)
Part_page_number[temp_name.upper()] = page_number
next_level_index = prev_index
with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
file.write(" ".join(file_content_prettified_list[previous_index:next_level_index]))
file.close()
store_index_list.append((previous_index,next_level_index))
previous_index = next_level_index
count+=1
else:
pass
elif len(list_of_values) == 2 :
for two_values in list_of_values :
values_parents_finder = page_soup.find(text = re.compile(two_values[0]))
parent_0_value = values_parents_finder.findParents()[0].text.strip().upper()
parent_1_value = values_parents_finder.findParents()[1].text.strip().upper()
parent_0_value = parent_0_value.replace(u'\xa0', u' ')
parent_1_value = parent_1_value.replace(u'\xa0', u' ')
parent_0_value = re.sub(' +', '',parent_0_value)
parent_1_value = re.sub(' +', '',parent_1_value)
if ((parent_0_value[0]=='P' and parent_0_value[-1] in ["I","V"]) or (parent_1_value[0]=='P' and (parent_1_value[-1] in ["I","V"] or parent_1_value[-2:] in ["I.","V."]))):
if(parent_0_value[:4].upper()=='PART' and parent_0_value[-1] in ["I","V"] ):
temp_name=re.sub('t', 't ',parent_0_value)
temp_name=re.sub('T', 'T ',parent_0_value)
else:
temp_name=re.sub('t', 't ',parent_1_value)
temp_name=re.sub('T', 'T ',parent_1_value)
if (temp_name not in part_repeat_storage_list):

part_repeat_storage_list.append(temp_name)
next_level_index = prev_index
Part_page_number[temp_name.upper()] = page_number
with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
file.write(" ".join(file_content_prettified_list[previous_index:indices[indices.index(next_level_index)+1]]))
file.close()
store_index_list.append((previous_index,next_level_index))
previous_index = next_level_index
count+=1



for link in input_files:
html = urlopen(link).read().decode('utf-8')
name = link.split('/')[-1]
with open(input_folder+"/"+name, 'w', encoding='utf-8') as f:
f.write(html)
f.close()



for links in input_files:
files = links.split("/")[-1]
file_name = os.path.join(input_folder,files)
print (file_name)
output_file_storage_folder = os.path.join(output_path,files)
if not os.path.exists(output_file_storage_folder):
os.makedirs(output_file_storage_folder)
try:
file_content_reading = open(file_name, encoding="utf8").read()
except Exception as e:
print(e)
file_content_bs = BeautifulSoup(file_content_reading, 'lxml')
file_content_prettified_list = file_content_bs.prettify().split("\n")
file_content_space_removed = [tags_values.strip() for tags_values in file_content_prettified_list]

page_splits = file_content_bs.find_all(attrs={'style': re.compile('page-break-before|page-break-after',re.IGNORECASE)})
if (len(page_splits)< 90 ):
page_splits=page_splits
indices = [index_number for index_number, html_tags in enumerate(file_content_space_removed) if ('page-break-after' in html_tags.lower() or 'page-break-before' in html_tags.lower())]
else:
page_splits=[tag_value for tag_value in page_splits if str(tag_value)[:2]!="<p"]
indices = [index_number for index_number, html_tags in enumerate(file_content_space_removed) if ('page-break-after' in html_tags.lower() or 'page-break-before' in html_tags.lower())]

type_parts=['PART 0','PART I','PART II','PART III','PART IV']
previous_index=0
store_index_list=[]
part_repeat_storage_list=[]
count=0

Part_page_number = { "PART 0" : 0, "PART I" : np.nan, "PART II" : np.nan , "PART III" : np.nan , "PART IV" : np.nan }

prev_index=0
count_page_number=1

for index_value in indices:
next_index = index_value
page_segmentation(file_content_space_removed[prev_index:index_value],prev_index,count_page_number)
prev_index = next_index
count_page_number+=1
page_segmentation(file_content_space_removed[next_index:],prev_index,count_page_number)

if(len(store_index_list)!=0):
with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
file.write(" ".join(file_content_prettified_list[store_index_list[-1][-1]:]))
file.close()
else:
with open(output_file_storage_folder+"/"+type_parts[count]+".html", "w",encoding='utf-8') as file:
file.write(" ".join(file_content_prettified_list[:]))
file.close()

Part_page_number['File_Name']=files
list_values_page_number.append(Part_page_number)

df_summary = pd.DataFrame(list_values_page_number)
df_summary.to_excel("summary_10K_Page_Segmentation.xlsx",index=False)

从上面的代码中,我无法按照我的意愿分割 html 文件。

编辑:

我添加了一组新的网址。

https://www.sec.gov/Archives/edgar/data/887921/000088792119000004/rev201810-k.htm https://www.sec.gov/Archives/edgar/data/104918/000010491819000053/ava-20181231x10k.htm https://www.sec.gov/Archives/edgar/data/886982/000119312519050198/d669877d10k.htm https://www.sec.gov/Archives/edgar/data/878927/000156459019004755/odfl-10k_20181231.htm https://www.sec.gov/Archives/edgar/data/785161/000078516119000011/ehc10k123118.htm https://www.sec.gov/Archives/edgar/data/1393818/000119312519061011/d663205d10k.htm https://www.sec.gov/Archives/edgar/data/86521/000008652119000014/sre20181231form10k.htm https://www.sec.gov/Archives/edgar/data/76282/000007628219000021/pkoh20181231-10k.htm https://www.sec.gov/Archives/edgar/data/883237/000088323719000026/vrts1231201810-k.htm https://www.sec.gov/Archives/edgar/data/883945/000088394519000016/usak-20181231.htm https://www.sec.gov/Archives/edgar/data/1000623/000100062319000048/swmform10-k12312018.htm

最佳答案

嗯,我写得很快,但是很复杂。

我会解释一下代码。

  1. 拆分为分隔页面的元素 ( <hr style="page-break-after:always"></hr> )。
  2. 在分割的页面中找到代表 PART 的文本并将内容组合起来。
  3. 保存。

我将粘贴代码。我希望这段代码有帮助

import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.sec.gov/Archives/edgar/data/763744/000076374419000018/lcii-20181231.htm", verify=False)
file_content_reading = response.text

splited_pages = file_content_reading.split('<hr style="page-break-after:always"></hr>')

skip_words = ['INDEX']

part_strings = [['PART I', 'PART I.', 'PART I. '],['PART II', 'PART II.', 'PART II. '],['PART III', 'PART III.', 'PART III. '],['PART IV', 'PART IV.', 'PART IV. ']]

part_content_list = []
appned_content = ""
part = 0

def maching_result(content_soup, list_string):
result = None
for match_string in list_string:
if content_soup.find("span", text=match_string) is not None:
result = match_string
break
return result

for page in splited_pages:
content = BeautifulSoup(page, "lxml")
if(part < len(part_strings)) and maching_result(content, skip_words) is None:

output = maching_result(content, part_strings[part])
if maching_result(content, part_strings[part]) is not None:
part += 1
index = page.find(str(content.find("span", text=output)))
first = page[:index]
second = page[index:]
part_content_list.append(appned_content + first)
appned_content = second + page
else:
appned_content += page + '<hr style="page-break-after:always"></hr>'
else:
appned_content += page

part_content_list.append(appned_content)

num = 0
for part in part_content_list:
soup = BeautifulSoup(part,"lxml")
with open("output"+ str(num)+".html", "w", encoding="utf-8") as file:
file.write(str(soup))
num +=1

关于python - 如何根据文本中的关键字将一个html页面拆分为多个html,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59550860/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com