gpt4 book ai didi

html - 从命令行操作、处理 HTML

转载 作者:太空狗 更新时间:2023-10-29 11:06:46 25 4
gpt4 key购买 nike

我正在寻找一种从命令行处理 HTML 代码的方法(可能使用 XPATH)。

例如我想删除 .container类或添加新的 <div>.container 之后类。

输入:

<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>

输出:

<div class="bg-detail2" id="geometry">
<div class="container">
<div class="newdiv>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
</div>

我的第一个想法是使用 sed ,但这不是防弹方法。我知道xmllint , 但它只能读取 HTML 文件。

是否有任何其他可用于命令行的工具?

最佳答案

我找不到一个程序来做你想做的事。所以我做了一个。 现在可以使用了!

#!python3

from html.parser import HTMLParser

class HTMLPass(HTMLParser):
def __init__(self, *a, convert_charrefs=False, **k):
super().__init__(*a, convert_charrefs=convert_charrefs, **k)

def handle_starttag(self, tag, attrs):
print(end=self.get_starttag_text())

@staticmethod
def handle_endtag(tag):
print(end="</" + tag + ">")

handle_startendtag = handle_starttag

@staticmethod
def handle_data(data):
print(end=data)

@staticmethod
def handle_entityref(name):
print(end="&"+name+";")

@staticmethod
def handle_charref(name):
print(end="&#"+name+";")

@staticmethod
def handle_comment(data):
print(end="<!--"+data+"-->")

@staticmethod
def handle_decl(decl):
print(end="<!"+decl+">")

@staticmethod
def handle_pi(data):
print(end="<?"+data+">")

unknown_decl = handle_decl

class HTMLPassMod(HTMLPass):
def __init__(self, *a, argv=None, **k):
super().__init__(*a, **k)
self.stack = []
self.args = debugremoveme = []
if argv is None:
import sys
argv = sys.argv[1:]
for arg in argv:
# Horrible string parsing
# Should turn "/a#link-1.external/d" into
# [d, ['a', ('id', 'link-1'), ('class', 'external')]]
sel, act = arg[1:].split(arg[0])
self.args.append([act])
for selector in sel.split(">"):
self.args[-1].append([])
selector = selector.strip()
if "." not in selector and "#" not in selector:
self.args[-1][-1].append(selector)
continue
if "." not in selector:
self.args[-1][-1][:] = selector.split("#")
self.args[-1][-1][1:] = zip(["id"]*(len(self.args[-1][-1])-1), self.args[-1][-1][1:])
continue
if "#" not in selector:
self.args[-1][-1][:] = selector.split(".")
self.args[-1][-1][1:] = zip(["class"]*(len(self.args[-1][-1])-1), self.args[-1][-1][1:])
continue
if selector.index(".") < selector.index("#"):
tag, selector = selector.split(".", maxsplit=1)
selector = "." + selector
else:
tag, selector = selector.split("#", maxsplit=1)
selector = "#" + selector
self.args[-1][-1].append(tag)
while selector:
if "#" not in selector:
self.args[-1][-1].extend(zip(["class"]*len(selector), selector.split(".")))
break
if "." not in selector:
self.args[-1][-1].extend(zip(["id"]*len(selector), selector.split("#")))
break
if selector[0] == ".":
if "." not in selector[1:] or selector.index("#") < selector.index("."):
axa, selector = selector[1:].split("#", maxsplit=1)
else:
axa, selector = selector[1:].split(".", maxsplit=1)
self.args[-1][-1].append(("class", axa))
else:
if "#" not in selector[1:] or selector.index(".") < selector.index("#"):
axa, selector = selector[1:].split(".", maxsplit=1)
else:
axa, selector = selector[1:].split("#", maxsplit=1)
self.args[-1][-1].append(("id", axa))

def handle_starttag(self, tag, attrs):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
# kill means kill
self.stack.append((tag, attrs, None))
return
self.stack.append((tag, attrs, None))
for arg in self.args:
for frame, a in zip(self.stack[::-1], arg[:0:-1]):
a_tag = a[0].replace("*", "").strip()
if a_tag and frame[0] != a_tag:
break
for attr, val in frame[1]:
if attr == "class":
frame_classes = val.split()
break
else:
frame_classes = []
for attr, val in a[1:]:
if attr == "class":
if val not in frame_classes:
break
else:
for a, v in frame[1]:
if a == attr and v == val:
break
else:
break
else:
continue
break
else:
self.stack[-1] = (tag, attrs, arg[0])
if arg[0][0] in "drk": # delete / replace / kill
if arg[0][0] == "r":
print(end=arg[0][1:])
return
if arg[0][0] == "i": # insert (inside / after)
super().handle_starttag(tag, attrs)
print(end=arg[0][2:].split(arg[0][1])[0])
break
else:
super().handle_starttag(tag, attrs)

def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.stack.pop()

def handle_endtag(self, tag):
if self.stack[-1][0] != tag:
# TODO: Implement proper HTML-isn't-XML behaviour
pass
frame = self.stack.pop()
if frame[2] is None:
return super().handle_endtag(tag)
if frame[2][0] in "drk": # delete / replace / kill
return
if frame[2][0] == "i":
super().handle_endtag(tag)
print(end=frame[2][2:].split(frame[2][1])[1])

def handle_data(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_data(data)

def handle_entityref(self, name):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_entityref(name)

def handle_charref(self, name):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_charref(name)

def handle_comment(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_comment(data)

def handle_decl(self, decl):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_data(decl)

def handle_pi(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().handle_pi(data)

def unknown_decl(self, data):
if self.stack and self.stack[-1][2] is not None and self.stack[-1][2][0] == 'k':
return
super().unknown_decl(data)

def run(pass_through=HTMLPassMod):
x = pass_through()
while True:
try:
i = input()
except EOFError:
break
x.feed(i + '\n')
x.close()

if __name__ == "__main__":
run()

此代码糟糕,但实际上可以正常运行,包括在许多边缘情况下。

示例用法:

wizzwizz4@wizzwizz4Laptop:~$ cat example_input.html
<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
wizzwizz4@wizzwizz4Laptop:~$ <example_input.html ./rubbish_program.py ~div.newdiv~r<h2>Title</h2>
<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
wizzwizz4@wizzwizz4Laptop:~$ cat example_input_2.html
<div class="bg-detail2" id="geometry">
<div class="container">
<h2>Title</h2>
<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div>
</div>
wizzwizz4@wizzwizz4Laptop:~$ <example_input_2.html ./rubbish_program.py 'Jdiv.containerJi~<div class="newdiv">~</div>' '\.container > h2\k'
<div class="bg-detail2" id="geometry">
<div class="container"><div class="newdiv">

<div class="line"></div>
<div class="fix"></div>
<div class="col50">
Content
</div>
<div class="col50">
Another Content
</div>
</div></div>
</div>

语法

./rubbish_program.py [argument...]

其中 argument 的形式为:

<separator><selector><separator><instruction>

哪里:

  • separator 是单个字符,不得出现在 selectorinstruction 中。
  • selector 是一系列类似 tag.class1.class2#id.class3 的东西,其中只能有一个 #id tag 是可选的,可以有无限数量的 .classn,由 > 分隔。示例:div#geometry > .container > h2
  • instruction 是以下形式的指令:

    <command><parameters>

    其中 command 是以下之一:

    • d – 移除元素而不移除其子元素。不带参数。
    • r – 用 parameters 替换开始标签,并删除结束标签,但不删除元素的子元素。
    • i – 有两种不同的行为,具体取决于标签是否自闭合。

      • 如果不是自闭,则在内容前加上第一个参数,在内容后加上第二个参数。
      • 如果是self-closing,紧跟在标签后面插入第一个参数,忽略后面的参数。

      parameters 的形式为:

      <separator2><first parameter><separator2><second parameter>[<separator2>discarded]

      separator2 不得出现在任何一个参数中,并且必须不同于 separator。它在不同的调用中可以有不同的值。

    • k – 删除元素及其子元素。不带参数。

关于html - 从命令行操作、处理 HTML,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54700212/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com