gpt4 book ai didi

python - 在 Python 2.5 中解码未知 unicoding 编码的最佳方法

转载 作者:太空狗 更新时间:2023-10-29 14:05:34 32 4
gpt4 key购买 nike

<分区>

我的理解是否正确?无论如何,我正在解析很多 html,但我并不总是知道它意味着什么编码(一个令人惊讶的数字谎言)。下面的代码很容易显示我到目前为止所做的事情,但我确信有更好的方法。非常感谢您的建议。

import logging
import codecs
from utils.error import Error

class UnicodingError(Error):
pass

# these encodings should be in most likely order to save time
encodings = [ "ascii", "utf_8", "big5", "big5hkscs", "cp037", "cp424", "cp437", "cp500", "cp737", "cp775", "cp850", "cp852", "cp855",
"cp856", "cp857", "cp860", "cp861", "cp862", "cp863", "cp864", "cp865", "cp866", "cp869", "cp874", "cp875", "cp932", "cp949",
"cp950", "cp1006", "cp1026", "cp1140", "cp1250", "cp1251", "cp1252", "cp1253", "cp1254", "cp1255", "cp1256", "cp1257", "cp1258",
"euc_jp", "euc_jis_2004", "euc_jisx0213", "euc_kr", "gb2312", "gbk", "gb18030", "hz", "iso2022_jp", "iso2022_jp_1", "iso2022_jp_2",
"iso2022_jp_2004", "iso2022_jp_3", "iso2022_jp_ext", "iso2022_kr", "latin_1", "iso8859_2", "iso8859_3", "iso8859_4", "iso8859_5",
"iso8859_6", "iso8859_7", "iso8859_8", "iso8859_9", "iso8859_10", "iso8859_13", "iso8859_14", "iso8859_15", "johab", "koi8_r", "koi8_u",
"mac_cyrillic", "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish", "ptcp154", "shift_jis", "shift_jis_2004",
"shift_jisx0213", "utf_32", "utf_32_be", "utf_32_le", "utf_16", "utf_16_be", "utf_16_le", "utf_7", "utf_8_sig" ]

def unicode(string):
'''make unicode'''
for enc in self.encodings:
try:
logging.debug("unicoder is trying " + enc + " encoding")
utf8 = unicode(string, enc)
logging.info("unicoder is using " + enc + " encoding")
return utf8
except UnicodingError:
if enc == self.encodings[-1]:
raise UnicodingError("still don't recognise encoding after trying do guess.")

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com