gpt4 book ai didi

python - 压缩此数据的更快方法是什么?

转载 作者:行者123 更新时间:2023-11-28 18:48:49 25 4
gpt4 key购买 nike

我有一个程序可以计算大量数据并将其写入文件。我的数据只是一堆 0-16 的数字(17 个不同的值),我计算了每个数字在数据中出现的频率。我需要使用尽可能少的磁盘空间和尽可能少的 RAM,所以我用纯 python 编写了一个小的 Huffman 编码/解码模块,它一次在内存中写入/读取带有尽可能少的编码符号的压缩数据。有没有python自带的模块可以做类似的事情?下面是代码,其中包含一些如何使用它的示例(警告代码有点长,注释得体):

def makeTree(data):
"""data is a list of tuples, whos first entry is a priority/frequency
number, and whos second entry is tuple containing the data to
be encoded. The tree uses an internal tag system to tell where the
branch ends. (End nodes are terminated with a False)"""
def tag(data):
taggedData = []
for priority, datum in data:
#all of the initial data is an end branch
taggedData += [(priority, (datum, False))]
return taggedData
#get the tagged data into decending order of priority/frequency
tree = sorted(tag(data), reverse=True)
while len(tree)>1:
#get the two lowest priority branches
bottomR, bottomL = tree.pop(), tree.pop()
#and stick them together into a new branch with combined priority
new_elem = bottomR[0]+bottomL[0], ((bottomL, bottomR), True)
#then add them back to the list of branches and sort
tree += [new_elem]
tree.sort(reverse=True)
return tree[0]

def makeTable(tree, code=""):
"""Takes a tree such as generated by makeTree and returns a dictionary
of code:value pairs."""
#if this is an end branch, return code:value pair
if tree[1][1]==False:
return {code:tree[1][0]}
#otherwise find the tables for the left and right branches
#add them to the main table, and return
table = {}
table.update(makeTable(tree[1][0][0], code+'0')) #left
table.update(makeTable(tree[1][0][1], code+'1')) #right
return table

class Decoder:
"""this class creates a Decoder object which is used to decode a compressed
file using the appropriate decoding table (duh). It used to be a function,
but it was buggy and would also be ugly if I left it a function. (this
class was written After the Encdoer class.)
"""
def __init__(self, fname, table):
self.file = open(fname)
self.table = table
self.byte = None
self.bit = 7
self.newByte = True

def decode(self, size=1):
"""Decodes and yields size number of symbols from the file.
Size defaults to 1"""
#a counter for how many symbols were read
read = 0
code = ''
while read<size:
if self.newByte:
self.byte = ord(self.file.read(1))
for n in xrange(self.bit, -1, -1):
code += str((self.byte & 1<<n) >> n)
self.byte &= (1<<n)-1
if code in self.table:
yield self.table[code]
read += 1
code = ''
if read==size:
self.bit = n-1
self.newByte = False
raise StopIteration
self.bit = 7
self.newByte = True

def close(self):
self.file.close()

class Encoder:
"""This class creates an encoder object, which is used to write encoded data
to a file. It was initially going to be a function, but I couldn't
accomplish that without code getting really ugly. :p """
def __init__(self, fname, table):
self.file = open(fname, 'w')
self.table = table
self.code = ''

def encode(self, datum):
"""Attempts to write encoded datum to file. If their isn't enough
code to write a whole number amount of bytes, then the code is saved up
until there is."""
self.code += self.table[datum]
if len(self.code)%8==0:
self.__write_code_chunk()
return

def end_encode(self):
"""Writes any remaining code to the file, appending the code with
trailing zeros to fit within a byte, then closes the file."""
#if the length of the code remaining isn't a multiple of 8 bits
if len(self.code)%8:
#then add zeros to the end so that it is
self.code += "0"*(8 - len(self.code)%8)
self.__write_code_chunk()
self.file.close()
return

def __write_code_chunk(self):
bytes = len(self.code)/8
#for every byte (or every 8 bits)...
for _ in xrange(bytes):
#turn those bits into an number using int with base 2,
#then turn the number into an ascii character,
#and finally write the data to the file.
self.file.write(chr(int(self.code[:8], 2)))
#then get rid of the 8 bits just read
self.code = self.code[8:]
#make sure there is no code left over
assert self.code==''
return

if __name__=="__main__":
import random

mandelbrotData = [
(0.10776733333333334, 0),
(0.24859, 1),
(0.12407666666666667, 2),
(0.07718866666666667, 3),
(0.04594733333333333, 4),
(0.03356, 5),
(0.023286666666666664, 6),
(0.018338, 7),
(0.014030666666666667, 8),
(0.011918, 9),
(0.009500666666666668, 10),
(0.008396666666666667, 11),
(0.006936, 12),
(0.006365999999999999, 13),
(0.005466, 14),
(0.0048920000000000005, 15),
(0.2537393333333333, 16)]
decode_table = makeTable(makeTree(mandelbrotData))
encode_table = {val:key for key, val in decode_table.iteritems()}
approx_data = sum([[val]*int(round(freq*10**3/2)) for freq, val in mandelbrotData], [])
random.shuffle(approx_data)

testname = 'hufftest'
encoder = Encoder(testname, encode_table)

for val in approx_data:
encoder.encode(val)
encoder.end_encode()

decoder = Decoder(testname, decode_table)
decoded = list(decoder.decode(len(approx_data)/2))
decoded += list(decoder.decode(len(approx_data)/2))
print approx_data == decoded

是否有可以更快地执行类似操作的模块?如果没有,有什么方法可以更改我的代码以使其更快?

最佳答案

  • 在内存中,同一值的不同出现只会占用一个位置。尽管重复了对它们的引用。
  • 对于磁盘存储,我可能只是用 zlib 压缩它.

关于python - 压缩此数据的更快方法是什么?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16223787/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com