gpt4 book ai didi

c++ - LZW减压

转载 作者:行者123 更新时间:2023-12-02 10:02:12 38 4
gpt4 key购买 nike

我正在C++中实现LZW算法。

字典的大小是用户输入的,但最小值为256,因此它应适用于二进制文件。如果到达字典的末尾,它将绕到索引0,然后从那里开始覆盖它。

例如,如果我放入alice in wonderland script并使用字典大小512对其进行压缩,则会得到this dictionary

但是我对解压缩和从解压缩压缩文件looks like this的输出字典有问题。

我的解压缩代码如下所示

struct dictionary
{
vector<unsigned char> entry;
vector<bool> bits;
};

void decompress(dictionary dict[], vector<bool> file, int dictionarySize, int numberOfBits)
{
//in this example
//dictionarySize = 512, tells the max size of the dictionary, and goes back to 0 if it reaches 513
//numberOfBits = log2(512) = 9
//dictionary dict[] contains bits and strings (strings can be empty)
// dict[0] =
// entry = (unsigned char)0
// bits = (if numberOfBits = 9) 000000001
// dict[255] =
// entry = (unsigned char)255
// bits = (if numberOfBits = 9) 011111111
// so the next entry will be dict[next] (next is currently 256)
// dict[256] =
// entry = what gets added in the code below
// bits = 100000000
// all the bits are already set previously (dictionary size is int dictionarySize) so in this case all the bits from 0 to 511 are already set, entries are set from 0 to 255, so extended ASCII


vector<bool> currentCode;
vector<unsigned char> currentString;
vector<unsigned char> temp;

int next=256;
bool found=false;

for(int i=0;i<file.size();i+=numberOfBits)
{
for(int j=0;j<numberOfBits;j++)
{
currentCode.push_back(file[i+j]);
}

for(int j=0;j<dictionarySize;j++)
{
// when the currentCode (size numberOfBits) gets found in the dictionary
if(currentCode==dict[j].bits)
{
currentString = dict[j].entry;

// if the current string isnt empty, then it means it found the characted in the dictionary
if(!currentString.empty())
{
found = true;
}
}
}

//if the currentCode in the dictionary has a string value attached to it
if(found)
{
for(int j=0;j<currentString.size();j++)
{
cout<<currentString[j];
}

temp.push_back(currentString[0]);

// so it doesnt just push 1 character into the dictionary
// example, if first read character is 'r', it is already in the dictionary so it doesnt get added
if(temp.size()>1)
{
// if next is more than 511, writing to that index would cause an error, so it resets back to 0 and goes back up
if(next>dictionarySize-1) //next > 512-1
{
next = 0;
}
dict[next].entry.clear();
dict[next].entry = temp;
next++;
}

//temp = currentString;
}
else
{
currentString = temp;
currentString.push_back(temp[0]);

for(int j=0;j<currentString.size();j++)
{
cout<<currentString[j];
}

// if next is more than 511, writing to that index would cause an error, so it resets back to 0 and goes back up
if(next>dictionarySize-1)
{
next = 0;
}
dict[next].entry.clear();
dict[next].entry = currentString;
next++;

//break;
}

temp = currentString;

// currentCode gets cleared, and written into in the next iteration
currentCode.clear();

//cout<<endl;
found = false;
}
}

我目前陷入困境,不知道该怎么解决以修复输出。
我还注意到,如果我把字典放的足够大,那么它就不会绕过字典(它不会到达末尾并从0开始重新开始),它可以工作。

最佳答案

  • 开始小

    您使用的文件中有太多数据需要调试。从字符串开始。我从Wikli举了一个很好的例子:
    Input: "abacdacacadaad"

    step input match output new_entry new_index
    a 0
    b 1
    c 2
    d 3
    1 abacdacacadaad a 0 ab 4
    2 bacdacacadaad b 1 ba 5
    3 acdacacadaad a 0 ac 6
    4 cdacacadaad c 2 cd 7
    5 dacacadaad d 3 da 8
    6 acacadaad ac 6 aca 9
    7 acadaad aca 9 acad 10
    8 daad da 8 daa 11
    9 ad a 0 ad 12
    10 d d 3

    Output: "0102369803"

    因此,您可以通过交叉匹配输入/输出和字典内容来逐步调试代码。一旦正确完成,就可以对解码进行相同的操作:
    Input: "0102369803"

    step input output new_entry new_index
    a 0
    b 1
    c 2
    d 3
    1 0 a
    2 1 b ab 4
    3 0 a ba 5
    4 2 c ac 6
    5 3 d cd 7
    6 6 ac da 8
    7 9 aca aca 9
    8 8 da acad 10
    9 0 a daa 11
    10 3 d ad 12

    Output: "abacdacacadaad"

    然后,移至文件并清除字典处理。
  • 比特流

    成功完成小写字母的LZW后,您可以尝试使用完整的字母和位编码。您知道LZW流可以以任何位长编码(不仅仅是8/16/32/64位),这可以极大地影响压缩率(就使用的数据属性而言)。因此,我将尝试对变量(或预定义的位长)的数据进行统一访问。

  • 有点好奇,所以我为压缩编码了一个简单的C++ / VCL示例:

    //---------------------------------------------------------------------------
    // LZW
    const int LZW_bits=12; // encoded bitstream size
    const int LZW_size=1<<LZW_bits; // dictinary size
    // bitstream R/W
    DWORD bitstream_tmp=0;
    //---------------------------------------------------------------------------
    // return LZW_bits from dat[adr,bit] and increment position (adr,bit)
    DWORD bitstream_read(BYTE *dat,int siz,int &adr,int &bit,int bits)
    {
    DWORD a=0,m=(1<<bits)-1;
    // save tmp if enough bits
    if (bit>=bits){ a=(bitstream_tmp>>(bit-bits))&m; bit-=bits; return a; }
    for (;;)
    {
    // insert byte
    bitstream_tmp<<=8;
    bitstream_tmp&=0xFFFFFF00;
    bitstream_tmp|=dat[adr]&255;
    adr++; bit+=8;
    // save tmp if enough bits
    if (bit>=bits){ a=(bitstream_tmp>>(bit-bits))&m; bit-=bits; return a; }
    // end of data
    if (adr>=siz) return 0;
    }
    }
    //---------------------------------------------------------------------------
    // write LZW_bits from a to dat[adr,bit] and increment position (adr,bit)
    // return true if buffer is full
    bool bitstream_write(BYTE *dat,int siz,int &adr,int &bit,int bits,DWORD a)
    {
    a<<=32-bits; // align to MSB
    // save tmp if aligned
    if ((adr<siz)&&(bit==32)){ dat[adr]=(bitstream_tmp>>24)&255; adr++; bit-=8; }
    if ((adr<siz)&&(bit==24)){ dat[adr]=(bitstream_tmp>>16)&255; adr++; bit-=8; }
    if ((adr<siz)&&(bit==16)){ dat[adr]=(bitstream_tmp>> 8)&255; adr++; bit-=8; }
    if ((adr<siz)&&(bit== 8)){ dat[adr]=(bitstream_tmp )&255; adr++; bit-=8; }
    // process all bits of a
    for (;bits;bits--)
    {
    // insert bit
    bitstream_tmp<<=1;
    bitstream_tmp&=0xFFFFFFFE;
    bitstream_tmp|=(a>>31)&1;
    a<<=1; bit++;
    // save tmp if aligned
    if ((adr<siz)&&(bit==32)){ dat[adr]=(bitstream_tmp>>24)&255; adr++; bit-=8; }
    if ((adr<siz)&&(bit==24)){ dat[adr]=(bitstream_tmp>>16)&255; adr++; bit-=8; }
    if ((adr<siz)&&(bit==16)){ dat[adr]=(bitstream_tmp>> 8)&255; adr++; bit-=8; }
    if ((adr<siz)&&(bit== 8)){ dat[adr]=(bitstream_tmp )&255; adr++; bit-=8; }
    }
    return (adr>=siz);
    }
    //---------------------------------------------------------------------------
    bool str_compare(char *s0,int l0,char *s1,int l1)
    {
    if (l1<l0) return false;
    for (;l0;l0--,s0++,s1++)
    if (*s0!=*s1) return false;
    return true;
    }
    //---------------------------------------------------------------------------
    AnsiString LZW_encode(AnsiString raw)
    {
    AnsiString lzw="";
    int i,j,k,l;
    int adr,bit;
    DWORD a;
    const int siz=32; // bitstream buffer
    BYTE buf[siz];
    AnsiString dict[LZW_size]; // dictionary
    int dicts=0; // actual size of dictionary

    // init dictionary
    for (dicts=0;dicts<256;dicts++) dict[dicts]=char(dicts); // full 8bit binary alphabet
    // for (dicts=0;dicts<4;dicts++) dict[dicts]=char('a'+dicts); // test alphabet "a,b,c,d"

    l=raw.Length();
    adr=0; bit=0;
    for (i=0;i<l;)
    {
    i&=i;
    // find match in dictionary
    for (j=dicts-1;j>=0;j--)
    if (str_compare(dict[j].c_str(),dict[j].Length(),raw.c_str()+i,l-i))
    {
    i+=dict[j].Length();
    if (i<l) // add new entry in dictionary (if not end of input)
    {
    // clear dictionary if full
    if (dicts>=LZW_size) dicts=256; // full 8bit binary alphabet
    // if (dicts>=LZW_size) dicts=4; // test alphabet "a,b,c,d"
    else{
    dict[dicts]=dict[j]+AnsiString(raw[i+1]); // AnsiString index starts from 1 hence the +1
    dicts++;
    }
    }
    a=j; j=-1; break; // full binary output
    // a='0'+j; j=-1; break; // test ASCII output
    }
    // store result to bitstream
    if (bitstream_write(buf,siz,adr,bit,LZW_bits,a))
    {
    // append buf to lzw
    k=lzw.Length();
    lzw.SetLength(k+adr);
    for (j=0;j<adr;j++) lzw[j+k+1]=buf[j];
    // reset buf
    adr=0;
    }
    }
    if (bit)
    {
    // store the remainding bits with zeropad
    bitstream_write(buf,siz,adr,bit,LZW_bits-bit,0);
    }
    if (adr)
    {
    // append buf to lzw
    k=lzw.Length();
    lzw.SetLength(k+adr);
    for (j=0;j<adr;j++) lzw[j+k+1]=buf[j];
    }
    return lzw;
    }
    //---------------------------------------------------------------------------
    AnsiString LZW_decode(AnsiString lzw)
    {
    AnsiString raw="";
    int adr,bit,siz,ix;
    DWORD a;
    AnsiString dict[LZW_size]; // dictionary
    int dicts=0; // actual size of dictionary

    // init dictionary
    for (dicts=0;dicts<256;dicts++) dict[dicts]=char(dicts); // full 8bit binary alphabet
    // for (dicts=0;dicts<4;dicts++) dict[dicts]=char('a'+dicts); // test alphabet "a,b,c,d"

    siz=lzw.Length();
    adr=0; bit=0; ix=-1;
    for (adr=0;(adr<siz)||(bit>=LZW_bits);)
    {
    a=bitstream_read(lzw.c_str(),siz,adr,bit,LZW_bits);
    // a-='0'; // test ASCII input
    // clear dictionary if full
    if (dicts>=LZW_size){ dicts=4; ix=-1; }
    // new dictionary entry
    if (ix>=0)
    {
    if (a>=dicts){ dict[dicts]=dict[ix]+AnsiString(dict[ix][1]); dicts++; }
    else { dict[dicts]=dict[ix]+AnsiString(dict[a ][1]); dicts++; }
    } ix=a;
    // update decoded output
    raw+=dict[a];
    }
    return raw;
    }
    //---------------------------------------------------------------------------

    并使用 // test ASCII input行输出:
    txt="abacdacacadaad"
    enc="0102369803"
    dec="abacdacacadaad"

    其中 AnsiString是我使用的唯一VCL东西,它只是自我分配的字符串变量,请注意其索引始于 1
    AnsiString s;
    s[5] // character access (1 is first character)
    s.Length() // returns size
    s.c_str() // returns char*
    s.SetLength(size) // resize

    所以只要使用您得到的任何字符串...

    如果您没有 BYTE,DWORD,请改用 unsigned charunsigned int ...

    看起来它也适用于长文本(大于字典和/或位流缓冲区的大小)。但是请注意,清除可能会在几个不同的代码位置进行,但必须在编码器/解码器中都进行同步,否则清除数据后会破坏数据。

    该示例可以只使用 "a,b,c,d"字母,也可以使用完整的8it字母。当前设置为8bit。如果要更改它,只需取消rem的 // test ASCII input行并取消rem的代码中的 // full 8bit binary alphabet行。

    要测试交叉缓冲区和边界,您可以玩:
    const int LZW_bits=12;              // encoded bitstream size
    const int LZW_size=1<<LZW_bits; // dictinary size

    以及:
    const int siz=32;                   // bitstream buffer

    常数...也会影响效果,因此请根据自己的喜好进行调整。
    当心 bitstream_write没有经过优化,可以大大提高速度...

    同样,为了调试4位对齐的编码,我使用了十六进制打印编码数据(十六进制字符串是其ASCII版本的两倍),如下所示(忽略VCL内容):
    AnsiString txt="abacdacacadaadddddddaaaaaaaabcccddaaaaaaaaa",enc,dec,hex;
    enc=LZW_encode(txt);
    dec=LZW_decode(enc);

    // convert to hex
    hex=""; for (int i=1,l=enc.Length();i<=l;i++) hex+=AnsiString().sprintf("%02X",enc[i]);

    mm_log->Lines->Add("\""+txt+"\"");
    mm_log->Lines->Add("\""+hex+"\"");
    mm_log->Lines->Add("\""+dec+"\"");
    mm_log->Lines->Add(AnsiString().sprintf("ratio: %i%",(100*enc.Length()/dec.Length())));

    结果:
    "abacdacacadaadddddddaaaaaaaabcccddaaaaaaaaa"
    "06106206106306410210510406106410FFFFFF910A10706110FFFFFFD10E06206311110910FFFFFFE11410FFFFFFD0"
    "abacdacacadaadddddddaaaaaaaabcccddaaaaaaaaa"
    ratio: 81%

    关于c++ - LZW减压,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62086402/

    38 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com