数据压缩原理与应用 | 实验2 | LZW编解码

张开发
2026/4/4 8:33:10 15 分钟阅读
数据压缩原理与应用 | 实验2 | LZW编解码
目录一、LZW原理概述二、文件格式设计三、实验代码LZW编码过程LZW解码过程定长编码输出定长编码读写批处理Python完整代码四、结果分析部分数据集​编辑分析五、经验总结六、碎碎念一、LZW原理概述编码先构造一个包含所有ASCLL码的“字节编号”字典然后从输入数据中逐个取字符再和缓存字符串拼接如果新的字符串在字典中有编号则继续读取否则将旧的缓存字符串的编号输出并把新的字符串存入字典然后将此轮读取的字符作为缓存字符串。若字典长度大于X则不再将未保存的字符串组合存入字典。解码先构造一个包含所有ASCLL码的“编号字节”字典然后先读取一个字符作为缓存字符串接下来再逐个读取并对称解码如果遇到未保存的编号则取缓存字符串的首位与其自身拼接。二、文件格式设计本实验设计了两种文件格式分别为12bit编码和16bit编码对应字典最大长度为4096和65536两种设计方案。为了满足8bit的字节封装包采用位运算的方式将读入流对8取模拆到不同字节中存储读取文件时再对称拆包拼接得到原bit数码字。三、实验代码LZW编码过程def lzw_compress(input_bytes, max_table_sizeMAX_SIZE): bytes_dictionary {bytes([i]): i for i in range(256)} string b result [] dict_size 256 for symbol in input_bytes: symbol bytes([symbol]) combined string symbol if combined in bytes_dictionary: string combined else: result.append(bytes_dictionary[string]) if dict_size max_table_size: bytes_dictionary[combined] dict_size dict_size 1 string symbol if string: result.append(bytes_dictionary[string]) return resultLZW解码过程def lzw_decompress(compressed, max_table_sizeMAX_SIZE): if not compressed: return b num_dictionary {i: bytes([i]) for i in range(256)} dict_size 256 result bytearray() prev_code compressed[0] result num_dictionary[prev_code] string num_dictionary[prev_code] for number in compressed[1:]: if number in num_dictionary: entry num_dictionary[number] elif number dict_size: entry string string[:1] else: raise ValueError(Bad compressed code) result entry if dict_size max_table_size: num_dictionary[dict_size] string entry[:1] dict_size 1 string entry return bytes(result)定长编码输出实验采用定长编码输出码字本实验尝试了12bit和16bit两种方式分别对应字典最大长度为4096和65536两种情况因为4096是2^1265536是2^16编码比特数由编码位数决定。为了解决编码位数和字节位数不匹配的问题即x mod 8 ! 0代码采用位运算来循环模8将码字拆到字节中。需要即时删除已写入的部分否则随着写入码字增多位运算的开销会非常恐怖。定长编码读写def write_bit_codes(number_list, output_file): buffer 0 buffer_bits 0 with open(output_file, wb) as f: for number in number_list: buffer (buffer 16) | (number 0xFFFF) buffer_bits 16 while buffer_bits 8: buffer_bits - 8 byte (buffer buffer_bits) 0xFF f.write(bytes([byte])) # 只保留剩余未写出的低 buffer_bits 位 if buffer_bits 0: buffer (1 buffer_bits) - 1 else: buffer 0 if buffer_bits 0: byte (buffer (8 - buffer_bits)) 0xFF f.write(bytes([byte])) def read_bit_codes(input_file): with open(input_file, rb) as f: data f.read() buffer 0 buffer_bits 0 codes [] for byte in data: buffer (buffer 8) | byte buffer_bits 8 while buffer_bits 16: buffer_bits - 16 code (buffer buffer_bits) 0xFFFF codes.append(code) # 只保留剩余未读取的低 buffer_bits 位 if buffer_bits 0: buffer (1 buffer_bits) - 1 else: buffer 0 return codes批处理可以写一个.bat文件来运行代码功能与常见的main函数相同。echo off setlocal enabledelayedexpansion set BASE%~dp0 set INPUT_DIR%BASE%testset set COMPRESS_DIR%BASE%LZW_compressed set DECOMPRESS_DIR%BASE%LZW_decompressed if not exist %COMPRESS_DIR% mkdir %COMPRESS_DIR% if not exist %DECOMPRESS_DIR% mkdir %DECOMPRESS_DIR% for %%F in (%INPUT_DIR%\*) do ( if exist %%~fF ( echo 正在压缩: %%~nxF python %BASE%main_65536.py compress %%~fF %COMPRESS_DIR%\%%~nxF.lzw ) ) for %%F in (%COMPRESS_DIR%\*.lzw) do ( if exist %%~fF ( echo 正在解压: %%~nxF python %BASE%main_65536.py decompress %%~fF %DECOMPRESS_DIR%\%%~nF ) ) echo. echo 全部处理完成 pausePython完整代码import sys import os import time MAX_SIZE 65536 def lzw_compress(input_bytes, max_table_sizeMAX_SIZE): bytes_dictionary {bytes([i]): i for i in range(256)} string b result [] dict_size 256 for symbol in input_bytes: symbol bytes([symbol]) combined string symbol if combined in bytes_dictionary: string combined else: result.append(bytes_dictionary[string]) if dict_size max_table_size: bytes_dictionary[combined] dict_size dict_size 1 string symbol if string: result.append(bytes_dictionary[string]) return result def lzw_decompress(compressed, max_table_sizeMAX_SIZE): if not compressed: return b num_dictionary {i: bytes([i]) for i in range(256)} dict_size 256 result bytearray() prev_code compressed[0] result num_dictionary[prev_code] string num_dictionary[prev_code] for number in compressed[1:]: if number in num_dictionary: entry num_dictionary[number] elif number dict_size: entry string string[:1] else: raise ValueError(Bad compressed code) result entry if dict_size max_table_size: num_dictionary[dict_size] string entry[:1] dict_size 1 string entry return bytes(result) def write_bit_codes(number_list, output_file): buffer 0 buffer_bits 0 with open(output_file, wb) as f: for number in number_list: buffer (buffer 16) | (number 0xFFFF) buffer_bits 16 while buffer_bits 8: buffer_bits - 8 byte (buffer buffer_bits) 0xFF f.write(bytes([byte])) # 只保留剩余未写出的低 buffer_bits 位 if buffer_bits 0: buffer (1 buffer_bits) - 1 else: buffer 0 if buffer_bits 0: byte (buffer (8 - buffer_bits)) 0xFF f.write(bytes([byte])) def read_bit_codes(input_file): with open(input_file, rb) as f: data f.read() buffer 0 buffer_bits 0 codes [] for byte in data: buffer (buffer 8) | byte buffer_bits 8 while buffer_bits 16: buffer_bits - 16 code (buffer buffer_bits) 0xFFFF codes.append(code) # 只保留剩余未读取的低 buffer_bits 位 if buffer_bits 0: buffer (1 buffer_bits) - 1 else: buffer 0 return codes def compress_file(input_path, output_path): original_size os.path.getsize(input_path) with open(input_path, rb) as f: data f.read() start_time time.perf_counter() compressed lzw_compress(data) write_bit_codes(compressed, output_path) end_time time.perf_counter() compressed_size os.path.getsize(output_path) encode_time_ms (end_time - start_time) * 1000 if compressed_size 0: compression_ratio float(inf) else: compression_ratio original_size / compressed_size print(Compression completed.) print(f原始文件大小: {original_size} 字节) print(f压缩后文件大小: {compressed_size} 字节) print(f压缩比: {compression_ratio:.4f}) print(f编码时间: {encode_time_ms:.3f} ms) def decompress_file(input_path, output_path): compressed_size os.path.getsize(input_path) start_time time.perf_counter() codes read_bit_codes(input_path) decompressed lzw_decompress(codes) with open(output_path, wb) as f: f.write(decompressed) end_time time.perf_counter() decompressed_size os.path.getsize(output_path) decode_time_ms (end_time - start_time) * 1000 print(f压缩文件名{input_path}) print(f压缩文件大小: {compressed_size} 字节) print(f解压后文件大小: {decompressed_size} 字节) print(f解码时间: {decode_time_ms:.3f} ms) def main_bat(): if len(sys.argv) ! 4: print(Usage:) print( python lzw12.py compress inputfile outputfile) print( python lzw12.py decompress inputfile outputfile) sys.exit(1) mode sys.argv[1] input_file sys.argv[2] output_file sys.argv[3] if mode compress: compress_file(input_file, output_file) elif mode decompress: decompress_file(input_file, output_file) else: print(Unknown mode:, mode) sys.exit(1) if __name__ __main__: main_bat()四、结果分析部分数据集分析经过实验对比发现字典长度为65535比4096更加压缩率更高表格中的数据均为65535字典长度压缩得到的结果。对于未经压缩且重复信息多的文件压缩比均可以超过1即LZW编码是有意义的而对于已压缩格式文件压缩比几乎都小于1即压缩后再进行LZW编码会起到反效果。不同文件的解码时间均小于编码时间这是因为在LZW编码时每次读取字符后都要遍历字典以查找新字符串是否已被存储而解码时只需要直接按照编号寻找字符串就可以了省去了查找时间。五、经验总结在理解LZW时一定要先理清两个字典里面的对应关系作为键值对编号和字符串谁是键谁是值。虽然定长编码输出在码字位数mod 8!0时会更让压缩比更高但如果不对应修改字典长度和位运算长度及掩码位数就会报错导致解压缩后无法打开文件。六、碎碎念chatgpt神力但没看懂的代码千万不要随便跑‍♀️否则就会越来越乱套

更多文章