C#哈夫曼编码文件压缩

作者:追风剑情 发布于:2024-10-16 20:43 分类:Algorithms

  因为需要将统计表保存到文件中,以便解码时重建哈夫曼树,所以当数据量小时,可能压缩后的文件比原文件还大。哈夫曼编码适合用来压缩数据量大且字符出现频率高的文件。算法原理参见 哈夫曼树(Huffman Tree)

一、哈夫曼算法实现

  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.IO;
  5. using System.Text;
  6.  
  7. namespace HuffmanTest
  8. {
  9. internal class Program
  10. {
  11. static void Main(string[] args)
  12. {
  13. //读取测试数据
  14. string s = File.ReadAllText("test.txt");
  15.  
  16. //压缩
  17. string base64 = HuffmanCoding.Compress(s);
  18. File.WriteAllText("encode.txt", base64);
  19.  
  20. //解压
  21. s = HuffmanCoding.Decompress(base64);
  22. File.WriteAllText("decode.txt", s);
  23.  
  24. Console.ReadKey();
  25. }
  26. }


  1. using System;
  2. using System.Collections;
  3. using System.Collections.Generic;
  4. using System.Text;
  5. using System.IO;
  6. /// <summary>
  7. /// 哈夫曼编码压缩算法
  8. /// </summary>
  9. public sealed class HuffmanCoding
  10. {
  11. //哈夫曼树节点
  12. public sealed class Node : IComparable<Node>
  13. {
  14. //值
  15. public byte value;
  16. //出现频率
  17. public int frequency;
  18. public Node left;
  19. public Node right;
  20.  
  21. public Node(byte value, int frequency)
  22. {
  23. this.value = value;
  24. this.frequency = frequency;
  25. }
  26.  
  27. public int CompareTo(Node other)
  28. {
  29. return frequency.CompareTo(other.frequency);
  30. }
  31. }
  32.  
  33. // 生成统计表
  34. private static Dictionary<byte, Node> GenerateStatistics(byte[] data)
  35. {
  36. //统计不同字节出现的频率
  37. var statistics = new Dictionary<byte, Node>();
  38. foreach (byte b in data)
  39. {
  40. if (statistics.ContainsKey(b))
  41. statistics[b].frequency++;
  42. else
  43. statistics.Add(b, new Node(b, 1));
  44. }
  45. return statistics;
  46. }
  47.  
  48. // 生成哈夫曼编码表
  49. private static Dictionary<byte, string> GenerateCodes(Node node, string code = "", Dictionary<byte, string> codeTable = null)
  50. {
  51. if (codeTable == null)
  52. codeTable = new Dictionary<byte, string>();
  53.  
  54. //叶子节点
  55. if (node.left == null && node.right == null)
  56. {
  57. codeTable[node.value] = code;
  58. return codeTable;
  59. }
  60.  
  61. //树的广度遍历
  62. GenerateCodes(node.left, code + "0", codeTable);
  63. GenerateCodes(node.right, code + "1", codeTable);
  64.  
  65. return codeTable;
  66. }
  67.  
  68. // 生成啥夫曼树
  69. private static Node GenerateTree(Dictionary<byte, Node> statistics)
  70. {
  71. List<Node> nodes = new List<Node>();
  72. nodes.AddRange(statistics.Values);
  73. nodes.Sort();
  74.  
  75. //构造哈夫曼树
  76. while (nodes.Count > 1)
  77. {
  78. //取出频率最小的两个节点
  79. var leftNode = nodes[0];
  80. var rightNode = nodes[1];
  81. nodes.RemoveRange(0, 2);
  82.  
  83. //构造一棵子树
  84. var node = new Node(0, leftNode.frequency + rightNode.frequency);
  85. node.left = leftNode;
  86. node.right = rightNode;
  87.  
  88. //将子树加入列表并重新排序
  89. nodes.Add(node);
  90. nodes.Sort();
  91. }
  92. //哈夫曼树的根节点
  93. var rootNode = nodes[0];
  94.  
  95. return rootNode;
  96. }
  97.  
  98. // 压缩数据, 返回 base64
  99. public static string Compress(string s)
  100. {
  101. byte[] bytes = Encoding.UTF8.GetBytes(s);
  102. byte[] encodeData = HuffmanCoding.Compress(bytes);
  103. string base64 = Convert.ToBase64String(encodeData);
  104. return base64;
  105. }
  106.  
  107. // 压缩数据
  108. public static byte[] Compress(byte[] data)
  109. {
  110. if (data == null)
  111. return data;
  112.  
  113. //频率统计表
  114. var statistics = GenerateStatistics(data);
  115. //哈夫曼树根节点
  116. var rootNode = GenerateTree(statistics);
  117. //哈夫曼编码表
  118. var codeTable = GenerateCodes(rootNode);
  119. PrintCodeTable("code_table.txt", codeTable);
  120. //生成压缩数据
  121. byte[] encodeData = Encode(statistics, codeTable, data);
  122.  
  123. return encodeData;
  124. }
  125.  
  126. // 解压数据
  127. public static string Decompress(string base64)
  128. {
  129. byte[] bytes = Convert.FromBase64String(base64);
  130. byte[] decodeData = HuffmanCoding.Decompress(bytes);
  131. return Encoding.UTF8.GetString(decodeData);
  132. }
  133.  
  134. // 解压数据
  135. public static byte[] Decompress(byte[] data)
  136. {
  137. if (data == null)
  138. return data;
  139.  
  140. MemoryStream ms = new MemoryStream(data);
  141. BinaryReader br = new BinaryReader(ms);
  142.  
  143. //频率统计表
  144. var statistics = ReadStatistics(br);
  145. //哈夫曼树根节点
  146. var rootNode = GenerateTree(statistics);
  147. //解码数据
  148. var decodeData = Decode(br, rootNode);
  149.  
  150. return decodeData;
  151. }
  152.  
  153. // 从内存流读统计表
  154. private static Dictionary<byte, Node> ReadStatistics(BinaryReader br)
  155. {
  156. var statistics = new Dictionary<byte, Node>();
  157. //读统计表长度
  158. int length = br.ReadInt32();
  159. for (int i = 0; i < length; i++)
  160. {
  161. byte b = br.ReadByte();
  162. int f = br.ReadInt32();
  163. var node = new Node(b, f);
  164. statistics.Add(b, node);
  165. }
  166. return statistics;
  167. }
  168.  
  169. // 解码内容
  170. private static byte[] Decode(BinaryReader br, Node rootNode)
  171. {
  172. //内容长度
  173. int length = br.ReadInt32();
  174. //压缩数据
  175. var data = br.ReadBytes(length);
  176.  
  177. //数据解码
  178. Node node = rootNode;
  179. List<byte> result = new List<byte>();
  180. BitArray bits = new BitArray(data);
  181. for (int i = 0; i < bits.Length; i++)
  182. {
  183. bool b = bits[i];
  184. node = b ? node.right : node.left;
  185. //如果是叶子节点
  186. if (node.left == null && node.right == null)
  187. {
  188. result.Add(node.value);
  189. //继续从根节点查找叶子
  190. node = rootNode;
  191. //内容已全部解码
  192. if (result.Count >= length)
  193. break;
  194. }
  195. }
  196. return result.ToArray();
  197. }
  198.  
  199. // 写统计表到内存流
  200. private static void WriteStatistics(Dictionary<byte, Node> statistics, BinaryWriter bw)
  201. {
  202. //写入统计表长度
  203. bw.Write(statistics.Count);
  204.  
  205. foreach (byte k in statistics.Keys)
  206. {
  207. //字节
  208. bw.Write(k);
  209. //出现的频率
  210. bw.Write(statistics[k].frequency);
  211. }
  212. }
  213.  
  214. // 写内容到内存流
  215. private static void WriteData(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data, BinaryWriter bw)
  216. {
  217. //写入内容长度
  218. bw.Write(data.Length);
  219.  
  220. //计算需要用多少bit来存储经哈夫曼编码后的数据
  221. int bitCount = 0;
  222. foreach (var b in statistics.Keys)
  223. {
  224. int frequence = statistics[b].frequency;
  225. string code = codeTable[b];
  226. bitCount += code.Length * frequence;
  227. }
  228.  
  229. //将数据的哈夫曼编码存储到BitArray中
  230. BitArray bitArray = new BitArray(bitCount);
  231. int i = 0;
  232. foreach (var b in data)
  233. {
  234. string code = codeTable[b];
  235. foreach (var c in code)
  236. {
  237. bitArray.Set(i++, c.Equals('1'));
  238. }
  239. }
  240.  
  241. //将BitArray中的数据复制到buffer中
  242. int size = bitArray.Count / 8;
  243. if (bitArray.Count % 8 > 0)
  244. size++;
  245. byte[] buffer = new byte[size];
  246. bitArray.CopyTo(buffer, 0);
  247.  
  248. //将编码后的数据写入内存流
  249. bw.Write(buffer);
  250. }
  251.  
  252. // 生成编码后的数据
  253. private static byte[] Encode(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data)
  254. {
  255. MemoryStream ms = new MemoryStream();
  256. BinaryWriter bw = new BinaryWriter(ms);
  257.  
  258. WriteStatistics(statistics, bw);
  259. WriteData(statistics, codeTable, data, bw);
  260.  
  261. byte[] bytes = ms.ToArray();
  262. return bytes;
  263. }
  264.  
  265. // 输出编码表
  266. private static void PrintCodeTable(string filePath, Dictionary<byte, string> codeTable)
  267. {
  268. StringBuilder sb = new StringBuilder();
  269. foreach (var b in codeTable.Keys)
  270. {
  271. sb.AppendLine(string.Format("{0}: {1}", (char)b, codeTable[b]));
  272. }
  273. File.WriteAllText(filePath, sb.ToString());
  274. }
  275. }


二、运行测试

22222.png

111111.png

标签: Algorithms

Powered by emlog  蜀ICP备18021003号-1   sitemap

川公网安备 51019002001593号