因为需要将统计表保存到文件中,以便解码时重建哈夫曼树,所以当数据量小时,可能压缩后的文件比原文件还大。哈夫曼编码适合用来压缩数据量大且字符出现频率高的文件。算法原理参见 哈夫曼树(Huffman Tree)
using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Text; namespace HuffmanTest { internal class Program { static void Main(string[] args) { //读取测试数据 string s = File.ReadAllText("test.txt"); //压缩 string base64 = HuffmanCoding.Compress(s); File.WriteAllText("encode.txt", base64); //解压 s = HuffmanCoding.Decompress(base64); File.WriteAllText("decode.txt", s); Console.ReadKey(); } }
using System; using System.Collections; using System.Collections.Generic; using System.Text; using System.IO; /// <summary> /// 哈夫曼编码压缩算法 /// </summary> public sealed class HuffmanCoding { //哈夫曼树节点 public sealed class Node : IComparable<Node> { //值 public byte value; //出现频率 public int frequency; public Node left; public Node right; public Node(byte value, int frequency) { this.value = value; this.frequency = frequency; } public int CompareTo(Node other) { return frequency.CompareTo(other.frequency); } } // 生成统计表 private static Dictionary<byte, Node> GenerateStatistics(byte[] data) { //统计不同字节出现的频率 var statistics = new Dictionary<byte, Node>(); foreach (byte b in data) { if (statistics.ContainsKey(b)) statistics[b].frequency++; else statistics.Add(b, new Node(b, 1)); } return statistics; } // 生成哈夫曼编码表 private static Dictionary<byte, string> GenerateCodes(Node node, string code = "", Dictionary<byte, string> codeTable = null) { if (codeTable == null) codeTable = new Dictionary<byte, string>(); //叶子节点 if (node.left == null && node.right == null) { codeTable[node.value] = code; return codeTable; } //树的广度遍历 GenerateCodes(node.left, code + "0", codeTable); GenerateCodes(node.right, code + "1", codeTable); return codeTable; } // 生成啥夫曼树 private static Node GenerateTree(Dictionary<byte, Node> statistics) { List<Node> nodes = new List<Node>(); nodes.AddRange(statistics.Values); nodes.Sort(); //构造哈夫曼树 while (nodes.Count > 1) { //取出频率最小的两个节点 var leftNode = nodes[0]; var rightNode = nodes[1]; nodes.RemoveRange(0, 2); //构造一棵子树 var node = new Node(0, leftNode.frequency + rightNode.frequency); node.left = leftNode; node.right = rightNode; //将子树加入列表并重新排序 nodes.Add(node); nodes.Sort(); } //哈夫曼树的根节点 var rootNode = nodes[0]; return rootNode; } // 压缩数据, 返回 base64 public static string Compress(string s) { byte[] bytes = Encoding.UTF8.GetBytes(s); byte[] encodeData = HuffmanCoding.Compress(bytes); string base64 = Convert.ToBase64String(encodeData); return base64; } // 压缩数据 public static byte[] Compress(byte[] data) { if (data == null) return data; //频率统计表 var statistics = GenerateStatistics(data); //哈夫曼树根节点 var rootNode = GenerateTree(statistics); //哈夫曼编码表 var codeTable = GenerateCodes(rootNode); PrintCodeTable("code_table.txt", codeTable); //生成压缩数据 byte[] encodeData = Encode(statistics, codeTable, data); return encodeData; } // 解压数据 public static string Decompress(string base64) { byte[] bytes = Convert.FromBase64String(base64); byte[] decodeData = HuffmanCoding.Decompress(bytes); return Encoding.UTF8.GetString(decodeData); } // 解压数据 public static byte[] Decompress(byte[] data) { if (data == null) return data; MemoryStream ms = new MemoryStream(data); BinaryReader br = new BinaryReader(ms); //频率统计表 var statistics = ReadStatistics(br); //哈夫曼树根节点 var rootNode = GenerateTree(statistics); //解码数据 var decodeData = Decode(br, rootNode); return decodeData; } // 从内存流读统计表 private static Dictionary<byte, Node> ReadStatistics(BinaryReader br) { var statistics = new Dictionary<byte, Node>(); //读统计表长度 int length = br.ReadInt32(); for (int i = 0; i < length; i++) { byte b = br.ReadByte(); int f = br.ReadInt32(); var node = new Node(b, f); statistics.Add(b, node); } return statistics; } // 解码内容 private static byte[] Decode(BinaryReader br, Node rootNode) { //内容长度 int length = br.ReadInt32(); //压缩数据 var data = br.ReadBytes(length); //数据解码 Node node = rootNode; List<byte> result = new List<byte>(); BitArray bits = new BitArray(data); for (int i = 0; i < bits.Length; i++) { bool b = bits[i]; node = b ? node.right : node.left; //如果是叶子节点 if (node.left == null && node.right == null) { result.Add(node.value); //继续从根节点查找叶子 node = rootNode; //内容已全部解码 if (result.Count >= length) break; } } return result.ToArray(); } // 写统计表到内存流 private static void WriteStatistics(Dictionary<byte, Node> statistics, BinaryWriter bw) { //写入统计表长度 bw.Write(statistics.Count); foreach (byte k in statistics.Keys) { //字节 bw.Write(k); //出现的频率 bw.Write(statistics[k].frequency); } } // 写内容到内存流 private static void WriteData(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data, BinaryWriter bw) { //写入内容长度 bw.Write(data.Length); //计算需要用多少bit来存储经哈夫曼编码后的数据 int bitCount = 0; foreach (var b in statistics.Keys) { int frequence = statistics[b].frequency; string code = codeTable[b]; bitCount += code.Length * frequence; } //将数据的哈夫曼编码存储到BitArray中 BitArray bitArray = new BitArray(bitCount); int i = 0; foreach (var b in data) { string code = codeTable[b]; foreach (var c in code) { bitArray.Set(i++, c.Equals('1')); } } //将BitArray中的数据复制到buffer中 int size = bitArray.Count / 8; if (bitArray.Count % 8 > 0) size++; byte[] buffer = new byte[size]; bitArray.CopyTo(buffer, 0); //将编码后的数据写入内存流 bw.Write(buffer); } // 生成编码后的数据 private static byte[] Encode(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data) { MemoryStream ms = new MemoryStream(); BinaryWriter bw = new BinaryWriter(ms); WriteStatistics(statistics, bw); WriteData(statistics, codeTable, data, bw); byte[] bytes = ms.ToArray(); return bytes; } // 输出编码表 private static void PrintCodeTable(string filePath, Dictionary<byte, string> codeTable) { StringBuilder sb = new StringBuilder(); foreach (var b in codeTable.Keys) { sb.AppendLine(string.Format("{0}: {1}", (char)b, codeTable[b])); } File.WriteAllText(filePath, sb.ToString()); } }