C#哈夫曼编码文件压缩

作者:追风剑情 发布于:2024-10-16 20:43 分类:Algorithms

  因为需要将统计表保存到文件中,以便解码时重建哈夫曼树,所以当数据量小时,可能压缩后的文件比原文件还大。哈夫曼编码适合用来压缩数据量大且字符出现频率高的文件。算法原理参见 哈夫曼树(Huffman Tree)

一、哈夫曼算法实现

using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Text;

namespace HuffmanTest
{
    internal class Program
    {
        static void Main(string[] args)
        {
            //读取测试数据
            string s = File.ReadAllText("test.txt");

            //压缩
            string base64 = HuffmanCoding.Compress(s);
            File.WriteAllText("encode.txt", base64);

            //解压
            s = HuffmanCoding.Decompress(base64);
            File.WriteAllText("decode.txt", s);

            Console.ReadKey();
        }
    }


using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
/// <summary>
/// 哈夫曼编码压缩算法
/// </summary>
public sealed class HuffmanCoding
{
    //哈夫曼树节点
    public sealed class Node : IComparable<Node>
    {
        //值
        public byte value;
        //出现频率
        public int frequency;
        public Node left;
        public Node right;

        public Node(byte value, int frequency)
        {
            this.value = value;
            this.frequency = frequency;
        }

        public int CompareTo(Node other)
        {
            return frequency.CompareTo(other.frequency);
        }
    }

    // 生成统计表
    private static Dictionary<byte, Node> GenerateStatistics(byte[] data)
    {
        //统计不同字节出现的频率
        var statistics = new Dictionary<byte, Node>();
        foreach (byte b in data)
        {
            if (statistics.ContainsKey(b))
                statistics[b].frequency++;
            else
                statistics.Add(b, new Node(b, 1));
        }
        return statistics;
    }

    // 生成哈夫曼编码表
    private static Dictionary<byte, string> GenerateCodes(Node node, string code = "", Dictionary<byte, string> codeTable = null)
    {
        if (codeTable == null)
            codeTable = new Dictionary<byte, string>();

        //叶子节点
        if (node.left == null && node.right == null)
        {
            codeTable[node.value] = code;
            return codeTable;
        }

        //树的广度遍历
        GenerateCodes(node.left, code + "0", codeTable);
        GenerateCodes(node.right, code + "1", codeTable);

        return codeTable;
    }

    // 生成啥夫曼树
    private static Node GenerateTree(Dictionary<byte, Node> statistics)
    {
        List<Node> nodes = new List<Node>();
        nodes.AddRange(statistics.Values);
        nodes.Sort();

        //构造哈夫曼树
        while (nodes.Count > 1)
        {
            //取出频率最小的两个节点
            var leftNode = nodes[0];
            var rightNode = nodes[1];
            nodes.RemoveRange(0, 2);

            //构造一棵子树
            var node = new Node(0, leftNode.frequency + rightNode.frequency);
            node.left = leftNode;
            node.right = rightNode;

            //将子树加入列表并重新排序
            nodes.Add(node);
            nodes.Sort();
        }
        //哈夫曼树的根节点
        var rootNode = nodes[0];

        return rootNode;
    }

    // 压缩数据, 返回 base64
    public static string Compress(string s)
    {
        byte[] bytes = Encoding.UTF8.GetBytes(s);
        byte[] encodeData = HuffmanCoding.Compress(bytes);
        string base64 = Convert.ToBase64String(encodeData);
        return base64;
    }

    // 压缩数据
    public static byte[] Compress(byte[] data)
    {
        if (data == null)
            return data;

        //频率统计表
        var statistics = GenerateStatistics(data);
        //哈夫曼树根节点
        var rootNode = GenerateTree(statistics);
        //哈夫曼编码表
        var codeTable = GenerateCodes(rootNode);
        PrintCodeTable("code_table.txt", codeTable);
        //生成压缩数据
        byte[] encodeData = Encode(statistics, codeTable, data);

        return encodeData;
    }

    // 解压数据
    public static string Decompress(string base64)
    {
        byte[] bytes = Convert.FromBase64String(base64);
        byte[] decodeData = HuffmanCoding.Decompress(bytes);
        return Encoding.UTF8.GetString(decodeData);
    }

    // 解压数据
    public static byte[] Decompress(byte[] data)
    {
        if (data == null)
            return data;

        MemoryStream ms = new MemoryStream(data);
        BinaryReader br = new BinaryReader(ms);

        //频率统计表
        var statistics = ReadStatistics(br);
        //哈夫曼树根节点
        var rootNode = GenerateTree(statistics);
        //解码数据
        var decodeData = Decode(br, rootNode);

        return decodeData;
    }

    // 从内存流读统计表
    private static Dictionary<byte, Node> ReadStatistics(BinaryReader br)
    {
        var statistics = new Dictionary<byte, Node>();
        //读统计表长度
        int length = br.ReadInt32();
        for (int i = 0; i < length; i++)
        {
            byte b = br.ReadByte();
            int f = br.ReadInt32();
            var node = new Node(b, f);
            statistics.Add(b, node);
        }
        return statistics;
    }

    // 解码内容
    private static byte[] Decode(BinaryReader br, Node rootNode)
    {
        //内容长度
        int length = br.ReadInt32();
        //压缩数据
        var data = br.ReadBytes(length);

        //数据解码
        Node node = rootNode;
        List<byte> result = new List<byte>();
        BitArray bits = new BitArray(data);
        for (int i = 0; i < bits.Length; i++)
        {
            bool b = bits[i];
            node = b ? node.right : node.left;
            //如果是叶子节点
            if (node.left == null && node.right == null)
            {
                result.Add(node.value);
                //继续从根节点查找叶子
                node = rootNode;
                //内容已全部解码
                if (result.Count >= length)
                    break;
            }
        }
        return result.ToArray();
    }

    // 写统计表到内存流
    private static void WriteStatistics(Dictionary<byte, Node> statistics, BinaryWriter bw)
    {
        //写入统计表长度
        bw.Write(statistics.Count);

        foreach (byte k in statistics.Keys)
        {
            //字节
            bw.Write(k);
            //出现的频率
            bw.Write(statistics[k].frequency);
        }
    }

    // 写内容到内存流
    private static void WriteData(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data, BinaryWriter bw)
    {
        //写入内容长度
        bw.Write(data.Length);

        //计算需要用多少bit来存储经哈夫曼编码后的数据
        int bitCount = 0;
        foreach (var b in statistics.Keys)
        {
            int frequence = statistics[b].frequency;
            string code = codeTable[b];
            bitCount += code.Length * frequence;
        }

        //将数据的哈夫曼编码存储到BitArray中
        BitArray bitArray = new BitArray(bitCount);
        int i = 0;
        foreach (var b in data)
        {
            string code = codeTable[b];
            foreach (var c in code)
            {
                bitArray.Set(i++, c.Equals('1'));
            }
        }

        //将BitArray中的数据复制到buffer中
        int size = bitArray.Count / 8;
        if (bitArray.Count % 8 > 0)
            size++;
        byte[] buffer = new byte[size];
        bitArray.CopyTo(buffer, 0);

        //将编码后的数据写入内存流
        bw.Write(buffer);
    }

    // 生成编码后的数据
    private static byte[] Encode(Dictionary<byte, Node> statistics, Dictionary<byte, string> codeTable, byte[] data)
    {
        MemoryStream ms = new MemoryStream();
        BinaryWriter bw = new BinaryWriter(ms);

        WriteStatistics(statistics, bw);
        WriteData(statistics, codeTable, data, bw);

        byte[] bytes = ms.ToArray();
        return bytes;
    }

    // 输出编码表
    private static void PrintCodeTable(string filePath, Dictionary<byte, string> codeTable)
    {
        StringBuilder sb = new StringBuilder();
        foreach (var b in codeTable.Keys)
        {
            sb.AppendLine(string.Format("{0}: {1}", (char)b, codeTable[b]));
        }
        File.WriteAllText(filePath, sb.ToString());
    }
}


二、运行测试

22222.png

111111.png

标签: Algorithms

Powered by emlog  蜀ICP备18021003号-1   sitemap

川公网安备 51019002001593号