c++实现霍夫曼编码,计算信源的熵、平均码长、编码效率、冗余度与压缩比
考虑到指针可能对新手不太友好,这里用的是vector容器(用法类似数组,可以动态扩容)存储树形结构,大致原理就是n号结点的左右子树分别是2n和2n+1号结点。
HuffmanCode.h
#include <stack>
#include <cmath>
#include "myhead.h"
typedef struct {
char ch; // 存储的字符
int weight; // 权重
int parent; // 父节点
int lChild;
int rChild;
} Node;
// 选择最小的权重节点
void findMin(vector<Node> ht, int n, int* s1, int* s2) {
int min = INT_MAX;
for (int i = 1; i <= n; i++) {
if (ht[i].parent == 0) { // 如果没有父节点
min = i;
break;
}
}
for (int i = 1; i <= n; i++) {
if (ht[i].parent == 0) {
if (ht[i].weight < ht[min].weight) {
min = i; // 权重比记录更小,则更新
}
}
}
*s1 = min;
//遍历全部结点
for (int i = 1; i <= n; i++) {
if (ht[i].parent == 0 && i != (*s1)) {
min = i;
break;
}
}
for (int i = 1; i <= n; i++) {
if (ht[i].parent == 0 && i != (*s1)) {
if (ht[i].weight < ht[min].weight) {
min = i; // 更小则更新
}
}
}
*s2 = min;
}
// 生成哈夫曼树
void generateTree(vector<Node>& ht, int times[], int n) {
int m = 2 * n - 1; // 节点总数
int s1;
int s2;
Node firstNode; // 填充下标为0
firstNode.ch = '0';
firstNode.weight = 0;
firstNode.lChild = 0;
firstNode.rChild = 0;
firstNode.parent = 0;
ht.push_back(firstNode);
for (int i = 1; i <= n; i++) {
Node temp;
temp.ch = char(96 + i);// a的ASCII码为97
temp.weight = times[i];
temp.lChild = 0;
temp.rChild = 0;
temp.parent = 0;
ht.push_back(temp);
}
for (int i = n + 1; i <= m; i++) {
Node temp;
temp.ch = '0';
temp.weight = 0;
temp.lChild = 0;
temp.rChild = 0;
temp.parent = 0;
ht.push_back(temp);
}
for (int i = n + 1; i <= m; i++) {
findMin(ht, i - 1, &s1, &s2); // 选择最小的两个权重节点
ht[s1].parent = i;
ht[s2].parent = i;
ht[i].lChild = s1;
ht[i].rChild = s2;
ht[i].weight = ht[s1].weight + ht[s2].weight; //更新权重
}
}
// 从叶节点到根节点找霍夫曼编码
void generateCode(vector<Node>& ht, vector<string>& code, int n) {
code.push_back(""); // 0下标的占位
int p;
int c;
for (int i = 1; i <= n; i++) {
stack<char> st; // 因为是逆向存储,所以使用栈
int start = n - 1;
for (c = i, p = ht[i].parent; p != 0; c = p, p = ht[p].parent) {
if (ht[p].lChild == c) {
st.push('0'); // 是左孩子则压入0
}
else {
st.push('1');
}
}
int s_len = st.size();
string hfm_code = "";
// 从栈中取出
for (int i = 0; i < s_len; i++) {
hfm_code += st.top();
st.pop();
}
code.push_back(hfm_code);
}
cout << "霍夫曼编码结果如下" << endl;
for (int i = 1; i <= n; i++)
{
cout << ht[i].ch << ":" << code[i] << endl;
}
}
// 计算信源的熵
double CalEntropy(int times[], int len, int n) {
double temp = 0;
for (int i = 1; i < len; i++) {
double p = double(times[i]) / n;
temp += p * log2(p);
}
temp = -temp;
return temp;
}
// 计算霍夫曼编码的平均码长
double CalAvgLen(vector<string> code, int times[], int len, int n) {
double temp = 0;
for (int i = 1; i < len; i++) {
int len = code[i].length();
double p = double(times[i]) / n;
temp += double(len) * p;
}
return temp;
}
myhead.h
#include <iostream>
#include <string>
#include <vector>
#include <time.h>
using namespace std;
// 随机生成字符串,这里只使用a~z,可以使用ASCII码这样就是全部字符,但是会增加遍历时间
string rand_str(int len) {
string str;
char c;
for (int i = 0; i < len; i++) {
c = 'a' + rand() % 26;
str.push_back(c);
}
return str;
}
// 生成十条随机字符串
vector<string> getChar() {
vector<string> ten_str;
srand(unsigned int(time(NULL))); // 转换为无符号整型避免安全提示
for (int i = 0; i < 10; i++) {
int length = 1000 + rand() % 9000;
ten_str.push_back(rand_str(length));
// cout << length << endl;
}
return ten_str;
}
// 统计每个字符出现的次数
void getNum(string str) {
int nums[26] = { 0 };
for (unsigned int i = 0; i < str.length(); i++) {
nums[int(str[i]) - 97] ++; // ASCII码中a为97
}
cout << "a~z的出现次数分别为:";
for (int i = 0; i < 26; i++) {
// cout << char(i + 97) << "的次数为:" << nums[i] << endl;
cout << nums[i] << " ";
}
cout << endl;
}
#include <iostream>
#include "HuffmanCode.h"
int main() {
string a = rand_str(5);
vector<string> ten_str = getChar(); // 生成十条长度在1000~10000的字符串
for (int i = 0; i < 10; i++) {
cout << "第" << i + 1 << "条随机字符串长度为" << ten_str[i].length() << endl;
// getNum(ten_str[i]);
vector<Node> ht;
vector<string> code;
// 因为字符串非常长,所以默认随机生成的时候全部26个字符都包含了,简化算法复杂程度,如果字符串很短需要遍历查询
int n = 26;
int times[27] = { 0 }; // 出现的次数
for (unsigned int j = 0; j < ten_str[i].length(); j++) {
times[int(ten_str[i][j]) - 96] ++; // ASCII码中a为97
}
cout << "a~z的出现次数分别为:";
for (int i = 1; i < 27; i++) {
cout << times[i] << " ";
}
cout << endl;
generateTree(ht, times, n);
generateCode(ht, code, n);
// 计算信源的熵
double entropy = CalEntropy(times, 27, ten_str[i].length());
cout << "信源的熵为:" << entropy << endl;
//计算霍夫曼编码的平均码长
double avg_len = CalAvgLen(code, times, 27, ten_str[i].length());
cout << "平均码长为:" << avg_len << endl;
cout << "霍夫曼编码效率为:" << entropy / avg_len << endl;
cout << "冗余度为:" << (1 - entropy / avg_len) * 100 << "%" << endl;
cout << "压缩比为:" << 5 / avg_len << endl;
cout << endl;
}
return 0;
}