#pragma once
#include <memory.h>
class huffman {
template<int n>
struct bits { // 位存储器,不使用std::bitset,实际存储顺序有问题,多次位运算性能也存在问题
static const int size = n / 8;
unsigned char v[size] = {0};
unsigned char get(int i) const { // i 从前往后
int a = i / 8;
int b = 7 - i % 8;
return (v[a] & (1 << b)) >> b;
}
void or_tail(unsigned char p) {
v[size - 1] |= p;
}
void or_merge(const bits& p, int from, int target, int len) {
for (int i = 0; i < len; i++) {
int a = (i + target) / 8;
int b = (i + target) % 8;
v[a] |= p.get(i + from) << (7 - b);
}
}
};
typedef bits<256> code_type;
struct element {
unsigned int count;
unsigned short i; // 索引,也是char的ascci码
unsigned short left;
unsigned short right;
code_type code; // 编码存放,这里256位,树的最大层级是255层
// 按照 count 降序
static int compare1(element* a, element* b) {
if (b->count > a->count)
return 1;
if (b->count < a->count)
return -1;
return 0;
}
// 按照index升序
static int compare2(element* a, element* b) {
if (a->i > b->i)
return 1;
if (a->i < b->i)
return -1;
return 0;
}
};
static void quick_sort(element* eles, int start, int end, int (*compare)(element* a, element* b)) {
if (start < end) {
int i = start, j = end;
element it = eles[i]; // 比较值,且空出来一个位置
while (i < j) {
// 从end->start查找一个比it大的,放到空出来的位置i,且产生新的空位j
while (i < j && compare(&eles[j], &it) >= 0) {
j--;
}
if (i < j) {
eles[i] = eles[j];
i++;
}
// 从start->end查找一个比it小的,放到空出来的位置j,且产生新的空位i
while (i < j && compare(&eles[i], &it) < 0) {
i++;
}
if (i < j) {
eles[j] = eles[i];
j--;
}
}
eles[i] = it;
quick_sort(eles, start, i - 1, compare);
quick_sort(eles, i + 1, end, compare);
}
}
// 0 左,1 右
static void encoding(element* root, element* index) {
if (root->left == 0 && root->right == 0) {
return;
}
index[root->left].count = root->count + 1;
index[root->right].count = root->count + 1;
const unsigned int buf_size = sizeof(root->code) * 8;
index[root->left].code = {0};
index[root->left].code.or_merge(root->code, buf_size - root->count, buf_size - root->count - 1, root->count);
index[root->right].code = index[root->left].code;
index[root->right].code.or_tail(1);
encoding(&index[root->left], index);
encoding(&index[root->right], index);
}
static element* tree(element* index) {
// 排序统计
quick_sort(index, 0, 255, element::compare1);
int size = 0;
for (; size < 256; size++) {
if (index[size].count == 0) {
break;
}
}
// 构建树,每次减少一个节点,直到只有一个节点。最后一个就是根节点
for (int i = size; i > 1; i--) {
element* it = &index[size];
int m = size - 1;
int n = size - 2;
it->left = index[m].i;
it->right = index[n].i;
it->count = index[m].count + index[n].count;
index[m].count = 0xffffffff; // 排到最前边,可视为删除了
index[n].count = 0xffffffff;
size++;
quick_sort(index, 0, size - 1, element::compare1);
}
// 获得的位置
unsigned short root = index[size - 1].i;
// 已经是乱序, 恢复需要的索引顺序
quick_sort(index, 0, size - 1, element::compare2);
for (int i = size - 1; i >= 0; i--) {
index[i].count = 0;
index[index[i].i] = index[i];
}
// 获取编码
encoding(&index[root], index);
return &index[root];
}
public:
static const int MAX_HEAD_SIZE = 8 + (4 + 1) * 256;
static unsigned int compress(unsigned char* input_data, unsigned int input_len, unsigned char* output_data) {
if (!input_data || input_len == 0 || !output_data) {
return 0;
}
element index[512] = {{0}};
// 初始化索引,因为排序会打乱,所以记录位置以供恢复
for (unsigned int i = 0; i < 512; i++) {
index[i].i = i;
}
// 统计
for (unsigned int i = 0; i < input_len; i++) {
index[input_data[i]].count += 1;
}
// 记录头
unsigned int out_offset = 0;
out_offset += 4; // 预占头大小
memcpy(output_data + out_offset, &input_len, 4);
out_offset += 4;
for (unsigned int i = 0; i < 256; i++) {
unsigned char c = i;
if (index[i].count > 0) {
memcpy(output_data + out_offset, &index[i].count, 4);
out_offset += 4;
memcpy(output_data + out_offset, &c, 1);
out_offset += 1;
}
}
memcpy(output_data, &out_offset, 4);
// 只有一个字符,直接不需要压缩
if (out_offset == 13) {
return out_offset;
}
// 构建树
tree(index);
// flush
code_type buf = {0};
unsigned int buf_len = 0;
const unsigned int buf_size = sizeof(buf) * 8;
for (unsigned int i = 0; i < input_len; i++) {
unsigned char c = input_data[i];
element* it = &index[c];
unsigned int left = buf_size - buf_len;
if (left <= it->count) {
buf.or_merge(it->code, buf_size - it->count, buf_len, left);
// 写入
memcpy(output_data + out_offset, &buf, sizeof(buf));
out_offset += sizeof(buf);
buf = {0};
buf_len = 0;
buf.or_merge(it->code, buf_size - it->count + left, buf_len, it->count - left);
buf_len += it->count - left;
} else {
buf.or_merge(it->code, buf_size - it->count, buf_len, it->count);
buf_len += it->count;
}
}
if (buf_len > 0) {
// 写入
memcpy(output_data + out_offset, &buf, buf_len / 8 + 1);
out_offset += sizeof(buf);
}
return out_offset;
}
static unsigned int decompress(unsigned char* input_data, unsigned int input_len, unsigned char* output_data) {
if (!input_data || input_len == 0 || !output_data) {
return 0;
}
element index[512] = {{0}};
unsigned int out_size = 0;
// 初始化索引,因为排序会打乱,所以记录位置
for (unsigned int i = 0; i < 512; i++) {
index[i].i = i;
}
// 获得统计
unsigned int head_size = 0;
unsigned int data_size = 0;
unsigned int offset = 0;
memcpy(&head_size, input_data + offset, 4);
offset += 4;
memcpy(&data_size, input_data + offset, 4);
offset += 4;
unsigned int _count = 0;
unsigned char _c = 0;
while (offset < head_size) {
memcpy(&_count, input_data + offset, 4);
offset += 4;
memcpy(&_c, input_data + offset, 1);
offset += 1;
index[_c].count = _count;
}
// 特殊处理一个单字符
if (offset == 13) {
for (unsigned int i = 0; i < _count; i++) {
*(output_data + out_size) = _c;
out_size += 1;
}
return out_size;
}
// 构建树
element* root = tree(index);
// 解析数据
unsigned char buf = 0;
element* current = root;
const int bits_count = sizeof(buf) * 8;
while (offset < input_len && out_size < data_size) {
memcpy(&buf, input_data + offset, sizeof(buf));
offset += sizeof(buf);
for (int i = 0; i < bits_count; i++) {
unsigned char tmp = (buf >> (bits_count - 1 - i)) & 1;
if (tmp == 0) {
current = &index[current->left];
} else {
current = &index[current->right];
}
if (current->left == 0 && current->right == 0) {
*(output_data + out_size) = current->i;
out_size += 1;
current = root;
}
if (out_size >= data_size) {
break;
}
}
}
return out_size;
}
};
测试例子:
#include "huffman.h"
#include <stdio.h>
#include <string>
#include <iostream>
#include <chrono>
int main(int argc, char const *argv[]) {
if (argc > 2) {
return 0;
}
auto start = std::chrono::high_resolution_clock::now();
FILE *file = fopen(argv[1], "rb");
if (!file) {
printf("not find file: %s\n", argv[1]);
return 0;
}
fseek(file, 0, SEEK_END);
unsigned int len = ftell(file);
fseek(file, 0, SEEK_SET);
unsigned char *text = new unsigned char[len];
memset(text, 0, len);
fread(text, 1, len, file);
fclose(file);
printf("input len : %u\n", len);
auto end = std::chrono::high_resolution_clock::now();
std::cout << "read: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
start = end;
unsigned char *output = new unsigned char[len + huffman::MAX_HEAD_SIZE];
memset(output, 0, len + huffman::MAX_HEAD_SIZE);
unsigned int olen = huffman::compress(text, len, output);
printf("output len: %u\n", olen);
end = std::chrono::high_resolution_clock::now();
int times = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "compress: " << times << "ms" << std::endl;
std::cout << " " << 1.0f * len / times * 1000 / 1024 << "kb/s" << std::endl;
start = end;
FILE *ofile = fopen((std::string(argv[1]) + ".hzip").c_str(), "wb+");
fwrite(output, 1, olen, ofile);
fclose(ofile);
end = std::chrono::high_resolution_clock::now();
std::cout << "write: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
start = end;
memset(text, 0, len);
len = huffman::decompress(output, olen, text);
FILE *rfile = fopen((std::string(argv[1]) + ".huzip").c_str(), "wb+");
fwrite(text, 1, len, rfile);
fclose(rfile);
end = std::chrono::high_resolution_clock::now();
times = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "decompress: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
std::cout << " " << 1.0f * len / times * 1000 / 1024 << "kb/s" << std::endl;
start = end;
return 0;
}
输出示例:
input len : 139246592
read: 48ms
output len: 136604744
compress: 8726ms
15583.7kb/s
write: 676ms
decompress: 10629ms
12793.6kb/s