From 589e7b711fafda07907e5b32c32e63d5c15e826f Mon Sep 17 00:00:00 2001 From: mehbark Date: Sat, 4 Oct 2025 02:05:43 -0400 Subject: [PATCH] initial huffman; DRY main --- src/compression_scheme.rs | 2 +- src/huffman.rs | 158 ++++++++++++++++++++++++++++++++++++++ src/main.rs | 35 ++++++--- src/test.rs | 8 +- 4 files changed, 192 insertions(+), 11 deletions(-) create mode 100644 src/huffman.rs diff --git a/src/compression_scheme.rs b/src/compression_scheme.rs index 9fc5c25..5fbc187 100644 --- a/src/compression_scheme.rs +++ b/src/compression_scheme.rs @@ -9,7 +9,7 @@ pub trait CompressionScheme { /// Encode some bytes into `buf`, returning a [`Header`][Self::Header]. /// - /// This does not necessarily have to be deterministic, + /// This does not have to be deterministic, /// but it **must** be decodable by [`decode`](Self::decode). /// That is, [`decode`](Self::decode) ∘ [`encode`](Self::encode) = `id`. fn encode(src: &[u8], buf: &mut BitVec) -> Self::Header; diff --git a/src/huffman.rs b/src/huffman.rs new file mode 100644 index 0000000..62070b0 --- /dev/null +++ b/src/huffman.rs @@ -0,0 +1,158 @@ +use std::{ + collections::{BinaryHeap, HashMap}, + fmt, +}; + +use bitvec::prelude::{BitSlice, BitVec}; + +use crate::CompressionScheme; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Huffman; + +impl CompressionScheme for Huffman { + type Header = Option; + + fn encode(src: &[u8], buf: &mut BitVec) -> Self::Header { + let mut counts = HashMap::new(); + for byte in src { + *counts.entry(*byte).or_default() += 1; + } + + let tree = Node::build(&counts)?; + + Some(tree) + } + + fn decode(src: &BitSlice, header: &Self::Header, buf: &mut Vec) { + if let Some(node) = header { + todo!() + } + } + + fn header_size(header: &Self::Header) -> usize { + header.as_ref().map_or(0, Node::byte_size) + } +} + +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Node { + Leaf { byte: u8 }, + Branch { left: Box, right: Box }, +} + +impl Node { + fn build(counts: &HashMap) -> Option { + WeightedNode::build(counts).map(WeightedNode::unburden) + } + + /// Write to `buf` the sequence of bits that this tree has assigned `byte`. + /// + /// # Panics + /// Panics if called with a byte that was not passed to [`build`]. + fn encode_byte(&self, byte: u8, buf: &mut BitVec) { + let mut current = self; + todo!() + } + + /// Return the next encoded byte in the stream of bits. + fn decode_byte(src: &BitSlice) -> Option { + todo!() + } + + fn byte_size(&self) -> usize { + match self { + Node::Leaf { .. } => 1, + Node::Branch { left, right, .. } => 1 + left.byte_size() + right.byte_size(), + } + } + + fn write_depth(&self, f: &mut fmt::Formatter<'_>, depth: usize) -> fmt::Result { + for _ in 0..depth { + write!(f, " ")?; + } + match self { + Node::Leaf { byte } => writeln!(f, "| {:?}", *byte as char), + Node::Branch { left, right } => { + writeln!(f, "{depth}+")?; + left.write_depth(f, depth + 1)?; + right.write_depth(f, depth + 1) + } + } + } +} + +impl fmt::Debug for Node { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.write_depth(f, 0) + } +} + +#[derive(Clone, PartialEq, Eq)] +enum WeightedNode { + Leaf { + byte: u8, + count: u32, + }, + Branch { + count: u32, + left: Box, + right: Box, + }, +} + +impl WeightedNode { + fn build(counts: &HashMap) -> Option { + let mut queue = BinaryHeap::new(); + + for (&byte, &count) in counts { + queue.push(WeightedNode::Leaf { byte, count }); + } + + loop { + let first = queue.pop()?; + + let Some(second) = queue.pop() else { + return Some(first); + }; + + queue.push(WeightedNode::join(first, second)); + } + } + + fn count(&self) -> u32 { + match self { + Self::Branch { count, .. } | Self::Leaf { count, .. } => *count, + } + } + + fn join(left: Self, right: Self) -> Self { + Self::Branch { + count: left.count() + right.count(), + left: Box::new(left), + right: Box::new(right), + } + } + + fn unburden(self) -> Node { + match self { + WeightedNode::Leaf { byte, .. } => Node::Leaf { byte }, + WeightedNode::Branch { left, right, .. } => Node::Branch { + left: Box::new(left.unburden()), + right: Box::new(right.unburden()), + }, + } + } +} + +impl PartialOrd for WeightedNode { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for WeightedNode { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.count().cmp(&other.count()).reverse() + } +} diff --git a/src/main.rs b/src/main.rs index 888c047..1f0d4a7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,12 @@ -use std::io::{self, Read}; +use std::{ + env, + fmt::Debug, + io::{self, Read}, +}; mod compression_scheme; mod freq; +mod huffman; mod rle; #[cfg(test)] mod test; @@ -9,9 +14,12 @@ mod test; use bitvec::vec::BitVec; pub use compression_scheme::CompressionScheme; pub use freq::Freq; +pub use huffman::Huffman; pub use rle::Rle; fn main() -> Result<(), io::Error> { + let debug = env::args().any(|arg| arg == "--debug" || arg == "-d"); + let mut buf = Vec::new(); let len_src = io::stdin().read_to_end(&mut buf)?; @@ -19,14 +27,23 @@ fn main() -> Result<(), io::Error> { let mut bitbuf = BitVec::new(); - let () = Rle::encode(&buf, &mut bitbuf); - let len_rle = bitbuf.len().div_ceil(8); - println!(" Rle'd: {len_rle}"); - - bitbuf.clear(); - let header = Freq::encode(&buf, &mut bitbuf); - let len_freq = Freq::header_size(&header) + bitbuf.len().div_ceil(8); - println!(" Freq'd: {len_freq}"); + run::(&buf, &mut bitbuf, "rle", debug); + run::(&buf, &mut bitbuf, "freq", debug); + run::(&buf, &mut bitbuf, "Huffman", debug); Ok(()) } + +fn run(buf: &[u8], bitbuf: &mut BitVec, name: &str, debug: bool) +where + Scheme: CompressionScheme
, + Header: Debug, +{ + bitbuf.clear(); + let header = Scheme::encode(buf, bitbuf); + let len_freq = Scheme::header_size(&header) + bitbuf.len().div_ceil(8); + println!("{name}'d: {len_freq}"); + if debug { + eprintln!("{name} header: {header:#?}\n"); + } +} diff --git a/src/test.rs b/src/test.rs index 4777530..05acade 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1,6 +1,6 @@ use quickcheck_macros::quickcheck; -use crate::{CompressionScheme, Freq, Rle}; +use crate::{CompressionScheme, Freq, Huffman, Rle}; #[allow(clippy::needless_pass_by_value)] #[quickcheck] @@ -13,3 +13,9 @@ fn roundtrip_freq(src: Vec) -> bool { fn roundtrip_rle(src: Vec) -> bool { Rle::idempotent_on(&src) } + +#[allow(clippy::needless_pass_by_value)] +#[quickcheck] +fn roundtrip_huffman(src: Vec) -> bool { + Huffman::idempotent_on(&src) +}