From 154501dcf5dee9e8d6e4f6c377d7ee9c90bf89c6 Mon Sep 17 00:00:00 2001 From: mehbark Date: Thu, 16 Apr 2026 14:01:48 -0400 Subject: [PATCH] add ngram count --- .gitignore | 1 + src/main.rs | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index ea8c4bf..a3ccf3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +flamegraph.svg diff --git a/src/main.rs b/src/main.rs index 5c68322..02cc1ea 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,13 +9,16 @@ fn main() { io::stdin().read_to_end(&mut src).unwrap(); for n in 1..=20 { - let freqs = ngram_freqs(n, &src); + let (ngrams, freqs) = ngram_freqs(n, &src); let bits = entropy(freqs); - println!("{n:2}: {bits:6.3} bits, {:.3} bits/letter", bits / n as f64); + println!( + "{n:2}: {bits:6.3} bits, {:.3} bits/letter ({ngrams:7} unique ngrams)", + bits / n as f64 + ); } } -fn ngram_freqs(n: usize, src: &[u8]) -> impl Iterator { +fn ngram_freqs(n: usize, src: &[u8]) -> (usize, impl Iterator) { assert!(n > 0); let mut counts: HashMap, u64> = HashMap::new(); @@ -26,9 +29,12 @@ fn ngram_freqs(n: usize, src: &[u8]) -> impl Iterator { let total_count: u64 = counts.values().sum(); - counts - .into_values() - .map(move |p| p as f64 / total_count as f64) + ( + counts.len(), + counts + .into_values() + .map(move |p| p as f64 / total_count as f64), + ) } fn entropy(probs: impl Iterator) -> f64 {