add ngram count

This commit is contained in:
mehbark 2026-04-16 14:01:48 -04:00
parent 62fa7ccd41
commit 154501dcf5
Signed by: mbk
GPG key ID: E333EC1335FFCCDB
2 changed files with 13 additions and 6 deletions

1
.gitignore vendored
View file

@ -1 +1,2 @@
/target /target
flamegraph.svg

View file

@ -9,13 +9,16 @@ fn main() {
io::stdin().read_to_end(&mut src).unwrap(); io::stdin().read_to_end(&mut src).unwrap();
for n in 1..=20 { for n in 1..=20 {
let freqs = ngram_freqs(n, &src); let (ngrams, freqs) = ngram_freqs(n, &src);
let bits = entropy(freqs); let bits = entropy(freqs);
println!("{n:2}: {bits:6.3} bits, {:.3} bits/letter", bits / n as f64); println!(
"{n:2}: {bits:6.3} bits, {:.3} bits/letter ({ngrams:7} unique ngrams)",
bits / n as f64
);
} }
} }
fn ngram_freqs(n: usize, src: &[u8]) -> impl Iterator<Item = f64> { fn ngram_freqs(n: usize, src: &[u8]) -> (usize, impl Iterator<Item = f64>) {
assert!(n > 0); assert!(n > 0);
let mut counts: HashMap<Box<[u8]>, u64> = HashMap::new(); let mut counts: HashMap<Box<[u8]>, u64> = HashMap::new();
@ -26,9 +29,12 @@ fn ngram_freqs(n: usize, src: &[u8]) -> impl Iterator<Item = f64> {
let total_count: u64 = counts.values().sum(); let total_count: u64 = counts.values().sum();
counts (
.into_values() counts.len(),
.map(move |p| p as f64 / total_count as f64) counts
.into_values()
.map(move |p| p as f64 / total_count as f64),
)
} }
fn entropy(probs: impl Iterator<Item = f64>) -> f64 { fn entropy(probs: impl Iterator<Item = f64>) -> f64 {