add ngram count
This commit is contained in:
parent
62fa7ccd41
commit
154501dcf5
2 changed files with 13 additions and 6 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1 +1,2 @@
|
||||||
/target
|
/target
|
||||||
|
flamegraph.svg
|
||||||
|
|
|
||||||
18
src/main.rs
18
src/main.rs
|
|
@ -9,13 +9,16 @@ fn main() {
|
||||||
io::stdin().read_to_end(&mut src).unwrap();
|
io::stdin().read_to_end(&mut src).unwrap();
|
||||||
|
|
||||||
for n in 1..=20 {
|
for n in 1..=20 {
|
||||||
let freqs = ngram_freqs(n, &src);
|
let (ngrams, freqs) = ngram_freqs(n, &src);
|
||||||
let bits = entropy(freqs);
|
let bits = entropy(freqs);
|
||||||
println!("{n:2}: {bits:6.3} bits, {:.3} bits/letter", bits / n as f64);
|
println!(
|
||||||
|
"{n:2}: {bits:6.3} bits, {:.3} bits/letter ({ngrams:7} unique ngrams)",
|
||||||
|
bits / n as f64
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ngram_freqs(n: usize, src: &[u8]) -> impl Iterator<Item = f64> {
|
fn ngram_freqs(n: usize, src: &[u8]) -> (usize, impl Iterator<Item = f64>) {
|
||||||
assert!(n > 0);
|
assert!(n > 0);
|
||||||
|
|
||||||
let mut counts: HashMap<Box<[u8]>, u64> = HashMap::new();
|
let mut counts: HashMap<Box<[u8]>, u64> = HashMap::new();
|
||||||
|
|
@ -26,9 +29,12 @@ fn ngram_freqs(n: usize, src: &[u8]) -> impl Iterator<Item = f64> {
|
||||||
|
|
||||||
let total_count: u64 = counts.values().sum();
|
let total_count: u64 = counts.values().sum();
|
||||||
|
|
||||||
counts
|
(
|
||||||
.into_values()
|
counts.len(),
|
||||||
.map(move |p| p as f64 / total_count as f64)
|
counts
|
||||||
|
.into_values()
|
||||||
|
.map(move |p| p as f64 / total_count as f64),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn entropy(probs: impl Iterator<Item = f64>) -> f64 {
|
fn entropy(probs: impl Iterator<Item = f64>) -> f64 {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue