File: length_distribution.rs

package info (click to toggle)
rust-coreutils 0.7.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 505,620 kB
  • sloc: ansic: 103,594; asm: 28,570; sh: 8,910; python: 5,581; makefile: 472; cpp: 97; javascript: 72
file content (89 lines) | stat: -rw-r--r-- 2,855 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/* Copyright 2018 Torbjørn Birch Moltu
 *
 * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
 * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
 * http://opensource.org/licenses/MIT>, at your option. This file may not be
 * copied, modified, or distributed except according to those terms.
 */

//! Counts the number of codepoints of each UTF-8 length in files

use std::env::args_os;
use std::fs::File;
use std::io::{self, Read, stdin};
use std::borrow::Cow;
extern crate encode_unicode;
use encode_unicode::U8UtfExt;

#[derive(Default)]
struct Distribution {
    bytes: usize,
    utf8: [usize; 4],
}

fn read(file: &mut dyn Read) -> (Distribution, Option<io::Error>) {
    let mut r = Distribution::default();
    let mut buf = [0u8; 4096];
    loop {
        let read = match file.read(&mut buf) {
            Ok(0) => return (r, None),
            Ok(n) => n,
            Err(e) => return (r, Some(e)),
        };
        r.bytes += read;
        for (o, &b) in buf[..read].iter().enumerate() {
            if let Ok(i) = b.extra_utf8_bytes() {
                r.utf8[i] += 1;
                if i == 3 {
                    let min = o.saturating_sub(20);
                    let max = if o+23 <= read {o+23} else {read};
                    println!("{}", String::from_utf8_lossy(&buf[min..max]));
                }
            }
        }
    }
}

fn display(name_pad: usize,  name: Cow<str>,
           r: Distribution,  err: Option<io::Error>) {
    let c = r.utf8;
    let characters = c[0]+c[1]+c[2]+c[3];
    let s = [c[0], c[1]*2, c[2]*3, c[3]*4];
    let p = [
        (s[0]*100) as f32 / r.bytes as f32,
        (s[1]*100) as f32 / r.bytes as f32,
        (s[2]*100) as f32 / r.bytes as f32,
        (s[3]*100) as f32 / r.bytes as f32,
    ];
    println!("{:>6$}: bytes: {:7}, UTF-8 distribution: [{:7}, {:6}, {:6}, {:6}]",
        name, r.bytes, s[0], s[1], s[2], s[3], name_pad
    );
    println!("{5:6$}  chars: {:7}, UTF-8 percentages:  [{:>6.2}%, {:>5.2}%, {:>5.2}%, {:>5.2}%]",
        characters, p[0], p[1], p[2], p[3], "", name_pad
    );
    if let Some(err) = err {
        println!("{1:2$}  {}", err, "", name_pad);
    }
}

fn main() {
    let name_length = args_os().skip(1)
        .map(|path| path.to_string_lossy().chars().count() )
        .max();
    for path in args_os().skip(1) {
        let name = path.to_string_lossy();
        let (r,err) = match File::open(&path) {
            Ok(mut file) => read(&mut file),
            Err(err) => {
                eprintln!("{}:\t{}", name, err);
                continue;
            }
        };
        display(name_length.unwrap(), name, r, err);
    }
    if name_length.is_none() {
        let stdin = stdin();
        let (r,err) = read(&mut stdin.lock());
        display(0, Cow::Borrowed("stdin"), r, err);
    }
}