File: ingest-tar.rs

package info (click to toggle)
rust-rust-unixfs 0.4.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 652 kB
  • sloc: sh: 17; makefile: 2
file content (148 lines) | stat: -rw-r--r-- 5,185 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use multihash::Multihash;

pub fn criterion_benchmark(c: &mut Criterion) {
    let file = "benchmark.tar";

    match std::fs::read(file) {
        Ok(tar_bytes) => {
            // warmup should take care of right sizing these
            let mut buffer = Vec::new();
            let mut path = String::new();

            c.bench_function("ingest-tar", |b| {
                b.iter(|| ingest_tar(&tar_bytes, &mut buffer, &mut path))
            });
        }
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
            eprintln!("could not find {file:?}:");
            eprintln!("please download a linux kernel and unpack it to enable benchmark. specific version doesn't matter.");
        }
        Err(e) => panic!("failed to read the {file:?}: {e}"),
    }
}

fn ingest_tar(bytes: &[u8], buffer: &mut Vec<u8>, path: &mut String) {
    use Cid;
    use rust_unixfs::dir::builder::{BufferingTreeBuilder, TreeOptions};
    use rust_unixfs::file::adder::FileAdder;
    use sha2::{Digest, Sha256};
    use std::io::Read;

    let mut archive = tar::Archive::new(std::io::Cursor::new(bytes));
    let entries = archive.entries().unwrap();

    let mut opts = TreeOptions::default();
    opts.wrap_with_directory();
    let mut tree = BufferingTreeBuilder::new(opts);

    for entry in entries {
        let mut entry = entry.expect("assuming good tar");

        let path_bytes = entry.path_bytes();
        let tmp_path = std::str::from_utf8(&path_bytes).unwrap();
        path.clear();
        path.push_str(tmp_path);

        if let Some(link_name) = entry.link_name_bytes() {
            let link_name =
                std::str::from_utf8(&link_name).expect("symlink targets should be utf8");

            buffer.clear();
            rust_unixfs::symlink::serialize_symlink_block(link_name, buffer);

            let len = buffer.len();

            let mh = Multihash::wrap(
                multihash_codetable::Code::Sha2_256.into(),
                &Sha256::digest(&buffer),
            )
            .unwrap();
            let cid = Cid::new_v0(mh).expect("sha2_256 is the correct multihash for cidv0");

            tree.put_link(path, cid, len as u64).unwrap();

            // save the &buffer[..]

            continue;
        }

        if !path.ends_with('/') {
            // TODO: reusing of adder
            let mut adder = FileAdder::default();

            // with the std::io::Read it'd be good to read into the fileadder, or read into ...
            // something. trying to acccess the buffer from inside FileAdder does not seem the be the
            // way to go.
            //
            // reusing the buffers between files would make a lot of sense as well

            if let Some(needed) = adder.size_hint().checked_sub(buffer.capacity()) {
                buffer.reserve(needed);
            }

            if let Some(mut needed) = adder.size_hint().checked_sub(buffer.len()) {
                let zeros = [0u8; 8];

                while needed > zeros.len() {
                    buffer.extend_from_slice(&zeros[..]);
                    needed -= zeros.len();
                }

                buffer.extend(std::iter::repeat(0).take(needed));
            }

            let mut total_written = 0usize;

            loop {
                match entry.read(&mut buffer[0..]).unwrap() {
                    0 => {
                        let blocks = adder.finish();
                        let (cid, subtotal) = blocks
                            .fold(
                                None,
                                |acc: Option<(Cid, usize)>, (cid, bytes): (Cid, Vec<u8>)| match acc
                                {
                                    Some((_, total)) => Some((cid, total + bytes.len())),
                                    None => Some((cid, bytes.len())),
                                },
                            )
                            .expect("this is probably always present");

                        total_written += subtotal;

                        tree.put_link(path, cid, total_written as u64).unwrap();
                        break;
                    }
                    n => {
                        let mut read = 0;
                        while read < n {
                            let (blocks, consumed) = adder.push(&buffer[read..n]);
                            read += consumed;
                            total_written += blocks.map(|(_, bytes)| bytes.len()).sum::<usize>();
                        }
                    }
                }
            }
        } else {
            tree.set_metadata(&path[..path.len() - 1], rust_unixfs::Metadata::default())
                .unwrap();
        }
    }

    let mut iter = tree.build();

    let mut last: Option<(Cid, u64, usize)> = None;

    while let Some(res) = iter.next_borrowed() {
        let res = res.unwrap();
        last = Some((res.cid.to_owned(), res.total_size, res.block.len()));
    }

    let last = last.unwrap();

    black_box(last);
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);