File: lib.rs

package info (click to toggle)
chromium 145.0.7632.159-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,976,224 kB
  • sloc: cpp: 36,198,469; ansic: 7,634,080; javascript: 3,564,060; python: 1,649,622; xml: 838,470; asm: 717,087; pascal: 185,708; sh: 88,786; perl: 88,718; objc: 79,984; sql: 59,811; cs: 42,452; fortran: 24,101; makefile: 21,144; tcl: 15,277; php: 14,022; yacc: 9,066; ruby: 7,553; awk: 3,720; lisp: 3,233; lex: 1,328; ada: 727; jsp: 228; sed: 36
file content (248 lines) | stat: -rw-r--r-- 10,767 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#![feature(portable_simd)]

// Modules public for testing, don't expect stable API.
mod cxx;
pub mod decoder;
pub mod dither;
pub mod quant;
pub mod selectors;

use std::simd::prelude::*;
use std::simd::Simd;

use bytemuck::cast_slice;
use bytemuck::cast_slice_mut;

use crate::decoder::decode_etc1_block;
use crate::dither::dither;
use crate::quant::{quantize_averages, QuantResult};
use crate::selectors::search_table_and_selectors;

// We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
// wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
// and support for AVX2 etc. this should be converted to a template parameter
// that varies based on the target architecture.
const SIMD_WIDTH: usize = 8;
const HALF_WIDTH: usize = SIMD_WIDTH / 2;
const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
type Reg = Simd<i16, SIMD_WIDTH>;
type Reg32 = Simd<i32, SIMD_WIDTH>;
type UReg = Simd<u16, SIMD_WIDTH>;

const ETC1_BLOCK_BYTES: usize = 8;

/// Define a helper to interleave elements from two vectors, reinterpret
/// it as a type twice as large, and return the resulting vector.
/// Each argument / return value is an array of vectors; conceptually, this
/// represents a vector that is <width> * <len> large; however, since std::simd
/// types have upper limits on their width we represent them using arrays to be
/// portable.
macro_rules! define_interleave {
    ($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
        fn $fn_name(
            a: [Simd<$src_ty, $src_width>; $src_len],
            b: [Simd<$src_ty, $src_width>; $src_len],
        ) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
            let mut iter = (0..$src_len).flat_map(|i| {
                let (a, b) = a[i].interleave(b[i]);
                [a, b].map(|x| bytemuck::cast(x))
            });
            let res = std::array::from_fn(|_| iter.next().unwrap());
            assert!(iter.next().is_none());
            res
        }
    };
}

/// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
/// into interleaved u64 arrays, while flipping the endianness (our internal
/// representation is little endian while ETC1 requires big endian).
#[inline]
pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
    // The interleaving assumes little endian.
    #[cfg(target_endian = "big")]
    compile_error!("Big endian is not supported");

    define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
    define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
    // Step 1: make each u16 codeword big-endian
    let regs = regs.map(|r| r.swap_bytes());
    // Step 2: [aaaa, bbbb] to [baba, baba]
    let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
    // Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
    let regs = conv_32_to_64(regs[1], regs[0]);
    regs
}

/// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
/// starting at `base_x` and `base_y`.
///
/// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
/// becomes `abcdxyyx`.
///
/// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
/// (from left to right), and each pixel in the block is accessed as
/// `[y][x][channel]`.
#[inline]
pub fn load_input_block(
    src: &[u32],
    width: u32,
    height: u32,
    row_width: u32,
    base_x: u32,
    base_y: u32,
) -> [[[Reg; 3]; 4]; 4] {
    let mut data = [[[Reg::default(); 3]; 4]; 4];
    // For now, input load and output store are not vectorized. The main reason is
    // that efficient loading requires shuffling and is poorly supported
    // by std::simd and the wide crate (which we plan to use for
    // supporting stable toolchain). Input load currently accounts for
    // ~20% of the runtime. If shuffle support improves this would be a
    // good candidate for optimization.
    for i in 0..4 {
        for j in 0..4 {
            let mut buf = [0u32; SIMD_WIDTH];
            for block in 0..SIMD_WIDTH as u32 {
                let x = base_x + block * 4 + j as u32;
                let y = base_y + i as u32;
                buf[block as usize] = if x < width && y < height {
                    // Fast path: load in-bound pixel
                    src[(y * row_width + x) as usize]
                } else {
                    // Slow path: mirror out-of-bound pixels
                    // If width or height is 1, mirroring can overflow, so make it saturate.
                    let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
                    let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
                    src[(ym * row_width + xm) as usize]
                };
            }
            let rgbx = Simd::from_array(buf);
            let extract_channel = |x: Simd<u32, SIMD_WIDTH>, shift: u32| {
                (x >> shift).cast::<i16>() & Simd::splat(0xFF)
            };
            data[i][j][0] = extract_channel(rgbx, 0);
            data[i][j][1] = extract_channel(rgbx, 8);
            data[i][j][2] = extract_channel(rgbx, 16);
        }
    }
    data
}

/// Compress RGB pixels to ETC1.
///
/// - `src` should be in RGBA format (the least significant byte is red).
/// - `dst` will be filled with compressed ETC1 blocks.
/// - `src_width` and `src_height` specifies the logical size of the image in
///   pixels. These does not need to be multiple of 4. The boundary pixels will
///   be padded with unspecified values.
/// - `src_row_width` and `dst_row_width` specifies the in-memory length of each
///   row, in pixels and blocks, respectively.
///
/// Note that `src` takes an aligned 32-bit buffer while `dst` takes a byte
/// buffer, even though each ETC1 codeword is 64-bit. This is due to two
/// reasons:
/// - 32-bit alignment is practical to get even on 32-bit platforms, whereas
///   64-bit values are not aligned to 8 bytes on 32-bit ARM.
/// - We require extensive shuffling when loading inputs, but store to the
///   output straight in the order of blocks. Dealing with unaligned buffers in
///   the latter case is significantly easier.
pub fn compress_etc1(
    src: &[u32],
    dst: &mut [u8],
    src_width: u32,
    src_height: u32,
    src_row_width: u32,
    dst_row_width: u32,
) {
    // Note: We deliberately do not declare the block size (4x4) of ETC1 as a
    //       constant. While magic constants in general are discouraged, the
    //       block size appears way too frequent that naming it would make the
    //       code verbose and less readable.
    let dst_height = src_height.div_ceil(4);
    let dst_width = src_width.div_ceil(4);
    // Aligned staging buffer. Data is copied into the potentially unaligned
    // destination buffer at the end of the each row.
    let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
    let copy_len = dst_width as usize * ETC1_BLOCK_BYTES;
    // Note on vectorization scheme:
    //
    // We process one 4x4 block per SIMD lane, instead of the more common practice
    // of processing pixels within the same block in parallel using multiple
    // lanes. The one-block-per-lane scheme, more akin to SPMD programming,
    // allows most of our code to be shuffle-free, and works much better with
    // portable SIMD than schemes that heavily shuffles.
    for dst_y in 0..dst_height {
        for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
            let data =
                load_input_block(src, src_width, src_height, src_row_width, dst_x0 * 4, dst_y * 4);
            let data = dither(&data);
            let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
                quantize_averages(&data);
            let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
            let codewords = interleave_etc1(best_fit);
            staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
        }
        let dst_row = &mut dst[(dst_y * dst_row_width) as usize * ETC1_BLOCK_BYTES..];
        let staging_row_bytes = cast_slice(&*staging_row);
        dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
    }
}

/// Decompress ETC1 to RGBA
///
/// - `src` should be in ETC1
/// - `dst` will be filled with RGBA
/// - `width` and `height` should be the dimensions of `dst`. If width or height
///   are not multiples of 4, note that the edges become partial blocks and
///   pixels out of bounds will be discarded. The number is truncated.
/// - `src_row_width` should be the width of ETC1 image `dst_row_width` should
///   be the width of RGBA image
pub fn decompress_etc1(
    src: &[u8],
    dst: &mut [u32],
    dst_width: u32,
    dst_height: u32,
    src_row_width: u32,
    dst_row_width: u32,
) {
    // We access 'src' as array of u64s, but 'src' is not always aligned to 8-byte
    // because of constrains at the callsite.(b/464139989) To solve the
    // alignment issue, we copy the data from `src` into a temporary buffer that
    // is guaranteed to be 8-byte aligned. To balance between copying overhead
    // and memory overhead, we copy one row at a time.

    let mut staging_row_u64 = vec![0u64; src_row_width as usize];
    let bytes_per_row = src_row_width as usize * ETC1_BLOCK_BYTES;
    for y in (0..dst_height).step_by(4) {
        let src_y = (y / 4) as usize;
        let copy_start_idx = src_y * bytes_per_row;
        let copy_end_idx = (src_y + 1) * bytes_per_row;
        let staging_row_bytes: &mut [u8] = cast_slice_mut(&mut staging_row_u64);
        staging_row_bytes[..bytes_per_row].copy_from_slice(&src[copy_start_idx..copy_end_idx]);

        for x in (0..dst_width).step_by(4) {
            // The ETC1 specification ("Khronos Data Format Specification v1.1 rev 9")
            // defines the 64-bit block data as big endian.
            let src_x = (x / 4) as usize;
            let output_rgba_block =
                decode_etc1_block(u64::from_be(staging_row_u64[src_x as usize]));
            for y_in_block in 0..4 {
                for x_in_block in 0..4 {
                    let dst_x = x + x_in_block;
                    let dst_y = y + y_in_block;

                    if dst_y < dst_height && dst_x < dst_width {
                        let dst_idx = dst_y * dst_row_width + dst_x;

                        dst[dst_idx as usize] =
                            output_rgba_block[y_in_block as usize][x_in_block as usize];
                    }
                }
            }
        }
    }
}