File: lib.rs

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (181 lines) | stat: -rw-r--r-- 7,756 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#![feature(portable_simd)]

// Modules public for testing, don't expect stable API.
mod cxx;
pub mod dither;
pub mod quant;
pub mod selectors;

use std::simd::prelude::*;
use std::simd::Simd;

use bytemuck::cast_slice;

use crate::dither::dither;
use crate::quant::{quantize_averages, QuantResult};
use crate::selectors::search_table_and_selectors;

// We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
// wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
// and support for AVX2 etc. this should be converted to a template parameter
// that varies based on the target architecture.
const SIMD_WIDTH: usize = 8;
const HALF_WIDTH: usize = SIMD_WIDTH / 2;
const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
type Reg = Simd<i16, SIMD_WIDTH>;
type Reg32 = Simd<i32, SIMD_WIDTH>;
type UReg = Simd<u16, SIMD_WIDTH>;

/// Define a helper to interleave elements from two vectors, reinterpret
/// it as a type twice as large, and return the resulting vector.
/// Each argument / return value is an array of vectors; conceptually, this
/// represents a vector that is <width> * <len> large; however, since std::simd
/// types have upper limits on their width we represent them using arrays to be
/// portable.
macro_rules! define_interleave {
    ($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
        fn $fn_name(
            a: [Simd<$src_ty, $src_width>; $src_len],
            b: [Simd<$src_ty, $src_width>; $src_len],
        ) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
            let mut iter = (0..$src_len).flat_map(|i| {
                let (a, b) = a[i].interleave(b[i]);
                [a, b].map(|x| bytemuck::cast(x))
            });
            let res = std::array::from_fn(|_| iter.next().unwrap());
            assert!(iter.next().is_none());
            res
        }
    };
}

/// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
/// into interleaved u64 arrays, while flipping the endianness (our internal
/// representation is little endian while ETC1 requires big endian).
#[inline]
pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
    // The interleaving assumes little endian.
    #[cfg(target_endian = "big")]
    compile_error!("Big endian is not supported");

    define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
    define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
    // Step 1: make each u16 codeword big-endian
    let regs = regs.map(|r| r.swap_bytes());
    // Step 2: [aaaa, bbbb] to [baba, baba]
    let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
    // Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
    let regs = conv_32_to_64(regs[1], regs[0]);
    regs
}

/// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
/// starting at `base_x` and `base_y`.
///
/// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
/// becomes `abcdxyyx`.
///
/// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
/// (from left to right), and each pixel in the block is accessed as
/// `[y][x][channel]`.
#[inline]
pub fn load_input_block(
    src: &[u32],
    width: u32,
    height: u32,
    row_width: u32,
    base_x: u32,
    base_y: u32,
) -> [[[Reg; 3]; 4]; 4] {
    let mut data = [[[Reg::default(); 3]; 4]; 4];
    // For now, input load and output store are not vectorized. The main reason is
    // that efficient loading requires shuffling and is poorly supported
    // by std::simd and the wide crate (which we plan to use for
    // supporting stable toolchain). Input load currently accounts for
    // ~20% of the runtime. If shuffle support improves this would be a
    // good candidate for optimization.
    for i in 0..4 {
        for j in 0..4 {
            let mut buf = [0u32; SIMD_WIDTH];
            for block in 0..SIMD_WIDTH as u32 {
                let x = base_x + block * 4 + j as u32;
                let y = base_y + i as u32;
                buf[block as usize] = if x < width && y < height {
                    // Fast path: load in-bound pixel
                    src[(y * row_width + x) as usize]
                } else {
                    // Slow path: mirror out-of-bound pixels
                    // If width or height is 1, mirroring can overflow, so make it saturate.
                    let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
                    let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
                    src[(ym * row_width + xm) as usize]
                };
            }
            let rgbx = Simd::from_array(buf);
            let extract_channel = |x: Simd<u32, SIMD_WIDTH>, shift: u32| {
                (x >> shift).cast::<i16>() & Simd::splat(0xFF)
            };
            data[i][j][0] = extract_channel(rgbx, 0);
            data[i][j][1] = extract_channel(rgbx, 8);
            data[i][j][2] = extract_channel(rgbx, 16);
        }
    }
    data
}

/// Compress RGB pixels to ETC1.
///
/// `src` should be in RGBA.
/// `dst` will be filled with compressed ETC1 blocks.
/// `width` and `height` does not need to be multiple of 4. The boundary pixels
/// will be padded with unspecified values.
/// `src_row_width` and `dst_row_width` specifies the stride, in units of pixels
/// and blocks, respectively.
///
/// Note that `src` assumes aligned 32-bit buffer while `dst` does not. This is
/// due to two reasons: 32-bit alignment is practical to get even on 32-bit
/// platforms, whereas 64-bit alignment does not hold on 32-bit ARM.
/// Additionally, we require extensive shuffling when loading inputs, but
/// stores to the output straight in the order of pixels. Dealing with
/// unaligned buffers in the latter case is significantly easier.
pub fn compress_etc1(
    src: &[u32],
    dst: &mut [u8],
    width: u32,
    height: u32,
    src_row_width: u32,
    dst_row_width: u32,
) {
    let dst_height = height.div_ceil(4);
    let dst_width = width.div_ceil(4);
    // Aligned staging buffer. Data is copied into the potentially unaligned
    // destination buffer at the end of the each row.
    let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
    let copy_len = dst_width as usize * 8;
    // Note on vectorization scheme:
    //
    // We process one 4x4 block per SIMD lane, instead of the more common practice
    // of processing pixels within the same block in parallel using multiple
    // lanes. The one-block-per-lane scheme, more akin to SPMD programming,
    // allows most of our code to be shuffle-free, and works much better with
    // portable SIMD than schemes that heavily shuffles.
    for dst_y in 0..dst_height {
        for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
            let data = load_input_block(src, width, height, src_row_width, dst_x0 * 4, dst_y * 4);

            let data = dither(&data);
            let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
                quantize_averages(&data);
            let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
            let codewords = interleave_etc1(best_fit);
            staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
        }
        let dst_row = &mut dst[(dst_y * dst_row_width * 8) as usize..];
        let staging_row_bytes = cast_slice(&*staging_row);
        dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
    }
}