1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
|
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#![feature(portable_simd)]
// Modules public for testing, don't expect stable API.
mod cxx;
pub mod dither;
pub mod quant;
pub mod selectors;
use std::simd::prelude::*;
use std::simd::Simd;
use bytemuck::cast_slice;
use crate::dither::dither;
use crate::quant::{quantize_averages, QuantResult};
use crate::selectors::search_table_and_selectors;
// We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
// wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
// and support for AVX2 etc. this should be converted to a template parameter
// that varies based on the target architecture.
const SIMD_WIDTH: usize = 8;
const HALF_WIDTH: usize = SIMD_WIDTH / 2;
const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
type Reg = Simd<i16, SIMD_WIDTH>;
type Reg32 = Simd<i32, SIMD_WIDTH>;
type UReg = Simd<u16, SIMD_WIDTH>;
/// Define a helper to interleave elements from two vectors, reinterpret
/// it as a type twice as large, and return the resulting vector.
/// Each argument / return value is an array of vectors; conceptually, this
/// represents a vector that is <width> * <len> large; however, since std::simd
/// types have upper limits on their width we represent them using arrays to be
/// portable.
macro_rules! define_interleave {
($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
fn $fn_name(
a: [Simd<$src_ty, $src_width>; $src_len],
b: [Simd<$src_ty, $src_width>; $src_len],
) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
let mut iter = (0..$src_len).flat_map(|i| {
let (a, b) = a[i].interleave(b[i]);
[a, b].map(|x| bytemuck::cast(x))
});
let res = std::array::from_fn(|_| iter.next().unwrap());
assert!(iter.next().is_none());
res
}
};
}
/// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
/// into interleaved u64 arrays, while flipping the endianness (our internal
/// representation is little endian while ETC1 requires big endian).
#[inline]
pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
// The interleaving assumes little endian.
#[cfg(target_endian = "big")]
compile_error!("Big endian is not supported");
define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
// Step 1: make each u16 codeword big-endian
let regs = regs.map(|r| r.swap_bytes());
// Step 2: [aaaa, bbbb] to [baba, baba]
let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
// Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
let regs = conv_32_to_64(regs[1], regs[0]);
regs
}
/// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
/// starting at `base_x` and `base_y`.
///
/// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
/// becomes `abcdxyyx`.
///
/// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
/// (from left to right), and each pixel in the block is accessed as
/// `[y][x][channel]`.
#[inline]
pub fn load_input_block(
src: &[u32],
width: u32,
height: u32,
row_width: u32,
base_x: u32,
base_y: u32,
) -> [[[Reg; 3]; 4]; 4] {
let mut data = [[[Reg::default(); 3]; 4]; 4];
// For now, input load and output store are not vectorized. The main reason is
// that efficient loading requires shuffling and is poorly supported
// by std::simd and the wide crate (which we plan to use for
// supporting stable toolchain). Input load currently accounts for
// ~20% of the runtime. If shuffle support improves this would be a
// good candidate for optimization.
for i in 0..4 {
for j in 0..4 {
let mut buf = [0u32; SIMD_WIDTH];
for block in 0..SIMD_WIDTH as u32 {
let x = base_x + block * 4 + j as u32;
let y = base_y + i as u32;
buf[block as usize] = if x < width && y < height {
// Fast path: load in-bound pixel
src[(y * row_width + x) as usize]
} else {
// Slow path: mirror out-of-bound pixels
// If width or height is 1, mirroring can overflow, so make it saturate.
let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
src[(ym * row_width + xm) as usize]
};
}
let rgbx = Simd::from_array(buf);
let extract_channel = |x: Simd<u32, SIMD_WIDTH>, shift: u32| {
(x >> shift).cast::<i16>() & Simd::splat(0xFF)
};
data[i][j][0] = extract_channel(rgbx, 0);
data[i][j][1] = extract_channel(rgbx, 8);
data[i][j][2] = extract_channel(rgbx, 16);
}
}
data
}
/// Compress RGB pixels to ETC1.
///
/// `src` should be in RGBA.
/// `dst` will be filled with compressed ETC1 blocks.
/// `width` and `height` does not need to be multiple of 4. The boundary pixels
/// will be padded with unspecified values.
/// `src_row_width` and `dst_row_width` specifies the stride, in units of pixels
/// and blocks, respectively.
///
/// Note that `src` assumes aligned 32-bit buffer while `dst` does not. This is
/// due to two reasons: 32-bit alignment is practical to get even on 32-bit
/// platforms, whereas 64-bit alignment does not hold on 32-bit ARM.
/// Additionally, we require extensive shuffling when loading inputs, but
/// stores to the output straight in the order of pixels. Dealing with
/// unaligned buffers in the latter case is significantly easier.
pub fn compress_etc1(
src: &[u32],
dst: &mut [u8],
width: u32,
height: u32,
src_row_width: u32,
dst_row_width: u32,
) {
let dst_height = height.div_ceil(4);
let dst_width = width.div_ceil(4);
// Aligned staging buffer. Data is copied into the potentially unaligned
// destination buffer at the end of the each row.
let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
let copy_len = dst_width as usize * 8;
// Note on vectorization scheme:
//
// We process one 4x4 block per SIMD lane, instead of the more common practice
// of processing pixels within the same block in parallel using multiple
// lanes. The one-block-per-lane scheme, more akin to SPMD programming,
// allows most of our code to be shuffle-free, and works much better with
// portable SIMD than schemes that heavily shuffles.
for dst_y in 0..dst_height {
for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
let data = load_input_block(src, width, height, src_row_width, dst_x0 * 4, dst_y * 4);
let data = dither(&data);
let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
quantize_averages(&data);
let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
let codewords = interleave_etc1(best_fit);
staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
}
let dst_row = &mut dst[(dst_y * dst_row_width * 8) as usize..];
let staging_row_bytes = cast_slice(&*staging_row);
dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
}
}
|