1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
|
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#![feature(portable_simd)]
// Modules public for testing, don't expect stable API.
mod cxx;
pub mod decoder;
pub mod dither;
pub mod quant;
pub mod selectors;
use std::simd::prelude::*;
use std::simd::Simd;
use bytemuck::cast_slice;
use bytemuck::cast_slice_mut;
use crate::decoder::decode_etc1_block;
use crate::dither::dither;
use crate::quant::{quantize_averages, QuantResult};
use crate::selectors::search_table_and_selectors;
// We primarily compute with 16-bit integers and a width of 8 fills a 128-bit
// wide lane (SSE, NEON). TODO(b/393494744): When we introduce multiversioning
// and support for AVX2 etc. this should be converted to a template parameter
// that varies based on the target architecture.
const SIMD_WIDTH: usize = 8;
const HALF_WIDTH: usize = SIMD_WIDTH / 2;
const QUARTER_WIDTH: usize = SIMD_WIDTH / 4;
type Reg = Simd<i16, SIMD_WIDTH>;
type Reg32 = Simd<i32, SIMD_WIDTH>;
type UReg = Simd<u16, SIMD_WIDTH>;
const ETC1_BLOCK_BYTES: usize = 8;
/// Define a helper to interleave elements from two vectors, reinterpret
/// it as a type twice as large, and return the resulting vector.
/// Each argument / return value is an array of vectors; conceptually, this
/// represents a vector that is <width> * <len> large; however, since std::simd
/// types have upper limits on their width we represent them using arrays to be
/// portable.
macro_rules! define_interleave {
($fn_name:ident, $src_ty:ty, $dst_ty:ty, $src_width:expr, $dst_width:expr, $src_len:literal) => {
fn $fn_name(
a: [Simd<$src_ty, $src_width>; $src_len],
b: [Simd<$src_ty, $src_width>; $src_len],
) -> [Simd<$dst_ty, $dst_width>; $src_len * 2] {
let mut iter = (0..$src_len).flat_map(|i| {
let (a, b) = a[i].interleave(b[i]);
[a, b].map(|x| bytemuck::cast(x))
});
let res = std::array::from_fn(|_| iter.next().unwrap());
assert!(iter.next().is_none());
res
}
};
}
/// Convert individual codewords laid out as [15..0, 31..16, 47..32, 63..48]
/// into interleaved u64 arrays, while flipping the endianness (our internal
/// representation is little endian while ETC1 requires big endian).
#[inline]
pub fn interleave_etc1(regs: [UReg; 4]) -> [Simd<u64, QUARTER_WIDTH>; 4] {
// The interleaving assumes little endian.
#[cfg(target_endian = "big")]
compile_error!("Big endian is not supported");
define_interleave!(conv_16_to_32, u16, u32, SIMD_WIDTH, HALF_WIDTH, 1);
define_interleave!(conv_32_to_64, u32, u64, HALF_WIDTH, QUARTER_WIDTH, 2);
// Step 1: make each u16 codeword big-endian
let regs = regs.map(|r| r.swap_bytes());
// Step 2: [aaaa, bbbb] to [baba, baba]
let regs = [conv_16_to_32([regs[1]], [regs[0]]), conv_16_to_32([regs[3]], [regs[2]])];
// Step 3: [baba, baba], [dcdc, dcdc] to [dcba, dcba], [dcba, dcba]
let regs = conv_32_to_64(regs[1], regs[0]);
regs
}
/// Load `SIMD_WIDTH` blocks from a region `4*SIMD_WIDTH` wide and `4` tall,
/// starting at `base_x` and `base_y`.
///
/// Out of bounds pixels are padded with mirroring. For example, `abcdxy`
/// becomes `abcdxyyx`.
///
/// Returns a 3D array of SIMD vectors. Each block is mapped to a SIMD lane
/// (from left to right), and each pixel in the block is accessed as
/// `[y][x][channel]`.
#[inline]
pub fn load_input_block(
src: &[u32],
width: u32,
height: u32,
row_width: u32,
base_x: u32,
base_y: u32,
) -> [[[Reg; 3]; 4]; 4] {
let mut data = [[[Reg::default(); 3]; 4]; 4];
// For now, input load and output store are not vectorized. The main reason is
// that efficient loading requires shuffling and is poorly supported
// by std::simd and the wide crate (which we plan to use for
// supporting stable toolchain). Input load currently accounts for
// ~20% of the runtime. If shuffle support improves this would be a
// good candidate for optimization.
for i in 0..4 {
for j in 0..4 {
let mut buf = [0u32; SIMD_WIDTH];
for block in 0..SIMD_WIDTH as u32 {
let x = base_x + block * 4 + j as u32;
let y = base_y + i as u32;
buf[block as usize] = if x < width && y < height {
// Fast path: load in-bound pixel
src[(y * row_width + x) as usize]
} else {
// Slow path: mirror out-of-bound pixels
// If width or height is 1, mirroring can overflow, so make it saturate.
let xm = if x >= width { (width - 1).saturating_sub(x - width) } else { x };
let ym = if y >= height { (height - 1).saturating_sub(y - height) } else { y };
src[(ym * row_width + xm) as usize]
};
}
let rgbx = Simd::from_array(buf);
let extract_channel = |x: Simd<u32, SIMD_WIDTH>, shift: u32| {
(x >> shift).cast::<i16>() & Simd::splat(0xFF)
};
data[i][j][0] = extract_channel(rgbx, 0);
data[i][j][1] = extract_channel(rgbx, 8);
data[i][j][2] = extract_channel(rgbx, 16);
}
}
data
}
/// Compress RGB pixels to ETC1.
///
/// - `src` should be in RGBA format (the least significant byte is red).
/// - `dst` will be filled with compressed ETC1 blocks.
/// - `src_width` and `src_height` specifies the logical size of the image in
/// pixels. These does not need to be multiple of 4. The boundary pixels will
/// be padded with unspecified values.
/// - `src_row_width` and `dst_row_width` specifies the in-memory length of each
/// row, in pixels and blocks, respectively.
///
/// Note that `src` takes an aligned 32-bit buffer while `dst` takes a byte
/// buffer, even though each ETC1 codeword is 64-bit. This is due to two
/// reasons:
/// - 32-bit alignment is practical to get even on 32-bit platforms, whereas
/// 64-bit values are not aligned to 8 bytes on 32-bit ARM.
/// - We require extensive shuffling when loading inputs, but store to the
/// output straight in the order of blocks. Dealing with unaligned buffers in
/// the latter case is significantly easier.
pub fn compress_etc1(
src: &[u32],
dst: &mut [u8],
src_width: u32,
src_height: u32,
src_row_width: u32,
dst_row_width: u32,
) {
// Note: We deliberately do not declare the block size (4x4) of ETC1 as a
// constant. While magic constants in general are discouraged, the
// block size appears way too frequent that naming it would make the
// code verbose and less readable.
let dst_height = src_height.div_ceil(4);
let dst_width = src_width.div_ceil(4);
// Aligned staging buffer. Data is copied into the potentially unaligned
// destination buffer at the end of the each row.
let mut staging_row = vec![[Simd::splat(0); 4]; (dst_width as usize).div_ceil(SIMD_WIDTH)];
let copy_len = dst_width as usize * ETC1_BLOCK_BYTES;
// Note on vectorization scheme:
//
// We process one 4x4 block per SIMD lane, instead of the more common practice
// of processing pixels within the same block in parallel using multiple
// lanes. The one-block-per-lane scheme, more akin to SPMD programming,
// allows most of our code to be shuffle-free, and works much better with
// portable SIMD than schemes that heavily shuffles.
for dst_y in 0..dst_height {
for dst_x0 in (0..dst_width).step_by(SIMD_WIDTH) {
let data =
load_input_block(src, src_width, src_height, src_row_width, dst_x0 * 4, dst_y * 4);
let data = dither(&data);
let QuantResult { lo: hdr0, hi: hdr1, scaled0: ep0, scaled1: ep1 } =
quantize_averages(&data);
let best_fit = search_table_and_selectors(hdr0, hdr1, &data, [ep0, ep1]);
let codewords = interleave_etc1(best_fit);
staging_row[dst_x0 as usize / SIMD_WIDTH] = codewords;
}
let dst_row = &mut dst[(dst_y * dst_row_width) as usize * ETC1_BLOCK_BYTES..];
let staging_row_bytes = cast_slice(&*staging_row);
dst_row[..copy_len].copy_from_slice(&staging_row_bytes[..copy_len]);
}
}
/// Decompress ETC1 to RGBA
///
/// - `src` should be in ETC1
/// - `dst` will be filled with RGBA
/// - `width` and `height` should be the dimensions of `dst`. If width or height
/// are not multiples of 4, note that the edges become partial blocks and
/// pixels out of bounds will be discarded. The number is truncated.
/// - `src_row_width` should be the width of ETC1 image `dst_row_width` should
/// be the width of RGBA image
pub fn decompress_etc1(
src: &[u8],
dst: &mut [u32],
dst_width: u32,
dst_height: u32,
src_row_width: u32,
dst_row_width: u32,
) {
// We access 'src' as array of u64s, but 'src' is not always aligned to 8-byte
// because of constrains at the callsite.(b/464139989) To solve the
// alignment issue, we copy the data from `src` into a temporary buffer that
// is guaranteed to be 8-byte aligned. To balance between copying overhead
// and memory overhead, we copy one row at a time.
let mut staging_row_u64 = vec![0u64; src_row_width as usize];
let bytes_per_row = src_row_width as usize * ETC1_BLOCK_BYTES;
for y in (0..dst_height).step_by(4) {
let src_y = (y / 4) as usize;
let copy_start_idx = src_y * bytes_per_row;
let copy_end_idx = (src_y + 1) * bytes_per_row;
let staging_row_bytes: &mut [u8] = cast_slice_mut(&mut staging_row_u64);
staging_row_bytes[..bytes_per_row].copy_from_slice(&src[copy_start_idx..copy_end_idx]);
for x in (0..dst_width).step_by(4) {
// The ETC1 specification ("Khronos Data Format Specification v1.1 rev 9")
// defines the 64-bit block data as big endian.
let src_x = (x / 4) as usize;
let output_rgba_block =
decode_etc1_block(u64::from_be(staging_row_u64[src_x as usize]));
for y_in_block in 0..4 {
for x_in_block in 0..4 {
let dst_x = x + x_in_block;
let dst_y = y + y_in_block;
if dst_y < dst_height && dst_x < dst_width {
let dst_idx = dst_y * dst_row_width + dst_x;
dst[dst_idx as usize] =
output_rgba_block[y_in_block as usize][x_in_block as usize];
}
}
}
}
}
}
|