1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Lempel–Ziv–Welch encoding/decoding
//
//===----------------------------------------------------------------------===//
#ifndef SANITIZER_LZW_H
#define SANITIZER_LZW_H
#include "sanitizer_dense_map.h"
namespace __sanitizer {
using LzwCodeType = u32;
template <class T, class ItIn, class ItOut>
ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
using Substring =
detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
// Sentinel value for substrings of len 1.
static constexpr LzwCodeType kNoPrefix =
Min(DenseMapInfo<Substring>::getEmptyKey().first,
DenseMapInfo<Substring>::getTombstoneKey().first) -
1;
DenseMap<Substring, LzwCodeType> prefix_to_code;
{
// Add all substring of len 1 as initial dictionary.
InternalMmapVector<T> dict_len1;
for (auto it = begin; it != end; ++it)
if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
dict_len1.push_back(*it);
// Slightly helps with later delta encoding.
Sort(dict_len1.data(), dict_len1.size());
// For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
// just generate them.
*out = dict_len1.size();
++out;
for (uptr i = 0; i != dict_len1.size(); ++i) {
// Remap after the Sort.
prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
*out = dict_len1[i];
++out;
}
CHECK_EQ(prefix_to_code.size(), dict_len1.size());
}
if (begin == end)
return out;
// Main LZW encoding loop.
LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
++begin;
for (auto it = begin; it != end; ++it) {
// Extend match with the new item.
auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
if (ins.second) {
// This is a new substring, but emit the code for the current match
// (before extend). This allows LZW decoder to recover the dictionary.
*out = match;
++out;
// Reset the match to a single item, which must be already in the map.
match = prefix_to_code.find({kNoPrefix, *it})->second;
} else {
// Already known, use as the current match.
match = ins.first->second;
}
}
*out = match;
++out;
return out;
}
template <class T, class ItIn, class ItOut>
ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
if (begin == end)
return out;
// Load dictionary of len 1 substrings. Theses correspont to lowest codes.
InternalMmapVector<T> dict_len1(*begin);
++begin;
if (begin == end)
return out;
for (auto& v : dict_len1) {
v = *begin;
++begin;
}
// Substrings of len 2 and up. Indexes are shifted because [0,
// dict_len1.size()) stored in dict_len1. Substings get here after being
// emitted to the output, so we can use output position.
InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
code_to_substr;
// Copies already emitted substrings into the output again.
auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
if (code < dict_len1.size()) {
*out = dict_len1[code];
++out;
return out;
}
const auto& s = code_to_substr[code - dict_len1.size()];
for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
return out;
};
// Returns lens of the substring with the given code.
auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
if (code < dict_len1.size())
return 1;
const auto& s = code_to_substr[code - dict_len1.size()];
return s.second - s.first;
};
// Main LZW decoding loop.
LzwCodeType prev_code = *begin;
++begin;
out = copy(prev_code, out);
for (auto it = begin; it != end; ++it) {
LzwCodeType code = *it;
auto start = out;
if (code == dict_len1.size() + code_to_substr.size()) {
// Special LZW case. The code is not in the dictionary yet. This is
// possible only when the new substring is the same as previous one plus
// the first item of the previous substring. We can emit that in two
// steps.
out = copy(prev_code, out);
*out = *start;
++out;
} else {
out = copy(code, out);
}
// Every time encoded emits the code, it also creates substing of len + 1
// including the first item of the just emmited substring. Do the same here.
uptr len = code_to_len(prev_code);
code_to_substr.push_back({start - len, start + 1});
prev_code = code;
}
return out;
}
} // namespace __sanitizer
#endif
|