File: ldm.h

package info (click to toggle)
tarantool 2.6.0-1.4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 85,412 kB
  • sloc: ansic: 513,775; cpp: 69,493; sh: 25,650; python: 19,190; perl: 14,973; makefile: 4,178; yacc: 1,329; sql: 1,074; pascal: 620; ruby: 190; awk: 18; lisp: 7
file content (197 lines) | stat: -rw-r--r-- 6,331 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#ifndef LDM_H
#define LDM_H

#include "mem.h"    // from /lib/common/mem.h

//#include "ldm_params.h"

// =============================================================================
// Modify the parameters in ldm_params.h if "ldm_params.h" is included.
// Otherwise, modify the parameters here.
// =============================================================================

#ifndef LDM_PARAMS_H
  // Defines the size of the hash table.
  // Note that this is not the number of buckets.
  // Currently this should be less than WINDOW_SIZE_LOG + 4.
  #define LDM_MEMORY_USAGE 23

  // The number of entries in a hash bucket.
  #define HASH_BUCKET_SIZE_LOG 3 // The maximum is 4 for now.

  // Defines the lag in inserting elements into the hash table.
  #define LDM_LAG 0

  // The maximum window size when searching for matches.
  // The maximum value is 30
  #define LDM_WINDOW_SIZE_LOG 28

  // The minimum match length.
  // This should be a multiple of four.
  #define LDM_MIN_MATCH_LENGTH 64

  // If INSERT_BY_TAG, insert entries into the hash table as a function of the
  // hash. Certain hashes will not be inserted.
  //
  // Otherwise, insert as a function of the position.
  #define INSERT_BY_TAG 1

  // Store a checksum with the hash table entries for faster comparison.
  // This halves the number of entries the hash table can contain.
  #define USE_CHECKSUM 1
#endif

// Output compression statistics.
#define COMPUTE_STATS

// Output the configuration.
#define OUTPUT_CONFIGURATION

// If defined, forces the probability of insertion to be approximately
// one per (1 << HASH_ONLY_EVERY_LOG). If not defined, the probability will be
// calculated based on the memory usage and window size for "even" insertion
// throughout the window.

// #define HASH_ONLY_EVERY_LOG 8

// =============================================================================

// The number of bytes storing the compressed and decompressed size
// in the header.
#define LDM_COMPRESSED_SIZE 8
#define LDM_DECOMPRESSED_SIZE 8
#define LDM_HEADER_SIZE ((LDM_COMPRESSED_SIZE)+(LDM_DECOMPRESSED_SIZE))

#define ML_BITS 4
#define ML_MASK ((1U<<ML_BITS)-1)
#define RUN_BITS (8-ML_BITS)
#define RUN_MASK ((1U<<RUN_BITS)-1)

// The number of bytes storing the offset.
#define LDM_OFFSET_SIZE 4

#define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG))

// TODO: Match lengths that are too small do not use the hash table efficiently.
// There should be a minimum hash length given the hash table size.
#define LDM_HASH_LENGTH LDM_MIN_MATCH_LENGTH

typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx;
typedef struct LDM_DCtx LDM_DCtx;

/**
 *  Compresses src into dst.
 *  Returns the compressed size if successful, 0 otherwise.
 *
 *  NB: This currently ignores maxDstSize and assumes enough space is available.
 *
 *  Block format (see lz4 documentation for more information):
 *  github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
 *
 *  A block is composed of sequences. Each sequence begins with a token, which
 *  is a one-byte value separated into two 4-bit fields.
 *
 *  The first field uses the four high bits of the token and encodes the literal
 *  length. If the field value is 0, there is no literal. If it is 15,
 *  additional bytes are added (each ranging from 0 to 255) to the previous
 *  value to produce a total length.
 *
 *  Following the token and optional length bytes are the literals.
 *
 *  Next are the 4 bytes representing the offset of the match (2 in lz4),
 *  representing the position to copy the literals.
 *
 *  The lower four bits of the token encode the match length. With additional
 *  bytes added similarly to the additional literal length bytes after the offset.
 *
 *  The last sequence is incomplete and stops right after the literals.
 */
size_t LDM_compress(const void *src, size_t srcSize,
                    void *dst, size_t maxDstSize);

/**
 * Initialize the compression context.
 *
 * Allocates memory for the hash table.
 *
 * Returns 0 if successful, 1 otherwise.
 */
size_t LDM_initializeCCtx(LDM_CCtx *cctx,
                          const void *src, size_t srcSize,
                          void *dst, size_t maxDstSize);

/**
 * Frees up memory allocated in LDM_initializeCCtx().
 */
void LDM_destroyCCtx(LDM_CCtx *cctx);

/**
 * Prints the distribution of offsets in the hash table.
 *
 * The offsets are defined as the distance of the hash table entry from the
 * current input position of the cctx.
 */
void LDM_outputHashTableOffsetHistogram(const LDM_CCtx *cctx);

/**
 * Outputs compression statistics to stdout.
 */
void LDM_printCompressStats(const LDM_compressStats *stats);

/**
 * Encode the literal length followed by the literals.
 *
 * The literal length is written to the upper four bits of pToken, with
 * additional bytes written to the output as needed (see lz4).
 *
 * This is followed by literalLength bytes corresponding to the literals.
 */
void LDM_encodeLiteralLengthAndLiterals(LDM_CCtx *cctx, BYTE *pToken,
                                        const U64 literalLength);

/**
 * Write current block (literals, literal length, match offset,
 * match length).
 */
void LDM_outputBlock(LDM_CCtx *cctx,
                     const U64 literalLength,
                     const U32 offset,
                     const U64 matchLength);

/**
 * Decompresses src into dst.
 *
 * Note: assumes src does not have a header.
 */
size_t LDM_decompress(const void *src, size_t srcSize,
                      void *dst, size_t maxDstSize);

/**
 * Initialize the decompression context.
 */
void LDM_initializeDCtx(LDM_DCtx *dctx,
                        const void *src, size_t compressedSize,
                        void *dst, size_t maxDecompressedSize);

/**
 * Reads the header from src and writes the compressed size and
 * decompressed size into compressedSize and decompressedSize respectively.
 *
 * NB: LDM_compress and LDM_decompress currently do not add/read headers.
 */
void LDM_readHeader(const void *src, U64 *compressedSize,
                    U64 *decompressedSize);

/**
 * Write the compressed and decompressed size.
 */
void LDM_writeHeader(void *memPtr, U64 compressedSize,
                     U64 decompressedSize);

/**
 * Output the configuration used.
 */
void LDM_outputConfiguration(void);

#endif /* LDM_H */