1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
package bzip2
// https://en.wikipedia.org/wiki/Bzip2
// https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
// TODO: multiple streams, possible to figure out length of compressed? use footer magic?
// TODO: empty file, no streams
import (
"compress/bzip2"
"encoding/binary"
"hash/crc32"
"io"
"math/bits"
"github.com/wader/fq/format"
"github.com/wader/fq/pkg/bitio"
"github.com/wader/fq/pkg/decode"
"github.com/wader/fq/pkg/interp"
"github.com/wader/fq/pkg/scalar"
)
var probeGroup decode.Group
func init() {
interp.RegisterFormat(
format.Bzip2,
&decode.Format{
Description: "bzip2 compression",
Groups: []*decode.Group{format.Probe},
DecodeFn: bzip2Decode,
Dependencies: []decode.Dependency{
{Groups: []*decode.Group{format.Probe}, Out: &probeGroup},
},
})
}
const blockMagic = 0x31_41_59_26_53_59
const footerMagic = 0x17_72_45_38_50_90
type bitFlipReader struct {
r io.Reader
}
func (bfr bitFlipReader) Read(p []byte) (n int, err error) {
n, err = bfr.r.Read(p)
for i := 0; i < n; i++ {
p[i] = bits.Reverse8(p[i])
}
return n, err
}
func bzip2Decode(d *decode.D) any {
// moreStreams := true
// d.FieldArray("streams", func(d *decode.D) {
// for moreStreams {
// d.FieldStruct("stream", func(d *decode.D) {
var blockCRCValue *decode.Value
var streamCRCN uint32
d.FieldUTF8("magic", 2, d.StrAssert("BZ"))
d.FieldU8("version")
d.FieldU8("hundred_k_blocksize")
d.FieldStruct("block", func(d *decode.D) {
// if d.PeekBits(48) != blockHeaderMagic {
// moreStreams = false
// return
// }
d.FieldU48("magic", d.UintAssert(blockMagic), scalar.UintHex)
d.FieldU32("crc", scalar.UintHex)
blockCRCValue = d.FieldGet("crc")
d.FieldU1("randomised")
d.FieldU24("origptr")
d.FieldU16("syncmapl1")
d.SeekRel(-16)
ranges := 0
for i := 0; i < 16; i++ {
if d.Bool() {
ranges++
}
}
d.FieldRawLen("syncmapl2", int64(ranges)*16)
numTrees := d.FieldU3("num_trees")
selectorsUsed := d.FieldU15("num_sels")
selectorsI := uint64(0)
d.FieldArrayLoop("selector_list", func() bool { return selectorsI < selectorsUsed }, func(d *decode.D) {
d.FieldU1("selector")
selectorsI++
})
treesI := uint64(0)
d.FieldArrayLoop("trees", func() bool { return treesI < numTrees }, func(d *decode.D) {
d.FieldUintFn("tree", func(d *decode.D) uint64 {
l := d.U5()
if !d.Bool() {
return l
}
if d.Bool() {
l--
} else {
l++
}
return l
})
treesI++
})
})
compressedStart := d.Pos()
readCompressedSize, uncompressedBR, dv, _, _ :=
d.TryFieldReaderRangeFormat("uncompressed", 0, d.Len(), bzip2.NewReader, &probeGroup, format.Probe_In{})
if uncompressedBR != nil {
if dv == nil {
d.FieldRootBitBuf("uncompressed", uncompressedBR)
}
blockCRC32W := crc32.NewIEEE()
d.Copy(blockCRC32W, bitFlipReader{bitio.NewIOReader(uncompressedBR)})
blockCRC32N := bits.Reverse32(binary.BigEndian.Uint32(blockCRC32W.Sum(nil)))
_ = blockCRCValue.TryUintScalarFn(d.UintValidate(uint64(blockCRC32N)))
streamCRCN = blockCRC32N ^ ((streamCRCN << 1) | (streamCRCN >> 31))
// HACK: bzip2.NewReader will read from start of whole buffer and then we figure out compressedSize ourself
// "It is important to note that none of the fields within a StreamBlock or StreamFooter are necessarily byte-aligned"
const footerByteSize = 10
compressedSize := (readCompressedSize - compressedStart) - footerByteSize*8
for i := 0; i < 8; i++ {
d.SeekAbs(compressedStart + compressedSize)
if d.PeekUintBits(48) == footerMagic {
break
}
compressedSize--
}
d.SeekAbs(compressedStart)
d.FieldRawLen("compressed", compressedSize)
d.FieldStruct("footer", func(d *decode.D) {
d.FieldU48("magic", d.UintAssert(footerMagic), scalar.UintHex)
// TODO: crc of block crcs
d.FieldU32("crc", scalar.UintHex, d.UintValidate(uint64(streamCRCN)))
d.FieldRawLen("padding", int64(d.ByteAlignBits()))
})
}
// moreStreams = false
// }
// })
return nil
}
|