File: compress.py

package info (click to toggle)
python-bitarray 3.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,288 kB
  • sloc: python: 11,456; ansic: 7,657; makefile: 73; sh: 6
file content (107 lines) | stat: -rw-r--r-- 3,043 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
This program demonstrates how Huffman codes can be used to efficiently
compress and uncompress files (text or binary).
"""
import os
import struct
from itertools import islice
from optparse import OptionParser
from collections import Counter

from bitarray import bitarray
from bitarray.util import (serialize, deserialize,
                           vl_encode, vl_decode, huffman_code)

def encode_code(code):
    res = bytearray(struct.pack("<H", len(code)))
    for sym in sorted(code):
        res.append(sym)
        res.extend(vl_encode(code[sym]))
    return res

def decode_code(stream):
    size = struct.unpack("<H", bytes(islice(stream, 2)))[0]
    code = {}
    for _ in range(size):
        sym = next(stream)
        code[sym] = vl_decode(stream)
    return code

def create_code(cnt):
    if len(cnt) > 0:
        return huffman_code(cnt)
    # special case for empty file
    return {0: bitarray('0')}

def encode(filename):
    with open(filename, 'rb') as fi:
        plain = fi.read()

    code = create_code(Counter(plain))
    with open(filename + '.huff', 'wb') as fo:
        fo.write(encode_code(code))
        a = bitarray(endian='little')
        a.encode(code, plain)
        fo.write(serialize(a))

    if len(plain) == 0:
        assert len(a) == 0
    else:
        print('Bits: %d / %d' % (len(a), 8 * len(plain)))
        print('Ratio =%6.2f%%' % (100.0 * a.buffer_info()[1] / len(plain)))

def decode(filename):
    assert filename.endswith('.huff')

    with open(filename, 'rb') as fi:
        stream = iter(fi.read())
    code = decode_code(stream)
    a = deserialize(bytes(stream))

    with open(filename[:-5] + '.out', 'wb') as fo:
        fo.write(bytearray(a.decode(code)))

def main():
    p = OptionParser("usage: %prog [options] FILE")
    p.add_option(
        '-e', '--encode',
        action="store_true",
        help="encode (compress) FILE using the Huffman code calculated for "
             "the frequency of characters in FILE itself. "
             "The output is FILE.huff which contains both the Huffman "
             "code and the bitarray resulting from the encoding.")
    p.add_option(
        '-d', '--decode',
        action="store_true",
        help="decode (decompress) FILE.huff and write the output to FILE.out")
    p.add_option(
        '-t', '--test',
        action="store_true",
        help="encode FILE, decode FILE.huff, compare FILE with FILE.out, "
             "and unlink created files.")
    opts, args = p.parse_args()
    if len(args) != 1:
        p.error('exactly one argument required')
    filename = args[0]

    if opts.encode:
        encode(filename)

    elif opts.decode:
        decode(filename + '.huff')

    elif opts.test:
        huff = filename + '.huff'
        out = filename + '.out'
        encode(filename)
        decode(huff)
        assert open(filename, 'rb').read() == open(out, 'rb').read()
        os.unlink(huff)
        os.unlink(out)

    else:
        p.error("no option provided")


if __name__ == '__main__':
    main()