File: StringTable.java

package info (click to toggle)
osmpbf 1.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 420 kB
  • sloc: java: 1,400; cpp: 244; xml: 186; sh: 67; makefile: 20; ansic: 11
file content (133 lines) | stat: -rw-r--r-- 5,310 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU Lesser General Public License as
   published by the Free Software Foundation, either version 3 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/

package crosby.binary;

import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;

import com.google.protobuf.ByteString;

/**
 * Class for mapping a set of strings to integers, giving frequently occurring
 * strings small integers.
 */
public class StringTable {
    public StringTable() {
        clear();
    }

    private HashMap<String, Integer> counts;
    private HashMap<String, Integer> stringmap;
    private String[] set;

    /**
     * Increments the count of the given string
     * @param s the string
     */
    public void incr(String s) {
        counts.merge(s, 1, Integer::sum);
    }

    /** After the stringtable has been built, return the offset of a string in it.
     *
     * Note, value '0' is reserved for use as a delimiter and will not be returned.
     * @param s the string to lookup
     * @return the offset of the string
     */
    public int getIndex(String s) {
        return stringmap.get(s);
    }

    public void finish() {
        Comparator<String> comparator = (s1, s2) -> counts.get(s2) - counts.get(s1);

        /* Sort the stringtable */

        /*
        When a string is referenced, strings in the stringtable with indices:
               0                : Is reserved (used as a delimiter in tags
         A:  1 to 127          : Uses can be represented with 1 byte
         B: 128 to 128**2-1 : Uses can be represented with 2 bytes,
         C: 128*128  to X    : Uses can be represented with 3 bytes in the unlikely case we have >16k strings in a block. No block will contain enough strings that we'll need 4 bytes.

        There are goals that will improve compression:
          1. I want to use 1 bytes for the most frequently occurring strings, then 2 bytes, then 3 bytes.
          2. I want to use low integers as frequently as possible (for better
             entropy encoding out of deflate)
          3. I want the stringtable to compress as small as possible.

        Condition 1 is obvious. Condition 2 makes deflate compress stringtable references more effectively.
        When compressing entities, delta coding causes small positive integers to occur more frequently
        than larger integers. Even though a stringtable references to indices of 1 and 127 both use one
        byte in a decompressed file, the small integer bias causes deflate to use fewer bits to represent
        the smaller index when compressed. Condition 3 is most effective when adjacent strings in the
        stringtable have a lot of common substrings.

        So, when I decide on the master stringtable to use, I put the 127 most frequently occurring
        strings into A (accomplishing goal 1), and sort them by frequency (to accomplish goal 2), but
        for B and C, which contain the less progressively less frequently encountered strings, I sort
        them lexicographically, to maximize goal 3 and ignoring goal 2.

        Goal 1 is the most important. Goal 2 helped enough to be worth it, and goal 3 was pretty minor,
        but all should be re-benchmarked.


        */



        set = counts.keySet().toArray(new String[0]);
        if (set.length > 0) {
          // Sort based on the frequency.
          Arrays.sort(set, comparator);
          // Each group of keys that serializes to the same number of bytes is
          // sorted lexicographically.
          // to maximize deflate compression.

          // Don't sort the first array. There's not likely to be much benefit, and we want frequent values to be small.
          //Arrays.sort(set, Math.min(0, set.length-1), Math.min(1 << 7, set.length-1));

          Arrays.sort(set, Math.min(1 << 7, set.length-1), Math.min(1 << 14,
              set.length-1));
          Arrays.sort(set, Math.min(1 << 14, set.length-1), Math.min(1 << 21,
              set.length-1), comparator);
        }
        stringmap = new HashMap<>(2 * set.length);
        for (int i = 0; i < set.length; i++) {
            stringmap.put(set[i], i + 1); // Index 0 is reserved for use as a delimiter.
        }
        counts = null;
    }

    public void clear() {
        counts = new HashMap<>(100);
        stringmap = null;
        set = null;
    }

    public Osmformat.StringTable.Builder serialize() {
        Osmformat.StringTable.Builder builder = Osmformat.StringTable
                .newBuilder();
        builder.addS(ByteString.copyFromUtf8("")); // Add a unused string at offset 0 which is used as a delimiter.
        for (String s : set) {
            builder.addS(ByteString.copyFromUtf8(s));
        }
        return builder;
    }
}