1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
|
#!/usr/bin/env python3
#
# Copyright 2016 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Generate a dictionary for libFuzzer or AFL-based fuzzer.
Invoked manually using a fuzzer binary and target format/protocol specification.
Works better for text formats or protocols. For binary ones may be useless.
"""
import argparse
# This is a Python 2-only import despite the file using a Python 3 shebang. This
# implies that this file has been unused for years and has not been properly
# converted to Python 3.
import HTMLParser # pylint: disable=import-error
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
MIN_STRING_LENGTH = 4
def DecodeHTML(html_data):
"""HTML-decoding of the data."""
html_parser = HTMLParser.HTMLParser()
data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
return data.encode('ascii', 'ignore')
def EscapeDictionaryElement(element):
"""Escape all unprintable and control characters in an element."""
element_escaped = element.encode('string_escape')
# Remove escaping for single quote because it breaks libFuzzer.
element_escaped = element_escaped.replace("\\'", "'")
# Add escaping for double quote.
element_escaped = element_escaped.replace('"', '\\"')
return element_escaped
def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
"""Extract words (splitted strings) from a binary executable file."""
rodata = PreprocessAndReadRodata(filepath)
words = []
strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
# Use different encodings for strings extraction.
for encoding in ENCODING_TYPES:
data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
raw_strings = strings_re.findall(data)
for splitted_line in map(lambda line: line.split(), raw_strings):
words += splitted_line
return set(words)
def ExtractWordsFromLines(lines):
"""Extract all words from a list of strings."""
words = set()
for line in lines:
for word in line.split():
words.add(word)
return words
def ExtractWordsFromSpec(filepath, is_html):
"""Extract words from a specification."""
data = ReadSpecification(filepath, is_html)
words = data.split()
return set(words)
def FindIndentedText(text):
"""Find space-indented text blocks, e.g. code or data samples in RFCs."""
lines = text.split('\n')
indented_blocks = []
current_block = ''
previous_number_of_spaces = 0
# Go through every line and concatenate space-indented blocks into lines.
for i in range(0, len(lines), 1):
if not lines[i]:
# Ignore empty lines.
continue
# Space-indented text blocks have more leading spaces than regular text.
n = FindNumberOfLeadingSpaces(lines[i])
if n > previous_number_of_spaces:
# Beginning of a space-indented text block, start concatenation.
current_block = lines[i][n:]
elif n == previous_number_of_spaces and current_block:
# Or continuation of a space-indented text block, concatenate lines.
current_block += '\n' + lines[i][n:]
if n < previous_number_of_spaces and current_block:
# Current line is not indented, save previously concatenated lines.
indented_blocks.append(current_block)
current_block = ''
previous_number_of_spaces = n
return indented_blocks
def FindNumberOfLeadingSpaces(line):
"""Calculate number of leading whitespace characters in the string."""
n = 0
while n < len(line) and line[n].isspace():
n += 1
return n
def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
"""Generate a dictionary for given pair of fuzzer binary and specification."""
for filepath in [path_to_binary, path_to_spec]:
if not os.path.exists(filepath):
logging.error("%s doesn't exist. Exit.", filepath)
sys.exit(1)
words_from_binary = ExtractWordsFromBinary(path_to_binary)
words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
dictionary_words = set()
if 'i' in strategy:
# Strategy i: only words which are common for binary and for specification.
dictionary_words = words_from_binary.intersection(words_from_spec)
if 'q' in strategy:
# Strategy q: add words from all quoted strings from specification.
# TODO(mmoroz): experimental and very noisy. Not recommended to use.
spec_data = ReadSpecification(path_to_spec, is_html)
quoted_strings = FindIndentedText(spec_data)
quoted_words = ExtractWordsFromLines(quoted_strings)
dictionary_words = dictionary_words.union(quoted_words)
if 'u' in strategy:
# Strategy u: add all uppercase words from specification.
uppercase_words = set(w for w in words_from_spec if w.isupper())
dictionary_words = dictionary_words.union(uppercase_words)
return dictionary_words
def PreprocessAndReadRodata(filepath):
"""Create a stripped copy of the binary and extract .rodata section."""
stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
stripped_filepath = stripped_file.name
shutil.copyfile(filepath, stripped_filepath)
# Strip all symbols to reduce amount of redundant strings.
strip_cmd = ['strip', '--strip-all', stripped_filepath]
result = subprocess.call(strip_cmd)
if result:
logging.warning('Failed to strip the binary. Using the original version.')
stripped_filepath = filepath
# Extract .rodata section to reduce amount of redundant strings.
rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
rodata_filepath = rodata_file.name
objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
# Hide output from stderr since objcopy prints a warning.
with open(os.devnull, 'w') as devnull:
result = subprocess.call(objcopy_cmd, stderr=devnull)
if result:
logging.warning('Failed to extract .rodata section. Using the whole file.')
rodata_filepath = stripped_filepath
with open(rodata_filepath) as file_handle:
data = file_handle.read()
stripped_file.close()
rodata_file.close()
return data
def ReadSpecification(filepath, is_html):
"""Read a specification file and return its contents."""
with open(filepath, 'r') as file_handle:
data = file_handle.read()
if is_html:
data = DecodeHTML(data)
return data
def WriteDictionary(dictionary_path, dictionary):
"""Write given dictionary to a file."""
with open(dictionary_path, 'wb') as file_handle:
file_handle.write('# This is an automatically generated dictionary.\n')
for word in dictionary:
if not word:
continue
line = '"%s"\n' % EscapeDictionaryElement(word)
file_handle.write(line)
def main():
parser = argparse.ArgumentParser(description='Generate fuzzer dictionary.')
parser.add_argument('--fuzzer',
required=True,
help='Path to a fuzzer binary executable. It is '
'recommended to use a binary built with '
'"use_libfuzzer=false is_asan=false" to get a better '
'dictionary with fewer number of redundant elements.')
parser.add_argument('--spec',
required=True,
help='Path to a target specification (in textual form).')
parser.add_argument('--html',
default=0,
help='Decode HTML [01] (0 is default value): '
'1 - if specification has HTML entities to be decoded.')
parser.add_argument('--out',
required=True,
help='Path to a file to write a dictionary into.')
parser.add_argument('--strategy',
default='iu',
help='Generation strategy [iqu] ("iu" is default value): '
'i - intersection, q - quoted, u - uppercase.')
args = parser.parse_args()
dictionary = GenerateDictionary(args.fuzzer,
args.spec,
args.strategy,
is_html=bool(args.html))
WriteDictionary(args.out, dictionary)
if __name__ == '__main__':
main()
|