File: textnorm.c

package info (click to toggle)
clamav 0.99%2Bdfsg-0%2Bdeb6u1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze-lts
  • size: 63,444 kB
  • ctags: 51,567
  • sloc: cpp: 267,214; ansic: 163,108; sh: 35,371; python: 2,630; makefile: 2,249; perl: 1,690; yacc: 1,352; pascal: 1,218; lex: 714; lisp: 184; csh: 117; xml: 38; asm: 32; exp: 4
file content (161 lines) | stat: -rw-r--r-- 5,249 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/*
 *  Generic text normalizer.
 *
 *  Copyright (C) 2015 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
 *  Copyright (C) 2008 Sourcefire, Inc.
 *
 *  Authors: Török Edvin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */

#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "clamav.h"
#include "textnorm.h"
#include "bignum_fast.h"

int text_normalize_init(struct text_norm_state *state, unsigned char *out, size_t out_len)
{
	if(!state) {
		return CL_ENULLARG;
	}
	state->out = out;
	state->out_len = out_len;
	state->out_pos = 0;
	state->space_written = 0;
	return CL_SUCCESS;
}

void text_normalize_reset(struct text_norm_state* state)
{
	state->out_pos = 0;
	state->space_written = 0;
}

enum normalize_action {
	NORMALIZE_COPY,
	NORMALIZE_SKIP,
	NORMALIZE_AS_WHITESPACE,
	NORMALIZE_ADD_32
};


/* use shorter names in the table */
#define IGN NORMALIZE_SKIP
#define WSP NORMALIZE_AS_WHITESPACE
#define A32 NORMALIZE_ADD_32
#define NOP NORMALIZE_COPY

/*
 * whitespace: \t, \n, \f, \v, \r, [ ]
 * nop: all characters 0x20 < c < 0x80, that are not A32 and WSP
 * tolowercase: all uppercase characters
 * ignore: control character < 0x20 that are not whitespace, and all > 0x7f
 */

static const enum normalize_action char_action[256] = {
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, WSP, WSP, WSP, WSP, WSP, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	WSP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,/* 0x20 - 0x2f */
	NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
	NOP, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32,
        A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, A32, NOP, NOP, NOP, NOP, NOP,
	NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,
	NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP,/* 0x70 - 0x7f */
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN,
	IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN, IGN
};

/* Normalizes the text at @buf of length @buf_len, @buf can include \0 characters.
 * Stores the normalized text in @state's buffer. 
 * Returns how many bytes it consumed of the input. */
size_t text_normalize_buffer(struct text_norm_state *state, const unsigned char *buf, const size_t buf_len)
{
	size_t i;
	const unsigned char *out_end = state->out + state->out_len;
	unsigned char *p = state->out + state->out_pos;

	for(i=0; i < buf_len && p < out_end; i++) {
		unsigned char c = buf[i];
		switch(char_action[c]) {
			case NORMALIZE_SKIP:
				continue;
			case NORMALIZE_AS_WHITESPACE:
				/* convert consecutive whitespaces to a single space */
				if(!state->space_written) {
					*p++ = ' ';
				}
				state->space_written = 1;
				continue;
			case NORMALIZE_ADD_32:
				/* aka uppercase to lowercase */
				c += 32;
				/* fall through */
			case NORMALIZE_COPY:
				state->space_written = 0;
				*p++ = c;
		}
	}
	state->out_pos = p - state->out;
	return i;
}

/* Normalizes the text in @fmap and stores the result in @state's buffer.
 * Returns number of characters written to buffer. */
size_t text_normalize_map(struct text_norm_state *state, fmap_t *map, size_t offset)
{
	const unsigned char *map_loc;
	unsigned int map_pgsz;
	uint64_t map_len;
	size_t buff_len;
	size_t acc;
	size_t acc_total;
	size_t acc_len;

	map_len = map->len;
	map_pgsz = map->pgsz;
	buff_len = state->out_len;

	acc_total = 0;
	acc = 0;

	while (1) {
		/* Break out if we've reached the end of the map or our buffer. */
		if(!(acc_len = MIN_3(map_pgsz, map_len - offset, buff_len - acc_total))) break;

		/* If map_loc is NULL, then there's nothing left to do but recover. */
		if(!(map_loc = fmap_need_off_once(map, offset, acc_len))) break;
		offset += acc_len;

		/* If we didn't normalize anything, no need to update values, just break out. */
		if(!(acc = text_normalize_buffer(state, map_loc, acc_len))) break;
		acc_total += acc;
	}

	return acc_total;
}