File: filtering.c

package info (click to toggle)
clamav 0.98.7+dfsg-0+deb6u2
  • links: PTS, VCS
  • area: main
  • in suites: squeeze-lts
  • size: 60,204 kB
  • ctags: 49,129
  • sloc: cpp: 267,090; ansic: 152,211; sh: 35,196; python: 2,630; makefile: 2,220; perl: 1,690; pascal: 1,218; lisp: 184; csh: 117; xml: 38; asm: 32; exp: 4
file content (752 lines) | stat: -rw-r--r-- 22,058 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
/*
 *  A fast filter for static patterns.
 *
 *  Copyright (C) 2008 Sourcefire, Inc.
 *
 *  Authors: Török Edvin
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 *  MA 02110-1301, USA.
 */
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif

#include "clamav.h"
#include "filtering.h"
#include "matcher-ac.h"
#include <string.h>
#include <assert.h>
#include "perflogging.h"
/* ----- shift-or filtering -------------- */

/*
 * Description of algorithm:
 *
 * Multiple patterns are added to the filter.
 * The filter retains an approximation of these patterns, which can lead to
 * false positive matches, but not false negative matches.
 *
 * For each position in the filter we retain what qgrams can match at that
 * position, for example (if we'd use characters as qgrams):
 * pattern1: atu
 * pattern2: bzf
 * pattern3: xat
 * 
 * filter accepts:
 * [abx][tza][uft]
 *
 * But it also accepts (false positives):
 * azu, azf, azt, ...
 *
 * It doesn't however accept:
 * aaa, atz, ...
 *
 * This is implemented by having a bit-level state-machine with MAXSOPATLEN (=32) states, 
 * each active bit meaning that a state is active.
 * 
 * The states are activated sequentially, eachtransition decision is made 
 * considering if we can accept the character at position X. 
 * Since we can start a match at any position, position 0 is
 * reactivated each time.
 * When the last position is activated, the filter reports a match.
 * If we can't accept the character at position X, the state remains inactive,
 * and further states aren't activated (unless we activate this state in the
 * future).
 *
 * Essentially this is an automaton like this:
 *
 *  /\    (a|b|x)        (t|z|a)        (u|f|t)
 * [S1] ---------> [S2] -------> [S3] ---------> [S4] -> match
 *  \_______________/             |               
 *  \_____________________________/               
 *
 *
 * But we are tracking multiple active states at each time (or run N automatons
 * in parallel if you like, N = number of states).
 *
 * We can have S3 and S2 active, meaning that if the next character is
 * acceptable, it transitions to S1,S3 and S4 being active, otherwise it
 * transitions to S1 being active.
 *
 * Active states can either be represented as a binary 1 or 0, and using
 * bit-shifting and masking.
 * If we choose 1, we must use &, and after shifting always reactivate bit 0.
 * If we choose 0, we must use |, and after shifting we don't need to do
 * anything (since by shifting a 0 is implicitly introduced).
 *
 * This file implements the latter (shift-or) method.
 *
 * The discussion above considered pattern to be of same length (or truncated to
 * be so). In reality patterns are of variable length, and we often have short
 * pattern.
 *
 * Thus another bitmap was introduced, meaning that if (end[Q] == set), then
 * a pattern can end at this position.
 * Also we would fill the pattern's position filters quite quickly with only 256
 * choices for a position, so the algorithm uses overlapping qgrams of length 2:
 * 'abcd' is 3 qgrams: 'ab','bc','cd'
 *
 * The algorithm is very sensitive to the end[Q] filter, since it can have false
 * positives due to short patterns!
 * For optimal performance we need:
 *   - patterns as long as possible
 *   - probability for end[Q] to match low (avoid 0000, and other common case
 *   - choose the most "diverse" subset from a long pattern
 *
 * diverse = refering to what we are scanning, so that the filter rarely
 * matches, so this actually means that we *want* to avoid adding more
 * characters to the filter, if we have 2 patterns:
 * abxfg, and dalabxpo, it may be preferable to shift the 2nd one so that we
 * don't add new character at the beginning.
 *
 * With NDB signatures there are more challenges to overcome:
 *    e8??0000000aa
 *
 *    will make the filter accept:
 *    e8<all-256-values-here>, <all-256-values>00, ... 000000aa
 *
 *    We should delay the pattern end as long as possible, especially if it is  0000
 *    The problem is that now the filter accepts 0000 on position 3, regardless
 *    of what we have on position 1 (even if we have something else than e8), so
 *    we have to be very careful not to allow 0000 on first position too,
 *    otherwise the filter will happily accept 000000000000.
 *
 * To optimize cache usage there are 2 end filters, one character (fits L1), and one qgram
 * based (fits L2), both must match for the filter to consider it a match.   
 *
 *
 */

/*#define DETAILED_DEBUG*/
#ifdef DETAILED_DEBUG
#define detailed_dbg cli_dbgmsg
#else
#define detailed_dbg(...)
#endif

#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & (1 << ((val) & 0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= (1 << ((val) & 0x1f)))

void filter_init(struct filter *m)
{
	memset(m->B, ~0, sizeof(m->B));
	memset(m->end, ~0, sizeof(m->end));
}

/* because we use uint32_t */
#define MAXSOPATLEN 8

static inline int filter_isset(const struct filter *m, unsigned pos, uint16_t val)
{
	return !(m->B[val] & (1<<pos));
}

static inline void filter_set_atpos(struct filter *m, unsigned pos, uint16_t val)
{
	if (!filter_isset(m, pos, val)) {
		cli_perf_log_count(FILTER_LOAD, pos);
		m->B[val] &= ~(1<<pos);
	}
}


static inline int filter_end_isset(const struct filter *m, unsigned pos, uint16_t a)
{
	return !(m->end[a] & (1<<pos));
}

static inline void filter_set_end(struct filter *m, unsigned pos, uint16_t a)
{
	if (!filter_end_isset(m, pos, a)) {
		cli_perf_log_count(FILTER_END_LOAD, pos);
		m->end[a] &= ~(1 << pos);
	}
}
#define MAX_CHOICES 8
/* just an arbitrary limit, if patterns are longer, we cut
 * the filter can only use MAXSOPATLEN (32) characters,
 * this longer buffer is needed so that we can choose the "best" subpattern from
 * it */
#define MAXPATLEN 255

/* merge another pattern into the filter
 * add('abc'); add('bcd'); will match [ab][bc][cd] */
int filter_add_static(struct filter *m, const unsigned char *pattern, unsigned long len, const char *name)
{
	uint16_t q = 0;
	uint8_t j, maxlen;
	uint32_t best = 0xffffffff;
	uint8_t best_pos = 0;

    UNUSEDPARAM(name);

	cli_perf_log_count(TRIE_ORIG_LEN, len > 8 ? 8 : len);
	/* TODO: choose best among MAXCHOICES */
	/* cut length */
	if(len > MAXPATLEN) {
		len = MAXPATLEN;
	}
	if(len < 2)
		return -1;

	/* we want subsigs to be as long as possible */
	if (len > 4) {
		maxlen = len - 4;
		if (maxlen == 1) maxlen = 2;
	} else
		maxlen = 2;
	for(j=0;(best < 100 && j<MAX_CHOICES) || (j < maxlen) ;j++) {
		uint32_t num = MAXSOPATLEN;
		uint8_t k;
		if (j+2 > len)
			break;
		for(k=j;k<len-1 && (k-j < MAXSOPATLEN);k++) {
			q = cli_readint16( &pattern[k] );
			/* we want to favor subsigs that add as little as
			 * possible to the filter */
			num += filter_isset(m, k-j, q) ? 0 : MAXSOPATLEN - (k-j);
			if ((k == j || k == j+1) && (q == 0x0000 || q == 0xffff))
				num += k==j ?  10000 : 1000;/* bad */
		}
		/* it is very important to keep the end set small */
		num += 10*(filter_end_isset(m, k-j-1, q) ? 0 : 1);
		/* it is very important to have signatures as long as possible
		 * */
		num += 5*(MAXSOPATLEN - (k-j));
		/* if we are lower length than threshold penalize */
		if (k-j+1 < 4)
			num += 200;
		/* favour longer patterns */
		num -= (2*MAXSOPATLEN - (k + 1+j))*(k-j)/2;

		if (num < best) {
			best = num;
			best_pos = j;
		}
	}

	assert(best_pos < len-1);
	if (pattern[best_pos] == 0 && pattern[best_pos+1] == 0) {
		detailed_dbg("filter (warning): subsignature begins with zero (static): %s\n", name);
	}
	pattern += best_pos;
	len -= best_pos;
	/* cut length */
	if(len > MAXSOPATLEN) {
		len = MAXSOPATLEN;
	}
	/* Shift-Or like preprocessing */
	for(j=0;j < len-1;j++) {
		/* use overlapping little-endian 2-grams. We need them overlapping because matching can start at any position */
		q = cli_readint16( &pattern[j] );
		filter_set_atpos(m, j, q);
	}
	/* we use variable length patterns, use last character to mark pattern end,
	 * can lead to false positives.*/
	/* mark that at state j, the q-gram q can end the pattern */
	if(j) {
		j--;
		filter_set_end(m, j, q);
	}
	return j+2;
}

struct char_spec {
	/* if non-null i-th character = alt[start + step*i]; start+step*i < end;
	 */
	struct cli_ac_special *alt;
	uint8_t start;
	uint8_t end;
	uint8_t step;
	uint8_t negative;
};

static inline unsigned char spec_ith_char(const struct char_spec *spec, unsigned i)
{
	const struct cli_ac_special *alt = spec->alt;
	if (alt) {
		assert (alt->type == 1);
		assert (i < alt->num);
		return alt->str[i];
	}
	return i;
}

#ifndef MIN
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif

#define SPEC_FOREACH(spec0, k0, spec1, k1) do {\
    unsigned char c0 = spec_ith_char(spec0, k0);\
    unsigned char c1 = spec_ith_char(spec1, k1);\
    unsigned c0end, c1end, cc0,cc1;\
    c0end = spec0->negative ? 255 : c0;\
    c1end = spec1->negative ? 255 : c1;\
    cc0 = spec0->negative ? 0 : c0;\
    cc1 = spec1->negative ? 0 : c1;\
    for (;cc0 <= c0end;cc0++) {\
	for (;cc1 <= c1end; cc1++) {\
	    uint16_t a = cc0 | (cc1<<8);\
	    if (spec0->negative && cc0 == c0)\
	    continue;\
	    if (spec1->negative && cc1 == c1)\
	    continue;

#define SPEC_END_FOR }}} while(0)

enum badness {
	reject,
	/* try to avoid if possible */
	avoid_first,
	avoid_anywhere, /* includes avoid_first! */
	/* not that bad, but still not best */
	dontlike,
	acceptable,
	like
};
static inline void get_score(enum badness badness, unsigned i, const struct filter *m, const struct char_spec *spec0, const struct char_spec *spec1, int32_t *score, int32_t *score_end)
{
	int32_t base;
	unsigned k0, k1, num_introduced = 0, num_end_introduced = 0;
	switch (badness) {
		case reject:
			/* not reached */
			assert(0);
			base = -0x7fffff;
			break;
		case avoid_first:
			if (!i)
				base = -0x700000;
			else
				base = 0;
			break;
		case avoid_anywhere:
			if (!i)
				base = -0x720000;
			else
				base = -0x1000;
			break;
		case dontlike:
			base = 0;
			break;
		case acceptable:
			base = 0x200;
			break;
		case like:
			/* a bit better only */
			base = 0x201;
			break;
	}
	if (base < 0) {
		*score = base;
		*score_end = base;
		return;
	}
	/* at most 256 iterations here, otherwise base would be negative */
	for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
		for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
		    SPEC_FOREACH(spec0, k0, spec1, k1) {
			num_introduced += filter_isset(m, i, a);
			num_end_introduced += filter_end_isset(m, i, a);
		    } SPEC_END_FOR;
		}
	}
	*score = base - num_introduced;
	*score_end = base - num_end_introduced;
	if (badness == avoid_first && i) {
		/* what is bad to begin with, is bad at end too */
		*score_end -= 0x1000;
	}
}

struct choice {
	enum badness base;
	unsigned begin;
	unsigned len;
};

static inline void add_choice(struct choice *choices, unsigned *cnt, unsigned i, unsigned ie, enum badness badness)
{
	struct choice *choice;
	int i_neg = -1;
	assert(ie < MAXPATLEN);
	if (ie < i+1)
		return;
	if (*cnt >= MAX_CHOICES)
		return;
	if (badness > avoid_first && *cnt >= (MAX_CHOICES >> 1)) {
		unsigned j;
		/* replace very bad picks if we're full */
		for (j=0;j<*cnt;j++) {
			if (choices[j].base < badness) {
				if (i_neg == -1 || choices[j].base < choices[i_neg].base) {
					i_neg = j;
				}
			}
		}
	}
	if (i_neg != -1) {
		choice = &choices[i_neg];
	} else {
		choice = &choices[(*cnt)++];
	}
	choice->begin = i;
	choice->len = ie - i + 1;
	choice->base = badness;
}

static inline int32_t spec_iter(const struct char_spec *spec)
{
    unsigned count;
    assert(spec->step);
    count = (1 + spec->end - spec->start)/spec->step;
    if (spec->negative) /* all chars except itself are added */
	count *= 254;
    return count;
}

int  filter_add_acpatt(struct filter *m, const struct cli_ac_patt *pat)
{
	unsigned i, j = 0, stop = 0, l=0;
	uint16_t k0, k1;

	struct char_spec chars[MAXPATLEN];
	enum badness char_badness[MAXPATLEN];
	unsigned char patc[MAXPATLEN];
	unsigned altcnt = 0;
	int32_t best_score = -0x7fffffff;
	unsigned best_score_i = 0;
	unsigned best_score_len = 0;
	struct char_spec *spec0 = NULL, *spec1 = NULL;

	struct choice choices[MAX_CHOICES];
	unsigned choices_cnt = 0;
	unsigned prefix_len = pat->prefix_length;
	unsigned speci;

	j = MIN(prefix_len + pat->length, MAXPATLEN);
	for(i=0;i<j;i++) {
		const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
		if ((p&CLI_MATCH_WILDCARD) != CLI_MATCH_CHAR)
			break;
		patc[i] = (uint8_t)p;
	}
	if (i == j) {
		/* all static, use add_static it has better heuristics for this
		 * case */
		return filter_add_static(m, patc, j, pat->virname);
	}
	cli_perf_log_count(TRIE_ORIG_LEN, j > 8 ? 8 : j);
	i = 0;
	if (!prefix_len) {
	    while ((pat->pattern[i] & CLI_MATCH_WILDCARD) == CLI_MATCH_SPECIAL) {
		/* we support only ALT_CHAR, skip the rest */
		if (pat->special_table[altcnt]->type == 1)
		    break;
		altcnt++;
		i++;
	    }
	}
	/* transform AC characters into our representation */
	for (speci=0;i<j && !stop; speci++,i++) {
		struct char_spec *spec = &chars[speci];
		const uint16_t p = i < prefix_len ? pat->prefix[i] : pat->pattern[i - prefix_len];
		spec->alt = NULL;
		spec->negative = 0;
		switch (p & CLI_MATCH_WILDCARD) {
			case CLI_MATCH_CHAR:
				spec->start = spec->end = (uint8_t)p;
				spec->step  = 1;
				break;
			case CLI_MATCH_IGNORE:
				spec->start = 0x00;
				spec->end   = 0xff;
				spec->step  = 1;
				break;
			case CLI_MATCH_SPECIAL:
				assert(pat->special_table);
				/* assert(altcnt < pat->alt); */
				assert(pat->special_table[altcnt]);
				spec->negative = pat->special_table[altcnt]->negative;
				switch (pat->special_table[altcnt++]->type) {
				    case 1: /* ALT_CHAR */
					spec->start = 0;
					spec->end = pat->special_table[altcnt-1]->num - 1;
					spec->step = 1;
					spec->alt = pat->special_table[altcnt-1];
					break;
				    default:
					stop = 1;
					break;	/* TODO: should something be done here?
					 * */
				}
				break;
			case CLI_MATCH_NIBBLE_HIGH:
				spec->start = (p & 0xf0);
				spec->end   = spec->start | 0x0f;
				spec->step  = 1;
				break;
			case CLI_MATCH_NIBBLE_LOW:
				spec->start = (p & 0xf);
				spec->end   = 0xf0 | spec->start;
				spec->step  = 0x10;
				break;
			default:
				cli_errmsg("filtering: unknown wildcard character: %d\n", p);
				return -1;
		}
	}
	if (stop) --speci;
	j = speci;
	if (j < 2) {
		if (stop)
			cli_warnmsg("Don't know how to create filter for: %s\n",pat->virname);
		else
			cli_warnmsg("Subpattern too short: %s\n", pat->virname);
		return -1;
	}

	for(i=0;i<j-1;i++) {
		int32_t num_iter;
		/* new qgrams added to the filter */
		spec0 = &chars[i];
		spec1 = &chars[i+1];
		num_iter = spec_iter(spec0) * spec_iter(spec1);

		if (num_iter >= 0x100) {
			if (num_iter == 0x10000)
				char_badness[i] = reject;
			else
				char_badness[i] = avoid_anywhere;
		} else {
			int8_t binary = 0;
			enum badness scor = acceptable;
			for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
				for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
					unsigned char c0 = spec_ith_char(spec0, k0);
					unsigned char c1 = spec_ith_char(spec1, k1);
					if (spec0->negative || spec1->negative) {
					    scor = avoid_anywhere;
					    break;
					}
					if ((!c0 && !c1) || (c0 == 0xff && c1 == 0xff)) {
						scor = avoid_first;
						break;
					}
					if (c0 == c1) {
						scor = dontlike;
						break;
					}
					if ((c0 < 32 || c0 > 127) && (c1 < 32 || c1 >127))
						binary = 1;
				}
			}
			if (scor == acceptable && binary) {
				/* slightly favor binary */
				scor = like;
			}
			char_badness[i] = scor;
		}
	}

	/* try to choose best subpattern */

	/* calculating the score for all possible i start pos
	 * and all possible length is too slow, so choose best among N choices
	 * only */
	for (i=0;i<j-1 && choices_cnt < MAX_CHOICES;i++) {
		enum badness base0 = like, base1 = like;
		unsigned kend = MIN(j-1, (i + MAXSOPATLEN)&~1), k;
		int ki = -0xff;
		/* add 2 scores: pattern with max length, one where we stop at
		 * first negative, and one we stop at last positive, but never
		 * include reject */
		assert(kend-1 < j-1);
		if (char_badness[i]  == reject)
			continue;
		if ((char_badness[i] == avoid_anywhere || char_badness[i] == avoid_first)
				&& choices_cnt > 0)
			/* if we have another choice don't choose this */
			continue;
		while ((kend > i+3) && char_badness[kend-1] == reject) kend--;
		for (k=i;k<kend;k++) {
			enum badness badness = char_badness[k];
			if (badness < acceptable) {
				if (badness == reject) {
					/* this is a never pick */
					kend = k;
					break;
				}
				if (badness == avoid_first && k != i)
					badness = dontlike;
				if (k == i && badness == avoid_anywhere)
					badness = avoid_first;
				if (ki == -0xff)
					ki = k;
			}
			base0 = MIN(base0, badness);
			if (ki == -0xff)
				base1 = MIN(base1, badness);
		}
		add_choice(choices, &choices_cnt, i, kend, base0);
		if (ki > (int)i) {
			/* ki|ki+1|??| */
			/* try subpattern from after the wildcard */
			i = ki;
		}
		/* if score is positive, it replaces a negative choice */
	}
	for(l=0;l<choices_cnt;l++) {
		int32_t score;
		unsigned kend;
		unsigned k;

		i = choices[l].begin;
		kend = i + choices[l].len;
		score = 0;

		for(k = i; k < kend-1; k++) {
			unsigned p = k - i;
			int32_t iscore, score_end;
			assert(k < j);
			get_score(char_badness[k], p, m, &chars[k], &chars[k+1],
				  &iscore, &score_end);
			/* give more importance to the score of the characters
			 * at the beginning */
			/* TODO: tune magic number here */
			if (p < 6) {
				iscore *= (6-p);
				score_end *= (6-p);
			}
			score += iscore;
			if (score + score_end > best_score) {
				/* we may have negative scores, so truncating
				 * the pattern could actually get us a higher
				 * score */
				best_score = score + score_end;
				best_score_len = p + 2;
				best_score_i = i;
				assert(i + best_score_len <= j);
			}
		}
	}

	if (best_score <= -0x7fffffff) {
		cli_warnmsg("filter rejecting %s due to very bad score: %ld\n", pat->virname, (long)best_score);
		return -1;
	}
	if (choices_cnt == 0) {
		cli_warnmsg("filter rejecting %s because there are no viable choices", pat->virname);
		return -1;
	}
	assert(best_score_len >= 2);
	detailed_dbg("filter %s score: %ld, %u (+ %u)\n", pat->virname, (long)best_score, best_score_i, best_score_len);
	/* Shift-Or like preprocessing */
	assert(1 < best_score_len);
	for (i=0;i < best_score_len-1;i++) {
		spec0 = &chars[best_score_i + i];
		spec1 = &chars[best_score_i + i + 1];
		/* use overlapping little-endian 2-grams, overlapping because match can start
		 * at any position (including odd) */

		for(k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
			for(k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
			    SPEC_FOREACH(spec0, k0, spec1, k1) {
				if (!cc0 && !cc1 && !i) {
					detailed_dbg("filter (warning): subsignature begins with zero: %s\n",pat->virname);
				}
				filter_set_atpos(m, i, a);
			    } SPEC_END_FOR;
			}
		}
	}

	j  = best_score_len - 2;
	if (spec0 && spec1) {
	    for (k0=spec0->start;k0 <= spec0->end;k0 += spec0->step) {
		for (k1=spec1->start;k1 <= spec1->end;k1 += spec1->step) {
		    SPEC_FOREACH(spec0, k0, spec1, k1) {
			if (!cc0 && !cc1) {
			    detailed_dbg("filter (warning): subsignature ends with zero: %s\n",pat->virname);
			}
			filter_set_end(m, j, a);
		    } SPEC_END_FOR;
		}
	    }
	}
	return j+2;
}

/* state 11110011 means that we may have a match of length min 4, max 5 */

__hot__ int filter_search_ext(const struct filter *m, const unsigned char *data, unsigned long len, struct filter_match_info *inf)
{
	size_t j;
	uint8_t state = ~0;
	const uint8_t *B = m->B;
	const uint8_t *End = m->end;

	if (len < 2) return -1;
	/* look for first match */
	for (j=0; j < len-1;j++) {
		uint8_t match_state_end;
		const uint16_t q0 = cli_readint16( &data[j] );

		state = (state << 1) | B[q0];
		match_state_end = state | End[q0];
		if (match_state_end != 0xff) {
			inf->first_match = j;
      return 0;
		}
	}
  /* no match, inf is invalid */
  return -1;
}

/* this is like a FSM, with multiple active states at the same time.
 * each bit in "state" means an active state, when a char is encountered
 * we determine what states can remain active.
 * The FSM transition rules are expressed as bit-masks */
long filter_search(const struct filter *m, const unsigned char *data, unsigned long len)
{
	size_t j;
	uint8_t state = ~0;
	const uint8_t *B = m->B;
	const uint8_t *End = m->end;

	/* we use 2-grams, must be higher than 1 */
	if(len < 2) return -1;
	/* Shift-Or like search algorithm */
	for(j=0;j < len-1; j++) {
		const uint16_t q0 = cli_readint16( &data[j] );
		uint8_t match_end;
		state = (state << 1) | B[q0];
		/* state marks with a 0 bit all active states
		 * End[q0] marks with a 0 bit all states where the q-gram 'q' can end a pattern
		 * if we got two 0's at matching positions, it means we encountered a pattern's end */
		match_end = state | End[q0];
		if(match_end != 0xff) {

			/* if state is reachable, and this character can finish a pattern, assume match */
			/* to reduce false positives check if qgram can finish the pattern */
			/* return position of probable match */
			/* find first 0 starting from MSB, the position of that bit as counted from LSB, is the length of the
			 * longest pattern that could match */
			return j >= MAXSOPATLEN  ? j - MAXSOPATLEN : 0;
		}
	}
	/* no match */
	return -1;
}