File: iv_bm_search_test.cc

package info (click to toggle)
gnome-commander 1.18.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,796 kB
  • sloc: cpp: 44,542; xml: 3,929; lex: 600; sh: 80; makefile: 13
file content (190 lines) | stat: -rw-r--r-- 7,702 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/**
 * @file iv_bm_byte_test.cc
 * @brief Part of GNOME Commander - A GNOME based file manager
 *
 * @details In this file all tests are placed which belong to the
 * Boyer–Moore string search algorithm, used in the internal viewer of
 * GCMD. Currently, only a single test is implemented in which a short
 * pattern of integers is searched inside a bigger array of integers.
 *
 * @copyright (C) 2006 Assaf Gordon\n
 * @copyright (C) 2007-2012 Piotr Eljasiak\n
 * @copyright (C) 2013-2024 Uwe Scholz\n
 *
 * @copyright This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * @copyright This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * @copyright You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#include "gtest/gtest.h"
#include <bm_byte.h>
#include <gvtypes.h>
#include <viewer-utils.h>
#include <bm_chartype.h>

/**
 *  The fixture for testing class BmByteTest.
 */
class BmByteTest : public ::testing::Test {
  public:
    BmByteTest() : pattern{0x01, 0x04, 0xB4, 0xFE, 0x01, 0x01, 0x04},
                   text{
0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,
0x1f,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,
0x2e,0x2f,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,
0x3d,0x3e,0x3f,0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,
0x4c,0x4d,0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x01,0x04,0x59,0x5a,
0x5b,0x5c,0x5d,0x5e,0x5f,0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,
0x6a,0x6b,0x6c,0x6d,0x01,0x01,0x04,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,
0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x01,0x04,0xB4,0xFE,0x01,0x01,0x04,0x96, //match on this line
0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,
0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,0xb0,0xb1,0xb2,0xb3,0xb4,
0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,0xc0,0xc1,0xc2,0xc3,
0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0x05,0x04,
0xB4,0xFe,0x01,0x01,0x04,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,0xe0,0xe1,
0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,0xf0,
0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0x01,0x04,0xb4,0xfe,0x01,0x01,0x04,0x00} {}; // match on this line, too
  protected:
    const guint8 pattern[7];
    const guint8 text[256];
};

////////////////////////////////////////////////////////////////////////

/**
 * In this test a short pattern of integers is searched inside a 
 * bigger array of integers.
 * (see definitions in @link BmByteTest @endlink)
 */
TEST_F(BmByteTest, match_test) {

    GViewerBMByteData *data;
    int expected_good_data[7] {5, 5, 5, 5, 5, 7, 1};
    int expected_bad_data[4] {1, 5, 4, 3,};

    data = create_bm_byte_data(pattern,sizeof(pattern));

    printf("Good suffices table:\n");
    for (int i = 0; i < data->pattern_len; i++)
    {
        ASSERT_EQ(data->good[i], expected_good_data[i]) << "Vectors data->good and expected_good_data differ at index " << i;

        printf("%d ", data->good[i]);
    }
    printf("\n\n");

    printf("Bad characters table:\nChar(ASCII)\tValue\n");
    for (int i = 0, j = 0; i < 256; i++)
    {
        if (data->bad[i] != data->pattern_len)
        {
            ASSERT_EQ(data->bad[i], expected_bad_data[j]) << "Vectors data->bad and expected_bad_data differ at index " << j;
            j++;
            printf("%c(%d)\t\t%d\n",i , i, data->bad[i]);
        }
    }
    printf("(All other characters have value of %d)\n\n", data->pattern_len);

    // Do the actual search: The test search should find a match at 
    // position 142 and 248, nowhere else in the sample text.
    int m = data->pattern_len;
    int n = sizeof(text);
    int i; 
    int j = 0;
    while (j <= n - m) {
        for (i = m - 1; i >= 0 && pattern[i] == text[i + j]; --i);
        if (i < 0) {
            printf(" Found match at offset = %d\n", j);
            ASSERT_TRUE(j == 142 || j == 248);
            j += data->good[0];
        }
        else
            j += MAX(data->good[i], data->bad[(int)text[i + j]] - m + 1 + i);
    }

    free_bm_byte_data(data);
}

////////////////////////////////////////////////////////////////////////

/**
 * In this test a pattern of UTF-8 encoded letters is searched in a text
 * with UTF-8 encoded letters.
 */
TEST_F(BmByteTest, chartype_test) {
    /* 
     * This is a valid UTF8 string, with four hebrew letters in it:
     *  0xD7 0x90 = Aleph (Unicode U+5D0)
     *  0xD7 0x95 = Vav   (Unicode U+5D5)
     *  0xD7 0x94 = He    (Unicode U+5D4)
     *  0xD7 0x91 = Bet   (Unicode U+5D1)
     *  (Aleph-Vav-He-Bet, pronounced "ohev", means "love" in hebrew, FYI :-)
     */
    const gchar *pattern = "I \xd7\x90\xd7\x95\xd7\x94\xd7\x91 you";


    // This is a valid UTF8 text, with pangrams in several languages (I hope I got it right...)
    const gchar *text = \
        "English:" \
        "The quick brown fox jumps over the lazy dog" \
        "Irish:" \
        "An \xe1\xb8\x83 fuil do \xc4\x8bro\xc3\xad ag buala\xe1\xb8\x8b \xc3\xb3 \xe1\xb8\x9f ait\xc3\xados an \xc4\xa1r\xc3\xa1 a \xe1\xb9\x81 eall lena \xe1\xb9\x97\xc3\xb3g \xc3\xa9 ada \xc3\xb3 \xe1\xb9\xa1l\xc3\xad do leasa \xe1\xb9\xab\xc3\xba\x3f" \
        "Swedish:" \
        "Flygande b\xc3\xa4 ckasiner s\xc3\xb6ka strax hwila p\xc3\xa5 mjuka tuvor" \
        "(our match: I \xd7\x90\xd7\x95\xd7\x94\xd7\x91 You)" \
        "Hebrew:" \
        "\xd7\x96\xd7\x94 \xd7\x9b\xd7\x99\xd7\xa3 \xd7\xa1\xd7\xaa\xd7\x9d \xd7\x9c\xd7\xa9\xd7\x9e\xd7\x95\xd7\xa2 \xd7\x90\xd7\x99\xd7\x9a \xd7\xaa\xd7\xa0\xd7\xa6\xd7\x97 \xd7\xa7\xd7\xa8\xd7\xa4\xd7\x93 \xd7\xa2\xd7\xa5 \xd7\x98\xd7\x95\xd7\x91 \xd7\x91\xd7\x92\xd7\x9f" \
        "French:" \
        "Les na\xc3\xaf fs \xc3\xa6githales h\xc3\xa2tifs pondant \xc3\xa0 No\xc3\xabl o\xc3\xb9 il g\xc3\xa8le sont s\xc3\xbbrs d\x27\xc3\xaatre d\xc3\xa9\xc3\xa7us et de voir leurs dr\xc3\xb4les d\x27\xc5\x93ufs ab\xc3\xaem\xc3\xa9s\x2e";

    int i;
    int j;
    int m;
    int n;
    char_type *ct_text;
    int  ct_text_len;

    GViewerBMChartypeData *data;

    data = create_bm_chartype_data(pattern,FALSE);

    // Convert the UTF8 text string to a chartype array
    ct_text = convert_utf8_to_chartype_array(text, ct_text_len);
    ASSERT_TRUE(ct_text) << "Failed to convert text to 'char_type' array (maybe 'text' is not a valid UTF8 string?)\n";

    // Do the actual search
    m = data->pattern_len;
    n = ct_text_len;
    j = 0;
    int found_at = 0;
    while (j <= n - m)
    {
        for (i = m - 1; i >= 0 && bm_chartype_equal(data,i,ct_text[i + j]); --i);

        if (i < 0)
        {
            printf(" Found match at offset = %d\n", j);
            found_at = j;
            j += bm_chartype_get_good_match_advancement(data);

        }
        else
            j += bm_chartype_get_advancement(data, i, ct_text[i+j]);
    }
    ASSERT_EQ(found_at, 217) << "String with UTF-8 letters not found in text where it should be found.";

    g_free(ct_text);
    free_bm_chartype_data(data);
}