File: test_utils.cc

package info (click to toggle)
gumbo-parser 0.13.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,900 kB
  • sloc: ansic: 30,496; cpp: 3,836; python: 890; makefile: 93; sh: 15
file content (204 lines) | stat: -rw-r--r-- 7,608 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jdtang@google.com (Jonathan Tang)

#include "test_utils.h"

#include "error.h"
#include "util.h"

int GetChildCount(GumboNode* node) {
  if (node->type == GUMBO_NODE_DOCUMENT) {
    return node->v.document.children.length;
  } else {
    return node->v.element.children.length;
  }
}

GumboTag GetTag(GumboNode* node) { return node->v.element.tag; }

GumboNode* GetChild(GumboNode* parent, int index) {
  if (parent->type == GUMBO_NODE_DOCUMENT) {
    return static_cast<GumboNode*>(parent->v.document.children.data[index]);
  } else {
    return static_cast<GumboNode*>(parent->v.element.children.data[index]);
  }
}

int GetAttributeCount(GumboNode* node) {
  return node->v.element.attributes.length;
}

GumboAttribute* GetAttribute(GumboNode* node, int index) {
  return static_cast<GumboAttribute*>(node->v.element.attributes.data[index]);
}

// Convenience function to do some basic assertions on the structure of the
// document (nodes are elements, nodes have the right tags) and then return
// the body node.
void GetAndAssertBody(GumboNode* root, GumboNode** body) {
  GumboNode* html = NULL;
  for (int i = 0; i < GetChildCount(root); ++i) {
    GumboNode* child = GetChild(root, i);
    if (child->type != GUMBO_NODE_ELEMENT) {
      ASSERT_EQ(GUMBO_NODE_COMMENT, child->type);
      continue;
    }
    ASSERT_TRUE(html == NULL);
    html = child;
  }
  ASSERT_TRUE(html != NULL);
  ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type);
  EXPECT_EQ(GUMBO_TAG_HTML, GetTag(html));

  // There may be comment/whitespace nodes; this walks through the children of
  // <html> and assigns head/body based on them, or assert-fails if there are
  // fewer/more than 2 such nodes.
  GumboNode* head = NULL;
  *body = NULL;
  for (int i = 0; i < GetChildCount(html); ++i) {
    GumboNode* child = GetChild(html, i);
    if (child->type != GUMBO_NODE_ELEMENT) {
      continue;
    }

    if (!head) {
      head = child;
      EXPECT_EQ(GUMBO_TAG_HEAD, GetTag(head));
    } else if (!(*body)) {
      *body = child;
      EXPECT_EQ(GUMBO_TAG_BODY, GetTag(*body));
    } else {
      ASSERT_TRUE("More than two elements found inside <html>" != NULL);
    }
  }
  EXPECT_TRUE(head != NULL);
  ASSERT_TRUE(*body != NULL);
}

void SanityCheckPointers(
    const char* input, size_t input_length, const GumboNode* node, int depth) {
  ASSERT_GE(input_length, (size_t) 0);
  ASSERT_TRUE(node != NULL);
  // There are some truly pathological HTML documents out there - the
  // integration tests for this include one where the DOM "tree" is actually a
  // linked list 27,500 nodes deep - and so we need a limit on the recursion
  // depth here to avoid blowing the stack. Alternatively, we could externalize
  // the stack and use an iterative algorithm, but that gets us very little for
  // the additional programming complexity.
  if (node->type == GUMBO_NODE_DOCUMENT || depth > 400) {
    // Don't sanity-check the document as well...we start with the root.
    return;
  }
  if (node->type == GUMBO_NODE_ELEMENT) {
    const GumboElement* element = &node->v.element;
    // Sanity checks on original* pointers, making sure they fall within the
    // original input.
    if (element->original_tag.data && element->original_tag.length) {
      EXPECT_GE(element->original_tag.data, input);
      EXPECT_LT(element->original_tag.data, input + input_length);
      EXPECT_LE(element->original_tag.length, input_length);
    }
    if (element->original_end_tag.data && element->original_tag.length) {
      EXPECT_GE(element->original_end_tag.data, input);
      EXPECT_LT(element->original_end_tag.data, input + input_length);
      EXPECT_LE(element->original_end_tag.length, input_length);
    }
    EXPECT_GE(element->start_pos.offset, 0);
    EXPECT_LE(element->start_pos.offset, input_length);
    EXPECT_GE(element->end_pos.offset, 0);
    EXPECT_LE(element->end_pos.offset, input_length);

    const GumboVector* children = &element->children;
    for (int i = 0; i < children->length; ++i) {
      const GumboNode* child = static_cast<const GumboNode*>(children->data[i]);
      // Checks on parent/child links.
      ASSERT_TRUE(child != NULL);
      EXPECT_EQ(node, child->parent);
      EXPECT_EQ(i, child->index_within_parent);
      SanityCheckPointers(input, input_length, child, depth + 1);
    }
  } else {
    const GumboText* text = &node->v.text;
    EXPECT_GE(text->original_text.data, input);
    EXPECT_LT(text->original_text.data, input + input_length);
    EXPECT_LE(text->original_text.length, input_length);
    EXPECT_GE(text->start_pos.offset, 0);
    EXPECT_LT(text->start_pos.offset, input_length);
  }
}

// Custom allocator machinery to sanity check for memory leaks.  Normally we can
// use heapcheck/valgrind/ASAN for this, but they only give the
// results when the program terminates.  This means that if the parser is run in
// a loop (say, a MapReduce) and there's a leak, it may end up exhausting memory
// before it can catch the particular document responsible for the leak.  These
// allocators let us check each document individually for leaks.

static void* LeakDetectingMalloc(void* userdata, size_t size) {
  MallocStats* stats = static_cast<MallocStats*>(userdata);
  stats->bytes_allocated += size;
  ++stats->objects_allocated;
  // Arbitrary limit of 2G on allocation; parsing any reasonable document
  // shouldn't take more than that.
  assert(stats->bytes_allocated < (1 << 31));
  void* obj = malloc(size);
  // gumbo_debug("Allocated %u bytes at %x.\n", size, obj);
  return obj;
}

static void LeakDetectingFree(void* userdata, void* ptr) {
  MallocStats* stats = static_cast<MallocStats*>(userdata);
  if (ptr) {
    ++stats->objects_freed;
    // gumbo_debug("Freed %x.\n");
    free(ptr);
  }
}

void InitLeakDetection(GumboOptions* options, MallocStats* stats) {
  stats->bytes_allocated = 0;
  stats->objects_allocated = 0;
  stats->objects_freed = 0;

  options->allocator = LeakDetectingMalloc;
  options->deallocator = LeakDetectingFree;
  options->userdata = stats;
}

GumboTest::GumboTest()
    : options_(kGumboDefaultOptions), errors_are_expected_(false), text_("") {
  InitLeakDetection(&options_, &malloc_stats_);
  options_.max_errors = 100;
  parser_._options = &options_;
  parser_._output = static_cast<GumboOutput*>(
      gumbo_parser_allocate(&parser_, sizeof(GumboOutput)));
  gumbo_init_errors(&parser_);
}

GumboTest::~GumboTest() {
  if (!errors_are_expected_) {
    // TODO(jdtang): A googlemock matcher may be a more appropriate solution for
    // this; we only want to pretty-print errors that are not an expected
    // output of the test.
    for (int i = 0; i < parser_._output->errors.length && i < 1; ++i) {
      gumbo_print_caret_diagnostic(&parser_,
          static_cast<GumboError*>(parser_._output->errors.data[i]), text_);
    }
  }
  gumbo_destroy_errors(&parser_);
  gumbo_parser_deallocate(&parser_, parser_._output);
  EXPECT_EQ(malloc_stats_.objects_allocated, malloc_stats_.objects_freed);
}