Package: gumbo-parser | Debian Sources

Package: gumbo-parser / 0.13.0+dfsg-2

03-bs4.patch
From 29e1abb337af2a15ac4b38fb1c28d1b55ed08d54 Mon Sep 17 00:00:00 2001
From: Roman Miroshnychenko <romanvm@yandex.ua>
Date: Tue, 19 Jul 2016 18:25:52 +0300
Subject: [PATCH] Updates soup_adapter to use BeautifulSoup 4

Also fixes the indentation according to PEP-8
---
 python/gumbo/soup_adapter.py | 123 +++++++++++++++++------------------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/python/gumbo/soup_adapter.py b/python/gumbo/soup_adapter.py
index b18748f..6a247dd 100644
--- a/python/gumbo/soup_adapter.py
+++ b/python/gumbo/soup_adapter.py
@@ -13,66 +13,65 @@
 # limitations under the License.
 #
 
-"""Adapter between Gumbo and BeautifulSoup.
+"""Adapter between Gumbo and BeautifulSoup 4.
 
-This parses an HTML document and gives back a BeautifulSoup object, which you
-can then manipulate like a normal BeautifulSoup parse tree.
+This parses an HTML document and gives back a BeautifulSoup 4 object, which you
+can then manipulate like a normal BeautifulSoup 4 parse tree.
 """
 
 __author__ = 'jdtang@google.com (Jonathan Tang)'
 
-import BeautifulSoup
-
+import bs4 as BeautifulSoup
 import gumboc
 
 
 def _utf8(text):
-  return text.decode('utf-8', 'replace')
+    return text.decode('utf-8', 'replace')
 
 
 def _add_source_info(obj, original_text, start_pos, end_pos):
-  obj.original = str(original_text)
-  obj.line = start_pos.line
-  obj.col = start_pos.column
-  obj.offset = start_pos.offset
-  if end_pos:
-    obj.end_line = end_pos.line
-    obj.end_col = end_pos.column
-    obj.end_offset = end_pos.offset
+    obj.original = str(original_text)
+    obj.line = start_pos.line
+    obj.col = start_pos.column
+    obj.offset = start_pos.offset
+    if end_pos:
+        obj.end_line = end_pos.line
+        obj.end_col = end_pos.column
+        obj.end_offset = end_pos.offset
 
 
 def _convert_attrs(attrs):
-  # TODO(jdtang): Ideally attributes would pass along their positions as well,
-  # but I can't extend the built in str objects with new attributes.  Maybe work
-  # around this with a subclass in some way...
-  return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
+    # TODO(jdtang): Ideally attributes would pass along their positions as well,
+    # but I can't extend the built in str objects with new attributes.  Maybe work
+    # around this with a subclass in some way...
+    return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
 
 
 def _add_document(soup, element):
-  # Currently ignored, since there's no real place for this in the BeautifulSoup
-  # API.
-  pass
+    # Currently ignored, since there's no real place for this in the BeautifulSoup
+    # API.
+    pass
 
 
 def _add_element(soup, element):
-  # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
-  # BeautifulSoup.
-  tag = BeautifulSoup.Tag(
-      soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
-  for child in element.children:
-    tag.append(_add_node(soup, child))
-  _add_source_info(
-      tag, element.original_tag, element.start_pos, element.end_pos)
-  tag.original_end_tag = str(element.original_end_tag)
-  return tag
+    # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
+    # BeautifulSoup.
+    tag = BeautifulSoup.Tag(
+        name=_utf8(element.tag_name), attrs=_convert_attrs(element.attributes))
+    for child in element.children:
+        tag.append(_add_node(soup, child))
+    _add_source_info(
+        tag, element.original_tag, element.start_pos, element.end_pos)
+    tag.original_end_tag = str(element.original_end_tag)
+    return tag
 
 
 def _add_text(cls):
-  def add_text_internal(soup, element):
-    text = cls(_utf8(element.text))
-    _add_source_info(text, element.original_text, element.start_pos, None)
-    return text
-  return add_text_internal
+    def add_text_internal(soup, element):
+        text = cls(_utf8(element.text))
+        _add_source_info(text, element.original_text, element.start_pos, None)
+        return text
+    return add_text_internal
 
 
 _HANDLERS = [
@@ -87,35 +86,35 @@ def add_text_internal(soup, element):
 
 
 def _add_node(soup, node):
-  return _HANDLERS[node.type.value](soup, node.contents)
+    return _HANDLERS[node.type.value](soup, node.contents)
 
 
 def _add_next_prev_pointers(soup):
-  def _traverse(node):
-    # .findAll requires the .next pointer, which is what we're trying to add
-    # when we call this, and so we manually supply a generator to yield the
-    # nodes in DOM order.
-    yield node
-    try:
-      for child in node.contents:
-        for descendant in _traverse(child):
-          yield descendant
-    except AttributeError:
-      # Not an element.
-      return
-
-  nodes = sorted(_traverse(soup), key=lambda node: node.offset)
-  if nodes:
-    nodes[0].previous = None
-    nodes[-1].next = None
-  for i, node in enumerate(nodes[1:-1], 1):
-    nodes[i-1].next = node
-    node.previous = nodes[i-1]
+    def _traverse(node):
+        # .findAll requires the .next pointer, which is what we're trying to add
+        # when we call this, and so we manually supply a generator to yield the
+        # nodes in DOM order.
+        yield node
+        try:
+          for child in node.contents:
+            for descendant in _traverse(child):
+                yield descendant
+        except AttributeError:
+            # Not an element.
+            return
+
+    nodes = sorted(_traverse(soup), key=lambda node: node.offset)
+    if nodes:
+        nodes[0].previous_element = None
+        nodes[-1].next_element = None
+    for i, node in enumerate(nodes[1:-1], 1):
+        nodes[i-1].next_element = node
+        node.previous_element = nodes[i-1]
 
 
 def parse(text, **kwargs):
-  with gumboc.parse(text, **kwargs) as output:
-    soup = BeautifulSoup.BeautifulSoup()
-    soup.append(_add_node(soup, output.contents.root.contents))
-    _add_next_prev_pointers(soup)
-    return soup
+    with gumboc.parse(text, **kwargs) as output:
+        soup = BeautifulSoup.BeautifulSoup(features='html.parser')
+        soup.append(_add_node(soup, output.contents.root.contents))
+        _add_next_prev_pointers(soup)
+        return soup