File: flask_api.py

package info (click to toggle)
mu-editor 1.2.0%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 13,492 kB
  • sloc: python: 33,326; makefile: 154; xml: 32; sh: 7
file content (131 lines) | stat: -rw-r--r-- 5,139 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
The scrapy and beautifulsoup4 packages must be installed.

Usage:

scrapy runspider flask_api.py -o flask.json

"""
import scrapy
from bs4 import BeautifulSoup


URL = "http://flask.pocoo.org/docs/1.0/api/"


class BottleSpider(scrapy.Spider):
    name = "BottleSpider"
    start_urls = [URL]

    def parse(self, response):
        """
        Scrapes the list of modules associated with Bottle. Causes
        scrapy to follow the links to the module docs and uses a different
        parser to extract the API information contained therein.
        """
        # Find all the function definitions on the page:
        for func in response.css("dl.function"):
            # Class details are always first items in dl.
            func_spec = func.css("dt")[0]
            func_doc = func.css("dd")[0]
            # Function name is always first dt
            func_name = BeautifulSoup(
                func_spec.css("code.descname").extract()[0], "html.parser"
            ).text
            # Args into function
            args = []
            for ems in func_spec.css("em"):
                args.append(
                    ems.extract().replace("<em>", "").replace("</em>", "")
                )
            # Function description.
            soup = BeautifulSoup(func_doc.extract(), "html.parser")
            d = self.to_dict(func_name, args, soup.text)
            if d:
                yield d
        # Find all the class definitions on the page:
        for classes in response.css("dl.class"):
            # Class details are always first items in dl.
            class_spec = classes.css("dt")[0]
            class_doc = classes.css("dd")[0]
            # Class name is always first dt
            class_name = BeautifulSoup(
                class_spec.css("code.descname").extract()[0], "html.parser"
            ).text
            # Args into __init__
            init_args = []
            for ems in class_spec.css("em"):
                props = "property" in ems.css("::attr(class)").extract()
                if not props:
                    init_args.append(
                        ems.extract().replace("<em>", "").replace("</em>", "")
                    )
            # Class description. Everything up to and including the field-list.
            soup = BeautifulSoup(class_doc.extract(), "html.parser")
            contents = soup.contents[0].contents
            description = ""
            for child in contents:
                if child.name == "p":
                    description += child.text + "\n\n"
                if child.name == "table":
                    raw = child.text
                    rows = [r.strip() for r in raw.split("/n") if r.strip()]
                    description += "\n"
                    description += "\n".join(rows)
                    break
                if child.name == "dl":
                    break
            d = self.to_dict(class_name, init_args, description)
            if d:
                yield d
            # Remaining dt are methods or attributes
            for methods in classes.css("dl.method"):
                # Parse and yield methods.
                method_name = BeautifulSoup(
                    methods.css("code.descname").extract()[0], "html.parser"
                ).text
                if method_name.startswith("__"):
                    break
                method_name = class_name + "." + method_name
                method_args = []
                for ems in methods.css("em"):
                    method_args.append(
                        ems.extract().replace("<em>", "").replace("</em>", "")
                    )
                description = BeautifulSoup(
                    methods.css("dd")[0].extract(), "html.parser"
                ).text
                d = self.to_dict(method_name, method_args, description)
                if d:
                    yield d
            for data in classes.css("dl.attribute"):
                name = BeautifulSoup(
                    data.css("code.descname").extract()[0], "html.parser"
                ).text
                name = class_name + "." + name
                description = BeautifulSoup(
                    data.css("dd")[0].extract(), "html.parser"
                ).text
                d = self.to_dict(name, None, description)
                if d:
                    yield d
            for data in classes.css("dl.data"):
                name = BeautifulSoup(
                    data.css("code.descname").extract()[0], "html.parser"
                ).text
                name = class_name + "." + name
                description = BeautifulSoup(
                    data.css("dd")[0].extract(), "html.parser"
                ).text
                d = self.to_dict(name, None, description)
                if d:
                    yield d

    def to_dict(self, name, args, description):
        """
        Returns a dictionary representation of the API element if valid, else
        returns None.
        """
        if name.endswith("__"):
            return None
        return {"name": name, "args": args, "description": description}