File: adafruit_api.py

package info (click to toggle)
mu-editor 1.0.3%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 9,440 kB
  • sloc: python: 16,321; makefile: 129; xml: 29; sh: 7
file content (138 lines) | stat: -rw-r--r-- 5,706 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
The scrapy package must be installed.

Usage:

scrapy runspider adafruit_api.py -o adafruit.json

"""
import scrapy
from bs4 import BeautifulSoup


URL = 'https://circuitpython.readthedocs.io/en/3.x/shared-bindings/index.html'


class AdafruitSpider(scrapy.Spider):
    name = 'AdafruitSpider'
    start_urls = [URL, ]

    def parse(self, response):
        """
        Scrapes the list of modules associated with CircuitPython. Causes
        scrapy to follow the links to the module docs and uses a different
        parser to extract the API information contained therein.
        """
        for next_page in response.css('div.toctree-wrapper li a'):
            yield response.follow(next_page, self.parse_api)

    def to_dict(self, name, args, description):
        """
        Returns a dictionary representation of the API element if valid, else
        returns None.
        """
        if name.endswith('__'):
            return None
        return {
            'name': name,
            'args': args,
            'description': description,
        }


    def parse_api(self, response):
        """
        Parses a *potential* API documentation page.
        """
        # Find all the function definitions on the page:
        for func in response.css('dl.function'):
            # Class details are always first items in dl.
            func_spec = func.css('dt')[0]
            func_doc = func.css('dd')[0]
            # Function name is always first dt
            fn1 = BeautifulSoup(func_spec.css('code.descclassname').\
                extract()[0], 'html.parser').text
            fn2 = BeautifulSoup(func_spec.css('code.descname').extract()[0],
                                'html.parser').text
            func_name = fn1 + fn2
            # Args into function
            args = []
            for ems in func_spec.css('em'):
                args.append(ems.extract().replace('<em>', '').\
                    replace('</em>', ''))
            # Function description.
            soup = BeautifulSoup(func_doc.extract(), 'html.parser')
            d = self.to_dict(func_name, args, soup.text)
            if d:
                yield d
        # Find all the class definitions on the page:
        for classes in response.css('dl.class'):
            # Class details are always first items in dl.
            class_spec = classes.css('dt')[0]
            class_doc = classes.css('dd')[0]
            # Class name is always first dt
            cn1 = BeautifulSoup(class_spec.css('code.descclassname').\
                extract()[0], 'html.parser').text
            cn2 = BeautifulSoup(class_spec.css('code.descname').extract()[0],
                                'html.parser').text
            class_name = cn1 + cn2
            # Args into __init__
            init_args = []
            for ems in class_spec.css('em'):
                props = 'property' in ems.css('::attr(class)').extract()
                if not props:
                    init_args.append(ems.extract().replace('<em>', '').\
                        replace('</em>', ''))
            # Class description. Everything up to and including the field-list.
            soup = BeautifulSoup(class_doc.extract(), 'html.parser')
            contents = soup.contents[0].contents
            description = ''
            for child in contents:
                if child.name == 'p':
                    description += child.text + '\n\n'
                if child.name == 'table':
                    raw = child.text
                    rows = [r.strip() for r in raw.split('/n') if r.strip()]
                    description += '\n'
                    description += '\n'.join(rows)
                    break
                if child.name == 'dl':
                    break
            d = self.to_dict(class_name, init_args, description)
            if d:
                yield d
            # Remaining dt are methods or attributes
            for methods in classes.css('dl.method'):
                # Parse and yield methods.
                method_name = BeautifulSoup(methods.css('code.descname').\
                    extract()[0], 'html.parser').text
                if method_name.startswith('__'):
                    break
                method_name = class_name + '.' + method_name
                method_args = []
                for ems in methods.css('em'):
                    method_args.append(ems.extract().replace('<em>', '').\
                        replace('</em>', ''))
                description = BeautifulSoup(methods.css('dd')[0].extract(),
                                            'html.parser').text
                d = self.to_dict(method_name, method_args, description)
                if d:
                    yield d
            for data in classes.css('dl.attribute'):
                name = BeautifulSoup(data.css('code.descname').extract()[0],
                                     'html.parser').text
                name = class_name + '.' + name
                description = BeautifulSoup(data.css('dd')[0].extract(),
                                            'html.parser').text
                d = self.to_dict(name, None, description)
                if d:
                    yield d
            for data in classes.css('dl.data'):
                name = BeautifulSoup(data.css('code.descname').extract()[0],
                                     'html.parser').text
                name = class_name + '.' + name
                description = BeautifulSoup(data.css('dd')[0].extract(),
                                            'html.parser').text
                d = self.to_dict(name, None, description)
                if d:
                    yield d