1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
|
"""
This module implements the FormRequest class which is a more convenient class
(than Request) to generate Requests based on form data.
See documentation in docs/topics/request-response.rst
"""
import six
from six.moves.urllib.parse import urljoin, urlencode
import lxml.html
from parsel.selector import create_root_node
from w3lib.html import strip_html5_whitespace
from scrapy.http.request import Request
from scrapy.utils.python import to_bytes, is_listlike
from scrapy.utils.response import get_base_url
class FormRequest(Request):
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
super(FormRequest, self).__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
@classmethod
def from_response(cls, response, formname=None, formid=None, formnumber=0, formdata=None,
clickdata=None, dont_click=False, formxpath=None, formcss=None, **kwargs):
kwargs.setdefault('encoding', response.encoding)
if formcss is not None:
from parsel.csstranslator import HTMLTranslator
formxpath = HTMLTranslator().css_to_xpath(formcss)
form = _get_form(response, formname, formid, formnumber, formxpath)
formdata = _get_inputs(form, formdata, dont_click, clickdata, response)
url = _get_form_url(form, kwargs.pop('url', None))
method = kwargs.pop('method', form.method)
return cls(url=url, method=method, formdata=formdata, **kwargs)
def _get_form_url(form, url):
if url is None:
action = form.get('action')
if action is None:
return form.base_url
return urljoin(form.base_url, strip_html5_whitespace(action))
return urljoin(form.base_url, url)
def _urlencode(seq, enc):
values = [(to_bytes(k, enc), to_bytes(v, enc))
for k, vs in seq
for v in (vs if is_listlike(vs) else [vs])]
return urlencode(values, doseq=1)
def _get_form(response, formname, formid, formnumber, formxpath):
"""Find the form element """
root = create_root_node(response.text, lxml.html.HTMLParser,
base_url=get_base_url(response))
forms = root.xpath('//form')
if not forms:
raise ValueError("No <form> element found in %s" % response)
if formname is not None:
f = root.xpath('//form[@name="%s"]' % formname)
if f:
return f[0]
if formid is not None:
f = root.xpath('//form[@id="%s"]' % formid)
if f:
return f[0]
# Get form element from xpath, if not found, go up
if formxpath is not None:
nodes = root.xpath(formxpath)
if nodes:
el = nodes[0]
while True:
if el.tag == 'form':
return el
el = el.getparent()
if el is None:
break
encoded = formxpath if six.PY3 else formxpath.encode('unicode_escape')
raise ValueError('No <form> element found with %s' % encoded)
# If we get here, it means that either formname was None
# or invalid
if formnumber is not None:
try:
form = forms[formnumber]
except IndexError:
raise IndexError("Form number %d not found in %s" %
(formnumber, response))
else:
return form
def _get_inputs(form, formdata, dont_click, clickdata, response):
try:
formdata = dict(formdata or ())
except (ValueError, TypeError):
raise ValueError('formdata should be a dict or iterable of tuples')
inputs = form.xpath('descendant::textarea'
'|descendant::select'
'|descendant::input[not(@type) or @type['
' not(re:test(., "^(?:submit|image|reset)$", "i"))'
' and (../@checked or'
' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
namespaces={
"re": "http://exslt.org/regular-expressions"})
values = [(k, u'' if v is None else v)
for k, v in (_value(e) for e in inputs)
if k and k not in formdata]
if not dont_click:
clickable = _get_clickable(clickdata, form)
if clickable and clickable[0] not in formdata and not clickable[0] is None:
values.append(clickable)
values.extend((k, v) for k, v in formdata.items() if v is not None)
return values
def _value(ele):
n = ele.name
v = ele.value
if ele.tag == 'select':
return _select_value(ele, n, v)
return n, v
def _select_value(ele, n, v):
multiple = ele.multiple
if v is None and not multiple:
# Match browser behaviour on simple select tag without options selected
# And for select tags wihout options
o = ele.value_options
return (n, o[0]) if o else (None, None)
elif v is not None and multiple:
# This is a workround to bug in lxml fixed 2.3.1
# fix https://github.com/lxml/lxml/commit/57f49eed82068a20da3db8f1b18ae00c1bab8b12#L1L1139
selected_options = ele.xpath('.//option[@selected]')
v = [(o.get('value') or o.text or u'').strip() for o in selected_options]
return n, v
def _get_clickable(clickdata, form):
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
clickables = [
el for el in form.xpath(
'descendant::*[(self::input or self::button)'
' and re:test(@type, "^submit$", "i")]'
'|descendant::button[not(@type)]',
namespaces={"re": "http://exslt.org/regular-expressions"})
]
if not clickables:
return
# If we don't have clickdata, we just use the first clickable element
if clickdata is None:
el = clickables[0]
return (el.get('name'), el.get('value') or '')
# If clickdata is given, we compare it to the clickable elements to find a
# match. We first look to see if the number is specified in clickdata,
# because that uniquely identifies the element
nr = clickdata.get('nr', None)
if nr is not None:
try:
el = list(form.inputs)[nr]
except IndexError:
pass
else:
return (el.get('name'), el.get('value') or '')
# We didn't find it, so now we build an XPath expression out of the other
# arguments, because they can be used as such
xpath = u'.//*' + \
u''.join(u'[@%s="%s"]' % c for c in six.iteritems(clickdata))
el = form.xpath(xpath)
if len(el) == 1:
return (el[0].get('name'), el[0].get('value') or '')
elif len(el) > 1:
raise ValueError("Multiple elements found (%r) matching the criteria "
"in clickdata: %r" % (el, clickdata))
else:
raise ValueError('No clickable element matching clickdata: %r' % (clickdata,))
|