File: www.gov.uk

package info (click to toggle)
python-protego 0.6.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 30,048 kB
  • sloc: python: 1,492; perl: 190; cpp: 33; sh: 4; makefile: 3
file content (22 lines) | stat: -rw-r--r-- 848 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
User-agent: *
Disallow: /*/print$
# Don't allow indexing of user needs pages
Disallow: /info/*
Sitemap: https://www.gov.uk/sitemap.xml

# https://ahrefs.com/robot/ crawls the site frequently
User-agent: AhrefsBot
Crawl-delay: 10

# https://www.deepcrawl.com/bot/ makes lots of requests. Ideally
# we'd slow it down rather than blocking it but it doesn't mention
# whether or not it supports crawl-delay.
User-agent: deepcrawl
Disallow: /

# Complaints of 429 'Too many requests' seem to be coming from SharePoint servers
# (https://social.msdn.microsoft.com/Forums/en-US/3ea268ed-58a6-4166-ab40-d3f4fc55fef4)
# The robot doesn't recognise its User-Agent string, see the MS support article:
# https://support.microsoft.com/en-us/help/3019711/the-sharepoint-server-crawler-ignores-directives-in-robots-txt
User-agent: MS Search 6.0 Robot
Disallow: /