1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
"""Checks that links are viable.
---
layout: post
source: SublimeLinter-annotations
source_url: http://bit.ly/16Q7H41
title: broken links
date: 2014-06-10 12:31:19
categories: writing
---
Check that links are not broken.
"""
import re
import urllib.request as urllib_request # for Python 3
from socket import error as SocketError
from proselint.tools import memoize
@memoize
def check(text):
"""Check the text."""
err = "links.valid"
msg = "Broken link: {}"
regex = re.compile(
r"""(?i)\b((?:https?://|www\d{0,3}[.]
|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+
|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)
|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019\u21a9]))""",
re.U | re.X)
errors = []
for m in re.finditer(regex, text):
url = m.group(0).strip()
if "http://" not in url and "https://" not in url:
url = "http://" + url
if is_broken_link(url):
errors.append((m.start(), m.end(), err, msg.format(url), None))
return errors
@memoize
def is_broken_link(url):
"""Determine whether the link returns a 404 error."""
try:
request = urllib_request.Request(
url, headers={'User-Agent': 'Mozilla/5.0'})
urllib_request.urlopen(request).read()
return False
except urllib_request.URLError:
return True
except SocketError:
return True
|