File: broken.py

package info (click to toggle)
python3-proselint 0.14.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,220 kB
  • sloc: python: 7,173; sh: 6; makefile: 3
file content (59 lines) | stat: -rw-r--r-- 1,480 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Checks that links are viable.

---
layout:     post
source:     SublimeLinter-annotations
source_url: http://bit.ly/16Q7H41
title:      broken links
date:       2014-06-10 12:31:19
categories: writing
---

Check that links are not broken.

"""
import re
import urllib.request as urllib_request  # for Python 3
from socket import error as SocketError

from proselint.tools import memoize


@memoize
def check(text):
    """Check the text."""
    err = "links.valid"
    msg = "Broken link: {}"

    regex = re.compile(
        r"""(?i)\b((?:https?://|www\d{0,3}[.]
        |[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+
        |(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)
        |[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019\u21a9]))""",
        re.U | re.X)

    errors = []
    for m in re.finditer(regex, text):
        url = m.group(0).strip()

        if "http://" not in url and "https://" not in url:
            url = "http://" + url

        if is_broken_link(url):
            errors.append((m.start(), m.end(), err, msg.format(url), None))

    return errors


@memoize
def is_broken_link(url):
    """Determine whether the link returns a 404 error."""
    try:
        request = urllib_request.Request(
            url, headers={'User-Agent': 'Mozilla/5.0'})
        urllib_request.urlopen(request).read()
        return False
    except urllib_request.URLError:
        return True
    except SocketError:
        return True