File: link_preview.py

package info (click to toggle)
odoo 18.0.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 878,716 kB
  • sloc: javascript: 927,937; python: 685,670; xml: 388,524; sh: 1,033; sql: 415; makefile: 26
file content (90 lines) | stat: -rw-r--r-- 3,734 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
# Part of Odoo. See LICENSE file for full copyright and licensing details.

from lxml import html
import requests
from urllib3.exceptions import LocationParseError


def get_link_preview_from_url(url, request_session=None):
    """
    Get the Open Graph properties of an url. (https://ogp.me/)
    If the url leads directly to an image mimetype, return
    the url as preview image else retrieve the properties from
    the html page.

    Using a stream request to prevent loading the whole page
    as those properties are declared in the <head> tag.

    The request session is optional as in some cases using
    a session could be beneficial performance wise
    (e.g. a lot of url could have the same domain).
    """
    # Some websites are blocking non browser user agent.
    user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'}
    try:
        if request_session:
            response = request_session.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)
        else:
            response = requests.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)
    except requests.exceptions.RequestException:
        return False
    except LocationParseError:
        return False
    if not response.ok or not response.headers.get('Content-Type'):
        return False
    # Content-Type header can return a charset, but we just need the
    # mimetype (eg: image/jpeg;charset=ISO-8859-1)
    content_type = response.headers['Content-Type'].split(';')
    if response.headers['Content-Type'].startswith('image/'):
        return {
            'image_mimetype': content_type[0],
            'og_image': url, # If the url mimetype is already an image type, set url as preview image
            'source_url': url,
        }
    elif response.headers['Content-Type'].startswith('text/html'):
        return get_link_preview_from_html(url, response)
    return False

def get_link_preview_from_html(url, response):
    """
    Retrieve the Open Graph properties from the html page. (https://ogp.me/)
    Load the page with chunks of 8kb to prevent loading the whole
    html when we only need the <head> tag content.
    Fallback on the <title> tag if the html doesn't have
    any Open Graph title property.
    """
    content = b""
    for chunk in response.iter_content(chunk_size=8192):
        content += chunk
        pos = content.find(b'</head>', -8196 * 2)
        # Stop reading once all the <head> data is found
        if pos != -1:
            content = content[:pos + 7]
            break

    if not content:
        return False
    tree = html.fromstring(content)
    og_title = tree.xpath('//meta[@property="og:title"]/@content')
    if og_title:
        og_title = og_title[0]
    elif tree.find('.//title') is not None:
        # Fallback on the <title> tag if it exists
        og_title = tree.find('.//title').text
    else:
        return False
    og_description = tree.xpath('//meta[@property="og:description"]/@content')
    og_type = tree.xpath('//meta[@property="og:type"]/@content')
    og_site_name = tree.xpath('//meta[@property="og:site_name"]/@content')
    og_image = tree.xpath('//meta[@property="og:image"]/@content')
    og_mimetype = tree.xpath('//meta[@property="og:image:type"]/@content')
    return {
        'og_description': og_description[0] if og_description else None,
        'og_image': og_image[0] if og_image else None,
        'og_mimetype': og_mimetype[0] if og_mimetype else None,
        'og_title': og_title,
        'og_type': og_type[0] if og_type else None,
        'og_site_name': og_site_name[0] if og_site_name else None,
        'source_url': url,
    }