#!/usr/bin/python3
# encoding=utf-8
#
# Copyright © 2015 Alexandre Detiste <alexandre@detiste.be>
# SPDX-License-Identifier: GPL-2.0-or-later

# a simple spider to locate Wikipedia url
# in per-engine-wiki pages
# we don't rescan games we already have

import sys
import time
import urllib.request

from bs4 import BeautifulSoup

from game_data_packager.game import load_games
from game_data_packager.util import AGENT

denylist = ['compet-n', 'hacx']

try:
    todo = sys.argv[1]
except IndexError:
    todo = '*'


def is_wikipedia(href: str) -> bool:
    return bool(href) and "wikipedia" in href


def add_wikipedia_to_game_yaml(game: str, url: str) -> None:
    print('Add %s to game %s' % (url, game))
    filename = f"data/{game}.yaml"
    with open(filename) as fh:
        data = fh.read()

    tags = ['wiki', 'genre', 'copyright', 'longname']
    idx = 0
    for tag in tags:
        substring = f"\n{tag}: "
        if substring in data:
            idx1 = data.index(substring) + 1
            idx2 = data[idx1:].index('\n') + 1
            idx = idx1 + idx2
            break

    data = data[:idx] + f"wikipedia: {url}\n" + data[idx:]
    with open(filename, "w") as fh:
        fh.write(data)


for shortname, game in load_games(game=todo, use_vfs=False).items():
    if game.wikipedia is not None:
        continue
    if shortname in denylist:
        continue
    if not game.wiki:
        continue

    print('processing %s ...' % shortname)
    url = game.wikibase + game.wiki
    try:
        html = urllib.request.urlopen(urllib.request.Request(url,
                                      headers={'User-Agent': AGENT}))
    except urllib.error.HTTPError as e:
        print('%s : %s' % (url, e))
        continue

    soup = BeautifulSoup(html, 'lxml')
    for tag in soup.find_all(href=is_wikipedia):
        print('  ' + tag['href'])
        add_wikipedia_to_game_yaml(shortname, tag['href'])

    # break
    time.sleep(1)