# coding: utf-8

# Copyright 2014-2017 Álvaro Justen <https://github.com/turicas/rows/>

#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Lesser General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.

#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Lesser General Public License for more details.

#    You should have received a copy of the GNU Lesser General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import unicode_literals

import tempfile
import unittest
from collections import OrderedDict
from io import BytesIO
from textwrap import dedent

import mock

import rows
import rows.plugins.plugin_html
import tests.utils as utils

# TODO: test unescape
# TODO: test colspan
# TODO: test rowspan
# TODO: test more nested tables
#       URL = 'https://finance.yahoo.com/q;_ylt=At7WXTIEGzyrIHemoSMI7I.iuYdG;_ylu=X3oDMTBxdGVyNzJxBHNlYwNVSCAzIERlc2t0b3AgU2VhcmNoIDEx;_ylg=X3oDMTByaDM4cG9kBGxhbmcDZW4tVVMEcHQDMgR0ZXN0AzUxMjAxMw--;_ylv=3?s=GOOG&uhb=uhb2&type=2button&fr=uh3_finance_web_gs'
#       URL_2 = 'http://www.rio.rj.gov.br/dlstatic/10112/2147539/DLFE-237833.htm/paginanova2.0.0.0._B.htm'


def cleanup_lines(lines):
    return [line.strip() for line in lines.strip().split("\n") if line.strip()]


class PluginHtmlTestCase(utils.RowsTestMixIn, unittest.TestCase):

    plugin_name = "html"
    file_extension = "html"
    filename = "tests/data/all-field-types.html"
    encoding = "utf-8"
    assert_meta_encoding = True

    def test_imports(self):
        self.assertIs(rows.import_from_html, rows.plugins.plugin_html.import_from_html)
        self.assertIs(rows.export_to_html, rows.plugins.plugin_html.export_to_html)

    def test_import_from_html_filename(self):
        table = rows.import_from_html(self.filename, encoding=self.encoding)
        self.assert_table_equal(table, utils.table)

        expected_meta = {
            "imported_from": "html",
            "filename": self.filename,
            "encoding": self.encoding,
        }
        self.assertEqual(table.meta, expected_meta)

    def test_import_from_html_fobj(self):
        # TODO: may test with codecs.open passing an encoding
        with open(self.filename, mode="rb") as fobj:
            table = rows.import_from_html(fobj, encoding=self.encoding)
        self.assert_table_equal(table, utils.table)

        expected_meta = {
            "imported_from": "html",
            "filename": self.filename,
            "encoding": self.encoding,
        }
        self.assertEqual(table.meta, expected_meta)

    @mock.patch("rows.plugins.plugin_html.create_table")
    def test_import_from_html_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {"some_key": 123, "other": 456}
        result = rows.import_from_html(self.filename, encoding="iso-8859-1", **kwargs)
        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs["meta"] = {
            "imported_from": "html",
            "filename": self.filename,
            "encoding": "iso-8859-1",
        }
        self.assertEqual(call[1], kwargs)

    def test_export_to_html_filename(self):
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        rows.export_to_html(utils.table, temp.name)

        table = rows.import_from_html(temp.name)
        self.assert_table_equal(table, utils.table)

    def test_export_to_html_fobj(self):
        # TODO: may test with codecs.open passing an encoding
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False, mode="wb")
        self.files_to_delete.append(temp.name)
        rows.export_to_html(utils.table, temp.file)

        table = rows.import_from_html(temp.name)
        self.assert_table_equal(table, utils.table)

    @mock.patch("rows.plugins.plugin_html.serialize")
    def test_export_to_html_uses_serialize(self, mocked_serialize):
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        kwargs = {"test": 123, "parameter": 3.14}
        mocked_serialize.return_value = iter([utils.table.fields.keys()])

        rows.export_to_html(utils.table, temp.name, encoding="utf-8", **kwargs)
        self.assertTrue(mocked_serialize.called)
        self.assertEqual(mocked_serialize.call_count, 1)

        call = mocked_serialize.call_args
        self.assertEqual(call[0], (utils.table,))
        self.assertEqual(call[1], kwargs)

    @mock.patch("rows.plugins.plugin_html.export_data")
    def test_export_to_html_uses_export_data(self, mocked_export_data):
        temp = tempfile.NamedTemporaryFile(delete=False)
        self.files_to_delete.append(temp.name)
        kwargs = {"test": 123, "parameter": 3.14, "encoding": "utf-8"}
        mocked_export_data.return_value = 42

        result = rows.export_to_html(utils.table, temp.name, **kwargs)
        self.assertTrue(mocked_export_data.called)
        self.assertEqual(mocked_export_data.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_export_data.call_args
        self.assertEqual(call[0][0], temp.name)
        self.assertEqual(call[1], {"mode": "wb"})

    def test_export_to_html_none(self):
        # TODO: may test with codecs.open passing an encoding
        # TODO: may test file contents
        temp = tempfile.NamedTemporaryFile(delete=False, mode="rb+")
        self.files_to_delete.append(temp.name)
        result = rows.export_to_html(utils.table)
        rows.export_to_html(utils.table, temp.file)
        temp.file.seek(0)
        self.assertEqual(temp.file.read(), result)

    def test_table_index(self):
        filename = "tests/data/simple-table.html"
        fobj = open(filename, mode="rb")

        table_1 = rows.import_from_html(fobj)
        self.assertEqual(set(table_1.fields.keys()), set(["t0r0c0", "t0r0c1"]))
        self.assertEqual(len(table_1), 1)
        self.assertEqual(table_1[0].t0r0c0, "t0r1c0")
        self.assertEqual(table_1[0].t0r0c1, "t0r1c1")

        fobj.seek(0)
        table_2 = rows.import_from_html(fobj, index=1)
        self.assertEqual(set(table_2.fields.keys()), set(["t1r0c0", "t1r0c1"]))
        self.assertEqual(len(table_2), 2)
        self.assertEqual(table_2[0].t1r0c0, "t1r1c0")
        self.assertEqual(table_2[0].t1r0c1, "t1r1c1")
        self.assertEqual(table_2[1].t1r0c0, "t1r2c0")
        self.assertEqual(table_2[1].t1r0c1, "t1r2c1")

    def test_table_thead_tbody(self):
        filename = "tests/data/table-thead-tbody.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj)
        self.assertEqual(set(table.fields.keys()), set(["t1", "t2"]))
        self.assertEqual(len(table), 2)
        self.assertEqual(table[0].t1, "456")
        self.assertEqual(table[0].t2, "123")
        self.assertEqual(table[1].t1, "qqq")
        self.assertEqual(table[1].t2, "aaa")

    def test_nested_tables_outer(self):
        filename = "tests/data/nested-table.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj)
        self.assertEqual(
            set(table.fields.keys()), set(["t00r0c0", "t00r0c1", "t00r0c2"])
        )
        self.assertEqual(len(table), 3)

        self.assertEqual(table[0].t00r0c0, "t0,0r1c0")
        self.assertEqual(table[0].t00r0c1, "t0,0r1c1")
        self.assertEqual(table[0].t00r0c2, "t0,0r1c2")

        # if there are nested tables, the inner ones will be represented as
        # strings (each <td>...</td> element will return only one string, even
        # if there is a <table> inside it)
        inner_table = (
            "t0,1r0c0 t0,1r0c1 t0,1r1c0 t0,1r1c1 t0,1r2c0 "
            "t0,1r2c1 t0,2r0c0 t0,2r0c1 t0,2r1c0 t0,2r1c1 "
            "t0,1r3c1 t0,1r4c0 t0,1r4c1 t0,1r5c0 t0,1r5c1"
        )
        self.assertEqual(table[1].t00r0c0, "t0,0r2c0")
        self.assertEqual(table[1].t00r0c1, inner_table)
        self.assertEqual(table[1].t00r0c2, "t0,0r2c2")

        self.assertEqual(table[2].t00r0c0, "t0,0r3c0")
        self.assertEqual(table[2].t00r0c1, "t0,0r3c1")
        self.assertEqual(table[2].t00r0c2, "t0,0r3c2")

    def test_nested_tables_first_inner(self):
        filename = "tests/data/nested-table.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj, index=1)
        self.assertEqual(set(table.fields.keys()), set(["t01r0c0", "t01r0c1"]))
        self.assertEqual(len(table), 5)

        self.assertEqual(table[0].t01r0c0, "t0,1r1c0")
        self.assertEqual(table[0].t01r0c1, "t0,1r1c1")

        self.assertEqual(table[1].t01r0c0, "t0,1r2c0")
        self.assertEqual(table[1].t01r0c1, "t0,1r2c1")

        inner_table = "t0,2r0c0 t0,2r0c1 t0,2r1c0 t0,2r1c1"
        self.assertEqual(table[2].t01r0c0, inner_table)
        self.assertEqual(table[2].t01r0c1, "t0,1r3c1")

        self.assertEqual(table[3].t01r0c0, "t0,1r4c0")
        self.assertEqual(table[3].t01r0c1, "t0,1r4c1")

        self.assertEqual(table[4].t01r0c0, "t0,1r5c0")
        self.assertEqual(table[4].t01r0c1, "t0,1r5c1")

    def test_nested_tables_second_inner(self):
        filename = "tests/data/nested-table.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj, index=2)
        self.assertEqual(set(table.fields.keys()), set(["t02r0c0", "t02r0c1"]))
        self.assertEqual(len(table), 1)

        self.assertEqual(table[0].t02r0c0, "t0,2r1c0")
        self.assertEqual(table[0].t02r0c1, "t0,2r1c1")

    def test_preserve_html(self):
        filename = "tests/data/nested-table.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj, preserve_html=True)
        # TODO: test without passing encoding
        expected_data = [
            "<table>",
            "<tr>",
            "<td> t0,1r0c0 </td>",
            "<td> t0,1r0c1 </td>",
            "</tr>",
            "<tr>",
            "<td> t0,1r1c0 </td>",
            "<td> t0,1r1c1 </td>",
            "</tr>",
            "<tr>",
            "<td> t0,1r2c0 </td>",
            "<td> t0,1r2c1 </td>",
            "</tr>",
            "<tr>",
            "<td>",
            "<table>",
            "<tr>",
            "<td> t0,2r0c0 </td>",
            "<td> t0,2r0c1 </td>",
            "</tr>",
            "<tr>",
            "<td> t0,2r1c0 </td>",
            "<td> t0,2r1c1 </td>",
            "</tr>",
            "</table>",
            "</td>",
            "<td> t0,1r3c1 </td>",
            "</tr>",
            "<tr>",
            "<td> t0,1r4c0 </td>",
            "<td> t0,1r4c1 </td>",
            "</tr>",
            "<tr>",
            "<td> t0,1r5c0 </td>",
            "<td> t0,1r5c1 </td>",
            "</tr>",
            "</table>",
        ]
        self.assertEqual(cleanup_lines(table[1].t00r0c1), expected_data)

    def test_preserve_html_None(self):
        html = dedent(
            """
        <html>
          <body>
            <table>
              <tr>
                <td><b>f1</b></td>
                <td>f2</td>
                <td>f3</td>
              </tr>
              <tr>
                <td><i>r0f1</i></td>
                <td><i>r0f2</i></td>
                <td><i>r0f3</i></td>
              </tr>
            </table>
          </body>
        </html>
        """
        ).encode("utf-8")
        table = rows.import_from_html(
            BytesIO(html), encoding="utf-8", preserve_html=True
        )
        table2 = rows.import_from_html(
            BytesIO(html), encoding="utf-8", preserve_html=False
        )
        self.assertEqual(table[0].f1, "<i>r0f1</i>")
        self.assertEqual(table[0].f2, "<i>r0f2</i>")
        self.assertEqual(table[0].f3, "<i>r0f3</i>")

    @mock.patch("rows.plugins.plugin_html.create_table")
    def test_preserve_html_and_not_skip_header(self, mocked_create_table):
        filename = "tests/data/table-with-sections.html"

        # If `import_from_html` needs to identify field names, then it
        # should not preserve HTML inside first row
        table_1 = rows.import_from_html(filename, index=1, preserve_html=True)
        call_args = mocked_create_table.call_args_list.pop()
        data = list(call_args[0][0])
        kwargs = call_args[1]

        self.assertEqual(kwargs.get("fields", None), None)
        self.assertEqual(len(data), 6)
        self.assertNotIn("<", data[0][1])
        self.assertNotIn(">", data[0][1])
        for row in data[1:]:
            # Second field has HTML
            self.assertIn("<", row[1])
            self.assertIn(">", row[1])

        # If we provide fields and ask to preserve HTML and to don't skip
        # header then it should strip HTML from every row
        fields = OrderedDict(
            [
                ("first", rows.fields.TextField),
                ("second", rows.fields.TextField),
                ("third", rows.fields.TextField),
                ("fourth", rows.fields.TextField),
            ]
        )
        table_2 = rows.import_from_html(
            filename, index=1, fields=fields, preserve_html=True, skip_header=False
        )
        call_args = mocked_create_table.call_args_list.pop()
        data = list(call_args[0][0])
        kwargs = call_args[1]

        self.assertEqual(kwargs.get("fields", None), fields)
        self.assertEqual(len(data), 6)
        for row in data:
            # Second field has HTML and should not be stripped
            self.assertIn("<", row[1])
            self.assertIn(">", row[1])

    def test_ignore_colspan(self):
        filename = "tests/data/colspan-table.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj, ignore_colspan=True)
        self.assertEqual(set(table.fields.keys()), set(["field1", "field2"]))
        self.assertEqual(len(table), 2)
        self.assertEqual(table[0].field1, "row1field1")
        self.assertEqual(table[0].field2, "row1field2")
        self.assertEqual(table[1].field1, "row2field1")
        self.assertEqual(table[1].field2, "row2field2")

        fobj = open(filename, mode="rb")
        table = rows.import_from_html(fobj, ignore_colspan=False)
        self.assertEquals(list(table.fields.keys()), ["huge_title", "field_1"])
        self.assertEquals(len(table), 3)
        expected_data = [
            ["field1", "field2"],
            ["row1field1", "row1field2"],
            ["row2field1", "row2field2"],
        ]
        for row_data, table_row in zip(expected_data, table):
            self.assertEqual(row_data, [table_row.huge_title, table_row.field_1])

    def test_extract_properties(self):
        filename = "tests/data/properties-table.html"
        fobj = open(filename, mode="rb")

        table = rows.import_from_html(fobj, properties=True)
        self.assertEqual(table.field_names, ["field1", "field2", "properties"])
        self.assertEqual(
            table.field_types,
            [rows.fields.TextField, rows.fields.TextField, rows.fields.JSONField],
        )
        properties_1 = {"class": "some-class another-class", "data-test": "value"}
        properties_2 = {"class": "css-class", "data-test": "value2"}
        self.assertEqual(len(table), 2)
        self.assertEqual(table[0].field1, "row1field1")
        self.assertEqual(table[0].field2, "row1field2")
        self.assertEqual(table[0].properties, properties_1)
        self.assertEqual(table[1].field1, "row2field1")
        self.assertEqual(table[1].field2, "row2field2")
        self.assertEqual(table[1].properties, properties_2)

    def test_issue_168(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.Table(fields=OrderedDict([("jsoncolumn", rows.fields.JSONField)]))
        table.append({"jsoncolumn": '{"python": 42}'})
        rows.export_to_html(table, filename)

        table2 = rows.import_from_html(filename)
        self.assert_table_equal(table, table2)

    def test_export_to_html_unescaped_content(self):
        table = rows.Table(
            fields=OrderedDict([("unescaped_content", rows.fields.TextField)])
        )
        table.append({"unescaped_content": "<&>"})
        output = rows.export_to_html(table)
        self.assertIn(b"<td> &lt;&amp;&gt; </td>", output)


class PluginHtmlUtilsTestCase(unittest.TestCase):

    html = '<a href="some-url" class="some-class"> some text </a> other'

    def test_tag_to_dict(self):
        result = rows.plugins.plugin_html.tag_to_dict(self.html)
        expected = {"text": " some text ", "class": "some-class", "href": "some-url"}
        self.assertEqual(result, expected)

    def test_extract_node_text(self):
        from lxml.html import document_fromstring

        html = """<div>
                    some text
                    <a href="#"><b>bold</b> link <b>bold</b> text</a>
                  </div>"""
        node = document_fromstring(html)
        desired_node = node.xpath("//a")[0]
        expected = "bold link bold text"
        result = rows.plugins.plugin_html._extract_node_text(desired_node)
        self.assertEqual(result, expected)

    def test_extract_text_from_html(self):
        expected = "some text other"
        result = rows.plugins.plugin_html.extract_text(self.html)
        self.assertEqual(result, expected)

        # Real HTML from
        # <http://voos.infraero.gov.br/hstvoos/RelatorioPortal.aspx>
        html = """<td>
                        <span id="GridVoos_ATR_0">0</span>
                        <span id="GridVoos_LABEL2_0">(</span>
                        <span id="GridVoos_LABEL1_0">0</span>
                        <span id="GridVoos_LABEL3_0">%)</span>
                  </td>"""
        expected = "0 ( 0 %)"
        result = rows.plugins.plugin_html.extract_text(html)
        self.assertEqual(result, expected)

        # test HTML unescape
        html = "<b>&Aacute;lvaro &amp; Python</b>"
        expected = "Álvaro & Python"
        result = rows.plugins.plugin_html.extract_text(html)
        self.assertEqual(result, expected)

    def test_extract_links_from_html(self):
        # Real HTML from
        # <http://wnpp.debian.net/>
        html = """
          <nobr> abcl
            <a href="http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=608466">[1]</a>
            <a href="http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=701712">[2]</a>
          </nobr>
        """
        expected = [
            "http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=608466",
            "http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=701712",
        ]
        result = rows.plugins.plugin_html.extract_links(html)
        self.assertEqual(result, expected)