# Licensed under a 3-clause BSD style license - see LICENSE.rst """ This module tests some of the methods related to the ``HTML`` reader/writer and aims to document its functionality. Requires `BeautifulSoup `_ to be installed. """ from .. import html from .. import core from ....table import Table import numpy as np from ....tests.helper import pytest from ....extern.six.moves import zip as izip from .common import (raises, assert_equal, assert_almost_equal, assert_true, setup_function, teardown_function) try: from itertools import izip except ImportError: izip = zip # Check to see if the BeautifulSoup dependency is present. try: from bs4 import BeautifulSoup HAS_BEAUTIFUL_SOUP = True except ImportError: HAS_BEAUTIFUL_SOUP = False @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_soupstring(): """ Test to make sure the class SoupString behaves properly. """ soup = BeautifulSoup('

foo

') soup_str = html.SoupString(soup) assert isinstance(soup_str, str) assert isinstance(soup_str, html.SoupString) assert soup_str == '

foo

' assert soup_str.soup is soup def test_listwriter(): """ Test to make sure the class ListWriter behaves properly. """ lst = [] writer = html.ListWriter(lst) for i in range(5): writer.write(i) for ch in 'abcde': writer.write(ch) assert lst == [0, 1, 2, 3, 4, 'a', 'b', 'c', 'd', 'e'] @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_identify_table(): """ Test to make sure that identify_table() returns whether the given BeautifulSoup tag is the correct table to process. """ # Should return False on non- tags and None soup = BeautifulSoup('') assert html.identify_table(soup, {}, 0) is False assert html.identify_table(None, {}, 0) is False soup = BeautifulSoup('

' \ '

A
B

').table assert html.identify_table(soup, {}, 2) is False assert html.identify_table(soup, {}, 1) is True # Default index of 1 # Same tests, but with explicit parameter assert html.identify_table(soup, {'table_id': 2}, 1) is False assert html.identify_table(soup, {'table_id': 1}, 1) is True # Test identification by string ID assert html.identify_table(soup, {'table_id': 'bar'}, 1) is False assert html.identify_table(soup, {'table_id': 'foo'}, 1) is True @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_missing_data(): """ Test reading a table with missing data """ # First with default where blank => '0' table_in = ['', '', '', '', '

A

1

'] dat = Table.read(table_in, format='ascii.html') assert dat.masked is True assert np.all(dat['A'].mask == [True, False]) assert dat['A'].dtype.kind == 'i' # Now with a specific value '...' => missing table_in = ['', '', '', '', '

A
...
1

'] dat = Table.read(table_in, format='ascii.html', fill_values=[('...', '0')]) assert dat.masked is True assert np.all(dat['A'].mask == [True, False]) assert dat['A'].dtype.kind == 'i' @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_rename_cols(): """ Test reading a table and renaming cols """ table_in = ['', '', '', '

A	B
1	2

'] # Swap column names dat = Table.read(table_in, format='ascii.html', names=['B', 'A']) assert dat.colnames == ['B', 'A'] assert len(dat) == 1 # Swap column names and only include A (the renamed version) dat = Table.read(table_in, format='ascii.html', names=['B', 'A'], include_names=['A']) assert dat.colnames == ['A'] assert len(dat) == 1 assert np.all(dat['A'] == 2) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_no_names(): """ Test reading a table witn no column header """ table_in = ['', '', '', '

'] dat = Table.read(table_in, format='ascii.html') assert dat.colnames == ['col1'] assert len(dat) == 2 dat = Table.read(table_in, format='ascii.html', names=['a']) assert dat.colnames == ['a'] assert len(dat) == 2 @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_identify_table_fail(): """ Raise an exception with an informative error message if table_id is not found. """ table_in = ['', '

A
B

'] with pytest.raises(core.InconsistentTableError) as err: Table.read(table_in, format='ascii.html', htmldict={'table_id': 'bad_id'}, guess=False) assert str(err).endswith("ERROR: HTML table id 'bad_id' not found") with pytest.raises(core.InconsistentTableError) as err: Table.read(table_in, format='ascii.html', htmldict={'table_id': 3}, guess=False) assert str(err).endswith("ERROR: HTML table number 3 not found") @pytest.mark.skipif('HAS_BEAUTIFUL_SOUP') def test_htmlinputter_no_bs4(): """ This should return an OptionalTableImportError if BeautifulSoup is not installed. """ inputter = html.HTMLInputter() with pytest.raises(core.OptionalTableImportError): inputter.process_lines([]) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmlinputter(): """ Test to ensure that HTMLInputter correctly converts input into a list of SoupStrings representing table elements. """ f = 't/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} # In absence of table_id, defaults to the first table expected = ['Column 1Column 2Column 3', '1a1.05', '2b2.75', '3c-1.25'] assert [str(x) for x in inputter.get_lines(table)] == expected # Should raise an InconsistentTableError if the table is not found inputter.html = {'table_id': 4} with pytest.raises(core.InconsistentTableError): inputter.get_lines(table) # Identification by string ID inputter.html['table_id'] = 'second' expected = ['Column AColumn BColumn C', '4d10.5', '5e27.5', '6f-12.5'] assert [str(x) for x in inputter.get_lines(table)] == expected # Identification by integer index inputter.html['table_id'] = 3 expected = ['C1C2C3', '7g105.0', '8h275.0', '9i-125.0'] assert [str(x) for x in inputter.get_lines(table)] == expected @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmlsplitter(): """ Test to make sure that HTMLSplitter correctly inputs lines of type SoupString to return a generator that gives all header and data elements. """ splitter = html.HTMLSplitter() lines = [html.SoupString(BeautifulSoup('Col 1Col 2').tr), html.SoupString(BeautifulSoup('Data 1Data 2').tr)] expected_data = [['Col 1', 'Col 2'], ['Data 1', 'Data 2']] assert list(splitter(lines)) == expected_data # Make sure the presence of a non-SoupString triggers a TypeError lines.append('Data 3Data 4') with pytest.raises(TypeError): list(splitter(lines)) # Make sure that passing an empty list triggers an error with pytest.raises(core.InconsistentTableError): list(splitter([])) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmlheader_start(): """ Test to ensure that the start_line method of HTMLHeader returns the first line of header data. Uses t/html.html for sample input. """ f = 't/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} header = html.HTMLHeader() lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ 'Column 1Column 2Column 3' inputter.html['table_id'] = 'second' lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ 'Column AColumn BColumn C' inputter.html['table_id'] = 3 lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ 'C1C2C3' # start_line should return None if no valid header is found lines = [html.SoupString(BeautifulSoup('Data').tr), html.SoupString(BeautifulSoup('

Text

').p)] assert header.start_line(lines) is None # Should raise an error if a non-SoupString is present lines.append('Header') with pytest.raises(TypeError): header.start_line(lines) @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_htmldata(): """ Test to ensure that the start_line and end_lines methods of HTMLData returns the first line of table data. Uses t/html.html for sample input. """ f = 't/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} data = html.HTMLData() lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '1a1.05' # end_line returns the index of the last data element + 1 assert str(lines[data.end_line(lines) - 1]) == \ '3c-1.25' inputter.html['table_id'] = 'second' lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '4d10.5' assert str(lines[data.end_line(lines) - 1]) == \ '6f-12.5' inputter.html['table_id'] = 3 lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '7g105.0' assert str(lines[data.end_line(lines) - 1]) == \ '9i-125.0' # start_line should raise an error if no table data exists lines = [html.SoupString(BeautifulSoup('').tr), html.SoupString(BeautifulSoup('

Text

').p)] with pytest.raises(core.InconsistentTableError): data.start_line(lines) # end_line should return None if no table data exists assert data.end_line(lines) is None # Should raise an error if a non-SoupString is present lines.append('Data') with pytest.raises(TypeError): data.start_line(lines) with pytest.raises(TypeError): data.end_line(lines) def test_multicolumn_write(): """ Test to make sure that the HTML writer writes multimensional columns (those with iterable elements) using the colspan attribute of . """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [('a', 'a', 'a'), ('b', 'b', 'b'), ('c', 'c', 'c')] table = Table([col1, col2, col3], names=('C1', 'C2', 'C3')) expected = """\

C1	C2		C3
1	1.0	1.0	a	a	a
2	2.0	2.0	b	b	b
3	3.0	3.0	c	c	c

""" assert html.HTML().write(table)[0].strip() == expected.strip() def test_write_no_multicols(): """ Test to make sure that the HTML writer will not use multi-dimensional columns if the multicol parameter is False. """ col1 = [1, 2, 3] col2 = [(1.0, 1.0), (2.0, 2.0), (3.0, 3.0)] col3 = [('a', 'a', 'a'), ('b', 'b', 'b'), ('c', 'c', 'c')] table = Table([col1, col2, col3], names=('C1', 'C2', 'C3')) expected = """\

C1	C2	C3
1	1.0 .. 1.0	a .. a
2	2.0 .. 2.0	b .. b
3	3.0 .. 3.0	c .. c

""" assert html.HTML({'multicol':False}).write(table)[0].strip() == \ expected.strip() @pytest.mark.skipif('not HAS_BEAUTIFUL_SOUP') def test_multicolumn_read(): """ Test to make sure that the HTML reader inputs multimensional columns (those with iterable elements) using the colspan attribute of . Ensure that any string element within a multidimensional column casts all elements to string prior to type conversion operations. """ table = Table.read('t/html2.html', format='ascii.html') str_type = np.dtype((np.str, 21)) expected = Table(np.array([(['1', '2.5000000000000000001'], 3), (['1a', '1'], 3.5)], dtype=[('A', str_type, (2,)), ('B', '