1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
|
"""
Tests that ensure tables generated by fpdf2
can be extracted by well-know PDF tables extraction tools
"""
import sys
from pathlib import Path
import camelot
from pandas import DataFrame
from pandas.testing import assert_frame_equal
import pytest
from test.table.test_table import TABLE_DATA
HERE = Path(__file__).resolve().parent
_TMP_DF = DataFrame(TABLE_DATA)
TABLE_DATA_AS_DF = DataFrame(_TMP_DF.values[1:], columns=_TMP_DF.iloc[0])
###############################################################################
################################### camelot ###################################
###############################################################################
if sys.platform not in ("cygwin", "win32", "darwin"):
# Disabling tests as GhostScript is not installed in GitHub Actions pipeline under Windows or macOS
@pytest.mark.parametrize("flavor", ("lattice", "stream"))
@pytest.mark.parametrize(
"filename",
(
"table_simple.pdf",
"table_with_images.pdf",
"table_with_images_and_img_fill_width.pdf",
"table_with_headings_styled.pdf",
"table_with_internal_layout.pdf",
),
)
def test_camelot_extract_simple_table(flavor, filename):
_test_camelot_parse(HERE / filename, flavor, 4, 5)
@pytest.mark.parametrize(
"filename",
(
"table_with_minimal_layout.pdf",
"table_with_single_top_line_layout.pdf",
),
)
def test_camelot_extract_table_ok_with_only_stream_flavor(filename):
_test_camelot_parse(HERE / filename, "stream", 4, 5)
@pytest.mark.parametrize(
"filename",
(
"table_align.pdf",
# "table_with_cell_fill.pdf",
),
)
def test_camelot_extract_two_tables(filename):
_test_camelot_parse(HERE / filename, "lattice", 4, 5, table_count=2)
@pytest.mark.xfail(
reason="camelot does not successfully parse tables split on several pages"
)
@pytest.mark.parametrize("flavor", ("lattice", "stream"))
def test_camelot_extract_two_pages_table(flavor):
_test_camelot_parse(HERE / "table_with_multiline_cells.pdf", flavor, 2, 5)
def _test_camelot_parse(pdf_path, flavor, col_count, row_count, table_count=1):
tables = camelot.read_pdf(str(pdf_path), flavor=flavor)
assert tables.n == table_count
for table in tables:
assert len(table.cols) == col_count
assert len(table.rows) == row_count
###############################################################################
################################### tabula ####################################
###############################################################################
@pytest.mark.skip(reason="tabula-py is not available in debian")
@pytest.mark.parametrize(
"filename",
(
"table_simple.pdf",
"table_with_headings_styled.pdf",
# "table_with_internal_layout.pdf", # tabula only parses the internal cells
"table_with_minimal_layout.pdf",
"table_with_single_top_line_layout.pdf",
),
)
def test_tabula_extract_simple_table(filename):
dataframes = tabula.read_pdf(HERE / filename, pages="all")
assert len(dataframes) == 1
for df_raw in dataframes:
df = df_raw.astype({"Age": str})
assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)
@pytest.mark.skip(reason="tabula-py is not available in debian")
@pytest.mark.parametrize(
"filename",
(
"table_align.pdf",
"table_with_cell_fill.pdf",
),
)
def test_tabula_extract_two_tables(filename):
dataframes = tabula.read_pdf(HERE / filename, pages="all")
assert len(dataframes) == 2
for df_raw in dataframes:
df = df_raw.astype({"Age": str})
assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)
@pytest.mark.skip(reason="tabula-py is not available in debian")
@pytest.mark.xfail(
reason="tabula does not successfully parse tables split on several pages"
)
def test_tabula_extract_two_pages_table():
dataframes = tabula.read_pdf(HERE / "table_with_multiline_cells.pdf", pages="all")
assert len(dataframes) == 2
for df in dataframes:
_rows_count, cols_count = df.shape
assert cols_count == 2
|