File: test_table_extraction.py

package info (click to toggle)
fpdf2 2.8.4-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 53,828 kB
sloc: python: 39,486; sh: 133; makefile: 12
file content (125 lines) | stat: -rw-r--r-- 4,325 bytes
"""
Tests that ensure tables generated by fpdf2
can be extracted by well-know PDF tables extraction tools
"""

import sys
from pathlib import Path

import camelot
from pandas import DataFrame
from pandas.testing import assert_frame_equal
import pytest

from test.table.test_table import TABLE_DATA

HERE = Path(__file__).resolve().parent
_TMP_DF = DataFrame(TABLE_DATA)
TABLE_DATA_AS_DF = DataFrame(_TMP_DF.values[1:], columns=_TMP_DF.iloc[0])


###############################################################################
################################### camelot ###################################
###############################################################################

if sys.platform not in ("cygwin", "win32", "darwin"):
    # Disabling tests as GhostScript is not installed in GitHub Actions pipeline under Windows or macOS

    @pytest.mark.parametrize("flavor", ("lattice", "stream"))
    @pytest.mark.parametrize(
        "filename",
        (
            "table_simple.pdf",
            "table_with_images.pdf",
            "table_with_images_and_img_fill_width.pdf",
            "table_with_headings_styled.pdf",
            "table_with_internal_layout.pdf",
        ),
    )
    def test_camelot_extract_simple_table(flavor, filename):
        _test_camelot_parse(HERE / filename, flavor, 4, 5)

    @pytest.mark.parametrize(
        "filename",
        (
            "table_with_minimal_layout.pdf",
            "table_with_single_top_line_layout.pdf",
        ),
    )
    def test_camelot_extract_table_ok_with_only_stream_flavor(filename):
        _test_camelot_parse(HERE / filename, "stream", 4, 5)

    @pytest.mark.parametrize(
        "filename",
        (
            "table_align.pdf",
            # "table_with_cell_fill.pdf",
        ),
    )
    def test_camelot_extract_two_tables(filename):
        _test_camelot_parse(HERE / filename, "lattice", 4, 5, table_count=2)

    @pytest.mark.xfail(
        reason="camelot does not successfully parse tables split on several pages"
    )
    @pytest.mark.parametrize("flavor", ("lattice", "stream"))
    def test_camelot_extract_two_pages_table(flavor):
        _test_camelot_parse(HERE / "table_with_multiline_cells.pdf", flavor, 2, 5)

    def _test_camelot_parse(pdf_path, flavor, col_count, row_count, table_count=1):
        tables = camelot.read_pdf(str(pdf_path), flavor=flavor)
        assert tables.n == table_count
        for table in tables:
            assert len(table.cols) == col_count
            assert len(table.rows) == row_count


###############################################################################
################################### tabula ####################################
###############################################################################

@pytest.mark.skip(reason="tabula-py is not available in debian")
@pytest.mark.parametrize(
    "filename",
    (
        "table_simple.pdf",
        "table_with_headings_styled.pdf",
        # "table_with_internal_layout.pdf",  # tabula only parses the internal cells
        "table_with_minimal_layout.pdf",
        "table_with_single_top_line_layout.pdf",
    ),
)
def test_tabula_extract_simple_table(filename):
    dataframes = tabula.read_pdf(HERE / filename, pages="all")
    assert len(dataframes) == 1
    for df_raw in dataframes:
        df = df_raw.astype({"Age": str})
        assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)


@pytest.mark.skip(reason="tabula-py is not available in debian")
@pytest.mark.parametrize(
    "filename",
    (
        "table_align.pdf",
        "table_with_cell_fill.pdf",
    ),
)
def test_tabula_extract_two_tables(filename):
    dataframes = tabula.read_pdf(HERE / filename, pages="all")
    assert len(dataframes) == 2
    for df_raw in dataframes:
        df = df_raw.astype({"Age": str})
        assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)


@pytest.mark.skip(reason="tabula-py is not available in debian")
@pytest.mark.xfail(
    reason="tabula does not successfully parse tables split on several pages"
)
def test_tabula_extract_two_pages_table():
    dataframes = tabula.read_pdf(HERE / "table_with_multiline_cells.pdf", pages="all")
    assert len(dataframes) == 2
    for df in dataframes:
        _rows_count, cols_count = df.shape
        assert cols_count == 2