File: gdalcompare.py

package info (click to toggle)
gdal 3.11.3%2Bdfsg-1~exp1
links: PTS, VCS
area: main
in suites: experimental
size: 89,016 kB
sloc: cpp: 1,165,048; ansic: 208,864; python: 26,958; java: 5,972; xml: 4,611; sh: 3,776; cs: 2,508; yacc: 1,306; makefile: 213
file content (561 lines) | stat: -rw-r--r-- 19,324 bytes
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ******************************************************************************
#
#  Project:  GDAL
#  Purpose:  Compare two files for differences and report.
#  Author:   Frank Warmerdam, warmerdam@pobox.com
#
# ******************************************************************************
#  Copyright (c) 2012, Frank Warmerdam <warmerdam@pobox.com>
#  Copyright (c) 2021, Idan Miara <idan@miara.com>
#
# SPDX-License-Identifier: MIT
# ******************************************************************************

import array
import filecmp
import math
import os
import sys

from osgeo import gdal, osr

#######################################################
from osgeo_utils.auxiliary.base import PathLikeOrStr
from osgeo_utils.auxiliary.util import enable_gdal_exceptions

my_print = print


def compare_metadata(golden_md, new_md, md_id, options=None):

    if golden_md is None and new_md is None:
        return 0

    found_diff = 0

    golden_keys = list(golden_md.keys())
    new_keys = list(new_md.keys())
    dont_care_keys = ["backend", "ERR_BIAS", "ERR_RAND"]

    for key in dont_care_keys:
        if key in golden_keys:
            golden_keys.remove(key)
        if key in new_keys:
            new_keys.remove(key)

    if len(golden_keys) != len(new_keys):
        my_print("Difference in %s metadata key count" % md_id)
        my_print("  Golden Keys: " + str(golden_keys))
        my_print("  New Keys: " + str(new_keys))
        found_diff += 1

    for key in golden_keys:
        if key not in new_keys:
            my_print('New %s metadata lacks key "%s"' % (md_id, key))
            found_diff += 1
        elif md_id == "RPC" and new_md[key].strip() != golden_md[key].strip():
            # The strip above is because _RPC.TXT files and in-file have a difference
            # in white space that is not otherwise meaningful.
            my_print('RPC Metadata value difference for key "' + key + '"')
            my_print('  Golden: "' + golden_md[key] + '"')
            my_print('  New:    "' + new_md[key] + '"')
            found_diff += 1
        elif md_id != "RPC" and new_md[key] != golden_md[key]:
            if key == "NITF_FDT":
                # this will always have the current date set
                continue
            my_print('Metadata value difference for key "' + key + '"')
            my_print('  Golden: "' + golden_md[key] + '"')
            my_print('  New:    "' + new_md[key] + '"')
            found_diff += 1

    return found_diff


#######################################################
# Review and report on the actual image pixels that differ.
def compare_image_pixels(golden_band, new_band, id, options=None):

    diff_count = 0
    max_diff = 0

    out_db = None
    if "DUMP_DIFFS" in options:
        prefix = ""
        for opt in options:
            if opt.startswith("DUMP_DIFFS_PREFIX="):
                prefix = opt[len("DUMP_DIFFS_PREFIX=") :]
                break
        diff_fn = prefix + id.replace(" ", "_") + ".tif"
        out_db = gdal.GetDriverByName("GTiff").Create(
            diff_fn, golden_band.XSize, golden_band.YSize, 1, gdal.GDT_Float32
        )

    xsize = golden_band.XSize
    for line in range(golden_band.YSize):
        golden_line = array.array(
            "d", golden_band.ReadRaster(0, line, xsize, 1, buf_type=gdal.GDT_Float64)
        )
        new_line = array.array(
            "d", new_band.ReadRaster(0, line, xsize, 1, buf_type=gdal.GDT_Float64)
        )
        diff_line = [golden_line[i] - new_line[i] for i in range(xsize)]
        max_diff_this_line = max([abs(x) for x in diff_line])
        max_diff = max(max_diff, max_diff_this_line)
        if max_diff_this_line:
            diff_count += sum([(1 if x else 0) for x in diff_line])
        if out_db is not None:
            out_db.GetRasterBand(1).WriteRaster(
                0,
                line,
                xsize,
                1,
                array.array("d", diff_line).tobytes(),
                buf_type=gdal.GDT_Float64,
            )

    my_print("  Pixels Differing: " + str(diff_count))
    my_print("  Maximum Pixel Difference: " + str(max_diff))
    if out_db is not None:
        my_print("  Wrote Diffs to: %s" % diff_fn)


#######################################################


def compare_band(golden_band, new_band, id, options=None):
    found_diff = 0

    options = [] if options is None else options

    if golden_band.XSize != new_band.XSize or golden_band.YSize != new_band.YSize:
        my_print(
            "Band size mismatch (band=%s golden=[%d,%d], new=[%d,%d])"
            % (id, golden_band.XSize, golden_band.YSize, new_band.XSize, new_band.YSize)
        )
        found_diff += 1

    if golden_band.DataType != new_band.DataType:
        my_print("Band %s pixel types differ." % id)
        my_print("  Golden: " + gdal.GetDataTypeName(golden_band.DataType))
        my_print("  New:    " + gdal.GetDataTypeName(new_band.DataType))
        found_diff += 1

    golden_nodata = golden_band.GetNoDataValue()
    new_nodata = new_band.GetNoDataValue()

    # Two 'nan' values are _never_ equal, but bands that both use 'nan' as
    # nodata value do in fact use the same nodata value. Same for 'inf' and
    # '-inf'. These checks are kind of gross, but are unavoidable since 'None'
    # has to be accounted for. The reader might be tempted to simplify these
    # checks with a couple of 'set()'s, however a set containing two 'nan'
    # values has a length of 2, not 1.
    if None not in (golden_nodata, new_nodata) and (
        math.isnan(golden_nodata) and math.isnan(new_nodata)
    ):
        pass
    elif None not in (golden_nodata, new_nodata) and (
        math.isinf(golden_nodata) and math.isinf(new_nodata)
    ):
        pass
    elif golden_nodata != new_nodata:
        my_print("Band %s nodata values differ." % id)
        my_print("  Golden: " + str(golden_nodata))
        my_print("  New:    " + str(new_nodata))
        found_diff += 1

    if golden_band.GetColorInterpretation() != new_band.GetColorInterpretation():
        my_print("Band %s color interpretation values differ." % id)
        my_print(
            "  Golden: "
            + gdal.GetColorInterpretationName(golden_band.GetColorInterpretation())
        )
        my_print(
            "  New:    "
            + gdal.GetColorInterpretationName(new_band.GetColorInterpretation())
        )
        found_diff += 1

    golden_band_checksum = golden_band.Checksum()
    new_band_checksum = new_band.Checksum()
    if golden_band_checksum != new_band_checksum:
        my_print("Band %s checksum difference:" % id)
        my_print("  Golden: " + str(golden_band_checksum))
        my_print("  New:    " + str(new_band_checksum))
        if found_diff == 0:
            compare_image_pixels(golden_band, new_band, id, options)
        found_diff += 1
    else:
        # check a bit deeper in case of Float data type for which the Checksum() function is not reliable
        if golden_band.DataType in (gdal.GDT_Float32, gdal.GDT_Float64):
            if golden_band.ComputeRasterMinMax(
                can_return_none=True
            ) != new_band.ComputeRasterMinMax(can_return_none=True):
                my_print("Band %s statistics difference:" % 1)
                my_print("  Golden: " + str(golden_band.ComputeBandStats()))
                my_print("  New:    " + str(new_band.ComputeBandStats()))
                compare_image_pixels(golden_band, new_band, id, {})

    # Check overviews
    if "SKIP_OVERVIEWS" not in options:
        if golden_band.GetOverviewCount() != new_band.GetOverviewCount():
            my_print("Band %s overview count difference:" % id)
            my_print("  Golden: " + str(golden_band.GetOverviewCount()))
            my_print("  New:    " + str(new_band.GetOverviewCount()))
            found_diff += 1
        else:
            for i in range(golden_band.GetOverviewCount()):
                found_diff += compare_band(
                    golden_band.GetOverview(i),
                    new_band.GetOverview(i),
                    id + " overview " + str(i),
                    options,
                )

    # Mask band
    if golden_band.GetMaskFlags() != new_band.GetMaskFlags():
        my_print("Band %s mask flags difference:" % id)
        my_print("  Golden: " + str(golden_band.GetMaskFlags()))
        my_print("  New:    " + str(new_band.GetMaskFlags()))
        found_diff += 1
    elif golden_band.GetMaskFlags() == gdal.GMF_PER_DATASET:
        # Check mask band if it's GMF_PER_DATASET
        found_diff += compare_band(
            golden_band.GetMaskBand(),
            new_band.GetMaskBand(),
            id + " mask band",
            options,
        )

    # Metadata
    if "SKIP_METADATA" not in options:
        found_diff += compare_metadata(
            golden_band.GetMetadata(), new_band.GetMetadata(), "Band " + id, options
        )

    # Band Description - currently this is opt in since we have not
    # been tracking this in the past.  It would be nice to make it the
    # default at some point.
    if "CHECK_BAND_DESC" in options:
        if golden_band.GetDescription() != new_band.GetDescription():
            my_print("Band %s descriptions difference:" % id)
            my_print("  Golden: " + str(golden_band.GetDescription()))
            my_print("  New:    " + str(new_band.GetDescription()))
            found_diff += 1

    # TODO: Color Table, gain/bias, units, blocksize, mask, min/max

    return found_diff


#######################################################


def compare_srs(golden_wkt, new_wkt):
    if golden_wkt == new_wkt:
        return 0

    my_print("Difference in SRS!")

    golden_srs = osr.SpatialReference(golden_wkt)
    new_srs = osr.SpatialReference(new_wkt)

    if golden_srs.IsSame(new_srs):
        my_print("  * IsSame() reports them as equivalent.")
    else:
        my_print("  * IsSame() reports them as different.")

    my_print("  Golden:")
    my_print("  " + (golden_srs.ExportToPrettyWkt() if golden_wkt else "None"))
    my_print("  New:")
    my_print("  " + (new_srs.ExportToPrettyWkt() if new_wkt else "None"))

    return 1


#######################################################


def compare_db(golden_db, new_db, options=None):
    found_diff = 0

    options = [] if options is None else options

    # Comparisons are done per-band, so an image with 'INTERLEAVE=PIXEL' and a
    # lot of bands will take hours to complete.
    if "SKIP_INTERLEAVE_CHECK" not in options:
        maxbands = 10
        interleave = golden_db.GetMetadata("IMAGE_STRUCTURE").get("INTERLEAVE", "")
        if golden_db.RasterCount > maxbands and interleave.lower() == "pixel":
            raise ValueError(
                f"Golden file has more than {maxbands} and INTERLEAVE={interleave} - this"
                f" check will eventually succeed but will take hours due to the"
                f" amount of I/O required for per-band comparisons. Recommend"
                f" testing image encoding directly in your test, and then"
                f" translating to a band interleaved format before calling this"
                f" method: {golden_db.GetDescription()}"
            )

    # SRS
    if "SKIP_SRS" not in options:
        found_diff += compare_srs(golden_db.GetProjection(), new_db.GetProjection())

    # GeoTransform
    if "SKIP_GEOTRANSFORM" not in options:
        golden_gt = golden_db.GetGeoTransform()
        new_gt = new_db.GetGeoTransform()
        if golden_gt != new_gt:
            my_print("GeoTransforms Differ:")
            my_print("  Golden: " + str(golden_gt))
            my_print("  New:    " + str(new_gt))
            found_diff += 1

    # Metadata
    if "SKIP_METADATA" not in options:
        found_diff += compare_metadata(
            golden_db.GetMetadata(), new_db.GetMetadata(), "Dataset", options
        )

    if "SKIP_RPC" not in options:
        found_diff += compare_metadata(
            golden_db.GetMetadata("RPC"), new_db.GetMetadata("RPC"), "RPC", options
        )

    if "SKIP_GEOLOCATION" not in options:
        found_diff += compare_metadata(
            golden_db.GetMetadata("GEOLOCATION"),
            new_db.GetMetadata("GEOLOCATION"),
            "GEOLOCATION",
            options,
        )

    # Bands
    if golden_db.RasterCount != new_db.RasterCount:
        my_print(
            "Band count mismatch (golden=%d, new=%d)"
            % (golden_db.RasterCount, new_db.RasterCount)
        )
        found_diff += 1
        return found_diff

    # Dimensions
    for i in range(golden_db.RasterCount):
        gSzX = golden_db.GetRasterBand(i + 1).XSize
        nSzX = new_db.GetRasterBand(i + 1).XSize
        gSzY = golden_db.GetRasterBand(i + 1).YSize
        nSzY = new_db.GetRasterBand(i + 1).YSize

        if gSzX != nSzX or gSzY != nSzY:
            my_print(
                "Band size mismatch (band=%d golden=[%d,%d], new=[%d,%d])"
                % (i, gSzX, gSzY, nSzX, nSzY)
            )
            found_diff += 1

    # If so-far-so-good, then compare pixels
    if found_diff == 0:
        for i in range(golden_db.RasterCount):
            found_diff += compare_band(
                golden_db.GetRasterBand(i + 1),
                new_db.GetRasterBand(i + 1),
                str(i + 1),
                options,
            )

    return found_diff


#######################################################


def compare_sds(golden_db, new_db, options=None):
    found_diff = 0

    options = [] if options is None else options

    golden_sds = golden_db.GetMetadata("SUBDATASETS")
    new_sds = new_db.GetMetadata("SUBDATASETS")

    count = len(list(golden_sds.keys())) // 2
    for i in range(count):
        key = "SUBDATASET_%d_NAME" % (i + 1)

        sub_golden_db = gdal.Open(golden_sds[key])
        sub_new_db = gdal.Open(new_sds[key])

        sds_diff = compare_db(sub_golden_db, sub_new_db, options)
        found_diff += sds_diff
        if sds_diff > 0:
            my_print(
                "%d differences found between:\n  %s\n  %s"
                % (sds_diff, golden_sds[key], new_sds[key])
            )

    return found_diff


#######################################################


def find_diff(
    golden_file: PathLikeOrStr,
    new_file: PathLikeOrStr,
    check_sds: bool = False,
    options=None,
):
    # Compare Files
    found_diff = 0

    options = [] if options is None else options

    if "SKIP_BINARY" not in options:
        # compare raw binary files.
        try:
            os.stat(golden_file)
            os.stat(new_file)

            if not filecmp.cmp(golden_file, new_file):
                my_print("Files differ at the binary level.")
                found_diff += 1
        except OSError:
            stat_golden = gdal.VSIStatL(str(golden_file))
            stat_new = gdal.VSIStatL(str(new_file))
            if stat_golden and stat_new:
                if stat_golden.size != stat_new.size:
                    my_print("Files differ at the binary level.")
                    found_diff += 1
                else:
                    f_golden = gdal.VSIFOpenL(str(golden_file), "rb")
                    f_new = gdal.VSIFOpenL(str(new_file), "rb")
                    if f_golden and f_new:
                        off = 0
                        while off < stat_golden.size:
                            to_read = min(stat_golden.size - off, 1024 * 1024)
                            golden_chunk = gdal.VSIFReadL(1, to_read, f_golden)
                            if len(golden_chunk) < to_read:
                                my_print(
                                    "Binary file comparison failed: not enough bytes read in golden file"
                                )
                                break
                            new_chunk = gdal.VSIFReadL(1, to_read, f_new)
                            if golden_chunk != new_chunk:
                                my_print("Files differ at the binary level.")
                                found_diff += 1
                                break
                            off += to_read
                    if f_golden:
                        gdal.VSIFCloseL(f_golden)
                    if f_new:
                        gdal.VSIFCloseL(f_new)
            else:
                if not stat_golden:
                    my_print(
                        "Skipped binary file comparison, golden file not in filesystem."
                    )
                elif not new_file:
                    my_print(
                        "Skipped binary file comparison, new file not in filesystem."
                    )

    # compare as GDAL Datasets.
    golden_db = gdal.Open(golden_file)
    new_db = gdal.Open(new_file)
    found_diff += compare_db(golden_db, new_db, options)

    if check_sds:
        found_diff += compare_sds(golden_db, new_db, options)

    return found_diff


#######################################################


def Usage(isError=True):
    f = sys.stderr if isError else sys.stdout
    print("Usage: gdalcompare [--help] [--help-general]", file=f)
    print("                      [-dumpdiffs] [-skip_binary] [-skip_overviews]", file=f)
    print("                      [-skip_geolocation] [-skip_geotransform]", file=f)
    print("                      [-skip_metadata] [-skip_rpc] [-skip_srs]", file=f)
    print("                      [-sds] <golden_file> <new_file>", file=f)
    return 2 if isError else 0


#######################################################
#
# Mainline
#


@enable_gdal_exceptions
def main(argv=sys.argv):

    # Default GDAL argument parsing.
    argv = gdal.GeneralCmdLineProcessor(argv)
    if argv is None:
        return 0

    # Script argument parsing.
    golden_file = None
    new_file = None
    check_sds = 0
    options = []

    i = 1
    while i < len(argv):

        if argv[i] == "--help":
            return Usage(isError=False)

        elif argv[i] == "-sds":
            check_sds = 1

        elif argv[i] == "-dumpdiffs":
            options.append("DUMP_DIFFS")

        elif argv[i] == "-skip_binary":
            options.append("SKIP_BINARY")

        elif argv[i] == "-skip_overviews":
            options.append("SKIP_OVERVIEWS")

        elif argv[i] == "-skip_geolocation":
            options.append("SKIP_GEOLOCATION")

        elif argv[i] == "-skip_geotransform":
            options.append("SKIP_GEOTRANSFORM")

        elif argv[i] == "-skip_metadata":
            options.append("SKIP_METADATA")

        elif argv[i] == "-skip_rpc":
            options.append("SKIP_RPC")

        elif argv[i] == "-skip_srs":
            options.append("SKIP_SRS")

        elif golden_file is None:
            golden_file = argv[i]

        elif new_file is None:
            new_file = argv[i]

        else:
            my_print("Unrecognised argument: " + argv[i])
            return Usage()

        i = i + 1
        # next argument

    if len(argv) == 1:
        return Usage()

    found_diff = find_diff(golden_file, new_file, check_sds, options)
    print("Differences Found: " + str(found_diff))
    sys.exit(found_diff)


if __name__ == "__main__":
    sys.exit(main(sys.argv))