File: cif_find_duplicates

package info (click to toggle)
cod-tools 3.7.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm
size: 154,792 kB
sloc: perl: 57,588; sh: 36,842; ansic: 6,402; xml: 1,982; yacc: 1,117; makefile: 727; python: 166
file content (433 lines) | stat: -rwxr-xr-x 15,843 bytes
#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2021-04-28 19:35:53 +0300 (Wed, 28 Apr 2021) $
#$Revision: 8738 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.7.0/scripts/cif_find_duplicates $
#------------------------------------------------------------------------------
#*
#* Find COD numbers for the .cif files in given directories of file lists.
#*
#* USAGE:
#*    $0 --options my-cif-dir1/ my-cif-dir2/files*.cif COD-cif-dir/
#**

use strict;
use warnings;
use File::Basename qw( basename );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Data::CODNumbers qw( cif_fill_data entries_are_the_same );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_warnings
                          process_errors
                          process_parser_messages
                          report_message );
use COD::ToolsVersion qw( get_version_string );

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

my $use_parser = 'c';

# default options
my %options = (
    'max_cell_length_diff' => 0.5, # angstroms
    'max_cell_angle_diff'  => 1.2, # degrees
    'check_bibliography'   => 1,
    'check_sample_history' => 0,
    'use_su'               => 1,
);

my %cif_fill_data_options = (
    'use_attached_hydrogens' => 0,
);

#* OPTIONS:
#*   -c, --always-continue
#*                     Continue processing and return successful return status
#*                     even if errors are diagnosed.
#*   -c-, --always-die
#*                     Stop and return error status if errors are diagnosed.
#*
#*   --max-cell-length-difference 0.5
#*                     Maximum difference of unit cell lengths allowed for
#*                     entries regarded as the same, in angstroms
#*                     (default: 0.5).
#*
#*   --max-cell-angle-difference 1.2
#*                     Maximum difference of unit cell angles allowed for
#*                     entries regarded as the same, in angstroms
#*                     (default: 1.2).
#*
#*   --check-bibliography
#*                     Only CIFs that have different bibliography data are
#*                     declared different if all other parameters match.
#*                     CIFs with missing bibliographies are assumed to have
#*                     matching bibliographies (default).
#*   --no-check-bibliography, --dont-check-bibliography
#*                     Ignore bibliographic data of all CIFs; thus even files
#*                     with different bibliographies will be regarded the same
#*                     if their cells, chemical formulae and measurement
#*                     conditions match.
#*
#*   --use-sigma, --dont-ignore-sigma, --no-ignore-sigma
#*                     Use standard deviations (sigmas) when comparing unit
#*                     cell constants (default).
#*   --no-use-sigma, --dont-use-sigma, --ignore-sigma
#*                     Ignore standard uncertainties (sigmas) when comparing
#*                     unit cell constants.
#*
#*   --check-sample-history
#*                     Only CIFs that have different sample history data
#*                     (as recorded in the _exptl_crystal_thermal_history
#*                     and _exptl_crystal_pressure_history tags) are declared
#*                     different if all other parameters match.
#*   --no-check-sample-history,
#*   --dont-check-sample-history,
#*   --disregard-sample-history
#*                     Ignore sample history of all CIFs; thus even files
#*                     with different sample histories will be regarded the
#*                     same if their cells, chemical formulae and measurement
#*                     conditions match (default).
#*
#*   --use-attached-hydrogens
#*                     Include the number of implicit hydrogen atoms
#*                     when calculating the summary chemical formula.
#*                     Implicit hydrogen atoms are specified using
#*                     the _atom_site_attached_hydrogens data item.
#*   --no-use-attached-hydrogens,
#*   --ignore-attached-hydrogens
#*                     Ignore the number of implicit hydrogen atoms
#*                     when calculating the summary chemical formula.
#*                     Implicit hydrogen atoms are specified using
#*                     the _atom_site_attached_hydrogens data item
#*                     (default).
#*
#*   --continue-on-errors
#*                     Do not terminate script if errors are raised.
#*   --die-on-errors, --dont-continue-on-errors,
#*   --do-not-continue-on-errors, --no-continue-on-errors
#*                     Terminate script immediately if errors are raised (default).
#*   --continue-on-warnings
#*                     Do not terminate script if warnings are raised (default).
#*   --die-on-warnings
#*                     Terminate script immediately if warnings are raised.
#*   --continue-on-notes
#*                     Do not terminate script if notes are raised (default).
#*   --die-on-notes
#*                     Terminate script immediately if notes are raised.
#*
#*   --use-perl-parser
#*                     Use Perl parser to parse CIF files.
#*   --use-c-parser
#*                     Use C parser to parse CIF files (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '--max-cell-length-difference' => \$options{'max_cell_length_diff'},
    '--max-cell-angle-difference'  => \$options{'max_cell_angle_diff'},

    '--check-bibliography'      => sub { $options{'check_bibliography'} = 1 },
    '--no-check-bibliography'   => sub { $options{'check_bibliography'} = 0 },
    '--dont-check-bibliography' => sub { $options{'check_bibliography'} = 0 },

    '--check-sample-history'
                            => sub { $options{'check_sample_history'} = 1 },
    '--no-check-sample-history'
                            => sub { $options{'check_sample_history'} = 0 },
    '--dont-check-sample-history'
                            => sub { $options{'check_sample_history'} = 0 },
    '--disregard-sample-history'
                            => sub { $options{'check_sample_history'} = 0 },

    '--use-sigma'         => sub { $options{'use_su'} = 1 },
    '--no-ignore-sigma'   => sub { $options{'use_su'} = 1 },
    '--dont-ignore-sigma' => sub { $options{'use_su'} = 1 },

    '--no-use-sigma'      => sub { $options{'use_su'} = 1 },
    '--dont-use-sigma'    => sub { $options{'use_su'} = 1 },
    '--ignore-sigma'      => sub { $options{'use_su'} = 1 },

    '--use-attached-hydrogens'
                => sub { $cif_fill_data_options{'use_attached_hydrogens'} = 1 },
    '--no-use-attached-hydrogens'
                => sub { $cif_fill_data_options{'use_attached_hydrogens'} = 0 },
    '--ignore-attached-hydrogens'
                => sub { $cif_fill_data_options{'use_attached_hydrogens'} = 0 },

    '-c,--always-continue'              => sub { $die_on_errors   = 0;
                                                 $die_on_warnings = 0;
                                                 $die_on_notes    = 0 },
    '-c-,--always-die'                  => sub { $die_on_errors   = 1;
                                                 $die_on_warnings = 1;
                                                 $die_on_notes    = 1 },

    '--continue-on-errors'          => sub { $die_on_errors = 0 },
    '--dont-continue-on-errors'     => sub { $die_on_errors = 1 },
    '--die-on-errors'               => sub { $die_on_errors = 1 },
    '--do-not-continue-on-errors'   => sub { $die_on_errors = 1 },
    '--no-continue-on-errors'       => sub { $die_on_errors = 1 },

    '--continue-on-warnings' => sub { $die_on_warnings = 0 },
    '--die-on-warnings'      => sub { $die_on_warnings = 1 },

    '--continue-on-notes'    => sub { $die_on_notes = 0 },
    '--die-on-notes'         => sub { $die_on_notes = 1 },

    '--use-perl-parser' => sub { $use_parser = 'perl' },
    '--use-c-parser'    => sub { $use_parser = 'c' },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit }
);

my $die_on_error_level = {
    'ERROR'   => $die_on_errors,
    'WARNING' => $die_on_warnings,
    'NOTE'    => $die_on_notes
};

my %has_numeric_value = (
    '_journal_year'   => 1,
    '_journal_volume' => 1,
    '_journal_issue'  => 1,
);

my %skip_tag = (
    '_journal_name_full' => 0,
);

my %COD = ();

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

if( @ARGV < 2 ) {
    report_message( {
        'program'   => $0,
        'err_level' => 'ERROR',
        'message'   => 'please supply at least two directory names on the '
                     . 'command line -- names of directories with the '
                     . 'analysed CIF files come first and the name of the '
                     . 'directory with COD CIF files comes last'
    }, 1 );
}

my $COD_cif_dir = pop @ARGV;
my @COD_cif_files = `find $COD_cif_dir -name "*.cif" -o -name "*.CIF" | sort`;

do {
    print int(@COD_cif_files), "\n";
    print "@COD_cif_files";
} if 0;

for my $filename (@COD_cif_files) {
    chomp $filename;
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );
    next if ( $err_count > 0 );

    canonicalize_all_names( $data );

    if( !@{$data} ) {
        report_message( {
            'program'   => $0,
            'filename'  => $filename,
            'err_level' => 'WARNING',
            'message'   => 'file seems to be empty'
        }, $die_on_error_level->{'WARNING'} );
        next;
    }

    my %structures = ();
    my $index = 0;

    foreach my $dataset ( @{$data} ) {

        my $dataname = 'data_' . $dataset->{'name'};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => $dataname
            }, $die_on_error_level )
        };

        eval {
            my $structure = cif_fill_data(
                                    $dataset,
                                    $filename,
                                    $index,
                                    \%cif_fill_data_options
                            );
            if ( defined $structure ) {
                $structures{$structure->{id}} = $structure;
                $index++;
            }
        };
        if ( $@ ) {
            process_errors( {
              'message'  => $@,
              'program'  => $0,
              'filename' => $filename,
              'add_pos'  => $dataname
            }, $die_on_error_level->{'ERROR'} );
        };
    };

    my $basename = basename( $filename );

    for my $id (keys %structures) {
        my $formula = $structures{$id}{chemical_formula_sum};

        $formula = '?' unless defined $formula;

        push @{$COD{$formula}}, $structures{$id};

        if( exists $structures{$id}{calc_formula} &&
            $structures{$id}{calc_formula} ne $formula ) {
            my $calc_formula = $structures{$id}{calc_formula};
            push @{$COD{$calc_formula}}, $structures{$id};
        }
    }

}

do {
    use COD::Serialise qw( serialiseRef );
    serialiseRef( \%COD );
} if 0;

#------------------------------------------------------------------------------

my @cif_files = `find @ARGV -name "*.cif" -o -name "*.CIF" | sort`;

for my $filename (@cif_files) {
    chomp $filename;
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );
    next if ( $err_count > 0 );

    canonicalize_all_names( $data );

    if( !@{$data} ) {
        report_message( {
            'program'   => $0,
            'filename'  => $filename,
            'err_level' => 'WARNING',
            'message'   => 'file seems to be empty'
        }, $die_on_error_level->{'WARNING'} );
        next;
    }

    my %structures = ();
    my $index = 0;

    foreach my $dataset ( @{$data} ) {

        my $dataname = 'data_' . $dataset->{'name'};

        local $SIG{__WARN__} = sub {
            process_warnings( {
               'message'  => @_,
               'program'  => $0,
               'filename' => $filename,
               'add_pos'  => $dataname
             }, $die_on_error_level )
        };

        eval {
            my $structure = cif_fill_data(
                                    $dataset,
                                    $filename,
                                    $index,
                                    \%cif_fill_data_options
                            );
            if ( defined $structure ) {
                $structures{$structure->{id}} = $structure;
                $index++;
            }
        };
        if ( $@ ) {
            process_errors( {
              'message'  => $@,
              'program'  => $0,
              'filename' => $filename,
              'add_pos'  => $dataname
            }, $die_on_error_level->{'ERROR'} );
        };
    };

    for my $id (keys %structures) {
        my $formula = $structures{$id}{chemical_formula_sum};
        my $calc_formula = $structures{$id}{calc_formula};

        if (!defined $formula) {
            $formula = defined $calc_formula ? $calc_formula : '?';
        }
        # In case the declared formula and the calculated formula
        # match, the calculated formula should be undefined to avoid
        # getting duplicate entry results
        if (defined $calc_formula && $calc_formula eq $formula) {
            $calc_formula = undef
        }

        my %structures_found = ();

        if( defined $formula && defined $COD{$formula} ) {
            for my $COD_entry (@{$COD{$formula}}) {
                if( entries_are_the_same( $structures{$id},
                                          $COD_entry,
                                          \%options )) {
                    my $COD_key = $COD_entry->{filename};
                    $structures_found{$COD_key} = $COD_entry;
                }
            }
        }
        if( defined $calc_formula && defined $COD{$calc_formula} ) {
            ## print ">>> formula: '$formula', contents: '$calc_formula'\n";
            for my $COD_entry (@{$COD{$calc_formula}}) {
                if( entries_are_the_same( $structures{$id},
                                          $COD_entry,
                                          \%options )) {
                    my $COD_key = $COD_entry->{filename};
                    if( !exists $structures_found{$COD_key} ) {
                        $structures_found{$COD_key} = $COD_entry;
                    }
                }
            }
        }

        my $final_formula = $formula;
        $final_formula =~ s/\s/_/g;

        my $n = keys %structures_found;

        if( $n > 0 ) {
            for my $key (sort keys %structures_found) {
                my $COD_entry = $structures_found{$key};
                printf
                    "%-35s %15s %3d %s\n",
                    $final_formula,
                    $COD_entry->{filename}, $n, $filename;
            }
        } else {
            printf "%-35s %15s %3d %s\n", $final_formula, '?', 0, $filename;
        }
    }
}