File: cif_fix_AMCSD_atom_labels

package info (click to toggle)
cod-tools 3.11.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: trixie
size: 159,136 kB
sloc: perl: 58,707; sh: 41,323; ansic: 7,268; xml: 1,982; yacc: 1,117; makefile: 731; python: 166
file content (515 lines) | stat: -rwxr-xr-x 18,803 bytes
parent folder | download | duplicates (2)
#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: saulius $
#$Date: 2024-08-15 11:23:14 +0300 (Thu, 15 Aug 2024) $
#$Revision: 10238 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.11.0/scripts/cif_fix_AMCSD_atom_labels $
#------------------------------------------------------------------------------
#*
#* Read the CIF file, assuming that it follows AMCSD atom naming
#* conventions, and correct duplicated _atom_site_label and
#* _atom_site_aniso_label values.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use utf8;

use Unicode::Normalize;
use POSIX qw( strftime );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::DictTags;
use COD::CIF::Tags::COD;
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::Print qw( print_cif );
use COD::CIF::Tags::Manage qw( set_loop_tag );
use COD::AtomProperties;
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_parser_messages
                          process_warnings );
use COD::ToolsVersion qw( get_version_string );

my $Id = '$Id: cif_fix_AMCSD_atom_labels 10238 2024-08-15 08:23:14Z saulius $';

$Id =~ s/^\$|\s*\$$//g;
$Id = remove_diacritics( $Id );
$Id = get_version_string() . "\n" . $Id;

my $use_parser = 'c';

my $die_on_error_level = {
    ERROR   => 1,
    WARNING => 0,
    NOTE    => 0
};

my $keep_tag_order = 0;

#* OPTIONS:
#*   --keep-tag-order
#*                     Keep the original tag order in CIF file (default).
#*   --sort-tags
#*                     Reorder tags in CIF file according to COD.
#*
#*   --use-perl-parser
#*                     Use Perl parser to parse CIF files.
#*   --use-c-parser
#*                     Use C parser to parse CIF files (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '--keep-tag-order'  => sub { $keep_tag_order = 1; },
    '--sort-tags'       => sub { $keep_tag_order = 0; },
    '--use-perl-parser' => sub { $use_parser = 'perl' },
    '--use-c-parser'    => sub { $use_parser = 'c' },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit }
);

my @dictionary_tags = ( @COD::CIF::Tags::DictTags::tag_list,
                        @COD::CIF::Tags::COD::tag_list );
my %dictionary_tags = map { $_, $_ } @dictionary_tags;

@ARGV = ('-') unless @ARGV;

# Flush streams on each output:
STDERR->autoflush(1);
STDOUT->autoflush(1);

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

sub get_max_key($)
{
    # Select maximal (last) key from the given CIF loop
    # column. Assume, as of revision 9535, that all key values are
    # numeric:
    my ($key_column) = @_;

    my $max_key = $key_column->[0];

    for my $key (@{$key_column}) {
        if( $max_key < $key ) {
            $max_key = $key;
        }
    }

    return $max_key;
}

sub get_next_available_key($)
{
    # Assuming that all keys in the given data column of a loop are
    # numeric, get the next available (unused) key value:
    return &get_max_key + 1;
}

sub insert_cod_changelog_entry($$)
{
    my ($dataset, $parameters) = @_;

    my $changed_atom_labels = $parameters->{changed_atom_labels};

    my $values = $dataset->{values};
    my $types = $dataset->{types};

    my $date = strftime('%Y-%m-%dT%H:%M:%S%z', localtime());

    # Convert the date string to strictly RFC 3339 conforming syntax,
    # i.e. add a colon (':') before the last two time zone digits:

    if ( $date =~ m/^(.*[0-9]{2})([0-9]{2})$/ ) {
        $date = $1 . ':' . $2;
    }

    my $change_description = '';

    if( $changed_atom_labels ) {
        $change_description = $changed_atom_labels
    }

    my $message =
        "Changed the following '_atom_site_label' values:\n" .
        $change_description;

    my $category_key = '_cod_changelog_entry_id';

    $message =~ s/\n$//;
    
    if( $change_description ne '' ) {
        if( exists $values->{$category_key} ) {
            my $key_column = $values->{$category_key};
            my $new_key = get_next_available_key( $key_column );

            push( @{$values->{$category_key}}, $new_key );
            push( @{$types->{$category_key}}, 'INT' );

            # Update those data items that already exist in the
            # loop. If the entry was missing, add that entry and fill
            # the previous missing values with the '?' value
            # ("unknown/missing"):
            {
                my $author_column = '_cod_changelog_entry_author';
                if( exists $values->{$author_column} ) {
                    push( @{$values->{$author_column}}, $Id );
                    push( @{$types->{$author_column}},'SQSTRING');
                } else {
                    my $loop_length = int(@{$values->{$category_key}});
                    my $value = [('?') x $loop_length];
                    $value->[-1] = $Id;
                    set_loop_tag( $dataset, $author_column, $category_key,
                                  $value );
                    push( @{$types->{$author_column}},
                          [('SQSTRING') x $loop_length] );
                }
            }

            {
                my $date_column = '_cod_changelog_entry_date';
                if( exists $values->{$date_column} ) {
                    push( @{$values->{$date_column}}, $date );
                    push( @{$types->{$date_column}},'SQSTRING');
                } else {
                    my $loop_length = int(@{$values->{$category_key}});
                    my $value = [('?') x $loop_length];
                    $value->[-1] = $date;
                    set_loop_tag( $dataset, $date_column, $category_key,
                                  $value );
                    push( @{$types->{$date_column}},
                          [('SQSTRING') x $loop_length] );
                }
            }

            {
                my $text_column = '_cod_changelog_entry_text';
                if( exists $values->{$text_column} ) {
                    push( @{$values->{$text_column}}, $message );
                    push( @{$types->{$text_column}},'TEXTFIELD');
                } else {
                    my $loop_length = int(@{$values->{$category_key}});
                    my $value = [('?') x $loop_length];
                    $value->[-1] = $message;
                    set_loop_tag( $dataset, $text_column, $category_key,
                                  $value );
                    push( @{$types->{$text_column}},
                          [('TEXTFIELD') x $loop_length] );
                }
            }

            # Fill the remaining data items with some placeholder values:
            my $loop_id = $dataset->{inloop}{$category_key};
            for my $changelog_loop_item (@{$dataset->{loops}[$loop_id]}) {
                if( scalar(@{$dataset->{values}{$changelog_loop_item}}) <
                    scalar(@{$key_column}) ) {
                    push( @{$values->{$changelog_loop_item}}, '?' );
                    push( @{$types->{$changelog_loop_item}}, 'UQSTRING' );
                }
            }
        } else {
            # Create a new category loop from scratch:
            set_loop_tag( $dataset, $category_key,
                          $category_key, [1] );
            push( @{$types->{$category_key}}, 'INT' );

            set_loop_tag( $dataset, '_cod_changelog_entry_author',
                          $category_key, [$Id] );
            push( @{$types->{_cod_changelog_entry_author}}, 'SQSTRING' );

            set_loop_tag( $dataset, '_cod_changelog_entry_date',
                          $category_key, [$date] );
            push( @{$types->{_cod_changelog_entry_date}}, 'SQSTRING' );

            set_loop_tag( $dataset, '_cod_changelog_entry_text',
                          $category_key, [$message] );
            push( @{$types->{_cod_changelog_entry_text}}, 'TEXTFIELD' );
        }
    } # if( $change_description ne '' ) {...

    return;
}

sub remove_diacritics
{
    my ($string) = @_;

    # from http://ahinea.com/en/tech/accented-translate.html
    # 2011.12.10
    local $_ = NFD($string);

    s/\pM//g;

    # additional normalizations:

    s/\x{00df}/ss/g;  ##  German beta “ß” -> “ss”
    s/\x{00c6}/AE/g;  ##  Æ
    s/\x{00e6}/ae/g;  ##  æ
    s/\x{0132}/IJ/g;  ##  Ĳ
    s/\x{0133}/ij/g;  ##  ĳ
    s/\x{0152}/Oe/g;  ##  Œ
    s/\x{0153}/oe/g;  ##  œ

    tr/\x{00d0}\x{0110}\x{00f0}\x{0111}\x{0126}\x{0127}/DDddHh/; # ÐĐðđĦħ
    tr/\x{0131}\x{0138}\x{013f}\x{0141}\x{0140}\x{0142}/ikLLll/; # ıĸĿŁŀł
    tr/\x{014a}\x{0149}\x{014b}\x{00d8}\x{00f8}\x{017f}/NnnOos/; # ŊŉŋØøſ
    tr/\x{00de}\x{0166}\x{00fe}\x{0167}/TTtt/;                   # ÞŦþŧ

    # clear everything else:
    s/[^\0-\x80]//g;

    return $_;
}

sub make_label($$)
{
    my ($label, $suffix) = @_;

    return $label . ($label =~ /\d$/ ? "_" : "") . $suffix;
}

sub generate_unique_label($$$)
{
    my ($label, $suffix, $existing_labels) = @_;

    for(;;) {
        my $new_label = make_label( $label, $suffix );
        if( !exists $existing_labels->{$new_label} ) {
            # Register the newly generated label:
            $existing_labels->{$new_label} = $new_label;
            # Return bot label and suffix:
            return ($new_label, $suffix);
        }
        if( $suffix == 2**32 - 1 ) {
            die "could not disambiguate label '$label' " .
                "with the max 32 bit integer...";
        }
        # Look for a new numeric suffix:
        $suffix ++;
    }
}

for my $filename (@ARGV) {
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );
    next if ( $err_count > 0 );

    canonicalize_all_names( $data );

    # The current practice of AMCSD and the COD contents (as of
    # 2023-04-27) show that the 'O-H' atom label is associated with a
    # hydroxyl and *needs* attached hydrogens (AMCSD convention),
    # while the 'OH' label, though also associated with a hydroxil
    # ion, does *not* need attached hydrogens, since hydrogens are
    # usually recorded explicitly. Thus, we will add attached
    # hydrogens to 'O-H' atoms, but only add atom type (and *not* the
    # attached hydrogen count) to 'OH" atoms. Thus different regexps
    # are used below for these atom names.

    # Also, it seems that in some cases a single quote character "'"
    # was changed to the star character "*" in _atom_site_label
    # values.

    # All these changes, however, were not propagated to
    # '_atom_site_aniso_label' data items, which this script will now
    # attempt to rectify.

    for my $dataset (@{$data}) {

        my $changed_aniso_label_description = '';

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => 'data_' . $dataset->{'name'},
            }, $die_on_error_level )
        };

        my $values = $dataset->{values};
        my $datablock = "data_" . $dataset->{name};

        # Check if all atoms have unique labels and if the do not, try
        # to fix them:

        my $atom_site_label = '_atom_site_label';
        my $atom_site_aniso_label = '_atom_site_aniso_label';

        my $atom_site_label_values = $values->{$atom_site_label};
        my $atom_site_aniso_label_values = $values->{$atom_site_aniso_label};

        # Go through the existing labels and add disambuguation
        # suffixes where necessary:

        my %seen_atom_labels;

        my $change_log_message = "";
        
        if( !defined $atom_site_label_values ) {
            warn "'_atom_site_label' data items are not defined (?!)";
        } else {
            if( defined $atom_site_aniso_label_values ) {
                if( int(@{$atom_site_label_values}) !=
                    int(@{$atom_site_aniso_label_values}) ) {
                    warn "numbers of labels in '_atom_site_label' and " .
                        "'_atom_site_aniso_label' are not the same"
                } else {
                    print STDERR "$0: $filename $datablock: NOTE, " .
                        "the label and aniso label counts match :)\n";
                }
            } else {
                print STDERR "$0: $filename $datablock: NOTE, no " .
                    "'_atom_site_aniso_label' data\n";
            }

            # Build a catalogue of existing atom labels, to make sure that
            # the newly generated labels do not clash with them:

            my %existing_labels;

            for my $label (@{$atom_site_label_values}) {
                $existing_labels{$label} = $label;
            }

            # Check all labels and disambiguate them:

            for my $i (0 .. $#{$atom_site_label_values}) {
                my $label = $atom_site_label_values->[$i];

                if( !exists $seen_atom_labels{$label} ) {
                    my $suffix = 1;
                    my $new_label;

                    # Generate a unique label for the atom:
                    ($new_label, $suffix) =
                        generate_unique_label($label, $suffix, \%existing_labels);

                    push @{$seen_atom_labels{$label}}, [
                        $i,     # Index of the old label
                        $label, # The old label
                        $suffix # Disambiguation suffix for the old label,
                                # in case we need it.
                        ];
                } else {
                    if( @{$seen_atom_labels{$label}} == 1 ) {
                        # This is the first time we have encountered
                        # the dupicate; we need to change both the
                        # first label and the current one:

                        my $previous_label_index = $seen_atom_labels{$label}[0][0];
                        my $previous_label_value = $seen_atom_labels{$label}[0][1];
                        my $previous_label_suffix = $seen_atom_labels{$label}[0][2];

                        my $new_label = make_label
                            (
                             $previous_label_value,
                             $previous_label_suffix
                            );

                        $atom_site_label_values->
                            [$previous_label_index] = $new_label;

                        if( defined $atom_site_aniso_label_values ) {
                            if( $atom_site_aniso_label_values->
                                [$previous_label_index] eq
                                $previous_label_value ) {
                                $atom_site_aniso_label_values->
                                    [$previous_label_index] = $new_label;
                            } else {
                                my $previous_aniso_label =
                                    $atom_site_aniso_label_values->
                                    [$previous_label_index];

                                warn "the first atom site and aniso labels " .
                                    "'$previous_aniso_label'" .
                                    " and '$previous_label_value' " .
                                    "do not match -- will not change " .
                                    "the aniso label";
                            }
                        }

                        my $current_message = "the first " .
                            "duplicated label " .
                            "'$previous_label_value' " .
                            " converted to '$new_label'" .
                            "\n";

                        $change_log_message .= $current_message;

                        print STDERR "$0: $filename $datablock: WARNING, ",
                            $current_message;
                    }

                    my $last_index = $#{$seen_atom_labels{$label}};
                    my $new_suffix = $seen_atom_labels{$label}[$last_index][2] + 1;

                    # Generate a unique label for the atom:
                    my $new_label;
                    ($new_label, $new_suffix) = generate_unique_label
                        (
                         $label,
                         $new_suffix,
                         \%existing_labels
                        );

                    push @{$seen_atom_labels{$label}}, [
                        $i,         # Index of the current label
                        $label,     # The current label
                        $new_suffix # Disambiguation suffix for the current label
                        ];

                    $atom_site_label_values->[$i] = $new_label;

                    if( defined $atom_site_aniso_label_values ) {
                        if( $atom_site_aniso_label_values->[$i] eq
                            $label ) {
                            $atom_site_aniso_label_values->[$i] = $new_label;
                        } else {
                            warn "the atom site and aniso labels " .
                                "'$atom_site_aniso_label_values->[$i]'" .
                                " and '$label' do not match -- " .
                                "will not change the aniso label";
                        }
                    }

                    my $current_message .= "duplicated " .
                        "label '$label' at index $i converted to " .
                        "'$new_label'\n";

                    $change_log_message .= $current_message;
                    
                    print STDERR "$0: $filename $datablock: WARNING, ",
                        $current_message;
                }
            }
        }

        # Add a change log entry if needed:
        insert_cod_changelog_entry( $dataset,
                                    {
                                        changed_atom_labels => $change_log_message,
                                    } );

        print_cif( $dataset, {
            exclude_misspelled_tags => 0,
            preserve_loop_order => 1,
            fold_long_fields => 0,
            dictionary_tags => \%dictionary_tags,
            dictionary_tag_list => \@dictionary_tags,
            keep_tag_order => $keep_tag_order,
        } );
    }
}