File: smi_canonicalise

package info (click to toggle)
smiles-scripts 0.3.0%2Bsvn878%2Bbranch%2Bsystem%2Bdeps-2
links: PTS, VCS
area: main
in suites: sid, trixie
size: 3,860 kB
sloc: perl: 1,889; java: 1,218; sh: 1,092; makefile: 246
file content (280 lines) | stat: -rwxr-xr-x 9,559 bytes
#!/usr/bin/perl

use strict;
use warnings;

use Chemistry::OpenSMILES qw(
    is_cis_trans_bond
    is_double_bond
    is_single_bond
    valence
);
use Chemistry::OpenSMILES::Aromaticity qw( aromatise kekulise );
use Chemistry::OpenSMILES::Parser;
use Chemistry::OpenSMILES::Stereo qw(
    chirality_to_pseudograph
    cis_trans_to_pseudoedges
    mark_all_double_bonds
);
use Chemistry::OpenSMILES::Writer qw( write_SMILES );
use File::Basename qw( basename );
use Getopt::Long::Descriptive;
use Graph::Nauty qw( canonical_order );
use List::Util qw( any first shuffle );

$Graph::Nauty::worksize = 25600;

my $basename = basename $0;
my( $opt, $usage ) = describe_options( <<"END" . 'OPTIONS',
USAGE
    $basename [<args>] [<files>]

DESCRIPTION
    $basename reads in files with SMILES descriptors and outputs them
    according to stable atom ordering established by Graph::Nauty.
    Moieties, if more than one, are ordered in lexicographic order.

END
    [ raw => hidden => {
        one_of => [
            [ 'infer-hydrogens' =>
                'infer hydrogen atom counts according to valency rules [default]' ],
            [ 'no-infer-hydrogens' =>
                'do not infer hydrogen atom counts' ]
        ],
        default => 'infer_hydrogens'
      }
    ],
    [],
    [ aroma => hidden => {
        one_of => [
            [ 'aromatise' => 'aromatise Kekule structures ' .
                             '(experimental)' ],
            [ 'no-aromatise' => 'do not attempt to aromatise [default]' ]
        ],
        default => 'no-aromatise'
      }
    ],
    [ kekul => hidden => {
        one_of => [
            [ 'kekulise' => 'kekulise simple aromatic structures ' .
                            '(experimental)' ],
            [ 'no-kekulise' => 'do not attempt to kekulise [default]' ]
        ],
        default => 'no-kekulise'
      }
    ],
    [],
    [ haloanions => hidden => {
        one_of => [
            [ 'canonicalise-haloanions'    => 'canonicalise the representation of haloanions [default]' ],
            [ 'no-canonicalise-haloanions' => 'retain input representation of haloanions' ],
        ],
        default => 'canonicalise_haloanions'
      }
    ],
    [ nitro_groups => hidden => {
        one_of => [
            [ 'canonicalise-nitro-groups', 'canonicalise the representation of nitro groups by converting them from *-[N+]([O-])=O to *-N(=O)=O' ],
            [ 'no-canonicalise-nitro-groups', 'retain input representation of nitro groups [default]' ],
        ],
        default => 'no_canonicalise_nitro_groups'
      }
    ],
    [],
    [ 'ignore-class',
      'ignore SMILES atom class in canonicalisation (useful for testing)' ],
    [ 'random-order',
      'instead of canonical, output SMILES in random order (useful for testing)' ],
    [],
    [ 'help', 'print usage message and exit', { shortcircuit => 1 } ],
);

if( $opt->help ) {
    print $usage->text;
    exit;
}

my $errors = 0;
while (<>) {
    chomp;
    my $additional_position = '';
    if( s/\t([^\t]*)$// ) {
        $additional_position = ' ' . $1;
    }

    local $SIG{__WARN__} = sub {
        print STDERR "$basename: $ARGV($.)$additional_position: $_[0]";
    };

    my $parser = Chemistry::OpenSMILES::Parser->new;
    my @moieties;
    eval {
        @moieties = $parser->parse( $_, { raw => $opt->raw eq 'no_infer_hydrogens' } );
    };
    if( $@ ) {
        $@ =~ s/^[^:]+:\s*// if !index( $@, $0 );
        print STDERR "$basename: $ARGV($.)$additional_position: $@";
        $errors++;
    }

    my @smiles_parts;
    for my $moiety (@moieties) {
        aromatise( $moiety ) if $opt->aroma eq 'aromatise';
        canonicalise_haloanions( $moiety )
                    if $opt->haloanions eq 'canonicalise_haloanions';
        canonicalise_nitro_groups( $moiety )
                    if $opt->nitro_groups eq 'canonicalise_nitro_groups';

        my @order;
        if( !$opt->random_order ) {
            # copy() makes a shallow copy without edge attributes, thus they
            # have to be added later:
            my $copy = $moiety->copy;
            for my $bond ($moiety->edges) {
                next unless $moiety->has_edge_attribute( @$bond, 'bond' );
                $copy->set_edge_attribute( @$bond,
                                           'bond',
                                           $moiety->get_edge_attribute( @$bond, 'bond' ) );
            }
            cis_trans_to_pseudoedges( $copy );
            chirality_to_pseudograph( $copy );

            @order = canonical_order( $copy, \&represent_vertex );
            my %order;
            for (0..$#order) {
                $order{$order[$_]} = $_;
            }

            # Collect cis/trans bonds for marking them up
            my @cis_trans_bonds;
            for my $bond ($moiety->edges) {
                next unless is_double_bond( $moiety, @$bond );

                my $subgraph = $copy->subgraph( [ $moiety->neighbours( $bond->[0] ),
                                                  $moiety->neighbours( $bond->[1] ) ] );
                my $cis_trans_bond = first { $copy->has_edge_attribute( @$_, 'pseudo' ) }
                                           $subgraph->edges;
                next unless $cis_trans_bond;

                @$cis_trans_bond = reverse @$cis_trans_bond unless $subgraph->has_edge( $bond->[0], $cis_trans_bond->[0] );
                push @cis_trans_bonds, [ $cis_trans_bond->[0],
                                         $bond->[0],
                                         $bond->[1],
                                         $cis_trans_bond->[1],
                                         $copy->get_edge_attribute( @$cis_trans_bond, 'pseudo' ) ];
            }

            # Drop cis/trans markers from the input graph and mark them
            # anew.
            for my $bond ($moiety->edges) {
                next unless is_cis_trans_bond( $moiety, @$bond );
                $moiety->delete_edge_attribute( @$bond, 'bond' );
            }
            mark_all_double_bonds( $moiety,
                                   \@cis_trans_bonds,
                                   sub { $order{$_[0]} } );
        } else {
            @order = shuffle $moiety->vertices;
        }
        my %order;
        for (0..$#order) {
            $order{$order[$_]} = $_;
        }

        kekulise( $moiety, sub { $order{$_[0]} } ) if $opt->kekul eq 'kekulise';

        eval {
            my $part =
                 write_SMILES( $moiety,
                               {
                                    order_sub =>
                                        sub {
                                            my @sorted = sort { $order{$a} <=> $order{$b} }
                                                              keys %{$_[0]};
                                            return $_[0]->{shift @sorted};
                                        },
                                    raw => $opt->raw eq 'no_infer_hydrogens',
                               } );

            # In a SMILES descriptor, one can substitute all '/' with '\'
            # and vice versa, retaining correct cis/trans settings.
            # Similar rule is explained in O'Boyle (2012), Rule H.
            if( $part =~ /([\/\\])/ && $1 eq '\\' ) {
                $part =~ tr/\/\\/\\\//;
            }
            push @smiles_parts, $part;
        };
        if( $@ ) {
            print STDERR "$basename: $ARGV($.)$additional_position: $@";
            $errors++;
        }
    }

    $additional_position =~ s/^ /\t/;
    print join( '.', sort @smiles_parts ), $additional_position, "\n";
}

exit( $errors > 0 );

sub represent_vertex
{
    my( $vertex ) = @_;

    return '' unless %$vertex;

    my %atom = %$vertex;
    delete $atom{chirality};
    delete $atom{class} if $opt->ignore_class;
    return write_SMILES( \%atom );
}

# See https://projects.ibt.lt/repositories/issues/1622 for rationale and algorithm
sub canonicalise_haloanions
{
    my( $moiety ) = @_;

    return if ( $moiety->vertices < 5 || $moiety->vertices > 7 );

    my @anions = grep  { $moiety->degree($_) == 1 } $moiety->vertices;
    my $center = first { $moiety->degree($_)  > 3 } $moiety->vertices;

    return unless $center;
    return unless @anions == $moiety->vertices - 1;
    return unless $center->{symbol} =~ /^(As|Se|Si|[BPS])$/;
    return if any { $_->{symbol} !~ /^(At|Br|Cl|[FI])$/ } @anions;

    return unless any { $_->{charge} } @anions;
    return if any { $_->{charge} && $_->{charge} > 0 } @anions;

    for (@anions) {
        next unless exists $_->{charge};
        $center->{charge} += $_->{charge};
        delete $_->{charge};
    }
}

sub canonicalise_nitro_groups
{
    my( $moiety ) = @_;

    my @N = grep { $_->{symbol} eq 'N' &&
                   $_->{charge} &&
                   $_->{charge} == 1 &&
                   $moiety->degree( $_  ) == 3 &&
                   valence( $moiety, $_ ) == 4 } $moiety->vertices;
    for my $N (@N) {
        my @O = grep { $_->{symbol} eq 'O' && $moiety->degree( $_ ) == 1 }
                     $moiety->neighbours( $N );
        next unless @O >= 2;
        my $ketone = first { is_double_bond( $moiety, $N, $_ ) } @O;
        my $O      = first { is_single_bond( $moiety, $N, $_ ) &&
                             $_->{charge} &&
                             $_->{charge} == -1 } @O;
        next unless $ketone && $O;

        delete $N->{charge};
        delete $O->{charge};
        $moiety->set_edge_attribute( $N, $O, 'bond', '=' );
    }
}