File: smi_canonicalise

package info (click to toggle)
smiles-scripts 0.2.0%2Bdfsg1-4
links: PTS, VCS
area: main
in suites: bookworm
size: 3,208 kB
sloc: perl: 1,189; java: 1,094; sh: 1,052; makefile: 249
file content (178 lines) | stat: -rwxr-xr-x 5,683 bytes
#!/usr/bin/perl

use strict;
use warnings;

use Chemistry::OpenSMILES qw( is_cis_trans_bond );
use Chemistry::OpenSMILES::Aromaticity qw( aromatise kekulise );
use Chemistry::OpenSMILES::Parser;
use Chemistry::OpenSMILES::Stereo qw(
    chirality_to_pseudograph
    cis_trans_to_pseudoedges
    mark_all_double_bonds
);
use Chemistry::OpenSMILES::Writer qw( write_SMILES );
use File::Basename qw( basename );
use Getopt::Long::Descriptive;
use Graph::Nauty qw( canonical_order );
use List::Util qw( shuffle );

sub represent_vertex
{
    my( $vertex ) = @_;

    return '' unless %$vertex;

    my %atom = %$vertex;
    delete $atom{chirality};
    return write_SMILES( \%atom );
}

my %parser_options;
my $basename = basename $0;
my( $opt, $usage ) = describe_options( <<"END" . 'OPTIONS',
USAGE
    $basename [<args>] [<files>]

DESCRIPTION
    $basename reads in files with SMILES descriptors and outputs them
    according to stable atom ordering established by Graph::Nauty.
    Moieties, if more than one, are ordered in lexicographic order.
END
    [ raw => hidden => {
        one_of => [
            [ 'infer-hydrogens' =>
                'infer hydrogen atom counts according to valency rules [default]' ],
            [ 'no-infer-hydrogens' =>
                'do not infer hydrogen atom counts' ]
        ],
        default => 'infer_hydrogens'
      }
    ],
    [ aroma => hidden => {
        one_of => [
            [ 'aromatise' => 'aromatise Kekule structures ' .
                             '(experimental)' ],
            [ 'no-aromatise' => 'do not attempt to aromatise [default]' ]
        ],
        default => 'no-aromatise'
      }
    ],
    [ kekul => hidden => {
        one_of => [
            [ 'kekulise' => 'kekulise simple aromatic structures ' .
                            '(experimental)' ],
            [ 'no-kekulise' => 'do not attempt to kekulise [default]' ]
        ],
        default => 'no-kekulise'
      }
    ],
    [],
    [ 'random-order',
      'instead of canonical, output SMILES in random order (useful for testing)' ],
    [],
    [ 'help', 'print usage message and exit', { shortcircuit => 1 } ],
);

if( $opt->help ) {
    print $usage->text;
    exit;
}

if( $opt->raw eq 'no_infer_hydrogens' ) {
    $parser_options{raw} = 1;
}

my $errors = 0;
while (<>) {
    chomp;
    my $additional_position = '';
    if( s/\t([^\t]*)$// ) {
        $additional_position = ' ' . $1;
    }

    my $parser = Chemistry::OpenSMILES::Parser->new;
    my @moieties;
    eval {
        @moieties = $parser->parse( $_, \%parser_options );
    };
    if( $@ ) {
        $@ =~ s/^[^:]+:\s*// if !index( $@, $0 );
        print STDERR "$0: $ARGV($.)$additional_position: $@";
        $errors++;
    }

    my @smiles_parts;
    for my $moiety (@moieties) {
        aromatise( $moiety ) if $opt->aroma eq 'aromatise';
        kekulise( $moiety )  if $opt->kekul eq 'kekulise';
        my @order;
        if( !$opt->random_order ) {
            # copy() makes a shallow copy without edge attributes, thus they
            # have to be added later:
            my $copy = $moiety->copy;
            for my $bond ($moiety->edges) {
                next unless $moiety->has_edge_attribute( @$bond, 'bond' );
                $copy->set_edge_attribute( @$bond,
                                           'bond',
                                           $moiety->get_edge_attribute( @$bond, 'bond' ) );
            }
            cis_trans_to_pseudoedges( $copy );
            chirality_to_pseudograph( $copy );

            @order = canonical_order( $copy, \&represent_vertex );
            my %order;
            for (0..$#order) {
                $order{$order[$_]} = $_;
            }

            # Drop cis/trans markers from the input graph and mark them
            # anew.
            for my $bond ($moiety->edges) {
                next unless is_cis_trans_bond( $moiety, @$bond );
                $moiety->delete_edge_attribute( @$bond, 'bond' );
            }
            mark_all_double_bonds( $moiety,
                                   sub {
                                        if( $copy->has_edge( $_[0], $_[3] ) &&
                                            $copy->has_edge_attribute( $_[0], $_[3], 'pseudo' ) ) {
                                            return $copy->get_edge_attribute( $_[0], $_[3], 'pseudo' );
                                        }
                                   },
                                   sub { return $order{$_[0]} } );
        } else {
            @order = shuffle $moiety->vertices;
        }
        my %order;
        for (0..$#order) {
            $order{$order[$_]} = $_;
        }

        eval {
            my $part =
                 write_SMILES( $moiety,
                               sub {
                                    my @sorted = sort { $order{$a} <=> $order{$b} }
                                                      keys %{$_[0]};
                                    return $_[0]->{shift @sorted};
                               } );

            # In a SMILES descriptor, one can substitute all '/' with '\'
            # and vice versa, retaining correct cis/trans settings.
            # Similar rule is explained in O'Boyle (2012), Rule H.
            if( $part =~ /([\/\\])/ && $1 eq '\\' ) {
                $part =~ tr/\/\\/\\\//;
            }
            push @smiles_parts, $part;
        };
        if( $@ ) {
            print STDERR "$0: $ARGV($.)$additional_position: $@";
            $errors++;
        }
    }

    $additional_position =~ s/^ /\t/;
    print join( '.', sort @smiles_parts ), $additional_position, "\n";
}

exit( $errors > 0 );