1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
|
package Chemistry::File::SDF;
our $VERSION = '0.24'; # VERSION
# $Id$
use base "Chemistry::File";
use Chemistry::Mol;
use Chemistry::File::MDLMol;
use strict;
use warnings;
=head1 NAME
Chemistry::File::SDF - MDL Structure Data File reader/writer
=head1 SYNOPSIS
use Chemistry::File::SDF;
# Simple interface (all at once)
# read all the molecules in the file
my @mols = Chemistry::Mol->read('myfile.sdf');
# assuming that the file includes a <PKA> data item...
print $mols[0]->attr("sdf/data")->{PKA};
# write a bunch of molecules to an SDF file
Chemistry::Mol->write('myfile.sdf', mols => \@mols);
# or write just one molecule
$mol->write('myfile.sdf');
# Low level interface (one at a time)
# create reader
my $reader = Chemistry::Mol->file('myfile.sdf');
$reader->open('<');
while (my $mol = $reader->read_mol($reader->fh)) {
# do something with $mol
}
=cut
Chemistry::Mol->register_format(sdf => __PACKAGE__);
=head1 DESCRIPTION
MDL SDF (V2000) reader.
This module automatically registers the 'sdf' format with Chemistry::Mol.
The parser returns a list of Chemistry::Mol objects. SDF data can be accessed
by the $mol->attr method. Attribute names are stored as a hash ref at the
"sdf/data" attribute, as shown in the synopsis. When a data item has a single
line in the SDF file, the attribute is stored as a string; when there's more
than one line, they are stored as an array reference. The rest of the
information on the line that holds the field name is ignored.
This module is part of the PerlMol project, L<https://github.com/perlmol>.
=cut
sub slurp_mol {
my ($self, $fh, %opts) = @_;
return if $fh->eof;
my $s;
while (<$fh>) {
last if /^\$\$\$\$/;
$s .= $_;
}
$s =~ s/\r\n?/\n/g; # normalize EOL
$s;
}
sub skip_mol {
my ($self, $fh, %opts) = @_;
return if $fh->eof;
while (<$fh>) {
return 1 if /^\$\$\$\$/;
}
return 0;
}
sub read_mol {
my ($self, $fh, %opts) = @_;
my $s = $self->slurp_mol($fh, %opts) or return;
my $mol = Chemistry::File::MDLMol->parse_string($s, %opts);
$self->parse_data($mol, $s);
$mol;
}
sub parse_data {
my ($self, $mol, $mol_string) = @_;
my (@items) = split /\n>/, $mol_string;
shift @items; # drop everything until first datum
my %data_block;
for my $item (@items) {
my ($header, @data) = split /\n/, $item;
my ($field_name) = $header =~ /<(.*?)>/g;
warn "SDF: no field name\n", next unless $field_name;
#$mol->attr("sdf/$field_name", @data == 1 ? $data[0] : \@data);
$data_block{$field_name} = @data == 1 ? $data[0] : \@data;
}
$mol->attr("sdf/data", \%data_block);
}
sub write_string {
my ($self, $mol_ref, %opts) = @_;
my @mols;
my $ret = '';
if ($opts{mols}) {
@mols = @{$opts{mols}};
} else {
@mols = $mol_ref;
}
for my $mol (@mols) {
$ret .= $mol->print(format => 'mdl');
$ret .= format_data($mol->attr('sdf/data')) . '$$$$'."\n";
}
$ret;
}
sub format_data {
my ($data) = @_;
my $ret = '';
return $ret unless $data;
for my $field_name (sort keys %$data) {
$ret .= "> <$field_name>\n";
my $value = $data->{$field_name};
if (ref $value) {
$ret .= join "\n", @$value;
} else {
$ret .= "$value\n";
}
$ret .= "\n";
}
$ret;
}
sub file_is {
my ($self, $fname) = @_;
return 1 if $fname =~ /\.sdf?$/i;
return 0;
}
sub name_is {
my ($self, $fname) = @_;
$fname =~ /\.sdf?$/i;
}
sub string_is {
my ($self, $s) = @_;
/\$\$\$\$/ ? 1 : 0;
}
1;
=head1 CAVEATS
Note that by storing the SDF data as a hash, there can be only one field with
a given name. The SDF format description is not entirely clear in this regard.
Also note that SDF data field names are considered to be case-sensitive.
=head1 SOURCE CODE REPOSITORY
L<https://github.com/perlmol/Chemistry-File-MDLMol>
=head1 SEE ALSO
L<Chemistry::Mol>
The MDL file format specification.
L<http://www.mdl.com/downloads/public/ctfile/ctfile.pdf> or
Arthur Dalby et al., J. Chem. Inf. Comput. Sci, 1992, 32, 244-255.
=head1 AUTHOR
Ivan Tubert-Brohman <itub@cpan.org>
=head1 COPYRIGHT
Copyright (c) 2009 Ivan Tubert-Brohman. All rights reserved. This program is
free software; you can redistribute it and/or modify it under the same terms as
Perl itself.
=cut
|