File: cif2rdf

package info (click to toggle)
cod-tools 3.7.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm
size: 154,792 kB
sloc: perl: 57,588; sh: 36,842; ansic: 6,402; xml: 1,982; yacc: 1,117; makefile: 727; python: 166
file content (415 lines) | stat: -rwxr-xr-x 16,870 bytes
#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2022-08-22 15:05:56 +0300 (Mon, 22 Aug 2022) $
#$Revision: 9377 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.7.0/scripts/cif2rdf $
#------------------------------------------------------------------------------
#*
#* Parse a CIF file and describe its properties in RDF format.
#*
#* USAGE:
#*    $0 --options input.cif inputs*.cif
#**

use strict;
use warnings;
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Data::CIF2COD qw( cif2cod );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::RDF qw( rdf_n3 rdf_ntriples rdf_xml );
use COD::ErrorHandler qw( process_warnings
                          process_errors
                          process_parser_messages );
use COD::ToolsVersion qw( get_version_string );

my $user_columns; # User specified list of columns to be printed

my $use_parser = 'c';
my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

my $output_format = 'xml';
my $ignore_input = 0;

my %cif2cod_options = (
    'require_only_doi' => 0,
    'reformat_space_group' => 0,
    'use_attached_hydrogens' => 1,
    'use_datablocks_without_coord' => 0,
);

my $rdf_options = {
    'url_prefix' => 'http://www.crystallography.net/cod/',
    'url_postfix' => '.html',
    'vocabulary_name' => 'cod',
    'vocabulary_url_prefix' => 'http://www.crystallography.net/cod/doc/rdf/',
    'replace_utf_code_points_from' => undef,
    'utf_code_point_format' => '&#x%04X;', # default is HTML escapes
    'split_author_names' => 1,
    'decode' => 0, # strings are UTF already
};
#* OPTIONS:
#*   -C, --cod-number 1000000
#*                     Use the specified number, 1000000 in this example, as
#*                     a COD number for this structure; do not take the number
#*                     from the data block name.
#*
#*   -c, --always-continue
#*                     Continue processing and return successful return status
#*                     even if errors are diagnosed.
#*   -c-, --always-die
#*                     Stop and return error status if errors are diagnosed.
#*
#*   --use-attached-hydrogens
#*                     Include number of implicit hydrogens, specified using
#*                     _atom_site_attached_hydrogens tag, into the formula
#*                     sum (default).
#*   --no-use-attached-hydrogens,
#*   --dont-use-attached-hydrogens,
#*   --ignore-attached-hydrogens
#*                     Ignore number of implicit hydrogens, specified using
#*                     _atom_site_attached_hydrogens tag, in calculation of
#*                      the formula sum.
#*
#*   --reformat-space-group
#*                     Correct the formatting of Hermann-Mauguin symmetry
#*                     space group symbol.
#*   --no-reformat-space-group,
#*   --dont-reformat-space-group,
#*   --leave-space-group
#*                     Do not correct the formatting of Hermann-Mauguin
#*                     symmetry space group symbol (default).
#*
#*   --use-datablocks-without-coordinates,
#*   --use-all-datablocks
#*                     Do not filter out data blocks without coordinates.
#*   --no-use-datablocks-without-coordinates,
#*   --do-not-use-datablocks-without-coordinates,
#*   --dont-use-datablocks-without-coordinates,
#*   --skip-datablocks-without-coordinates
#*                     Filter out data blocks without coordinates (default).
#*
#*   --require-only-doi
#*                     Do not require all bibliographic details (authors,
#*                     journal name, title, year, volume and first page of
#*                     the publication) to be present if publication DOI is
#*                     specified.
#*   --require-full-bibliography
#*                     Require author names, journal name, volume, publication
#*                     title, year and first page to be present (default).
#*
#*   --columns file,flags,Robs
#*   --columns "file flags Robs"
#*                     Print only columns specified in this option.
#*
#*   --vocabulary-namespace cod
#*                     Use vocabulary namespace "cod" to prefix properties
#*                     native to the queried database.
#*
#*   --vocabulary-url-prefix http://www.crystallography.net/cod/doc/rdf/
#*                     Specify URL prefix for database's native namespace.
#*
#*   --prefix, --db-url-prefix http://www.crystallography.net/cod/
#*                     Specify URL prefix for all objects in the queried database.
#*
#*   --postfix, --db-url-postfix .html
#*                     Specify URL postfix for all objects in the queried database.
#*
#*   --split-author-names
#*                     Split author names in internal database representation at
#*                     semicolons (';') to produce RDF-parsable list of author names.
#*                     Each author in such list is marked with "author" property
#*                     instead of "authors", which is used for non-split value
#*                     (default).
#*   --no-split-author-names, --dont-split-author-names
#*                     Make no assumptions about internal database representation
#*                     of author list.
#*
#*   --xml-output
#*                     Output RDF in XML carrier format (default).
#*   --n3-output
#*                     Output RDF in N3 (Notation3) carrier format.
#*   --turtle-output
#*                     Output RDF in Turtle carrier format.
#*   --ntriples-output, --n-triples-output
#*                     Output RDF in N-Triples carrier format.
#*
#*   --replace-utf-codepoints-from
#*                     Replace Unicode code points starting at specified
#*                     point with carrier format-specific entities.
#*   --no-replace-utf-codepoints, --dont-replace-utf-code-points
#*                     Leave Unicode code points unescaped (default).
#*
#*   --html-utf-escapes
#*                     Escape Unicode code points using HTML hexadecimal
#*                     format corresponding to '&#x%04X;' formatted string
#*                     (default).
#*   --internal-utf-escapes
#*                     Escape Unicode code points using internal format
#*                     corresponding to '\\u%04X' formatted string.
#*   --utf-escape-format
#*                     Escape Unicode code points using format
#*                     corresponding to the provided formatted string.
#*
#*   --no-header
#*                     Skip header of corresponding format. May be useful
#*                     for generation of catenable output.
#*   --no-footer
#*                     Skip footer of corresponding format. May be useful
#*                     for generation of catenable output.
#*
#*   --only-header
#*                     Print header of corresponding format and exit.
#*   --only-footer
#*                     Print footer of corresponding format and exit.
#*
#*   --continue-on-errors
#*                     Do not terminate script if errors are raised.
#*   --die-on-errors,
#*   --no-continue-on-errors,
#*   --do-not-continue-on-errors,
#*   --dont-continue-on-errors,
#*   --exit-on-errors
#*                     Terminate script immediately if errors are raised (default).
#*   --continue-on-warnings
#*                     Do not terminate script if warnings are raised (default).
#*   --die-on-warnings
#*                     Terminate script immediately if warnings are raised.
#*   --continue-on-notes
#*                     Do not terminate script if notes are raised (default).
#*   --die-on-notes
#*                     Terminate script immediately if notes are raised.
#*
#*   --use-perl-parser
#*                     Use the Perl parser for parsing CIFs.
#*   --use-c-parser
#*                     Use the faster C parser for parsing CIFs (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '-C,--cod-number' => \$cif2cod_options{cod_number},

    '--use-attached-hydrogens' =>
            sub { $cif2cod_options{'use_attached_hydrogens'} = 1 },
    '--dont-use-attached-hydrogens' =>
            sub { $cif2cod_options{'use_attached_hydrogens'} = 0 },
    '--no-use-attached-hydrogens' =>
            sub { $cif2cod_options{'use_attached_hydrogens'} = 0 },
    '--ignore-attached-hydrogens' =>
            sub { $cif2cod_options{'use_attached_hydrogens'} = 0 },

    '--require-only-doi' =>
            sub{ $cif2cod_options{'require_only_doi'} = 1 },
    '--require-full-bibliography' =>
            sub{ $cif2cod_options{'require_only_doi'} = 0 },

    '--reformat-space-group' =>
            sub { $cif2cod_options{'reformat_space_group'} = 1 },
    '--no-reformat-space-group' =>
            sub { $cif2cod_options{'reformat_space_group'} = 0 },
    '--dont-reformat-space-group' =>
            sub { $cif2cod_options{'reformat_space_group'} = 0 },
    '--leave-space-group' =>
            sub { $cif2cod_options{'reformat_space_group'} = 0 },

    '--columns' => \$user_columns,

    '--split-author-names' =>
            sub { $rdf_options->{'split_author_names'} = 1 },
    '--no-split-author-names' =>
            sub { $rdf_options->{'split_author_names'} = 0 },
    '--dont-split-author-names' =>
            sub { $rdf_options->{'split_author_names'} = 0 },

    '--vocabulary-namespace'  => \$rdf_options->{'vocabulary_name'},
    '--vocabulary-url-prefix' => \$rdf_options->{'vocabulary_url_prefix'},
    '--prefix,--db-url-prefix'   => \$rdf_options->{'url_prefix'},
    '--postfix,--db-url-postfix' => \$rdf_options->{'url_postfix'},

    '--xml-output'       => sub { $output_format = 'xml' },
    '--n3-output'        => sub { $output_format = 'n3' },
    '--turtle-output'    => sub { $output_format = 'turtle' },
    '--ntriples-output'  => sub { $output_format = 'ntriples' },
    '--n-triples-output' => sub { $output_format = 'ntriples' },

    '--replace-utf-codepoints-from' =>
            \$rdf_options->{'replace_utf_code_points_from'},
    '--no-replace-utf-codepoints' =>
            sub { $rdf_options->{'replace_utf_code_points_from'} = undef },
    '--dont-replace-utf-code-points' =>
            sub { $rdf_options->{'replace_utf_code_points_from'} = undef },

    '--html-utf-escapes' =>
            sub { $rdf_options->{'utf_code_point_format'} = '&#x%04X;' },
    '--internal-utf-escapes' =>
            sub { $rdf_options->{'utf_code_point_format'} = '\\u%04X' },
    '--utf-escape-format'    => \$rdf_options->{'utf_code_point_format'},

    '--no-header' => sub { $rdf_options->{'print_header'} = 0 },
    '--no-footer' => sub { $rdf_options->{'print_footer'} = 0 },

    '--only-header' => sub { $rdf_options->{'print_header'} = 1;
                             $rdf_options->{'print_footer'} = 0;
                             $ignore_input = 1 },
    '--only-footer' => sub { $rdf_options->{'print_header'} = 0;
                             $rdf_options->{'print_footer'} = 1;
                             $ignore_input = 1 },

    '--use-datablocks-without-coordinates' =>
            sub{ $cif2cod_options{'use_datablocks_without_coord'} = 1 },
    '--use-all-datablocks' =>
            sub{ $cif2cod_options{'use_datablocks_without_coord'} = 1 },

    '--do-not-use-datablocks-without-coordinates' =>
            sub{ $cif2cod_options{'use_datablocks_without_coord'} = 0 },
    '--dont-use-datablocks-without-coordinates' =>
            sub{ $cif2cod_options{'use_datablocks_without_coord'} = 0 },
    '--no-use-datablocks-without-coordinates' =>
            sub{ $cif2cod_options{'use_datablocks_without_coord'} = 0 },
    '--skip-datablocks-without-coordinates' =>
            sub{ $cif2cod_options{'use_datablocks_without_coord'} = 0 },

    '-c,--always-continue' => sub { $die_on_errors   = 0;
                                    $die_on_warnings = 0;
                                    $die_on_notes    = 0 },
    '-c-,--always-die'     => sub { $die_on_errors   = 1;
                                    $die_on_warnings = 1;
                                    $die_on_notes    = 1 },

    '--continue-on-errors'          => sub { $die_on_errors = 0 },
    '--dont-continue-on-errors'     => sub { $die_on_errors = 1 },
    '--die-on-errors'               => sub { $die_on_errors = 1 },
    '--do-not-continue-on-errors'   => sub { $die_on_errors = 1 },
    '--no-continue-on-errors'       => sub { $die_on_errors = 1 },
    '--exit-on-errors'              => sub { $die_on_errors = 1 },

    '--continue-on-warnings' => sub { $die_on_warnings = 0 },
    '--die-on-warnings'      => sub { $die_on_warnings = 1 },

    '--continue-on-notes'    => sub { $die_on_notes = 0 },
    '--die-on-notes'         => sub { $die_on_notes = 1 },

    '--use-perl-parser'       => sub{ $use_parser = 'perl' },
    '--use-c-parser'          => sub{ $use_parser = 'c' },

    '--options'       => sub { options; exit },
    '--help,--usage'  => sub { usage; exit },
    '--version'       => sub { print get_version_string(), "\n"; exit }
);

$rdf_options->{vocabularies} = { $rdf_options->{vocabulary_name} =>
                                 $rdf_options->{vocabulary_url_prefix} };

my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

my %requested_data_fields;
if( defined $user_columns ) {
    %requested_data_fields = map { $_ => 1 } split m/[ ,]/, $user_columns;
} else {
    %requested_data_fields = map { $_ => 1 }
                                @COD::CIF::Data::CIF2COD::default_data_fields;
}
# Always include the 'file' field since it may be needed
# to construct the 'url' field value.
$requested_data_fields{'file'} = 1 if !exists $requested_data_fields{'file'};

binmode( STDOUT, ':encoding(UTF-8)' );
binmode( STDERR, ':encoding(UTF-8)' );

if( $ignore_input ) {
    # Prints only header or footer, files are ignored
    @ARGV = ();
} else {
    @ARGV = ( '-' ) unless @ARGV;
}

my @extracted;
for my $filename (@ARGV) {

    my $parser_options = { 'parser' => $use_parser, 'no_print' => 1 };

    my ( $data, $err_count, $messages ) = parse_cif( $filename, $parser_options );
    process_parser_messages( $messages, $die_on_error_level );

    canonicalize_all_names( $data );

    foreach my $dataset (@{$data}) {

        my $dataname = 'data_' . $dataset->{name};

        local $SIG{__WARN__} = sub { process_warnings( {
                                       'message'       => @_,
                                       'program'       => $0,
                                       'filename'      => $filename,
                                       'add_pos'       => $dataname
                                     }, $die_on_error_level ) };

        my $extracted_dataset;
        eval {
            $extracted_dataset = cif2cod( $dataset, \%cif2cod_options );
        };
        if ($@) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }

        next if !defined $extracted_dataset;

        # Remove fields that were not requested.
        for my $field (keys %{$extracted_dataset}) {
            if (!exists $requested_data_fields{$field}) {
                delete $extracted_dataset->{$field};
            }
        }

        # Forming URLs
        if( $extracted_dataset->{file} =~ /^[1-9][0-9]+$/ ) {
            $extracted_dataset->{url} = $rdf_options->{url_prefix} .
                                        $extracted_dataset->{file} .
                                        $rdf_options->{url_postfix};
        } else {
            $extracted_dataset->{url} = "file://$filename";
        }

        # Removing unneeded fields
        delete $extracted_dataset->{file};
        delete $extracted_dataset->{text};
        delete $extracted_dataset->{flags};

        for my $field (qw( formula calcformula cellformula )) {
            next if !exists  $extracted_dataset->{$field};
            next if !defined $extracted_dataset->{$field};
            $extracted_dataset->{$field} =~ s/^-\s*//;
            $extracted_dataset->{$field} =~ s/\s*-$//;
        }

        push @extracted, $extracted_dataset;
    }
}

if( $output_format eq 'xml' ) {
    print rdf_xml( \@extracted, $rdf_options );
} elsif( $output_format eq 'n3' or $output_format eq 'turtle' ) {
    print rdf_n3( \@extracted, $rdf_options );
} elsif( $output_format eq 'ntriples' ) {
    print rdf_ntriples( \@extracted, $rdf_options );
}