File: cif_molecule

package info (click to toggle)
cod-tools 3.7.0%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bookworm
size: 154,792 kB
sloc: perl: 57,588; sh: 36,842; ansic: 6,402; xml: 1,982; yacc: 1,117; makefile: 727; python: 166
file content (2834 lines) | stat: -rwxr-xr-x 109,778 bytes
#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2022-08-23 15:49:44 +0300 (Tue, 23 Aug 2022) $
#$Revision: 9388 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.7.0/scripts/cif_molecule $
#------------------------------------------------------------------------------
#*
#* Restore molecules from a CIF file.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

# Note: this script assumes that atoms have unique labels in the input
# CIF file; most often these are labels given by the _atom_site_label
# tag. If the assumption of uniqueness does not hold, the script
# attempts by default to create unique labels itself, appending numeric
# prefixes to the duplicate labels.
#
# The uniqueness of the labels is assumed in checks for atoms at
# special positions, and most importantly in the code removing
# duplicate molecules.
#
# Although there is an option to switch off this diversification of
# labels, the algorithms employed in this script will most probably
# break and give incorrect results (e.g. some atoms, namely ones with
# duplicate labels, will be missing from the output). Thus, use option
# '--dont-uniquify-atoms' with caution.
#
# Atom identification.
# Atoms will be identified within this program using three components:
#
# a) the original label, as found in the input CIF (the "site_label",
# taken from the _atom_site_label data item). This label must be
# unique; it it is not, it will be uniquified by adding a serial
# number upon reading in;
#
# b) a rotation operator (unity operator if no rotation is applied);
# upon any rotation or when atoms are read in, their fractional
# coordinates are truncated modulo 1, i.e. moved to the first octant
# [0..1)x[0..1)x[0..1).
#
# c) a translation vector from the first octant to the actual atom
# position; translation names will use IUCr convention shift +5 (555
# is 0,0,0 translation). For larger translations, ":" character
# separator will be used, e.g. 10:5:-11.
#
# These three components, concatenated with underscores ("_"), will be
# used as unique atom names (the "name" key in the $atom_info hash).

use strict;
use File::Basename qw( basename );
use Clone qw( clone );
use COD::Algebra qw( gcd );
use COD::Algebra::Vector qw( distance vector_sub );
use COD::AtomBricks qw( build_bricks get_atom_index get_search_span );
use COD::AtomNeighbours qw(
    get_max_covalent_radius
    get_max_vdw_radius
    make_neighbour_list
);
use COD::AtomProperties;
use COD::CIF::Data qw( get_cell
                       get_space_group_number
                       get_symmetry_operators );
use COD::CIF::Data::AtomList qw( atom_array_from_cif
                                 atom_groups
                                 atoms_are_alternative
                                 datablock_from_atom_array
                                 generate_cod_molecule_data_block
                                 dump_atoms_as_cif );
use COD::CIF::Data::SymmetryGenerator qw( apply_shifts
                                          atoms_coincide
                                          chemical_formula_sum
                                          symop_apply
                                          symops_apply_modulo1
                                          test_bond
                                          test_bump
                                          translate_atom
                                          translation
                                          trim_polymer );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::Manage qw( contains_data_item
                               exclude_tag
                               rename_tags
                               set_loop_tag
                               set_tag );
use COD::CIF::Tags::Merge qw( merge_datablocks );
use COD::CIF::Tags::Print qw( print_cif );
use COD::ErrorHandler qw( process_errors process_warnings
                          process_parser_messages report_message );
use COD::MorganFingerprints qw( make_morgan_fingerprint );
use COD::Spacegroups::Builder;
use COD::Spacegroups::SimpleBuilder;
use COD::Spacegroups::Lookup qw( make_symop_hash );
use COD::Spacegroups::Lookup::COD;
use COD::Spacegroups::Symop::Algebra qw( symop_mul
                                         symop_invert
                                         symop_is_unity
                                         symop_vector_mul );
use COD::Spacegroups::Symop::Parse qw( symop_from_string
                                       string_from_symop
                                       symop_string_canonical_form
                                       modulo_1 );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ToolsVersion qw( get_version_string );
## use COD::Algebra::GaussJordan qw( gj_elimination_non_zero_elements );
use COD::Algebra::GaussJordanBigRat qw( gj_elimination_non_zero_elements );

no warnings 'recursion';

my $Id = '$Id: cif_molecule 9388 2022-08-23 12:49:44Z antanas $';

my $debug;
my $symdebug;
my $verbose = 0;
my $total_nbumps = 0;

my $sort_molecules = 1; # A flag indicating whether molecules should
                        # be sorted in the output (descending by atom
                        # number)

my $dump_atoms = 0;
my $format = "%8.6f";
my $continue_on_errors = 0;
my $covalent_sensitivity = 0.35;
my $audit = 1;
my $uniquify_atoms = 1;
my $exclude_zero_occupancies = 1; # Do not use atoms with zero occupancies
my $exclude_dummy_atoms = 1;      # Do not use atoms with the 'dum' calc flag

my $force_unit_occupancies = 0; # Forcibly set occupancies to 1.0.

# A fraction of covalent bond radii used to determine when atoms are
# too close and are considered a bump:

my $bump_distance_factor = 0.75;

# A fraction of vdW radii used to determine when atoms are too close
# and are considered as overlapping; used, for instance, to determine
# whether an atom group that is disordered around special position is
# mapped onto itself by a symmetry operator:

my $vdw_distance_factor = 1.2;


my $ignore_bumps = 0; # detect and warn about close atom "bumps"
                      # but do not stop processing.

# A span, in +/- unit cells, in which polymeric molecules (repeating
# units) will be constructed:

my $max_polymer_span = 4;

# A maximum allowed count of polymer example atoms: more than this
# amount of symmetry (translational) equivalent atoms, for each AU
# atom, will not be written to the output file:

my $max_polymer_atoms = 100;

my $cif_header_file; # Comments from the beginning of this file will be
                     # prepended to the output.

my $use_parser = "c"; # Used CIF parser

my $use_morgan_fingerprints = 0; # Use Morgan fingerprints to identify
                                 # duplicated moieties

my $use_atom_classes = 1; # Use COD AtomClassifier to sort atoms for
                          # generation of Morgan fingerprints

# Used for atom classification via AtomClassifier:
my $flat_planarity = 0.10;
my $classification_level = 3;
my $max_ring_size = 7; # maximum size of detected rings

my $use_one_output_datablock = 0; # Put all molecules, and all
                                  # disorder groups, into a single
                                  # data block in the output.

my $merge_disorder_groups = 0; # Put all alternative conformations
                               # into one data block.

my $preserve_stoichiometry = 0; # If true (1), apply symmetry
                                # operators from cosets of a point
                                # group in each molecule to all other
                                # molecules, to preserve molecular
                                # stoichiometry (charge balance,
                                # etc.).

my $largest_molecule_only = 0; # Output only the largest (having the
                               # greatest number of atoms) molecule.

my $output_geom_bond = 0; # Compute and output the _geom_bond_... data
                          # items (bond lengths, valencies, etc.)

my $expand_to_p1 = 0; # Do we want a full P1 unit cell that can be used
                      # to re-create the whole crystal using only the
                      # lattice translations?

# Random seed to be used for rand() function:

my $random_seed;

# If true, generates symmetry equivalent sites for disorder groups
# with negative indices.
my $use_special_position_disorder = 1;

my $special_position_operator_set = 0;

# The simpler and slower space group builder algorithm
# (COD::Spacegroup::SimpleBuilder) is mostly intended for debugging.
# Ideally, it should give results identical to the space group builder
# algorithm optimised for speed (COD::Spacegroup::SimpleBuilder). It
# is also expected that the optimised algorithm outperforms the simple one.
my $space_group_builder_type = 'optimised';
# 'optimised' => 'use COD::Spacegroups::Builder'
# 'simple'    => 'use COD::Spacegroups::SimpleBuilder'

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

my $machine_epsilon = get_machine_epsilon();

my %SYMOP_LOOKUP_HASH = make_symop_hash( [
                            \@COD::Spacegroups::Lookup::COD::table,
                            \@COD::Spacegroups::Lookup::COD::extra_settings
                        ] );

#* OPTIONS:
#*   --use-optimised-spacegroup-builder
#*                     Use the space group builder algorithm optimised
#*                     for speed  as implemented in the
#*                     COD::Spacegroups::Builder module (default).
#*   --use-simple-spacegroup-builder
#*                     Use the simpler and slower space group builder
#*                     algorithm as implemented in the
#*                     COD::Spacegroups::SimpleBuilder module.
#*
#*   -1, --one-datablock-output
#*                     Output all moieties to a single output data block.
#*
#*                     However, if the --split-disorder-groups option is
#*                     enabled all generated alternative conformations will
#*                     be put into separate data blocks starting with the
#*                     most likely one (disorder group occupancy wise) and
#*                     ending with the least likely one. In order to retrieve
#*                     only the most likely one, the --largest-molecule-only
#*                     option should be used in combination with the
#*                     --one-datablock-output option.
#*
#*   -1-, --multiple-datablocks-output
#*                     Separate each molecule and each example of an alternative
#*                     conformation into a separate data block (default).
#*
#*   -c, --covalent-sensitivity
#*                     Set a new covalent sensitivity value (default: 0.35).
#*
#*   -g, --geom-bond-output
#*                     Output _geom_bond_... data items (bond lengths,
#*                     valencies, etc.).
#*
#*   -g-, --no-geom-bond-output
#*                     Do not output _geom_bond_... information (default).
#*
#*   -h, --add-cif-header input_header.txt
#*                     Comments from the beginning of this file will be
#*                     prepended to the output.
#*
#*   -i, --ignore-bumps
#*                     Detect and warn about close atom "bumps" but do not
#*                     stop processing.
#*
#*   --dont-ignore-bumps, --no-ignore-bumps
#*                     Stop processing immediately if bumps are
#*                     detected (default).
#*
#*   -s, --sort-molecules
#*                     Sort molecules in descending order by their atom count
#*                     and overall occupancy before outputting them. Atom count
#*                     takes precedence over overall occupancy (default).
#*
#*   --dont-sort-molecules, --no-sort-molecules
#*                     Do not sort molecules, print them out in the order they
#*                     are detected.
#*
#*   --expand-to-P1, --P1-expand, --p1-expand
#*                     Expand all atoms to the P1 unit cell, so that the
#*                     translation operators can be used to restore the whole
#*                     crystal.
#*
#*   --dont-expand-to-P1, --no-expand-to-P1
#*   --dont-P1-expand, --no-p1-expand
#*                     Do not expand to P1, output only the minimal molecule
#*                     list (default).
#*
#*   --uniquify-atoms
#*                     Makes unique the labels of atoms (default).
#*
#*   --no-uniquify-atoms, --dont-uniquify-atoms
#*                     Do not makes unique labels for atoms,
#*                     exclude duplicates.
#*
#*   --use-morgan-fingerprints
#*                     Use Morgan fingerprints to identify and skip
#*                     duplicated moieties.
#*
#*   --no-use-morgan-fingerprints, --dont-use-morgan-fingerprints
#*                     Use atom labels to identify and skip duplicated
#*                     moieties. This method is default, however under
#*                     certain circumstances it leaves duplicate moieties,
#*                     as asymmetric unit can initially contain more than
#*                     one copy of a single moiety (default).
#*
#*   --use-atom-classes
#*                     Use COD atom classes, generated by AtomClassifier
#*                     module from 'atomclasses' repository, for the
#*                     generation of Morgan fingerprints. Requires the
#*                     external AtomClassifier module (default).
#*
#*   --no-use-atom-classes, --dont-use-atom-classes
#*                     Use atom chemical types for generation of Morgan
#*                     fingerprints instead of COD atom classes.
#*
#*   --bump-distance-factor 0.75
#*                     A fraction of covalent bond radii sum used to
#*                     determine when atoms are too close and are
#*                     considered a bump (default: 0.75).
#*
#*   --vdw-distance-factor 1.2
#*                     A factor for the vdW radii sum used to
#*                     determine when atoms are too close and are
#*                     considered a vdW clash (default: 1.2).
#*
#*   --continue-on-errors
#*                     Do not stop if errors such as unrecognised atoms are
#*                     encountered; the output may be incorrect and missing
#*                     some atoms if this option is used!
#*
#*   --dont-continue-on-errors, --no-continue-on-errors
#*                     Stop immediately when an error is encountered.
#*
#*   --exclude-zero-occupancies
#*                     Do not use atoms with 0 occupancies in calculations
#*                     (default).
#*
#*   --dont-exclude-zero-occupancies, --no-exclude-zero-occupancies
#*                     Use atoms with 0 occupancies in calculations.
#*
#*   --exclude-dummy-atoms
#*                     Do not use dummy atoms (marked by the 'dum' calc flag)
#*                     in calculations (default).
#*
#*   --dont-exclude-dummy-atoms, --no-exclude-dummy-atoms
#*                     Use dummy atoms (marked by the 'dum' calc flag)
#*                     in calculations. Dummy atoms can be used to mark
#*                     interesting positions within the unit cell
#*                     (e.g. geometric centers of coordinated atom rings),
#*                     but they are not considered as part of the molecule.
#*                     As a result, the occupancies of all output dummy atoms
#*                     are set to '.'. It should also be noted that dummy atoms
#*                     with non-numeric coordinates will still be excluded.
#*
#*   --preserve-stoichiometry
#*                     Apply necessary symmetry operators to preserve molecular
#*                     stoichiometry (charges, etc.).
#*
#*   --dont-preserve-stoichiometry, --no-preserve-stoichiometry
#*                     Do not apply any more symmetry operators than needed to
#*                     reconstruct covalently connected networks; may
#*                     break stoichiometry of salts and complexes (default).
#*
#*   --force-unit-occupancies
#*                     Set occupancies of all output atoms to 1.0. Unit
#*                     occupancies are only set when outputting the atoms
#*                     and do not affect the flow of the algorithm
#*                     (disorder group processing, molecule sorting, etc.).
#*                     Dummy atoms are excluded from the effects of this option
#*                     and are always output with the '.' occupancy.
#*
#*                     Some programs, notably Jumbo converter's cif2cml,
#*                     assume unresolved disorder and do not recognize
#*                     aromatic rings if occupancies are not unities.
#*                     Obviously, this flag has only sense in combination
#*                     with --split-disorder-groups.
#*
#*   --dont-force-unit-occupancies, --do-not-force-unit-occupancies,
#*   --no-force-unit-occupancies
#*                     Leave occupancies as they are (default).
#*
#*   --dump-atoms
#*                     Dump atoms (including symmetry-equivalent) in CIF
#*                     format for inspection with some graphics program.
#*
#*   --dont-dump-atoms, --no-dump-atoms
#*                     Do not dump atoms (default).
#*
#*   --max-polymer-span 4
#*                     A span, in +/- unit cells, in which polymeric
#*                     molecules (repeating units) will be constructed.
#*
#*   --max-polymer-atoms 100
#*                     A maximum allowed count of polymer example atoms:
#*                     more than this amount of symmetry (translational)
#*                     equivalent atoms, for each AU atom, will not be
#*                     written to the output:
#*
#*                     Using --max-polymer-span=0 --max-polymer-atoms=1
#*                     essentially switches off the polymer detection.
#*
#*   --split-disorder-groups, --dont-merge-disorder-groups
#*                     Put examples of disorder group conformations into
#*                     separate data blocks (default).
#*
#*   --merge-disorder-groups, --dont-split-disorder-groups
#*                     Put all disorder groups into one data block.
#*
#*   --use-special-disorder-symmetry
#*                     Generate symmetry equivalents for disorder groups
#*                     with negative indices (default).
#*
#*   --random-seed 123456
#*                     Use the provided seed to initialise the random
#*                     number generator. Use "" (empty string) as a seed
#*                     to revert back to the default seed.
#*
#*   --special-disorder-operator-set 0
#*   --special-disorder-operator-set random
#*                     Indicates which operator set to apply to atom groups
#*                     that are disordered around a special position. Can be
#*                     an integer (0, 1, 2, ...) or as special value "random",
#*                     in which case a random operator is selected for each
#*                     special position image.
#*
#*   --no-use-special-disorder-symmetry,
#*   --dont-use-special-disorder-symmetry,
#*   --do-not-use-special-disorder-symmetry,
#*                     Do not generate symmetry equivalents for disorder
#*                     groups with negative indices.
#*
#*   --largest, --largest-molecule-only
#*                     Output only the largest molecule. The largest molecule
#*                     is selected based on two criteria in the given order:
#*                     atom count and overall occupancy of the molecule.
#*                     When the combination of the --one-datablock-output and
#*                     --split-disorder-groups options is in effect the
#*                     molecule with the most likely disorder conformation
#*                     (occupancy wise) is returned.
#*
#*                     NOTE: if there is more than one disorder assembly
#*                     and the --split-disorder-groups option is in effect,
#*                     the conformation with the highest atom count might not
#*                     be generated at all. In this case, a molecule that best
#*                     fits the previously defined criteria out of the generated
#*                     conformation subset will be returned.
#*
#*   --all, --all-molecules
#*                     Output all molecules (default).
#*
#*   --use-perl-parser
#*   --use-c-parser
#*                     Specify parser to parse CIF files. C parser is default.
#*
#*   --symdebug
#*                     Print debug output for symmetry reconstruction.
#*   --no-symdebug
#*                     Do not print any symmetry debug output (default).
#*   --debug
#*                     Print some human-readable debug output.
#*   --no-debug
#*                     Suppress any debug output (default).
#*
#*   --format "%8.6f"
#*                     Use the specified format for output coordinate printout.
#*
#*   --audit
#*                     Print audit information to the generated CIF file (default).
#*   --no-audit
#*                     Do not print audit information to the generated CIF file.
#*
#*   --verbose
#*                     Print warning messages in long format.
#*   --no-verbose
#*                     Print warning messages in concise format (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '--use-simple-spacegroup-builder' =>
        sub { $space_group_builder_type = 'simple' },

    '--use-optimised-spacegroup-builder' =>
        sub { $space_group_builder_type = 'optimised' },

    "-1,--one-datablock-output" => sub { $use_one_output_datablock = 1; },
    "-1-,--multiple-datablocks-output" =>
        sub { $use_one_output_datablock = 0; },

    "--expand-to-P1,--P1-expand,--p1-expand" => sub { $expand_to_p1 = 1 },
    "--no-expand-to-P1,--no-P1-expand,--no-p1-expand" =>
        sub { $expand_to_p1 = 0 },
    "--dont-expand-to-P1,--dont-P1-expand,--dont-p1-expand" =>
        sub { $expand_to_p1 = 0 },
    "--do-not-expand-to-P1,--do-not-P1-expand,--do-not-p1-expand" =>
        sub { $expand_to_p1 = 0 },

    "--uniquify-atoms"      => sub { $uniquify_atoms = 1; },
    "--no-uniquify-atoms"   => sub { $uniquify_atoms = 0; },
    "--dont-uniquify-atoms" => sub { $uniquify_atoms = 0; },

    "--use-morgan-fingerprints" =>
        sub { $use_morgan_fingerprints = 1 },
    "--no-use-morgan-fingerprints" =>
        sub { $use_morgan_fingerprints = 0 },
    "--dont-use-morgan-fingerprints" =>
        sub { $use_morgan_fingerprints = 0 },

    "--use-atom-classes" => sub { $use_atom_classes = 1 },
    "--no-use-atom-classes" => sub { $use_atom_classes = 0 },
    "--dont-use-atom-classes" => sub { $use_atom_classes = 0 },

    "-c,--covalent-sensitivity" => \$covalent_sensitivity,

    "-g,--geom-bond-output"     => sub { $output_geom_bond = 1 },
    "-g-,--no-geom-bond-output" => sub { $output_geom_bond = 0 },

    "-h,--add-cif-header" => \$cif_header_file,

    "-i,--ignore-bumps"   => sub{ $ignore_bumps = 1 },
    "--no-ignore-bumps"   => sub{ $ignore_bumps = 0 },
    "--dont-ignore-bumps" => sub{ $ignore_bumps = 0 },

    "-s,--sort-molecules"   => sub{ $sort_molecules = 1 },
    "--no-sort-molecules"   => sub{ $sort_molecules = 0 },
    "--dont-sort-molecules" => sub{ $sort_molecules = 0 },

    "--exclude-zero-occupancies"    => sub { $exclude_zero_occupancies = 1; },
    "--no-exclude-zero-occupancies" => sub { $exclude_zero_occupancies = 0; },
    "--dont-exclude-zero-occupancies" => sub { $exclude_zero_occupancies = 0; },

    "--exclude-dummy-atoms"    => sub { $exclude_dummy_atoms = 1; },
    "--no-exclude-dummy-atoms" => sub { $exclude_dummy_atoms = 0; },
    "--dont-exclude-dummy-atoms" => sub { $exclude_dummy_atoms = 0; },

    "--preserve-stoichiometry" => sub { $preserve_stoichiometry = 1 },
    "--dont-preserve-stoichiometry, --no-preserve-stoichiometry" =>
        sub { $preserve_stoichiometry = 0 },

    "--bump-distance-factor" => \$bump_distance_factor,

    "--vdw-distance-factor" => \$vdw_distance_factor,

    "--max-polymer-span" => \$max_polymer_span,
    "--max-polymer-atoms" => \$max_polymer_atoms ,

    "--symdebug"    => sub { $symdebug = 1 },
    "--no-symdebug" => sub { $symdebug = 0 },

    "--debug"    => sub { $debug = 1 },
    "--no-debug" => sub { $debug = 0 },

    "--format" => \$format,

    "--force-unit-occupancies" => sub { $force_unit_occupancies = 1 },
    "--no-force-unit-occupancies" => sub { $force_unit_occupancies = 0 },
    "--dont-force-unit-occupancies" => sub { $force_unit_occupancies = 0 },
    "--do-not-force-unit-occupancies" => sub { $force_unit_occupancies = 0 },

    "--dump-atoms"      => sub{ $dump_atoms = 1 },
    "--dont-dump-atoms" => sub{ $dump_atoms = 0 },
    "--no-dump-atoms"   => sub{ $dump_atoms = 0 },

    "--split-disorder-groups,--dont-merge-disorder-groups," .
    "--do-not-merge-disorder-groups,--no-merge-disorder-groups"
        => sub { $merge_disorder_groups = 0 },
    "--merge-disorder-groups,--dont-split-disorder-groups" .
    "--do-not-split-disorder-groups,--no-split-disorder-groups"
        => sub { $merge_disorder_groups = 1 },

    "--random-seed" => \$random_seed,
    "--special-disorder-operator-set" => \$special_position_operator_set,

    "--use-special-disorder-symmetry"
        => sub { $use_special_position_disorder = 1 },
    "--no-use-special-disorder-symmetry," .
    "--dont-use-special-disorder-symmetry," .
    "--do-not-use-special-disorder-symmetry"
        => sub { $use_special_position_disorder = 0 },

    "--largest,--largest-molecule-only"
        => sub { $largest_molecule_only = 1 },
    "--all,--all-molecules"
        => sub { $largest_molecule_only = 0 },

    "--always-continue"                 => sub { $die_on_errors   = 0;
                                                 $die_on_warnings = 0;
                                                 $die_on_notes    = 0 },
    "-c-,--always-die"                  => sub { $die_on_errors   = 1;
                                                 $die_on_warnings = 1;
                                                 $die_on_notes    = 1 },

    "--continue-on-errors"          => sub { $die_on_errors = 0 },
    "--dont-continue-on-errors"     => sub { $die_on_errors = 1 },
    "--die-on-errors"               => sub { $die_on_errors = 1 },
    "--no-continue-on-errors"       => sub { $die_on_errors = 1 },

    "--continue-on-warnings" => sub { $die_on_warnings = 0 },
    "--die-on-warnings"      => sub { $die_on_warnings = 1 },

    "--continue-on-notes"    => sub { $die_on_notes = 0 },
    "--die-on-notes"         => sub { $die_on_notes = 1 },

    "--use-perl-parser"       => sub{ $use_parser = "perl" },
    "--use-c-parser"          => sub{ $use_parser = "c" },

    "--audit"                   => sub { $audit = 1; },
    "--no-audit"                => sub { $audit = 0; },

    "--verbose"                 => sub { $verbose = 1; },
    "--no-verbose"              => sub { $verbose = 0; },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit },

# The following options are left only for compatibility with historic
# version of the script:

# The '--remove-duplicate-molecules' is no longer necessary since the
# new algorithm (after changing order of molecule generation and
# disorder group representative generation) never produces duplicate
# molecules:

    "--remove-duplicate-molecules"      => sub { },
    "--no-remove-duplicate-molecules"   => sub { },
    "--dont-remove-duplicate-molecules" => sub { },
);

my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

# Initialise the Perl random number generator:

if( defined $random_seed && $random_seed ne "" ) {
    srand($random_seed);
}

# Covalent radii taken from Kitaigorodskij 1955, "Organicheskaja
# kristallochimija", p. 11.

#==============================================================================#
my %atom_radii = (
    "C" => [
        # bond order name, bond order, covalent radius in ångströms:
        [ "single",         1.0, 0.77 ],
        [ "one-and-a-half", 1.5, 0.70 ],
        [ "double",         2.0, 0.67 ],
        [ "triple",         3.0, 0.60 ],
    ],
    "Si" => [
        [ "single", 1.0, 1.17 ],
        [ "double", 2.0, 1.07 ],
        [ "triple", 3.0, 1.00 ],
    ],
    "Ge" => [
        [ "single", 1.0, 1.17 ],
        [ "double", 2.0, 1.07 ],
        [ "triple", 3.0, 1.00 ],
    ],
    "Sn" => [
        [ "single", 1.0, 1.22 ],
        [ "double", 2.0, 1.20 ],
    ],
    "O" => [
        [ "single", 1.0, 0.66 ],
        [ "double", 2.0, 0.55 ],
    ],
    "S" => [
        [ "single", 1.0, 1.04 ],
        [ "double", 2.0, 0.94 ],
    ],
    "Se" => [
        [ "single", 1.0, 1.17 ],
        [ "double", 2.0, 1.07 ],
    ],
    "Te" => [
        [ "single", 1.0, 1.37 ],
        [ "double", 2.0, 1.27 ],
    ],
    "B" => [
        [ "single", 1.0, 0.88 ],
        [ "double", 2.0, 0.76 ],
        [ "triple", 3.0, 0.68 ],
    ],
    "N" => [
        [ "single", 1.0, 0.70 ],
        [ "double", 2.0, 0.60 ],
        [ "triple", 3.0, 0.55 ],
    ],
    "P" => [
        [ "single", 1.0, 1.10 ],
        [ "double", 2.0, 1.00 ],
        [ "triple", 3.0, 0.93 ],
    ],
    "As" => [
        [ "single", 1.0, 1.21 ],
        [ "double", 2.0, 1.11 ],
    ],
    "Sb" => [
        [ "single", 1.0, 1.41 ],
        [ "double", 2.0, 1.31 ],
    ],
    "H" => [
        [ "single", 1.0, 0.30 ],
    ],
    "F" => [
        [ "single", 1.0, 0.64 ],
    ],
    "Cl" => [
        [ "single", 1.0, 1.00 ],
    ],
    "Br" => [
        [ "single", 1.0, 1.14 ],
    ],
    "I" => [
        [ "single", 1.0, 1.33 ],
    ],
    "Hg" => [
        [ "single", 1.0, 1.50 ],
    ],
);

#==============================================================================#
# Forward subroutine definitions:

sub symgen_atom( $$ );
sub symgen_all_atoms( $$$ );
sub find_molecules( $$$$$$ );
sub find_molecule( $$$$$$$$$ );

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

my $cif_header;
eval {
    if( $cif_header_file ) {
        open( my $header, '<',"$cif_header_file" ) or die "ERROR, "
          . "could not open header file for input -- ". lcfirst($!) . "\n";

        $cif_header = "";
        while( <$header> ) {
            last unless /^#/;
            $cif_header .= $_;
        };

        close( $header ) or die "ERROR, "
           . "error while closing header file after reading -- "
           . lcfirst($!) . "\n";

        # The header must not contain CIF 2.0 magic code. For CIF 2.0
        # files the magic code is printed explicitly before the header.
        $cif_header =~ s/^#\\#CIF_2\.0[ \t]*\n//;
    }
};
if ($@) {
    process_errors( {
      'message'       => $@,
      'program'       => $0,
      'filename'      => $cif_header_file,
    }, $die_on_errors )
};

@ARGV = ("-") unless @ARGV;

# Choose an appropriate space group builder class as specified in the
# options:
sub make_spacegroup_builder
{
    my ($builder_type) = @_;

    return COD::Spacegroups::Builder->new
        if $builder_type eq 'optimised';
    return COD::Spacegroups::SimpleBuilder->new
        if $builder_type eq 'simple';
    die "unknown spacegroup builder type '$builder_type'" . "\n";
}

for my $filename (@ARGV) {

    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    # Is this line necessary?
    # next if ( $err_count > 0 );

    if( !ref $data ||
        !@$data || !defined $data->[0] || !defined $data->[0]{name} ) {
        report_message( {
                'filename'  => $filename,
                'program'   => $0,
                'err_level' => 'WARNING',
                'message'   => 'file seems to be empty'
            }, $die_on_warnings );
        next;
    }

    canonicalize_all_names( $data );

    if( $cif_header ) {
        # Ensure that for CIF v2.0 the magic code comes
        # before the CIF comment header:
        if( grep { exists $_->{cifversion} &&
                          $_->{cifversion}{major} == 2 } @$data ) {
            print "#\\#CIF_2.0\n";
        }
        print $cif_header;
    }

    for my $dataset (@$data) {

        my $dataname = 'data_' . $dataset->{name};

        local $SIG{__WARN__} =  sub { process_warnings( {
                                       'message'       => @_,
                                       'program'       => $0,
                                       'filename'      => $filename,
                                       'add_pos'       => $dataname
                                     }, {
                                       WARNING => $die_on_warnings,
                                       NOTE    => $die_on_notes,
                                     } ) };

        my $values = $dataset->{values};
        my $sym_data;
        eval {
            # Extracts symmetry operators.
            # Raises warnings upon unrecognised symmetry information.
            # Raises die if unable to find symmetry information.
            $sym_data = get_symmetry_operators( $dataset );

            my $unity_operator_found = 0;
            for my $symop (@$sym_data) {
                if( symop_is_unity( symop_from_string( $symop ) ) ) {
                    $unity_operator_found = 1;
                    last;
                }
            }
            if( !$unity_operator_found ) {
                warn "WARNING, unity symmetry operation ('x,y,z') is not "
                   . "found in the symmetry operation list -- results may "
                   . "be incorrect\n";
            } elsif ( !symop_is_unity( symop_from_string( $sym_data->[0] ) ) ) {
                # TODO: the symmetry operation position is currently only
                # determined from the string position in a CIF loop.
                # Technically, the appropriate looped list key data item
                # (i.e. _space_group_symop_id) should also be examined
                warn "WARNING, unity symmetry operation ('x,y,z') is not "
                   . "the first symmetry operation in the symmetry operation "
                   . "list -- results may be incorrect\n";
            }
        };
        if ( $@ ) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
        next if !defined $sym_data || !@{$sym_data};

        my $original_sg_number = get_space_group_number(
                                    $sym_data,
                                    \%SYMOP_LOOKUP_HASH,
                                    $dataset
                                 );

        my $unique_molecules;
        eval {
            $unique_molecules = get_molecules( $covalent_sensitivity,
                                               $sym_data,
                                               $dataset,
                                               \%COD::AtomProperties::atoms,
                                               $uniquify_atoms );
        };

        if ( $@ ) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
        next if !defined $unique_molecules || !@{$unique_molecules};

        eval {
            if( $preserve_stoichiometry && $expand_to_p1 ) {
                warn "NOTE, option '--expand-to-P1' implies " .
                    "'--preserve-stoichiometry'" . "\n";
            }
            if( $preserve_stoichiometry && ! $expand_to_p1 ) {
                my $molecular_symmetry =
                    make_spacegroup_builder( $space_group_builder_type );

                print STDERR "Start building molecule symmetry groups:\n"
                    if $symdebug;

                foreach my $molecule (@$unique_molecules) {
                # Build molecule point group here...
                    my $sg =
                        make_spacegroup_builder( $space_group_builder_type );
                    print STDERR "\nProcessing molecule "
                        . "'$molecule->{chemical_formula_sum}':\n"
                        if( $symdebug );
                    my %original_atoms = ();
                    for my $atom (@{$molecule->{atoms}}) {
                        my $atom_label = $atom->{site_label};
                        if( exists $atom->{site_symops} ) {
                            $sg->insert_symops( $atom->{site_symops} );
                            do {
                                for (@{$atom->{site_symops}}) {
                                    print STDERR "<<<< inserting symop: ", string_from_symop($_),"\n";
                                }
                            } if $symdebug;
                        }
                        if( !exists $original_atoms{$atom_label} ) {
                            $original_atoms{$atom_label} = $atom;
                        } else {
                            my $symop1 = $original_atoms{$atom_label}{symop};
                            my $inverted_symop1 = symop_invert( $symop1 );
                            $sg->insert_symop( symop_mul( $atom->{symop},
                                                          $inverted_symop1 ));
                            do {
                                print( STDERR "<<<< inserting symop (inversion): ",
                                       string_from_symop(
                                           symop_mul(
                                               $atom->{symop},
                                               $inverted_symop1
                                           )
                                       ), "\n" );
                            } if $symdebug;
                        }
                    }
                    $molecule->{symmetry} = $sg;
                    $molecular_symmetry->insert_symops( $sg->all_symops_ref() );
                    if( $symdebug ) {
                        print STDERR "\nMolecule symmetry for molecule "
                            . "'$molecule->{chemical_formula_sum}':\n";
                        $sg->print( \*STDERR );
                        print STDERR "\nMolecule cluster symmetry after insertion:\n";
                        $molecular_symmetry->print( \*STDERR );
                    }
                }
                if( $symdebug ) {
                    print STDERR "\nMolecule cluster symmetry:\n";
                    $molecular_symmetry->print( \*STDERR );
                    print STDERR "\nFinished building molecule symmetry groups:\n";
                }

                my @stoichiometric_molecules;
                foreach my $molecule (@$unique_molecules) {
                    use COD::Spacegroups::Cosets qw( find_left_cosets
                                                     canonical_string_from_symop );

                    if( $symdebug ) {
                        print STDERR "\nMolecule cluster symmetry:\n";
                        $molecular_symmetry->print( \*STDERR );
                        print STDERR "\nMolecule symmetry for molecule "
                            . "'$molecule->{chemical_formula_sum}':\n";
                        $molecule->{symmetry}->print( \*STDERR );
                    }

                    my @cosets = find_left_cosets(
                        $molecular_symmetry->all_symops_ref(),
                        $molecule->{symmetry}->all_symops_ref()
                    );
                    if( $symdebug ) {
                        print STDERR "Cosets for '$molecule->{chemical_formula_sum}':\n";
                        ## serialiseRef( \@cosets, "", \*STDERR );
                        my $indent = "   ";
                        my $n = 1;
                        for my $coset (@cosets) {
                            print STDERR $indent, "Coset ", $n++, ": \n";
                            for my $symop (@$coset) {
                                print( STDERR $indent x 2,
                                       string_from_symop( $symop ), "\n" );
                            }
                        }
                    }
                    push( @stoichiometric_molecules, $molecule );
                    for my $coset (@cosets[1..$#cosets]) {
                        ## use COD::Serialise qw( serialiseRef ); serialiseRef( [@cosets[1..$#cosets]] );
                        my $symop = $coset->[0];
                        my $symop_key = canonical_string_from_symop( $symop );
                        my %additional_molecule = (
                            atoms =>
                                symop_apply_to_atoms( $molecule->{atoms},
                                                      $symop ),
                            chemical_formula_sum =>
                                $molecule->{chemical_formula_sum},
                            is_polymer => $molecule->{is_polymer},
                            polymer_dimension => $molecule->{polymer_dimension},
                            polymer_basis => $molecule->{polymer_basis},
                        );
                        push( @stoichiometric_molecules,
                              \%additional_molecule );
                        ## use COD::Serialise; serialiseRef( \%additional_molecule );
                    }
                }

                # Find molecular Z value:
                my %molecules;
                for my $molecule (@stoichiometric_molecules) {
                    my $molecule_key;
                    if( $use_morgan_fingerprints ) {
                        my $neighbours =
                            make_neighbour_list(
                                $molecule->{atoms},
                                $covalent_sensitivity,
                                $bump_distance_factor,
                                \%COD::AtomProperties::atoms,
                                1 );
                        $molecule_key =
                            make_morgan_fingerprint(
                                $neighbours,
                                $use_atom_classes,
                                $classification_level,
                                $max_ring_size,
                                $flat_planarity );
                    } else {
                        $molecule_key =
                            join( "\0", sort map {$_->{site_label}}
                                  @{$molecule->{atoms}} );
                    }
                    push( @{$molecules{$molecule_key}}, $molecule );
                }
                my $Z = gcd( map { int(@$_) } values %molecules );

                ## print STDERR ">>> Z = $Z\n";

                # Simplify molecular formula:

                if( $Z > 1 ) {
                    @stoichiometric_molecules = ();
                    for my $molecule_key (keys %molecules) {
                        my $N = int(@{$molecules{$molecule_key}});
                        for my $i (0 .. $N/$Z - 1) {
                            push( @stoichiometric_molecules,
                                  $molecules{$molecule_key}[$i] );
                        }
                    }
                }

                $unique_molecules = \@stoichiometric_molecules;
            }

            # Trim polymers
            for my $moiety (@$unique_molecules) {
                next if !$moiety->{is_polymer};
                $moiety->{atoms} = trim_polymer( $moiety->{atoms},
                                                 $max_polymer_span );
                ##print STDERR ">>>> trimmed: \$moiety_key = " .
                ##join( "\0", sort map {$_->{site_label}} @{$moiety->{atoms}} ) . "\n";
            }

            my $Z = 1;
            if( $use_one_output_datablock ) {
                my @all_atoms = map { @{$_->{atoms}} } @$unique_molecules;
                if( @all_atoms > 0 ) {
                    # Find molecular Z value, once more:
                    my %moieties;
                    for my $moiety (@$unique_molecules) {
                        my $moiety_key;
                        if( $use_morgan_fingerprints ) {
                            my $neighbours =
                                make_neighbour_list(
                                    $moiety->{atoms},
                                    $covalent_sensitivity,
                                    $bump_distance_factor,
                                    \%COD::AtomProperties::atoms,
                                    1 );
                            $moiety_key =
                                make_morgan_fingerprint(
                                    $neighbours,
                                    $use_atom_classes,
                                    $classification_level,
                                    $max_ring_size,
                                    $flat_planarity );
                        } else {
                            $moiety_key =
                                join( "\0", sort map {$_->{site_label}}
                                      @{$moiety->{atoms}} );
                            ##print STDERR ">>>> \$moiety_key = $moiety_key\n";
                        }
                        push( @{$moieties{$moiety_key}}, $moiety );
                    }

                    $Z = gcd( map { int(@$_) } values %moieties );
                }
            }

            # Merge all molecules to one if requested.
            if( $use_one_output_datablock ) {
                my @all_atoms = map { @{$_->{atoms}} } @$unique_molecules;
                if( @all_atoms > 0 ) {
                    my @all_bases;
                    for my $moiety (@$unique_molecules) {
                        next if !$moiety->{polymer_dimension};
                        push @all_bases,
                            basis_string_to_matrix( $moiety->{polymer_basis} );
                    }
                    # Once @all_bases matrix is full, deref components and
                    # calculate cumulative rank and basis.
                    my $m = [map { @{$_} } @all_bases];
                    my ( $rank, $basis ) = get_rank_and_basis( $m );
                    $unique_molecules = [{
                        atoms =>
                            \@all_atoms,
                        chemical_formula_sum =>
                            chemical_formula_sum( \@all_atoms, $Z ),
                        is_polymer => ((grep { $_->{is_polymer} == 1 }
                                               @$unique_molecules) > 0),
                        polymer_dimension => $rank,
                        polymer_basis => $basis
                    }];
                }
            }

            ## use COD::Serialise qw( serialiseRef ); serialiseRef( $unique_molecules );
            # Split init atoms into assemblies and groups, if requested.
            if( !$merge_disorder_groups ) {
                my @split_molecules;
                my $n = 1;
                for my $molecule (@$unique_molecules) {
                    ## print ">>> molecule No. ", $n++, "\n";
                    my $atom_list = $molecule->{atoms};
                    my $disorder_groups = atom_groups($atom_list);
                    ## print ">>> ngroups = ", int(@$disorder_groups), "\n";
                    ## use COD::Serialise qw( serialiseRef ); serialiseRef( $disorder_groups );
                    for my $disorder_representative (@$disorder_groups) {
                        push( @split_molecules,
                              {
                                  atoms =>
                                      $disorder_representative,
                                  chemical_formula_sum =>
                                      chemical_formula_sum
                                      ( $disorder_representative, $Z ),
                                  is_polymer => $molecule->{is_polymer},
                                  polymer_dimension =>
                                      $molecule->{polymer_dimension},
                                  polymer_basis => $molecule->{polymer_basis},
                              }
                        );
                    }
                }
                $unique_molecules = \@split_molecules;
            }

            # There is no need to sort the molecules if the single data block
            # output is required since:
            # a) there is only one molecule (no disorder);
            # b) there are several disorder configurations, but the
            #    best one (occupancy wise) is already at the beginning
            #    of the array
            if( !$use_one_output_datablock &&
                ( $sort_molecules || $largest_molecule_only ) ) {
                my @molecule_sum_occupancy;
                for (my $i = 0; $i < @{$unique_molecules}; $i++ ) {
                    $molecule_sum_occupancy[$i] = 0;
                    my $atoms = $unique_molecules->[$i]{'atoms'};
                    next if ( !defined $atoms->[0]{'atom_site_occupancy'} );
                    for my $atom (@{$atoms}) {
                        my $occupancy = (
                                      $atom->{'atom_site_occupancy'} eq '.' ||
                                      $atom->{'atom_site_occupancy'} eq '?' )
                                      ? 0 : $atom->{'atom_site_occupancy'};
                        $occupancy =~ s/[(][0-9]+[)]$//; # remove precision
                        $molecule_sum_occupancy[$i] += $occupancy;
                    }
                };

                my @sorted_indexes = sort {
                    @{$unique_molecules->[$b]{atoms}} <=>
                    @{$unique_molecules->[$a]{atoms} ||
                    $molecule_sum_occupancy[$b] <=>
                    $molecule_sum_occupancy[$a] }
                } 0..$#$unique_molecules;
                @{$unique_molecules} = @{$unique_molecules}[@sorted_indexes];
            }

            my $molecule_id = 0;
            my $dataset_name = $dataset->{name};
            foreach my $molecule (@$unique_molecules) {
                my $id;
                unless( ($use_one_output_datablock &&
                         $merge_disorder_groups) ||
                         $largest_molecule_only ) {
                    $id = $molecule_id;
                } else {
                    $id = undef;
                }

                if( $output_geom_bond ) {
                    $molecule->{bonds} = atom_bonds( $molecule->{atoms},
                                                     \%COD::AtomProperties::atoms,
                                                     $covalent_sensitivity );
                }

                print_molecule( $id, $audit, $molecule, $Id,
                                $dataset, $dataset_name, $filename,
                                $sym_data, $Z, $original_sg_number );

                last if $largest_molecule_only;

                $molecule_id++;
            }
        };

        if ( $@ ) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname
            }, $die_on_errors )
        }
    }
}

#==============================================================================#
# Check whether an atom belongs to a group which is disordered around
# a special position. The $atom is a reference to a hash returned by
# atom_array_from_cif() subroutine.

sub is_disordered_around_special_position($)
{
    my ($atom) = @_;

    # "A minus prefix (e.g. "-1") is used to indicate sites disordered
    # about a special position"
    # (https://www.iucr.org/__data/iucr/cifdic_html/1/cif_core.dic/Iatom_site_disorder_group.html,
    # 2019-10-22):
    if( $atom &&  exists $atom->{group} &&
        $atom->{group} =~ /^-/ ) {
        return 1
    } else {
        return 0;
    }
}

#==============================================================================#
# Calculate distance between two atoms. The atoms are represented as
# references to a hash returned by atom_array_from_cif()
# subroutine. These hashes MUST contain 'coordinates_ortho' field
# (with Cartesian atom coordinates.

sub atom_distance($$)
{
    my ($atom1, $atom2) = @_;

    return
        distance(
            $atom1->{coordinates_ortho},
            $atom2->{coordinates_ortho},
        );
}

#==============================================================================#
# Test if two atoms are too close (i.e. if the "bump").

sub atoms_bump($$$$)
{
    my ($atom1, $atom2, $atom_properties, $distance_factor) = @_;

    my $distance = atom_distance( $atom1, $atom2 );

    do {
        local $, = " ";
        local $\ = "\n";
        print STDERR ">>>> checking bump: ",
        $atom1->{chemical_type}, $atom2->{chemical_type},
        $atom1->{site_label}, $atom2->{site_label}, $distance,
        $distance_factor;
    } if 0;

    return
        test_bump(
            $atom_properties,
            $atom1->{chemical_type},
            $atom2->{chemical_type},
            $atom1->{site_label},
            $atom2->{site_label},
            $distance,
            $distance_factor,
            "vdw_radius"
        );
}

#==============================================================================#
# Find all atoms sets that are disordered around a special position;
# determine symmetry (sub)group of each such special position and left
# cosets of each such symmetry group. Store references to space group
# operators, coset operator lists and unique operators needed to by
# applied to disordered atoms into each atom's record (hash).

sub determine_disordered_set_symmetry($$$$)
{
    my ( $atom_list, $symmetry_operators, $atom_properties,
         $distance_factor ) = @_;

    my %special_disorder_groups;

    for my $atom (@$atom_list) {
        if( is_disordered_around_special_position( $atom ) ) {
            my $disorder_group_key = $atom->{group};
            push( @{$special_disorder_groups{$disorder_group_key}},
                  $atom );
        }
    }

    for my $group_key (sort keys %special_disorder_groups) {
        my @group_atoms = @{$special_disorder_groups{$group_key}};
        my $unity_symop =
            [ [ 1, 0, 0, 0 ],
              [ 0, 1, 0, 0 ],
              [ 0, 0, 1, 0 ],
              [ 0, 0, 0, 1 ] ];
        my $group_symmetric_atoms =
            apply_shifts(
                symgen_all_atoms( \@group_atoms, [ $unity_symop ],
                                  { print_errors => 0 } )
            );

        my $max_vdw_radius = get_max_vdw_radius( $atom_properties );

        my $bricks = build_bricks( $group_symmetric_atoms,
                                   $max_vdw_radius * 2.5 );

        do {
            local $\ = "\n";
            local $, = " ";
            print STDERR int(@$group_symmetric_atoms);
            print STDERR "\$group_symmetric_atoms";
            for my $atom (@$group_symmetric_atoms) {
                print( STDERR substr($atom->{site_label}, 0, 1),
                       @{$atom->{coordinates_ortho}} );
            }
        } if 0;

        my @symgroup_generators;
        for my $symop (@$symmetry_operators) {
            do {
                local $\ = "\n";
                print STDERR int(@group_atoms);
                print STDERR "Group label ", $group_key;
            } if 0;
            for my $atom (@group_atoms) {
                my $symm_atom = symop_apply( $atom, $symop, {modulo_1 => 1} );

                do {
                    local $, = " ";
                    local $\ = "\n";
                    print STDERR substr($symm_atom->{site_label}, 0, 1),
                        @{$symm_atom->{coordinates_ortho}};
                } if 0;

                # Performed an optimised search of the neighbouring
                # atoms in "bricks":
                my $coordinates = $symm_atom->{coordinates_ortho};

                my ($i_init, $j_init, $k_init) =
                    get_atom_index( $bricks, @$coordinates );

                my ( $min_i, $max_i, $min_j, $max_j, $min_k, $max_k );

                eval {
                    ( $min_i, $max_i, $min_j, $max_j, $min_k, $max_k ) =
                        get_search_span( $bricks, $i_init, $j_init, $k_init );
                };
                if( $@ ) {
                    use COD::Serialise qw( serialiseRef );
                    serialiseRef( $coordinates );
                    serialiseRef( [ $i_init, $j_init, $k_init ] );
                    serialiseRef( $bricks );
                    die( $@ );
                }

                for my $i ($min_i .. $max_i) {
                for my $j ($min_j .. $max_j) {
                for my $k ($min_k .. $max_k) {
                    for my $other_atom ( @{$bricks->{atoms}[$i][$j][$k]} ) {
                        do {
                            print STDERR ">>> testing: ",
                                $symm_atom->{name}, " ",
                                $other_atom->{name}, " ",
                                "distance = ",
                                atom_distance($symm_atom, $other_atom ),
                                "\n";
                        } if 0;
                        if( atoms_bump( $symm_atom, $other_atom,
                                        $atom_properties, $distance_factor ) ) {
                            push( @symgroup_generators, $symop );
                            do {
                                print( STDERR ">>> bump: ",
                                       $symm_atom->{'site_label'}, " ",
                                       $other_atom->{'site_label'},
                                       "\n" );
                                print( STDERR ">>> pushing symop ",
                                       string_from_symop($symop), "\n" );
                            } if 0;
                        }
                    }
                }}}
            }
        }

        # Now the @symgroup_generators will contain, if any, those
        # symmetry operators that mapped the currently processed
        # disordered group onto itself. Let's build a subgroup
        # generated by those operators, it will the symmetry of the
        # site around which the atom group is disordered.
        if( @symgroup_generators ) {
            use COD::Spacegroups::Cosets qw( find_left_cosets );

            my $sg_builder =
                make_spacegroup_builder( $space_group_builder_type );

            $sg_builder->insert_symops( \@symgroup_generators );

            my $subgroup_operators = $sg_builder->all_symops_ref();

            do {
                local $" = "; ";
                local $\ = "\n";
                my @disorder_symops =
                    map { string_from_symop($_) } @$subgroup_operators;
                print STDERR ">>>> disorder symops: @disorder_symops";
            } if 0;

            my @cosets = find_left_cosets( $symmetry_operators,
                                           $subgroup_operators );

            my @permissible_operators;
            for my $coset (@cosets) {
                do {
                    local $" = "; ";
                    local $\ = "\n";
                    my @coset_symop_strings = map {string_from_symop($_)} @$coset;
                    print( STDERR ">>> coset ", ": ",
                           "@coset_symop_strings" );
                } if 0;
                for my $i (0..$#$coset) {
                    push( @{$permissible_operators[$i]}, $coset->[$i] );
                }
            }

            do {
                local $" = "; ";
                local $\ = "\n";
                for my $operators (@permissible_operators) {
                    my @operator_strings = map {string_from_symop($_)} @$operators;
                    print( STDERR ">>> operator set ", ": ",
                           "@operator_strings" );
                }
            } if 0;

            # Distribute the found operators into disorder group
            # atoms. Only those symmetry operators listed here should
            # be applied to the atoms that contain them:
            if( @permissible_operators ) {
                for my $atom (@group_atoms) {
                    die "permissible operators already defined!"
                        if exists $atom->{permissible_operators};
                    $atom->{permissible_operators} = \@permissible_operators;
                    $atom->{disorder_site_symmetry} = $subgroup_operators;
                }
            }
        } else {
            warn "WARNING, disorder group '$group_key' is not mapped " .
                "to itself by any non-unity symmetry operator\n";
        }
    }
}


#==============================================================================#
# This is the main function where other functions such as find_molecules are
# called.
# Accepts
#     covalent_sensitivity - a threshold for covalent sensitivity
#     filename             - CIF file name
#     sym_data             - symmetric data from the CIF file
#     atom_site_tag        - atom site label or atom site type symbol from the
#                            CIF file
#     values               - a hash where a data from the CIF file is stored
#
# Returns
#     unique_molecules     - an array of hashes
#                     %molecule = (
#                         atoms=>[\%atom_info1, \%atom_info2], #covalent bond
#                         chemical_formula_sum=>"C6 H6",
#                                 );

sub get_molecules
{
    my $covalent_sensitivity = shift;
    my $sym_data             = shift;
    my $dataset              = shift;
    my $atom_properties      = shift;
    my $uniquify_atoms       = shift;

    my $values = $dataset->{values};

    # Parse symmetry operators:
    my @sym_operators = map { symop_from_string($_) } @{$sym_data};

    # Create a list of symmetry operators:
    my $symop_list = { symops => [ map { symop_from_string($_) } @$sym_data ],
                       symop_ids => {} };
    for (my $i = 0; $i < @{$sym_data}; $i++)
    {
        $symop_list->{symop_ids}
                     {symop_string_canonical_form($sym_data->[$i])} = $i;
    }

    my $cif_atom_list_options = {
        uniquify_atom_names => 1,
        uniquify_atoms => $uniquify_atoms,
        exclude_dummy_atoms => $exclude_dummy_atoms,
        exclude_dummy_coordinates => 1,
        exclude_unknown_coordinates => 1,
        symop_list => $symop_list,
        modulo_1 => 1,
        atom_properties => $atom_properties,
        continue_on_errors => !$die_on_errors
    };

    # Extract atoms fract coordinates
    my $atom_list = atom_array_from_cif( $dataset, $cif_atom_list_options );
    return [] unless defined $atom_list;

    # atoms with zero occupancies are not initially filtered in the
    # 'atom_array_from_cif' subroutine due to some dummy atoms
    # potentially containing zero or equivalent ('.', '?') occupancies
    if ( $exclude_zero_occupancies ) {
        my @filtered_atom_list;
        for my $atom ( @$atom_list ) {
            my $has_zero_occupancy = 0;
            if ( exists $atom->{'atom_site_occupancy'} ) {
                if ( $atom->{'atom_site_occupancy'} eq '?' ||
                     $atom->{'atom_site_occupancy'} eq '.' ) {
                    $has_zero_occupancy = 1;
                } else {
                    my $occupancy = $atom->{'atom_site_occupancy'};
                    $occupancy =~ s/[(][0-9]+[)]$//; # remove precision
                    if ( $occupancy == 0.0 ) {
                        $has_zero_occupancy = 1;
                    }
                }
            }

            next if ( $has_zero_occupancy &&
                      ( !exists $atom->{'calc_flag'} ||
                        $atom->{'calc_flag'} ne 'dum' ) );

            push @filtered_atom_list, $atom;
        }
        $atom_list = \@filtered_atom_list;
    }

    if( !@$atom_list ) {
        warn "WARNING, no atoms suitable for processing were found -- "
           . "maybe all occupancies were unknown, zero, or "
           . "all atom types were unrecognised\n";
            return [];
    }

    my $max_covalent_radius = get_max_covalent_radius( $atom_properties );

    my @unique_molecules;
    my %seen_molecules;

    # If there are atom sets that are disordered around a special
    # position, determine their symmetry subgroups in the space group
    # and their symmetry subgroup cosets:

    determine_disordered_set_symmetry( $atom_list, \@sym_operators,
                                       $atom_properties,
                                       $vdw_distance_factor );

    # Apply necessary symmetry operators to all atoms. For atom sets
    # that are disordered around a special position, only symmetry
    # operators from atom set symmetry group cosets are applied, one
    # symop from each coset:
    my %initial_atom_names = map { $_->{name} => 1 } @$atom_list;

    my $unit_cell_atoms = symgen_all_atoms( $atom_list, \@sym_operators,
                                            {
                                                print_errors => 1,
                                                initial_atom_names =>
                                                    \%initial_atom_names
                                            } );

    my $symmetric_atoms = apply_shifts( $unit_cell_atoms );

    my @initial_atoms;
    if( $expand_to_p1 ) {
        @initial_atoms = @$unit_cell_atoms;
    } else {
        do {
            local $" = ", ";
            local $\ = "\n";
            my @atom_names = sort keys %initial_atom_names;
            print STDERR ">>> atom names: @atom_names";
        } if 0;
        foreach my $symmetric_atom ( @$symmetric_atoms ) {
            do {
                local $\ = "\n";
                print STDERR ">>> checking atom: $symmetric_atom->{name}";
            } if 0;
            push( @initial_atoms, $symmetric_atom )
                if exists $initial_atom_names{$symmetric_atom->{name}};
        }
    }

    if( $dump_atoms ) {
        dump_atoms_as_cif( 1, \@initial_atoms,
                           [ get_cell( $values ) ] );
    } else {

        my $bricks = build_bricks( $symmetric_atoms,
                                   $max_covalent_radius * 2 +
                                   $covalent_sensitivity );

        # Finds molecules
        my @current_ordered_molecules = find_molecules( $covalent_sensitivity,
                                                        $atom_properties,
                                                        $symmetric_atoms,
                                                        \@initial_atoms,
                                                        $bricks,
                                                        \%seen_molecules );

        push( @unique_molecules, @current_ordered_molecules );
    }

    # Calculates chemical formula sum
    foreach my $molecule (@unique_molecules) {
        $molecule->{chemical_formula_sum} =
            chemical_formula_sum( $molecule->{atoms} );
    }

    return \@unique_molecules;
}

#===============================================================#
# Applies symmetry operator to all atoms in a given list.
#
# The symop_apply_to_atoms subroutine accepts a reference to an array
# of hash references:
#
# $atom_list = [
#                 {
#                    site_label=>"C1",
#                    name=>"C1_2",
#                    chemical_type=>"C",
#                    coordinates_fract=>[1.0, 1.0, 1.0],
#                    unity_matrix_applied=>1
#                 }, # $atom_info hash
#                 $atom2_info,
#                 $atom3_info,
#                 $atom4_info
#              ]
#
# and a reference to an array - symmetry operator:
#
# my $symop = [
#     [ r11 r12 r13 t1 ]
#     [ r21 r22 r23 t1 ]
#     [ r31 r32 r33 t1 ]
#     [   0   0   0  1 ]
# ],
#
# Returns an list of the above-mentioned atom_info hashes.

sub symop_apply_to_atoms
{
    my($atom_list, $symop) = @_;

    my @sym_atoms = ();
    for my $atom (@$atom_list) {
        push( @sym_atoms,
            symop_apply( $atom, $symop,
                         { append_symop_to_label => $expand_to_p1 } ) );
    }

    return \@sym_atoms;
}

#===============================================================#
# Generate symmetry equivalents of an atom, exclude duplicates
# on special positions

sub symgen_atom($$)
{
    my ( $atom, $sym_operators ) = @_;

    my( $sym_atoms ) = symops_apply_modulo1( $atom, $sym_operators,
                                             { append_symop_to_label =>
                                               $expand_to_p1,
                                               use_special_position_disorder =>
                                               $use_special_position_disorder } );

    if( $sym_atoms &&
        ( !@{$sym_atoms} ||
          $sym_atoms->[0]{multiplicity_ratio} == 1 )) {
        return @$sym_atoms;
    } else {
        my @unique_atoms;
        my %to_be_deleted;
        for my $i (0..$#$sym_atoms-1) {
            for my $j ($i+1..$#$sym_atoms) {
                if( atoms_coincide( $sym_atoms->[$i],
                                    $sym_atoms->[$j],
                                    $sym_atoms->[$i]{f2o} )) {
                    $to_be_deleted{$sym_atoms->[$j]{name}} = 1;
                }
            }
        }
        for my $atom (@$sym_atoms) {
            if( !defined $to_be_deleted{$atom->{name}} ) {
                push( @unique_atoms, $atom );
            }
        }
        return @unique_atoms;
    }
}

#===============================================================#
# Generate symmetry equivalents of all atoms from a list, exclude
# duplicates on special positions. Check the multiplicity values
# provided in the original file.

sub symgen_all_atoms($$$)
{
    my ( $atoms, $sym_operators, $options ) = @_;

    my $print_errors = 1
        if $options && $options->{print_errors};

    my $initial_atom_names = $options->{initial_atom_names}
        if exists $options->{initial_atom_names};

    my @sym_atoms = ();

    my %disorder_group_operators;

    for my $atom (@{$atoms}) {
        my $atom_symops;
        if( exists $atom->{permissible_operators} ) {
            my $operator_set_index;
            if( $special_position_operator_set eq "random" ) {
                my $disorder_group = $atom->{group};
                if( exists $disorder_group_operators{$disorder_group} ) {
                    $atom_symops =
                        $disorder_group_operators{$disorder_group}
                } else {
                    # Pre-multiply the symmetry operator with a randomly
                    # selected operator that maps a disordered group into
                    # itself:
                    my @randomised_symops;
                    my $site_symops = $atom->{disorder_site_symmetry};
                    my $nsymops = int(@$site_symops);

                    for my $permissible_symop (
                        @{$atom->{permissible_operators}[0]}
                        ) {
                        my $symop_index = int(rand($nsymops));
                        my $site_symop = $site_symops->[$symop_index];
                        do {
                            use Data::Dumper;
                            print STDERR Dumper( $site_symop, $permissible_symop );
                        } if 0;
                        push( @randomised_symops,
                              symop_mul( $permissible_symop, $site_symop )
                            );
                    }
                    $disorder_group_operators{$disorder_group} =
                        \@randomised_symops;
                    $atom_symops = \@randomised_symops;
                }
            } else {
                $operator_set_index =
                    $special_position_operator_set %
                    int(@{$atom->{permissible_operators}});
                $atom_symops =
                    $atom->{permissible_operators}[$operator_set_index];
            }

        } else {
            $atom_symops = $sym_operators;
        }
        die unless defined $atom_symops;
        # If symgen_all_atoms() is called by the code that needs all
        # symmetry equivalents in the cell to reconstruct molecules,
        # it passes a hash of all initial atoms names as an optional
        # parameter. Molecules are reconstructed by the caller of
        # this function starting from atoms that have their names
        # listed in the $initial_atom_names = {} hash. Normally, they
        # are just _atom_site_label values from the CIF since, when a
        # unity operator is applied, the atom name is not changed by
        # symgen_atom(). However, if an atom belongs to a group that is
        # disordered around a special position, and if we choose to
        # apply some of the disorder site operators to this atom, the
        # unity operator will never be used for such atom. In this
        # case, its name in the initial atom list must be replaced so
        # that the molecule reconstruction code finds them:
        if( $initial_atom_names &&
            !symop_is_unity( $atom_symops->[0] ) ) {
            my $symop_string = canonical_string_from_symop( $atom_symops->[0] );
            # FIXME: the code that generates the $symop_id value MUST
            # be exactly the same as in symop_register_applied_symop()
            # of the SymmetryGenerator.pm, line 206 onwards
            # (rev. 7270). The code should be refactored so that the
            # symop_id generation happens just in one place (restore
            # SPOT) (S.G.).
            my $symop_id =
                $atom->{symop_list}{symop_ids}{$symop_string} + 1;

            my $old_atom_name = $atom->{site_label};
            my $new_atom_name = $atom->{site_label} . '_' . $symop_id . '_555';
            delete $initial_atom_names->{$old_atom_name};
            $initial_atom_names->{$new_atom_name} = 1;
        }
        push( @sym_atoms, symgen_atom( $atom, $atom_symops ) );
    }

    my $nr_multiplicity_ratios_found = 0;

    for my $atom (@{$atoms}) {
        my $multiplicity = $atom->{multiplicity};
        my $multiplicity_ratio = $atom->{multiplicity_ratio};

        if( exists $atom->{_atom_site_symmetry_multiplicity} &&
            $atom->{_atom_site_symmetry_multiplicity} ne '?' &&
            $atom->{_atom_site_symmetry_multiplicity} ne '.' &&
            $atom->{_atom_site_symmetry_multiplicity} !=
            $multiplicity ) {
            if( $atom->{_atom_site_symmetry_multiplicity} ==
                $multiplicity_ratio ) {
                $nr_multiplicity_ratios_found++;
            } else {
                if( $print_errors ) {
                    warn 'WARNING, the given multiplicity value of atom ' .
                         "'$atom->{name}' differs from the calculated value " .
                         "('$atom->{_atom_site_symmetry_multiplicity}' vs. " .
                         "'$multiplicity') -- the calculated value will be " .
                         'used' . "\n";
                }
            }
        }
    }

    if( $nr_multiplicity_ratios_found > 0 &&
        $print_errors ) {
        warn "WARNING, multiplicity ratios are given instead of "
           . "multiplicities for $nr_multiplicity_ratios_found atoms -- "
           . "taking calculated values\n";
    }

    return \@sym_atoms;
}

#===============================================================#
# Prints molecule to the CIF file.

# Accepts a hash
# %molecule = (
#               atoms=>[\%atom_info1, \%atom_info2, ...],
#               bonds=>[{atom1=>\%atom_info1, atom2=>\%atom_info2}], #covalent bond
#               chemical_formula_sum=>"\\'C6 H6\\'",
#             );
# ...
#
# @param $sym_data
#       Reference to an array of symmetry operations as returned
#       by the COD::CIF::Data::get_symmetry_operators() subroutine.
#       Currently not used.
# @param $Z
#       Molecular Z number.
# @param $original_sg_number
#       Space group IT number derived from the input crystal structure
#       (see the get_space_group_number() subroutine). May be undefined.
##

sub print_molecule
{
    my( $molecule_id, $audit, $molecule, $Id, $dataset, $dataset_name,
        $filename, $sym_data, $Z, $original_sg_number ) = @_;

    my $new_dataset = clone( $dataset );

    $new_dataset->{name} = $dataset_name;
    if( defined $molecule_id ) {
        $new_dataset->{name} .= '_molecule_' . $molecule_id;
    }

    my @data2copy = qw(
    _publ_author_name
    _publ_section_title
    _journal_issue
    _journal_name_full
    _journal_page_first
    _journal_page_last
    _journal_volume
    _journal_year

    _cell_length_a
    _cell_length_b
    _cell_length_c
    _cell_angle_alpha
    _cell_angle_beta
    _cell_angle_gamma

    _cell_measurement_pressure
    _cell_measurement.pressure
    _cell_measurement.pressure_esd
    _cell_measurement_pressure_gPa
    _cell_measurement_radiation
    _cell_measurement.radiation
    _cell_measurement.temp
    _cell_measurement_temperature
    _cell_measurement_temperature_C
    _cell_measurement.temp_esd
    _cell_measurement_wavelength
    _cell_measurement.wavelength
    _cell_measurement_wavelength_nm
    _cell_measurement_wavelength_pm

    _diffrn_ambient_environment
    _diffrn.ambient_environment
    _diffrn_ambient_pressure
    _diffrn.ambient_pressure
    _diffrn.ambient_pressure_esd
    _diffrn_ambient_pressure_gPa
    _diffrn_ambient_pressure_gt
    _diffrn.ambient_pressure_gt
    _diffrn_ambient_pressure_lt
    _diffrn.ambient_pressure_lt
    _diffrn.ambient_temp
    _diffrn.ambient_temp_details
    _diffrn_ambient_temperature
    _diffrn_ambient_temperature_C
    _diffrn_ambient_temperature_gt
    _diffrn_ambient_temperature_lt
    _diffrn.ambient_temp_esd
    _diffrn.ambient_temp_gt
    _diffrn.ambient_temp_lt

    _diffrn_radiation_collimation
    _diffrn_radiation.collimation
    _diffrn_radiation_detector
    _diffrn_radiation_detector_dtime
    _diffrn_radiation.diffrn_id
    _diffrn_radiation.div_x_source
    _diffrn_radiation.div_x_y_source
    _diffrn_radiation.div_y_source
    _diffrn_radiation_filter_edge
    _diffrn_radiation.filter_edge
    _diffrn_radiation_filter_edge_nm
    _diffrn_radiation_filter_edge_pm
    _diffrn_radiation_inhomogeneity
    _diffrn_radiation.inhomogeneity
    _diffrn_radiation_monochromator
    _diffrn_radiation.monochromator
    _diffrn_radiation_polarisn_norm
    _diffrn_radiation.polarisn_norm
    _diffrn_radiation_polarisn_ratio
    _diffrn_radiation.polarisn_ratio
    _diffrn_radiation.polarizn_source_norm
    _diffrn_radiation.polarizn_source_ratio
    _diffrn_radiation_probe
    _diffrn_radiation.probe
    _diffrn_radiation_source
    _diffrn_radiation_type
    _diffrn_radiation.type
    _diffrn_radiation_wavelength
    _diffrn_radiation_wavelength_id
    _diffrn_radiation_wavelength.id
    _diffrn_radiation.wavelength_id
    _diffrn_radiation_wavelength_nm
    _diffrn_radiation_wavelength_pm
    _diffrn_radiation_wavelength.wavelength
    _diffrn_radiation_wavelength_wt
    _diffrn_radiation_wavelength.wt
    _diffrn_radiation_xray_symbol
    _diffrn_radiation.xray_symbol

    _diffrn_reflns_theta_full
    _diffrn_reflns_resolution_full
    _diffrn_reflns_theta_max
    _diffrn_reflns_resolution_max
    _reflns_d_resolution_high
    _reflns.d_resolution_high
    _reflns_d_resolution_high_nm
    _reflns_d_resolution_high_pm
    _reflns_d_resolution_low
    _reflns.d_resolution_low
    _reflns_d_resolution_low_nm
    _reflns_d_resolution_low_pm
    _diffrn_reflns_limit_h_max
    _diffrn_reflns.limit_h_max
    _diffrn_reflns_limit_h_min
    _diffrn_reflns.limit_h_min
    _diffrn_reflns_limit_k_max
    _diffrn_reflns.limit_k_max
    _diffrn_reflns_limit_k_min
    _diffrn_reflns.limit_k_min
    _diffrn_reflns_limit_l_max
    _diffrn_reflns.limit_l_max
    _diffrn_reflns_limit_l_min
    _diffrn_reflns.limit_l_min

    _cod_duplicate_entry
    _[local]_cod_duplicate_entry
);

    my @data2rename = qw(
    _chemical_formula_analytical
    _chemical_formula.analytical
    _chemical_formula.entry_id
    _chemical_formula_iupac
    _chemical_formula.iupac
    _chemical_formula_moiety
    _chemical_formula.moiety
    _chemical_formula_structural
    _chemical_formula.structural
    _chemical_formula_sum
    _chemical_formula.sum
    _pd_proc_ls_prof_R_factor
    _pd_proc_ls_prof_wR_factor
    _refine_hist.R_factor_all
    _refine_hist.R_factor_obs
    _refine_hist.R_factor_R_free
    _refine_hist.R_factor_R_work
    _refine_ls_class_R_factor_all
    _refine_ls_class.R_factor_all
    _refine_ls_class_R_factor_gt
    _refine_ls_class.R_factor_gt
    _refine_ls_class_wR_factor_all
    _refine_ls_class.wR_factor_all
    _refine_ls_R_factor_all
    _refine.ls_R_factor_all
    _refine_ls_R_factor_gt
    _refine.ls_R_factor_gt
    _refine_ls_R_factor_obs
    _refine.ls_R_factor_obs
    _refine.ls_R_factor_R_free
    _refine.ls_R_factor_R_free_error
    _refine.ls_R_factor_R_free_error_details
    _refine.ls_R_factor_R_work
    _refine_ls_shell.R_factor_all
    _refine_ls_shell.R_factor_obs
    _refine_ls_shell.R_factor_R_free
    _refine_ls_shell.R_factor_R_free_error
    _refine_ls_shell.R_factor_R_work
    _refine_ls_shell.wR_factor_all
    _refine_ls_shell.wR_factor_obs
    _refine_ls_shell.wR_factor_R_free
    _refine_ls_shell.wR_factor_R_work
    _refine_ls_wR_factor_all
    _refine.ls_wR_factor_all
    _refine_ls_wR_factor_gt
    _refine_ls_wR_factor_obs
    _refine.ls_wR_factor_obs
    _refine_ls_wR_factor_ref
    _refine.ls_wR_factor_R_free
    _refine.ls_wR_factor_R_work
    _reflns_class_R_factor_all
    _reflns_class.R_factor_all
    _reflns_class_R_factor_gt
    _reflns_class.R_factor_gt
    _reflns_class_wR_factor_all
    _reflns_class.wR_factor_all
);

    # Copy the '_atom_type.symbol' and '_atom_type.oxidation_number'
    # data items only if both are simultaneously provided. Otherwise,
    # the oxidation numbers cannot be mapped to the corresponding atom
    # types or the oxidation numbers are not provided at all.
    if( ( contains_data_item( $new_dataset, '_atom_type_symbol' ) ||
          contains_data_item( $new_dataset, '_atom_type.symbol' ) ) &&
        ( contains_data_item( $new_dataset, '_atom_type_oxidation_number' ) ||
          contains_data_item( $new_dataset, '_atom_type.oxidation_number' ) ) )
    {
        push @data2copy, qw(
                _atom_type.symbol
                _atom_type_symbol
                _atom_type.oxidation_number
                _atom_type_oxidation_number
            )
    }

    my %data2copy = map { $_, $_ } @data2copy;

    my @tag_list = @{$new_dataset->{tags}};

    my $src_tag_prefix = '_[local]_cod_src';
    my %renamed_tags = rename_tags( $new_dataset,
                                    \@data2rename,
                                    $src_tag_prefix );

    my @tags_to_exclude = grep { !exists $data2copy{$_} &&
                                 !exists $renamed_tags{$_} }
                               @{$new_dataset->{tags}};
    foreach (@tags_to_exclude) {
        exclude_tag( $new_dataset, $_ );
    }

    if( $audit ) {
        my $id_value = $Id;
        $id_value =~ s/\s*\$\s*//g;
        set_tag( $new_dataset, '_audit_creation_method', $id_value );
    }

    set_tag( $new_dataset, '_chemical_formula_sum',
             $molecule->{chemical_formula_sum} );

    set_tag( $new_dataset, '_cod_data_source_file',
             basename( $filename ) );
    set_tag( $new_dataset, '_cod_data_source_block',
             $dataset_name );
    set_tag( $new_dataset, '_cell_formula_units_Z', $Z );
    set_tag( $new_dataset, '_space_group_name_H-M_alt', 'P 1' );

    set_loop_tag( $new_dataset, '_space_group_symop_operation_xyz',
                  undef, [ 'x, y, z' ] );

    if( defined $original_sg_number ) {
        set_tag( $new_dataset, '_cod_molecule_space_group_IT_number',
                 $original_sg_number );
    }

    if( $molecule->{is_polymer} ) {
        set_tag( $new_dataset, '_cod_molecule_is_polymer', 'yes' );
    }
    else {
        set_tag( $new_dataset, '_cod_molecule_is_polymer', 'no' );
    }

    if( $molecule->{polymer_dimension} && $max_polymer_span != 0 ) {
        set_tag( $new_dataset, '_cod_molecule_polymer_dimension',
                 $molecule->{polymer_dimension} );
        set_tag( $new_dataset, '_cod_molecule_polymer_basis',
                 $molecule->{polymer_basis} );
    }

    my @atoms = sort {
        length($a->{name}) == length($b->{name}) ?
        $a->{name} cmp $b->{name} :
        length($a->{name}) <=> length($b->{name})
    } @{$molecule->{atoms}};

    my $atoms_datablock = datablock_from_atom_array( \@atoms );
    merge_datablocks( $atoms_datablock, $new_dataset );

    my $cod_molecule_datablock = generate_cod_molecule_data_block( \@atoms );
    merge_datablocks( $cod_molecule_datablock, $new_dataset );

    if( $force_unit_occupancies &&
        exists $new_dataset->{values}{_atom_site_occupancy} ) {
        set_loop_tag( $new_dataset,
                      '_atom_site_occupancy',
                      '_atom_site_label',
                      [ map { exists $_->{calc_flag} && $_->{calc_flag} eq 'dum'
                                ? '.' : '1.0' } @atoms ] );
    }
    if( !$use_one_output_datablock ) {
        exclude_tag( $new_dataset, '_atom_site_disorder_assembly' );
        exclude_tag( $new_dataset, '_atom_site_disorder_group' );
    }

    # Forcing coordinate format
    for my $tag ( qw( _atom_site_fract_x
                      _atom_site_fract_y
                      _atom_site_fract_z ) ) {
        set_loop_tag( $new_dataset,
                      $tag,
                      '_atom_site_label',
                      [ map { $_ = sprintf $format, $_;
                              s/^\s+//; s/\s+$//; $_ }
                            @{$new_dataset->{values}{$tag}} ] );
    }

    # Printing _geom_bond_ output on request
    if( $output_geom_bond ) {
        if( exists $molecule->{bonds} ) {
            set_loop_tag( $new_dataset,
                          '_geom_bond_atom_site_label_1',
                          '_geom_bond_atom_site_label_1',
                          [ map { $_->{atom1}{name} }
                                @{$molecule->{bonds}} ] );
            set_loop_tag( $new_dataset,
                          '_geom_bond_atom_site_label_2',
                          '_geom_bond_atom_site_label_1',
                          [ map { $_->{atom2}{name} }
                                @{$molecule->{bonds}} ] );
            set_loop_tag( $new_dataset,
                          '_geom_bond_distance',
                          '_geom_bond_atom_site_label_1',
                          [ map { sprintf '%.5f', $_->{distance} }
                                @{$molecule->{bonds}} ] );
            set_loop_tag( $new_dataset,
                          '_geom_bond_valence',
                          '_geom_bond_atom_site_label_1',
                          [ map { $_->{order} }
                                @{$molecule->{bonds}} ] );
        } else {
            warn 'WARNING, bond data necessary to compute _geom_bond_ '
               . 'data items was not calculated' . "\n";
        }
    }

    print_cif( $new_dataset,
                    {
                        preserve_loop_order => 1,
                        keep_tag_order => 1
                    } );

    return;
}

#===============================================================#
# Finds all possible molecules in the CIF file. If two atoms are connected via
# then the algorithm states that there in no bond between these two atoms.

# The algorithm:
# 1. Takes an initial atom and tests if it has not been found in the other
#    molecule yet
# 2. If not, then begins to search for the other molecule:
# 2.1  Does modulo_1 for the initial atom
# 2.2  Finds a translation from initial atom to atom_modulo_1
# 2.3  Searches for all neighbours of atom_modulo_1
# 2.4  For each neighbour of atom_modulo_1 does 2.1 -- 2.4
# 2.5  atom_modulo_1 and all its neighbours translates according translation
#       vector. atom_modulo_1 now becomes initial atom. The others - accordingly
# 3. Stops and does the step 1 until there is no left any initial atom.


# Accepts
#     covalent_sensitivity - a threshold for covalent sensitivity
#     atom_properties(
#           H => {
#                     name => Hydrogen, #(chemical_type)
#                     period => 1,
#                     group => 1,
#                     block => s,
#                     atomic_number => "1",
#                     atomic_weight => 1.008,
#                     covalent_radius => 0.23,
#                     vdw_radius => 1.09,
#                     valency => [1],
#                     },
#          );
# symmetric_atoms and initial_atoms are arrays of
#                                 $atom_info = {
#                                             name=>"C1_2",
#                                             site_label=>"C1",
#                                             chemical_type=>"C",
#                                             coordinates_fract=>[1.0, 1.0,1.0],
#                                             coordinates_ortho=>[1.0, 1.0,1.0],
#                                             unity_matrix_applied=>1
#                                             }
# Returns an array of
# %molecule = (
#               atoms => [
#                   \%atom1_info, \%atom2_info, \%atom3_info, \%atom4_info
#               ],
#               bonds => [
#                   [ \%atom1_info, \%atom2_info ],
#                   [ \%atom1_info, \%atom3_info ],
#                   [ \%atom4_info, \%atom3_info ],
#               ] # covalent bond description
#               chemical_formula_sum => "C6 H6",
#             );

sub find_molecules($$$$$$)
{
    my $covalent_sensitivity = shift(@_);
    my $atom_properties      = shift(@_);
    my $symmetric_atoms      = shift(@_);
    my $initial_atoms        = shift(@_);
    my $bricks               = shift(@_);
    my $seen_molecules       = shift(@_);

    my @unique_molecules;
    my %used_atoms;
    my %used_originals;
    my %used_uc_atoms;
    my %checked_pairs;
    my $nbumps = 0;

    foreach my $initial_atom (@$initial_atoms)
    {
        next if exists $used_originals{$initial_atom->{cell_label}};
        print STDERR ">>>> starting new molecule\n" if $debug;

        ## if( ! $expand_to_p1 &&
        ##     $initial_atom->{cell_label} ne $initial_atom->{site_label} ) {
        ##     print STDERR
        ##         ">>>> site: $initial_atom->{site_label}, " .
        ##         "cell: $initial_atom->{cell_label}\n";
        ## }

        my( $molecule_atoms, $mol_nbumps, $mol_polymer_atoms ) =
            find_molecule( $covalent_sensitivity,
                           $atom_properties,
                           $symmetric_atoms,
                           \%used_atoms,
                           \%used_originals,
                           \%used_uc_atoms,
                           \%checked_pairs,
                           $initial_atom, $bricks );

        my @molecule_atoms = @$molecule_atoms;
        $nbumps += $mol_nbumps;

        if( !@molecule_atoms ) {
            warn "WARNING, found molecule with no atoms -- strange...\n";
            next;
        }

        # Calculate polymer dimension and basis.
        my $polymer_dimension;
        my $polymer_basis;

        if( $mol_polymer_atoms > 0 ) {
            my $polymer_vectors = {};
            for my $atom ( @molecule_atoms ) {
                my $site_label = $atom->{site_label};
                my $symop_id = $atom->{symop_id};
                if( !exists $polymer_vectors->{$site_label}{$symop_id} ) {
                    $polymer_vectors->{$site_label}{$symop_id} = [];
                }
                push( @{$polymer_vectors->{$site_label}{$symop_id}},
                      $atom->{translation} );
            }
            for my $site_label (sort keys %$polymer_vectors) {
                for my $symop_id (sort keys %{$polymer_vectors->{$site_label}} ) {
                    my @polymer_vectors =
                        @{$polymer_vectors->{$site_label}{$symop_id}};
                    next if @polymer_vectors < 2;
                    my $reference_vector = shift @polymer_vectors;
                    my ($polymer_dimension_now, $basis_now) =
                        get_rank_and_basis(
                            [ map { vector_sub( $_, $reference_vector ) }
                            @polymer_vectors ]
                        );
                    next if !defined $polymer_dimension_now;
                    if( !defined $polymer_dimension ||
                        $polymer_dimension < $polymer_dimension_now ) {
                        $polymer_dimension = $polymer_dimension_now;
                        $polymer_basis = $basis_now;
                    }
                }
            }
        }

        my %molecule = (
            atoms => \@molecule_atoms,
            chemical_formula_sum => '',
            is_polymer => ($mol_polymer_atoms > 0),
            polymer_dimension => $polymer_dimension,
            polymer_basis => $polymer_basis,
        );

        push( @unique_molecules, \%molecule );
    }

    if( !$verbose && $nbumps > 0 ) {
        warn "WARNING, $nbumps pair(s) of atoms are too close to "
           . "each other and are considered as bumps\n";
    }

    return @unique_molecules;
}

# ============================================================================ #

sub find_molecule($$$$$$$$$)
{
    my $covalent_sensitivity = shift(@_);
    my $atom_properties      = shift(@_);
    my $symmetric_atoms      = shift(@_);
    my $used_atoms           = shift(@_);
    my $used_originals       = shift(@_);
    my $used_uc_atoms        = shift(@_);
    my $checked_pairs        = shift(@_);
    my $current_atom         = shift(@_);
    my $bricks               = shift(@_);

    my @current_coords_fract_modulo_1 =
        map { modulo_1($_) } @{$current_atom->{coordinates_fract}};

    my $atom_in_unit_cell_coords_ortho =
        symop_vector_mul( $current_atom->{f2o}, \@current_coords_fract_modulo_1 );

    my $current_translation = translation( $current_atom->{coordinates_fract},
                                           \@current_coords_fract_modulo_1 );

    my @neighbors;

    do {
        no warnings;
        if( exists $used_atoms->
            {$current_atom->{site_label}}
            {$current_atom->{symop_id}}
            {$current_atom->{translation_id}} ) {
            print STDERR "<<<< atom labelled '$current_atom->{name}' " .
                "is already in some molecule, returning\n"
                if $debug;
            return ( \@neighbors, 0, 0 );
        }

        $used_atoms->{$current_atom->{site_label}}
            {$current_atom->{symop_id}}
            {$current_atom->{translation_id}} = $current_atom;
    }; # end no warnings

    $used_originals->{$current_atom->{cell_label}} =
        $current_atom->{cell_label};

    my $polymer_atoms = 0;

    do {
        no warnings;
        if( exists $used_uc_atoms->
            {$current_atom->{site_label}}
            {$current_atom->{symop_id}} ) {
            my $used_uc_atom = $used_uc_atoms->
                    {$current_atom->{site_label}}
                    {$current_atom->{symop_id}};
            print STDERR ">>> !!!! detected a used unit cell " .
                "label $current_atom->{name}/$current_atom->{symop_id}/" .
                "$current_atom->{translation_id} (${used_uc_atom}-th time)\n"
            if $debug;

            $polymer_atoms++;

            if( $used_uc_atoms->
                {$current_atom->{site_label}}
                {$current_atom->{symop_id}} > $max_polymer_atoms ) {
                my $message = "the maximum number of polymer atom " .
                    "repetitions $max_polymer_atoms was reached for " .
                    "atom '$current_atom->{site_label}' " .
                    "(symop id '$current_atom->{symop_id}') -- " .
                    "to get around this limit, please increase " .
                    "--max-polymer-atoms, to say, " .
                    "--max-polymer-atoms=" . (2 * $max_polymer_atoms) . " " .
                    "or decrease --max-polymer-span (e.g. " .
                    "--max-polymer-span=" . int($max_polymer_span/2) . ", " .
                    "but expect increased computation times and " .
                    "memory consumption)";
                if( !$die_on_errors ) {
                    warn "WARNING, $message\n";
                    return ( [], 0, $polymer_atoms );
                } else {
                    die "ERROR, $message\n";
                }
            }

            if( abs($current_atom->{translation}[0]) > $max_polymer_span ||
                abs($current_atom->{translation}[1]) > $max_polymer_span ||
                abs($current_atom->{translation}[2]) > $max_polymer_span ) {
                return ( [$current_atom], 0, $polymer_atoms );
            }
        }

        $used_uc_atoms->
            {$current_atom->{site_label}}
            {$current_atom->{symop_id}} ++;
    }; # end no warnings

    print STDERR
        ">>> considering atom $current_atom->{name} " .
            "(@{$atom_in_unit_cell_coords_ortho}) " .
        "($current_atom->{cell_label}/" .
        "$current_atom->{symop_id}/$current_atom->{translation_id})\n"
        if $debug;

    push( @neighbors, $current_atom );

    my ($i_init, $j_init, $k_init) =
        get_atom_index( $bricks, @{$atom_in_unit_cell_coords_ortho} );

    my ( $min_i, $max_i, $min_j, $max_j, $min_k, $max_k );

    eval {
        ( $min_i, $max_i, $min_j, $max_j, $min_k, $max_k ) =
            get_search_span( $bricks, $i_init, $j_init, $k_init );
    };
    if( $@ ) {
        use COD::Serialise qw( serialiseRef );
        serialiseRef( $atom_in_unit_cell_coords_ortho );
        serialiseRef( [ $i_init, $j_init, $k_init ] );
        serialiseRef( $bricks );
        die( $@ );
    }

    if( $debug ) {
        local $" = ", ";
        print STDERR
            ">>> now scanning its distinct neighbours " .
            "around @{$atom_in_unit_cell_coords_ortho}:\n";
    };

    my $nbumps = 0;

    ## foreach my $sym_atom (@$symmetric_atoms)
    for my $i ($min_i .. $max_i) {
    for my $j ($min_j .. $max_j) {
    for my $k ($min_k .. $max_k) {
        for my $sym_atom ( @{$bricks->{atoms}[$i][$j][$k]} ) {
            my $sym_atom_coords_ortho = $sym_atom->{coordinates_ortho};
            my $new_label = $current_atom->{name};
            my $sym_label = $sym_atom->{name};

            if( $current_atom->{name} eq $sym_atom->{name} ) {
            # We have found the same atom, no need to add bond or
            # neighbour
            next;
            }

            my $dist = distance( $atom_in_unit_cell_coords_ortho,
                                 $sym_atom_coords_ortho );

            do {
                local $" = ' ';
                print STDERR ">>> checking neighbour $sym_label " .
                    "(@{$sym_atom_coords_ortho}), " .
                    "d = $dist\n";
            } if $debug;

            my $is_bump = test_bump( $atom_properties,
                                     $current_atom->{chemical_type},
                                     $sym_atom->{chemical_type},
                                     $current_atom->{site_label},
                                     $sym_atom->{site_label},
                                     $dist, $bump_distance_factor );

            if( $is_bump &&
                !atoms_are_alternative( $current_atom, $sym_atom ) ) {
                if( not exists $checked_pairs->{$sym_label}{$new_label} ) {
                    my $message = "atoms '$current_atom->{name}' and " .
                        "'$sym_atom->{name}' are too close " .
                        "(distance = " .
                        sprintf( "%6.4f", $dist ) .
                        ") and are considered a bump";
                    if( $ignore_bumps ) {
                        if( $verbose || $total_nbumps < 5 ) {
                            warn "WARNING, $message\n";
                        }
                        $nbumps++;
                        $total_nbumps++;
                    } else {
                        die "ERROR, $message -- aborting calculations\n";
                    }
                }
            }

            $checked_pairs->{$sym_label}{$new_label} = 1;
            $checked_pairs->{$new_label}{$sym_label} = 1;

            my $is_bond = test_bond($atom_properties,
                                    $current_atom->{chemical_type},
                                    $sym_atom->{chemical_type},
                                    $dist,
                                    $covalent_sensitivity);

            if( $is_bond &&
                !atoms_are_alternative( $current_atom, $sym_atom ) ) {
                do {
                    use COD::Serialise qw( serialiseRef );
                    local $" = ' ';
                    print STDERR ">>> found bond:\n";
                    serialiseRef( { "translation" => $current_translation,
                                    "original atom" => $current_atom,
                                    "sym atom" => $sym_atom } );
                } if $debug;

                my $back_shifted_sym_atom =
                    translate_atom( $sym_atom, $current_translation );

                do {
                    use COD::Serialise qw( serialiseRef );
                    print ">>>> back-shifted atom:\n";
                    serialiseRef( { sym_atom => $sym_atom,
                                    backshifted => $back_shifted_sym_atom } );
                } if $debug;

                my( $neighbours, $mol_nbumps, $mol_polymer_atoms ) =
                    find_molecule( $covalent_sensitivity,
                                   $atom_properties,
                                   $symmetric_atoms,
                                   $used_atoms,
                                   $used_originals,
                                   $used_uc_atoms,
                                   $checked_pairs,
                                   $back_shifted_sym_atom,
                                   $bricks );

                push(@neighbors, @$neighbours);
                $nbumps += $mol_nbumps;
                $polymer_atoms += $mol_polymer_atoms;
            }
        }
    }}}

    print ">>> Finished checks;\n" if $debug;

    do {
        use COD::Serialise qw( serialiseRef );
        print ">>> Before translation:";
        serialiseRef( \@neighbors );
    } if $debug;

    return ( \@neighbors, $nbumps, $polymer_atoms );
}

#===========================================================================
# Return a list of chemical bonds (represented as atom pairs, each
# pair being two references to two %atom_info structures describing
# the bonded atoms).

sub atom_bonds
{
    my ($atoms, $atom_properties, $covalent_sensitivity) = @_;

    my $max_covalent_radius = get_max_covalent_radius( $atom_properties );

    my $bricks = build_bricks( $atoms,
                                           $max_covalent_radius * 2 +
                                           $covalent_sensitivity );

    my %used_atoms;
    my @bonds;

    for my $atom (@$atoms) {

        $used_atoms{$atom->{name}} = $atom;

        my $coordinates = $atom->{coordinates_ortho};

        my ($i_init, $j_init, $k_init) =
            get_atom_index( $bricks, @$coordinates );

        my ( $min_i, $max_i, $min_j, $max_j, $min_k, $max_k );

        eval {
            ( $min_i, $max_i, $min_j, $max_j, $min_k, $max_k ) =
                get_search_span( $bricks, $i_init, $j_init, $k_init );
        };
        if( $@ ) {
            use COD::Serialise qw( serialiseRef );
            serialiseRef( $coordinates );
            serialiseRef( [ $i_init, $j_init, $k_init ] );
            serialiseRef( $bricks );
            die( $@ );
        }

        ## foreach my $sym_atom (@$symmetric_atoms)
        for my $i ($min_i .. $max_i) {
        for my $j ($min_j .. $max_j) {
        for my $k ($min_k .. $max_k) {
            for my $neighbour ( @{$bricks->{atoms}[$i][$j][$k]} ) {

                next if exists $used_atoms{$neighbour->{name}};

                my $neighbour_coords = $neighbour->{coordinates_ortho};

                if( $atom == $neighbour ) {
                    # We have found the same atom, no need to add bond or
                    # neighbour
                    next;
                }

                my $distance = distance( $coordinates, $neighbour_coords );

                my $is_bond = test_bond($atom_properties,
                                        $atom->{chemical_type},
                                        $neighbour->{chemical_type},
                                        $distance,
                                        $covalent_sensitivity);

                if( $is_bond &&
                    !atoms_are_alternative( $atom, $neighbour ) ) {

                    do {
                        use COD::Serialise qw( serialiseRef );
                        local $" = ' ';
                        print STDERR ">>> found bond:\n";
                        serialiseRef( { "original atom" => $atom,
                                        "neighbour atom" => $neighbour } );
                    } if $debug;

                    my $bond_order =
                        get_bond_order( $distance,
                                        $atom->{chemical_type},
                                        $neighbour->{chemical_type},
                                        $atom_properties );

                    push( @bonds, {
                        atom1 => $atom,
                        atom2 => $neighbour,
                        distance => $distance,
                        order => $bond_order,
                    });
                }
            }
        }}}
    }

    return \@bonds;
}

#==============================================================================
# Use heuristics to guess bond order from its length:

sub get_bond_order
{
    my( $distance, $atom1_type, $atom2_type, $atom_properties ) = @_;

    if( exists $atom_radii{$atom1_type} && exists $atom_radii{$atom2_type} ) {
        my @atom1_radii = @{$atom_radii{$atom1_type}};
        my @atom2_radii = @{$atom_radii{$atom2_type}};
        my @lengths;
        for my $a1 (@atom1_radii) {
            for my $a2 (@atom2_radii) {
                if( $a1->[0] eq $a2->[0] ) {
                    push( @lengths, [ $a1->[0], $a1->[1],
                                      $a1->[2] + $a2->[2] ] );
                }
            }
        }
        @lengths = sort {$a->[2] <=> $b->[2]} @lengths;
        for my $length (@lengths) {
            if( $distance < $length->[2] ) {
                return $length->[1];
            }
        }
        return "?";
    } else {
        return "?";
    }
}


#==============================================================================
# Calculate the rank and basis of a matrix using Gauss-Jordan elimination.
# @param   matrix
# @retval  rank (integer), matrix basis (string)

sub get_rank_and_basis
{
    my( $m ) = @_;
    return 0 if @{$m} == 0; # no need to create row echelon form

    do {
        local $\ = "\n";
        for (@{$m}) {
            print STDERR join ' ', @{$_};
        }
    } if 0;

    my $reduced_row_echelon_matrix =
        gj_elimination_non_zero_elements( $m, 8 * $machine_epsilon );

    my @rre_semicolon_separated_vectors =
        reverse sort { $a cmp $b }
            map {
                join ';', map {sprintf '%g', $_} @{$_}
            } @{$reduced_row_echelon_matrix};

    # set of linearly independent vectors for the matrix:
    my $basis = join ' ', @rre_semicolon_separated_vectors;

    print STDERR ">>>> basis: $basis\n"
        if 0;

    return scalar( @{$reduced_row_echelon_matrix} ), $basis;
}


#==============================================================================
# Convert single-quoted basis string to matrix.
# @param   string
# @retval  matrix

sub basis_string_to_matrix
{
    my( $string ) = @_;
    $string =~ s/'//g; # remove single-quotes
    my @vectors = split /\s+/, $string;
    my $matrix = [map { [split /;/, $_] } @vectors];
    return $matrix;
}


#==============================================================================
# Find machine epsilon.
# @param   void
# @retval  scalar
sub get_machine_epsilon
{
    my $epsilon = 1.00;
    while ( $epsilon + 1.00 > 1.00 ) {
        $epsilon /= 2;
    }
    return $epsilon;
}