File: sdf_add_cod_data

package info (click to toggle)
cod-tools 2.3%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 114,852 kB
  • sloc: perl: 53,336; sh: 23,842; ansic: 6,318; xml: 1,982; yacc: 1,112; makefile: 716; python: 158; sql: 73
file content (129 lines) | stat: -rwxr-xr-x 4,246 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#! /bin/sh
#------------------------------------------------------------------------------
#$Author: andrius $
#$Date: 2018-08-28 17:09:31 +0300 (An, 28 rugp. 2018) $
#$Revision: 6403 $
#$URL: svn://www.crystallography.net/cod-tools/tags/v2.3/scripts/sdf_add_cod_data $
#------------------------------------------------------------------------------
#*
#* Append COD-specific meta-information to an SDF file in a format
#* suitable for PubChem.
#*
#* USAGE: $0 --options --cod-cif cod-input.cif file1.sdf
#* USAGE: $0 --options --cod-cif cod-input.cif file1.sdf > output.sdf
#* USAGE: $0 --options --cod-cif cod-input.cif < file1.sdf > output.sdf
#**

TMP_DIR="${TMPDIR}"

set -ue

FILES=""
COD_CIF=""

#* OPTIONS:
#*   -C, --cod-cif 1000000.cif
#*                     Provide the original COD CIF to extract structure metadata.
#*
#*   --tmp-dir /tmp
#*                     Use the specified temporary directory (default is /tmp).
#*                     This option is deprecated and will be removed in
#*                     future releases.
#*
#*   --help, --usage
#*                     Output a short help message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
while [ $# -gt 0 ]
do
  case $1 in
      -C|--cod-cif|--cod-ci|--cod-c|--cod|--co|--c)
          COD_CIF="$2"
          shift
          ;;
      --tmp-dir|--tmp-di|--tmp-d|--tmp|--tm|--t)
          TMP_DIR="$2"
          shift
          ;;
      --options|--option|--optio|--opti|--opt|--op|--o)
          echo "`basename $0`:: The '--options' option is a placehoder."
          echo "`basename $0`:: It should be replaced by one of the following options:"
          awk '/#\* OPTIONS:/,/#\*\*/ {
                  sub("OPTIONS:", ""); \
                  sub("^ *#[*]?[*]?", ""); \
                  gsub("\\$0","'$0'"); \
                  print $0
              }' $0
          exit
          ;;
      --help|--hel|--he|--h|--usage)
          awk '/#\*/,/#\*\*/ {
                  sub("^ *#[*]?[*]?", ""); \
                  gsub("\\$0","'$0'"); \
                  print $0
              }' $0
          exit
          ;;
      --version)
          $(dirname $0)/cod-tools-version
          exit
          ;;
      -*) echo "`basename $0`:: ERROR, unknown option '$1'." >&2 ; exit 1 ;;
      *)  FILES="$FILES '$1'" ;;
    esac
    shift
done

eval set -- "${FILES}"

cif_values='cif_values --no-header --no-dataname --no-filename --dont-replace-spaces --tags'

grep -v '^\$\$\$\$' ${1+"$@"}

(
    DATABASE_ID=$(${cif_values} _cod_database_code $COD_CIF)
    test "${DATABASE_ID}" = "?" && \
        DATABASE_ID=$(${cif_values} _cod_data_source_block $COD_CIF)
    echo '> <PUBCHEM_EXT_DATASOURCE_REGID>'
    echo ${DATABASE_ID}
    echo ""
    echo '> <PUBCHEM_SUBSTANCE_SYNONYM>'
    (
        ${cif_values} _chemical_name_systematic $COD_CIF
        ${cif_values} _chemical_name_common $COD_CIF
    ) \
        | ( grep -E -v '^\s*\?' || true ) \
        | perl -0777 -pe 's/[ \t]+/ /g; s/^\s*|\s*$//g'
    echo ""
    echo ""
    echo '> <PUBCHEM_SUBSTANCE_COMMENT>'
    (
        ${cif_values} _publ_author_name --value-separator "; " $COD_CIF
        echo "("$(${cif_values} _journal_year $COD_CIF)")"
        ${cif_values} _publ_section_title $COD_CIF
        ${cif_values} _journal_name_full $COD_CIF
        ${cif_values} _journal_volume $COD_CIF
        ${cif_values} _journal_issue $COD_CIF
        echo $(${cif_values} _journal_page_first $COD_CIF)-$(${cif_values} \
            _journal_page_last $COD_CIF)
        echo DOI:$(${cif_values} _journal_paper_doi $COD_CIF)
    ) \
        | perl -pe 's/-\?//; s/\(\?\)/?/; s/\?-/?/; s/\?+/?/g' \
        | ( grep -Ev '^\?|DOI:\?' || true ) \
        | perl -0777 -pe 's/\n(.)/, $1/g; s/[ \t]+/ /g'
    echo ""
    echo '> <PUBCHEM_EXT_DATASOURCE_URL>'
    echo 'http://www.crystallography.net/'
    echo ""
    echo '> <PUBCHEM_EXT_SUBSTANCE_URL>'
    echo "http://www.crystallography.net/cod/${DATABASE_ID}.html"
    echo ""
    echo '$$$$'
) \
| perl -0777 -pe 's/^> <.*?>\n\s*\n//mg' \
| cif-to-utf8 \
| perl -CS -MUnicode::Normalize -pe \
    "# from http://ahinea.com/en/tech/accented-translate.html:
     # 2011.12.10
     \$_ = NFD(\$_); s/\\pM//g;"