File: validate.cpp

package info (click to toggle)
gemmi 0.6.5%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 5,836 kB
  • sloc: cpp: 54,719; python: 4,743; ansic: 3,972; sh: 384; makefile: 73; f90: 42; javascript: 12
file content (305 lines) | stat: -rw-r--r-- 10,932 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
// Copyright 2017-2020 Global Phasing Ltd.

#include "gemmi/cif.hpp"
#include "gemmi/ddl.hpp"
#include "gemmi/gz.hpp"
#include "gemmi/cifdoc.hpp"
#include "gemmi/numb.hpp"
#include "gemmi/dirwalk.hpp"  // for CifWalk
#include "validate_mon.h"  // for check_monomer_doc
#include <cstdio>
#include <iostream>
#include <stdexcept>  // for std::runtime_error

#ifdef ANALYZE_RULES
# include <tao/pegtl/analyze.hpp>
#endif

#define GEMMI_PROG validate
#include "options.h"

namespace cif = gemmi::cif;

namespace {

enum OptionIndex {
  Quiet=4, Fast, Stat, Context, Ddl, NoRegex, NoMandatory, NoUniqueKeys,
  Parents, Recurse, Monomer, Zscore, Ccd, AuditDate
};
const option::Descriptor Usage[] = {
  { NoOp, 0, "", "", Arg::None, "Usage: " EXE_NAME " [options] FILE [...]"
                                "\n\nOptions:" },
  CommonUsage[Help],
  CommonUsage[Version],
  CommonUsage[Verbose],
  { Quiet, 0, "q", "quiet", Arg::None, "  -q, --quiet  \tShow only errors." },
  { Fast, 0, "f", "fast", Arg::None, "  -f, --fast  \tSyntax-only check." },
  { Stat, 0, "s", "stat", Arg::None, "  -s, --stat  \tShow token statistics" },
  { Ddl, 0, "d", "ddl", Arg::Required, "  -d, --ddl=PATH  \tDDL for validation." },
  { Context, 0, "c", "context", Arg::None,
    "  -c, --context  \tCheck _pdbx_{category|item}_context.type." },
  { NoRegex, 0, "", "no-regex", Arg::None,
    "  --no-regex  \tSkip regex checking (when using DDL2)" },
  { NoMandatory, 0, "", "no-mandatory", Arg::None,
    "  --no-mandatory  \tSkip checking if mandatory tags are present." },
  { NoUniqueKeys, 0, "", "no-unique", Arg::None,
    "  --no-unique  \tSkip checking if category keys (DDL2) are unique." },
  { Parents, 0, "p", "", Arg::None,
    "  -p  \tCheck if parent items (DDL2) are present." },
  { Recurse, 0, "r", "recursive", Arg::None,
    "  -r, --recursive  \tRecurse directories and process all CIF files." },

  { NoOp, 0, "", "", Arg::None, "\nValidation specific to CCP4 monomer files:" },
  { Monomer, 0, "m", "monomer", Arg::None,
    "  -m, --monomer  \tRun checks specific to monomer dictionary." },
  { Zscore, 0, "", "z-score", Arg::Float,
    "  --z-score=Z  \tUse Z for validating _chem_comp_atom.[xyz] (default: 2.0)." },
  { Ccd, 0, "", "ccd", Arg::Required,
    "  --ccd=PATH  \tCCD file for comparison." },
  { AuditDate, 0, "", "audit-on", Arg::Required,
    "  --audit-on=DATE  \tCheck only if CCD component was remediated on DATE." },
  { 0, 0, 0, 0, 0, 0 }
};

// basic types, used for token statistics only
enum class ValueType : unsigned char {
  NotSet,
  Char,
  Numb,
  Dot,
  QuestionMark,
};

inline std::string value_type_to_str(ValueType v) {
  switch (v) {
    case ValueType::NotSet: return "n/a";
    case ValueType::Char: return "char";
    case ValueType::Numb: return "numb";
    case ValueType::Dot: return "'.'";
    case ValueType::QuestionMark: return "'?'";
  }
  return "";
}

// For now the infer_* functions are used only here, not sure where they belong
inline ValueType infer_value_type(const std::string& val) {
  assert(!val.empty());
  if (val == ".")
    return ValueType::Dot;
  if (val == "?")
    return ValueType::QuestionMark;
  if (cif::is_numb(val))
    return ValueType::Numb;
  return ValueType::Char;
}

std::string format_7zd(size_t k) {
  char buf[64];
  snprintf(buf, 63, "%7zu", k);
  return buf;
}

std::string token_stats(const cif::Document& d) {
  size_t nframes = 0, nvals = 0, nloops = 0, nlooptags = 0, nloopvals = 0;
  size_t vals_by_type[5] = {0};
  size_t looptags_by_type[5] = {0};
  for (const cif::Block& block : d.blocks) {
    for (const cif::Item& item : block.items) {
      if (item.type == cif::ItemType::Pair) {
        nvals++;
        ValueType vt = infer_value_type(item.pair[1]);
        vals_by_type[static_cast<int>(vt)]++;
      } else if (item.type == cif::ItemType::Frame) {
        nframes++;
      } else if (item.type == cif::ItemType::Loop) {
        nloops++;
        size_t width = item.loop.width();
        nlooptags += width;
        nloopvals += item.loop.values.size();
        for (size_t i = 0; i != width; ++i) {
          ValueType vt = ValueType::NotSet;
          // TODO: ConstColumn(const::Item*, ...)
          const cif::Column col(const_cast<cif::Item*>(&item), i);
          for (const std::string& v : col) {
            ValueType this_vt = infer_value_type(v);
            if (this_vt != vt) {
              // if we are here: vt != ValueType::Char
              if (vt == ValueType::NotSet || this_vt == ValueType::Numb) {
                vt = this_vt;
              } else if (this_vt == ValueType::Char) {
                vt = this_vt;
                break;
              }
            }
          }
          looptags_by_type[static_cast<int>(vt)]++;
        }
      }
    }
  }
  std::string info;
  gemmi::cat_to(info, format_7zd(d.blocks.size()), " block(s)\n");
  gemmi::cat_to(info, format_7zd(nframes), " frames\n");
  gemmi::cat_to(info, format_7zd(nvals), " non-loop items:");
  for (int i = 1; i != 5; ++i)
    gemmi::cat_to(info, "  ", value_type_to_str(static_cast<ValueType>(i)),
                  ':', vals_by_type[i]);
  gemmi::cat_to(info, '\n', format_7zd(nloops), " loops w/"
                "\n        ", format_7zd(nlooptags), " tags:");
  for (int i = 1; i != 5; ++i)
    gemmi::cat_to(info, "  ", value_type_to_str(static_cast<ValueType>(i)),
                  ':', looptags_by_type[i]);
  gemmi::cat_to(info, "\n        ", format_7zd(nloopvals), " values\n");
  return info;
}

// Empty loop is not a valid CIF syntax, but we parse it to accommodate
// some broken CIF files. Only validation shows an error.
void check_empty_loops(const cif::Block& block) {
  for (const cif::Item& item : block.items) {
    if (item.type == cif::ItemType::Loop) {
      if (item.loop.values.empty() && !item.loop.tags.empty())
        throw std::runtime_error("Empty loop in block " + block.name +
                                 ": " + item.loop.tags[0]);
    } else if (item.type == cif::ItemType::Frame) {
      check_empty_loops(item.frame);
    }
  }
}

bool column_contains(const cif::Column& column, const char* value) {
  for (const std::string& v : column)
    if (cif::as_string(v) == value)
      return true;
  return false;
}

bool process_file(const char* path, const cif::Ddl& dict,
                  const std::map<std::string, cif::Block>& ccd_map,
                  const std::vector<option::Option>& options) {
  bool ok = true;
  std::string msg;
  if (options[Verbose])
    std::cout << "Reading " << path << "..." << std::endl;
  try {
    if (options[Fast]) {
      ok = cif::check_syntax_any(gemmi::MaybeGzipped(path), &msg);
    } else {
      cif::Document doc = cif::read(gemmi::MaybeGzipped(path));
      for (const cif::Block& block : doc.blocks) {
        if (block.name == " ")
          std::cout << doc.source << ": missing block name (bare data_)\n";
        check_empty_loops(block);
      }
      if (options[Stat])
        msg = token_stats(doc);
      if (options[Ddl]) {
        dict.check_audit_conform(doc, std::cout);
        ok = dict.validate_cif(doc, std::cout);
      }

      if (options[Monomer] || options[Zscore] || options[Ccd]) {
        for (const cif::Block& block : doc.blocks) {
          if (block.name == "comp_list")
            continue;
          const std::string cc_name =
            block.name.substr(gemmi::starts_with(block.name, "comp_") ? 5 : 0);
          try {
            if (options[Monomer] || options[Zscore]) {
              double z_score = options[Zscore] ? std::atof(options[Zscore].arg) : 2.0;
              check_monomer(block, z_score);
            }
            if (options[Ccd]) {
              auto it = ccd_map.find(cc_name);
              if (it == ccd_map.end()) {
                std::cout << cc_name << " [ccd] monomer not found in CCD file(s)\n";
                continue;
              }
              const cif::Block& ccd_block = it->second;
              if (options[AuditDate]) {
                cif::Column col = const_cast<cif::Block&>(ccd_block)
                                   .find_values("_pdbx_chem_comp_audit.date");
                if (!column_contains(col, options[AuditDate].arg)) {
                  if (options[Verbose])
                    std::cout << cc_name << " [ccd] ignored - audit date does not match\n";
                  continue;
                }
              }
              compare_monomer_with_ccd(block, ccd_block, options[Verbose]);
            }
          } catch (const std::exception& e) {
            std::cerr << "Failed to interpret monomer: block " << block.name
                      << " from " << doc.source << '\n'
                      << e.what() << std::endl;
          }
        }
      }
    }
  } catch (std::runtime_error& e) {
    ok = false;
    msg = e.what();
  }
  if (!msg.empty())
    std::cout << msg << std::endl;

  if (options[Verbose])
    std::cout << (ok ? "OK" : "FAILED") << std::endl;
  return ok;
}

} // anonymous namespace


int GEMMI_MAIN(int argc, char **argv) {
#ifdef ANALYZE_RULES // for debugging only
  tao::pegtl::analyze<cif::rules::file>();
  tao::pegtl::analyze<cif::numb_rules::numb>();
#endif
  OptParser p(EXE_NAME);
  p.simple_parse(argc, argv, Usage);
  p.require_input_files_as_args();

  bool total_ok = true;
  cif::Ddl dict;
  dict.print_unknown_tags = !p.options[Quiet];
  dict.use_regex = !p.options[NoRegex];
  dict.use_context = p.options[Context];
  dict.use_parents = p.options[Parents];
  dict.use_mandatory = !p.options[NoMandatory];
  dict.use_unique_keys = !p.options[NoUniqueKeys];
  dict.print_extra_diagnostics = p.options[Verbose];
  if (p.options[Ddl]) {
    try {
      for (option::Option* ddl = p.options[Ddl]; ddl; ddl = ddl->next())
        dict.read_ddl(cif::read_file(ddl->arg), std::cout);
    } catch (std::runtime_error& e) {
      std::cerr << "Error when reading dictionary: " << e.what() << std::endl;
      return EXIT_FAILURE;
    }
  }
  std::map<std::string, cif::Block> ccd_map;
  if (p.options[Ccd]) {
    try {
      for (option::Option* ccd = p.options[Ccd]; ccd; ccd = ccd->next()) {
        cif::Document ccd_doc(cif::read(gemmi::MaybeGzipped(ccd->arg)));
        for (cif::Block& ccd_block : ccd_doc.blocks) {
          std::string name = ccd_block.name;
          ccd_map.emplace(name, std::move(ccd_block));
        }
      }
    } catch (std::runtime_error& e) {
      std::cerr << "Error when reading CCD file: " << e.what() << std::endl;
      return EXIT_FAILURE;
    }
  }
  for (int i = 0; i < p.nonOptionsCount(); ++i) {
    const char* path = p.nonOption(i);
    if (p.options[Recurse]) {
      for (const std::string& file : gemmi::CifWalk(path))
        total_ok = process_file(file.c_str(), dict, ccd_map, p.options) && total_ok;
    } else {
      total_ok = process_file(path, dict, ccd_map, p.options) && total_ok;
    }
  }
  return total_ok ? EXIT_SUCCESS : EXIT_FAILURE;
}