1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
|
#!/usr/bin/python3
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################
#script for testing SPAdes
#provide a path to .info file
import sys
import os
import shutil
import getopt
import re
import datetime
import argparse
import subprocess
import glob
import math
from traceback import print_exc
sys.path.append('./src/spades_pipeline/')
import process_cfg
sys.path.append('./src/test/teamcity/')
import teamcity_support
from teamcity_support import *
def parse_args_ut():
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--config", "-c", help="teamcity.py info config OR", type=str)
parser.set_defaults(config="")
parser.add_argument("--config_dir", "-d", help="folder with configs, will be scanned recursively; not compatible with etalon contigs options", type=str)
parser.set_defaults(config_dir="")
parser.add_argument("--tmp_dir", "-t", help="folder for temporary QUAST results, will be cleared before and removed after the run (default ./.tc_tmp)", type=str)
parser.set_defaults(tmp_dir="./.tc_tmp")
parser.add_argument("--preserve_tmp_dir", "-p", help="do not clean or remove temporary dir", action='store_true')
parser.set_defaults(preserve_tmp_dir=False)
#all - update all; add all missibg metrics
#basic - update only basic metrics; add only basic missing metrics
#none - update only metrics that are already on config, add none
parser.add_argument("--add_metrics", "-a", help="add new metrics to the info file [all | basic | none] (default: basic). none: do not add new thresholds to config; basic: add new thresholds only if they are from list of basic ones; all: add all thesholds", type=str)
parser.set_defaults(add_metrics="basic")
parser.add_argument("--etalon_contigs_prefix", help="prefix to saved contigs that will be used to compute thresholds; latest contigs from the config storage by default", type=str)
parser.add_argument("--etalon_contigs_suffix", help="suffix to saved contigs that will be used to compute thresholds; empty by default", type=str)
parser.set_defaults(etalon_contigs_prefix="")
parser.set_defaults(etalon_contigs_suffix="")
parser.add_argument("--force_to_update", help="comma-separated list of metrics for which threshold will be overwritten even if the metric satisfies current value; e.g. 'max_n50,min_n50,max_mis'")
parser.set_defaults(force_to_update = "")
parser.add_argument("--overwrite_if_satisfies", dest="overwrite_if_satisfies", help="overwrite threshold even if metric satisfies current value", action='store_true')
parser.set_defaults(overwrite_if_satisfies=False)
args = parser.parse_args()
return args
BASIC_METRICS = ["N50", "LG50", "# misassemblies", "Genome fraction (%)", "# indels per 100 kbp", "# mismatches per 100 kbp", "# local misassemblies"]
def metric_in_list(metric, mlist):
return metric in mlist.split(',')
#use threshold update policy set by user
def shall_update_threshold(args, metric, entry):
return (args.add_metrics == "all") or (args.add_metrics == "basic" and metric in BASIC_METRICS) or entry.assess or metric_in_list(entry.config_name, args.force_to_update)
#calculate new metric treshold
def calculate_treshold(entry, value):
result = 0.0
coeff = -1 if entry.should_be_higher_than_theshold else 1
if entry.relative_delta:
result = value * (1.0 + coeff * entry.delta_value)
else:
result = value + coeff * entry.delta_value
if entry.is_int:
result = math.floor(result) if entry.should_be_higher_than_theshold else math.ceil(result)
result = max(0.0, result)
return int(result) if entry.is_int else result
#update metrics using given quast report
def process_quast_report(args, report, limit_map):
result_map = parse_report(report, limit_map)
to_update = {}
if args.overwrite_if_satisfies:
log.log("Will overwrite thresholds even when metric satisfies")
for metric in sorted(result_map.keys()):
if metric in limit_map and len(limit_map[metric]) > 0:
for entry in limit_map[metric]:
param_name = entry.config_name
if not shall_update_threshold(args, metric, entry):
log.log("Value for " + param_name + " will not be updated")
continue
#that metric shouold be higher than threshold (e.g. N50)
if entry.should_be_higher_than_theshold:
if result_map[metric] < entry.value or args.overwrite_if_satisfies or metric_in_list(param_name, args.force_to_update):
#either not satisies or overwrite anyways
to_update[param_name] = calculate_treshold(entry, result_map[metric])
log.log("New value for " + param_name + " is set to " + str(to_update[param_name]))
else:
log.log(param_name + " will not be updated, " + metric + " = " + str(result_map[metric]) + " >= " + str(entry.value))
#that metric shouold be less than threshold (e.g. misassemblies)
else:
if result_map[metric] > entry.value or args.overwrite_if_satisfies or metric_in_list(param_name, args.force_to_update):
#either not satisies or overwrite anyways
to_update[param_name] = calculate_treshold(entry, result_map[metric])
log.log("New value for " + param_name + " is set to " + str(to_update[param_name]))
else:
log.log(param_name + " will not be updated, " + metric + " = " + str(result_map[metric]) + " <= " + str(entry.value))
else:
log.log(metric + " = " + str(result_map[metric]))
return to_update
# Run QUAST and assess its report for a single contig file
def quast_run_and_update(dataset_info, fn, output_dir, name, prefix, opts):
if not os.path.exists(fn):
log.err("File not found " + fn)
return {}
if 'quast_params' not in dataset_info.__dict__:
log.log("quast_params are not set, will not run QUAST")
return {}
log.log("Processing " + fn)
qcode = run_quast(dataset_info, [fn], output_dir, opts)
if qcode != 0:
log.err("Failed to run QUAST!")
return {}
limit_map = construct_rnaquast_limit_map(dataset_info, prefix, True) if dataset_info.mode == "rna" else construct_quast_limit_map(dataset_info, prefix, True)
report_path = output_dir
if dataset_info.mode == "meta":
report_path = os.path.join(report_path, "combined_reference")
if dataset_info.mode == "rna":
report_path = os.path.join(report_path, "short_report.tsv")
else:
report_path = os.path.join(report_path, "report.tsv")
return process_quast_report(args, report_path, limit_map)
def get_prefix(var):
if var.startswith("sc_"):
return "sc_"
elif var.startswith("prelim_"):
return "prelim_"
return ""
#substitute and add params to config
def substitute_params(filename, var_dict):
log.log(" === Substituting values in " + filename + " === ")
lines = open(filename).readlines()
vars_in_file = process_cfg.vars_from_lines(lines)
for var, value in var_dict.items():
var_prefix = get_prefix(var)
assess_param = var_prefix + "assess"
if assess_param not in vars_in_file:
lines.extend(["\n", assess_param + " true", "\n"])
vars_in_file = process_cfg.vars_from_lines(lines)
if var not in vars_in_file:
log.log(var + " is not in config, adding")
assess_meta_info = vars_in_file[var_prefix + "assess"]
lines = lines[:assess_meta_info.line_num + 1] + ["\n", assess_meta_info.indent + str(var) + " " + str(value) + "\n"] + lines[assess_meta_info.line_num + 1:]
vars_in_file = process_cfg.vars_from_lines(lines)
else:
log.log("Substituting " + var)
meta = vars_in_file[var]
lines[meta.line_num] = meta.indent + str(var) + " " + str(value) + "\n"
file = open(filename, "w")
file.writelines(lines)
file.close()
#run for given config
def update_thresholds_for_config(args, config):
log.log(" >>>>>>>> Updating " + config)
dataset_info = load_info(config)
contigs = get_contigs_list(args, dataset_info)
full_etalon_contigs_prefix = args.etalon_contigs_prefix
etalon_contigs_suffix = args.etalon_contigs_suffix
if full_etalon_contigs_prefix == "":
if 'contig_storage' in dataset_info.__dict__:
full_etalon_contigs_prefix = os.path.join(dataset_info.contig_storage, "latest_")
etalon_contigs_suffix = ""
else:
log.err("Etalon contigs are not set and contigs storage is not found in info file")
return
log.log("Using etalon contigs prefix: " + full_etalon_contigs_prefix)
to_update = {}
for name, file_name, prefix, opts, ext in contigs:
if ext != "fasta" and ext != "fa":
continue
log.log("======= PROCESSING " + name.upper() + " =======")
if prefix != "":
prefix = prefix + "_"
to_update.update(quast_run_and_update(dataset_info, full_etalon_contigs_prefix + name + etalon_contigs_suffix + "." + ext, os.path.join(args.tmp_dir, "QUAST_RESULTS_" + dataset_info.name.upper() + "_" + name.upper()), name, prefix, opts))
substitute_params(config, to_update)
log.log(" <<<<<<<< Finished updating " + config)
#update threshold for all configs in folder
def update_thresholds_for_folder(args, config_dir):
configs = sorted(glob.glob(os.path.join(config_dir, "*.info")))
if len(configs) == 0:
return
for config in configs:
update_thresholds_for_config(args, config)
#recursively update threshold for all configs in folder
def update_thresholds_recursive(args, base_dir):
update_thresholds_for_folder(args, base_dir)
files = sorted(glob.glob(os.path.join(base_dir, "*")))
for f in files:
if os.path.isdir(f):
update_thresholds_recursive(args, f)
def check_args(args):
correct = True
if args.config == "" and args.config_dir == "":
log.err("Either config file or config dir should be set")
correct = False
if args.config != "" and args.config_dir != "":
log.err("Both config file or config dir cannot be set")
correct = False
if (args.etalon_contigs_prefix != "" or args.etalon_contigs_suffix) and args.config_dir != "":
log.err("Etalon contigs cannot be set when using multiple config files from config")
correct = False
return correct
### main ###
try:
if len(sys.argv) == 1:
command = 'python3 {} -h'.format(sys.argv[0])
subprocess.call(command, shell=True)
sys.exit(1)
sys.stderr = sys.stdout
exit_code = 0
args = parse_args_ut()
if not check_args(args):
log.err("Correct arguments and run again")
sys.exit(3)
if os.path.exists(args.tmp_dir) and not args.preserve_tmp_dir:
shutil.rmtree(args.tmp_dir)
if not os.path.exists(args.tmp_dir):
os.makedirs(args.tmp_dir)
if args.config != "":
update_thresholds_for_config(args, args.config)
else:
update_thresholds_recursive(args, args.config_dir)
if os.path.exists(args.tmp_dir) and not args.preserve_tmp_dir:
shutil.rmtree(args.tmp_dir)
except SystemExit:
raise
except:
log.err("The following unexpected error occured during the run:")
print_exc()
sys.exit(239)
|