1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
#!/usr/bin/python3
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################
import os
import sys
import glob
sys.path.append(os.path.join(sys.path[0], "../../spades_pipeline/"))
sys.path.append(os.path.join(sys.path[0], "../../../ext/src/python_libs/"))
from process_cfg import *
if sys.version.startswith('2.'):
import pyyaml2 as pyyaml
elif sys.version.startswith('3.'):
import pyyaml3 as pyyaml
import support
########################################################################
if len(sys.argv) != 2:
print ("Converts datasets from .info format into new .yaml format (for each .info file output is modified .info file and new .yaml file)\n")
print ("Usage: " + sys.argv[0] + " <a DIRECTORY with datasets in .info format or a SINGLE .info FILE>")
exit(0)
input_dir_or_file = sys.argv[1]
if os.path.isdir(input_dir_or_file):
working_dir = input_dir_or_file
info_files = glob.glob(os.path.join(working_dir, "*.info"))
if len(info_files) == 0:
print (".info files not found in " + working_dir + " directory!")
exit(0)
elif os.path.isfile(input_dir_or_file):
info_files = [input_dir_or_file]
working_dir = os.path.dirname(input_dir_or_file)
else:
print (input_dir_or_file + " is not exist!")
exit(0)
# aux function
def add_to_dataset(option, data, dataset_data, lib_type='pe'):
data_type = support.get_data_type(option)
if lib_type == 'pe':
record_id = 0
else: # mate-pairs
record_id = 1
if not dataset_data[record_id]: # setting default values for a new record
if lib_type == 'pe':
dataset_data[record_id]['type'] = 'paired-end'
else:
dataset_data[record_id]['type'] = 'mate-pairs'
if data_type.endswith('reads'): # reads are stored as lists
if data_type in dataset_data[record_id]:
dataset_data[record_id][data_type].append(data)
else:
dataset_data[record_id][data_type] = [data]
else: # other values are stored as plain strings
dataset_data[record_id][data_type] = data
def load_dataset_from_info_file(info_file):
content = dict()
for line in open(info_file):
if line.strip():
key = line.split()[0]
value = line[len(key) + 1:].strip()
content[key] = value
return content
for info_file in info_files:
yaml_file = os.path.join(working_dir, os.path.splitext(os.path.basename(info_file))[0] + ".yaml")
if os.path.isfile(yaml_file):
print "Skipping", info_file, "because corresponding .yaml file already exists"
continue
print "\tProcessing", info_file,
dataset_data = [{}, {}]
content = load_dataset_from_info_file(info_file)
if "reads" in content:
print "\nSkipping", info_file, "because it contains link to a .yaml file! (" + content["reads"] + ")"
continue
for k, v in content.items():
if k.find("_reads") != -1 or k.find("jumping_") != -1:
if k.find("jumping_") != -1:
lib_type = 'mp'
else:
lib_type = 'pe'
if v.startswith('"'):
v = v[1:-1].strip()
reads = v.split()
if k.startswith("paired_reads"):
if len(reads) == 1:
add_to_dataset('--12', reads[0], dataset_data, lib_type)
else:
add_to_dataset('-1', reads[0], dataset_data, lib_type)
add_to_dataset('-2', reads[1], dataset_data, lib_type)
elif k.find("single") != -1:
for read in reads:
add_to_dataset('-s', read, dataset_data, lib_type)
elif k == "jumping_first":
add_to_dataset('-1', reads[0], dataset_data, lib_type)
elif k == "jumping_second":
add_to_dataset('-2', reads[0], dataset_data, lib_type)
else:
print >> sys.stderr, "\nError: reads are not paired and not single!"
continue
dataset_data = support.correct_dataset(dataset_data)
print '...writing to .yaml:', yaml_file, "and updating .info file with link to .yaml:", info_file
pyyaml.dump(dataset_data, file(yaml_file, 'w'))
info = open(info_file, 'a')
info.write("\n")
info.write("reads\t" + os.path.basename(yaml_file) + "\n")
#print "yaml"
#print pyyaml.dump(dataset_data)
|