File: datasets_info2yaml_converter.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (122 lines) | stat: -rwxr-xr-x 4,719 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python3

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################


import os
import sys
import glob

sys.path.append(os.path.join(sys.path[0], "../../spades_pipeline/"))
sys.path.append(os.path.join(sys.path[0], "../../../ext/src/python_libs/"))
from process_cfg import *
if sys.version.startswith('2.'):
    import pyyaml2 as pyyaml
elif sys.version.startswith('3.'):
    import pyyaml3 as pyyaml
import support

########################################################################

if len(sys.argv) != 2:
	print ("Converts datasets from .info format into new .yaml format (for each .info file output is modified .info file and new .yaml file)\n")
	print ("Usage: " + sys.argv[0] + " <a DIRECTORY with datasets in .info format or a SINGLE .info FILE>")
	exit(0)

input_dir_or_file = sys.argv[1] 
if os.path.isdir(input_dir_or_file):
    working_dir = input_dir_or_file
    info_files = glob.glob(os.path.join(working_dir, "*.info"))
    if len(info_files) == 0:
        print (".info files not found in " + working_dir + " directory!")
        exit(0)
elif os.path.isfile(input_dir_or_file):
    info_files = [input_dir_or_file]
    working_dir = os.path.dirname(input_dir_or_file)
else:
    print (input_dir_or_file + " is not exist!")
    exit(0)

# aux function
def add_to_dataset(option, data, dataset_data, lib_type='pe'):
    data_type = support.get_data_type(option)    
    if lib_type == 'pe':
        record_id = 0
    else: # mate-pairs
        record_id = 1

    if not dataset_data[record_id]: # setting default values for a new record
        if lib_type == 'pe':
            dataset_data[record_id]['type'] = 'paired-end'
        else:
            dataset_data[record_id]['type'] = 'mate-pairs'            
    if data_type.endswith('reads'): # reads are stored as lists
        if data_type in dataset_data[record_id]:
            dataset_data[record_id][data_type].append(data)
        else:
            dataset_data[record_id][data_type] = [data]
    else: # other values are stored as plain strings
        dataset_data[record_id][data_type] = data


def load_dataset_from_info_file(info_file):
    content = dict()
    for line in open(info_file):
        if line.strip():
            key = line.split()[0]
            value = line[len(key) + 1:].strip()
            content[key] = value
    return content


for info_file in info_files:
    yaml_file = os.path.join(working_dir, os.path.splitext(os.path.basename(info_file))[0] + ".yaml")  
    if os.path.isfile(yaml_file):
        print "Skipping", info_file, "because corresponding .yaml file already exists"
        continue
    print "\tProcessing", info_file,
    dataset_data = [{}, {}]
    content = load_dataset_from_info_file(info_file)
    if "reads" in content:
        print "\nSkipping", info_file, "because it contains link to a .yaml file! (" + content["reads"] + ")"
        continue
    for k, v in content.items():        
        if k.find("_reads") != -1 or k.find("jumping_") != -1:
            if k.find("jumping_") != -1:
                lib_type = 'mp'
            else:
                lib_type = 'pe'                
            if v.startswith('"'):
                v = v[1:-1].strip()
            reads = v.split()
            if k.startswith("paired_reads"):
                if len(reads) == 1:
                    add_to_dataset('--12', reads[0], dataset_data, lib_type)
                else:
                    add_to_dataset('-1', reads[0], dataset_data, lib_type)
                    add_to_dataset('-2', reads[1], dataset_data, lib_type)
            elif k.find("single") != -1:
                for read in reads:
                    add_to_dataset('-s', read, dataset_data, lib_type)
            elif k == "jumping_first":
                    add_to_dataset('-1', reads[0], dataset_data, lib_type)
            elif k == "jumping_second":
                    add_to_dataset('-2', reads[0], dataset_data, lib_type)
            else:
                print >> sys.stderr, "\nError: reads are not paired and not single!"
                continue

    dataset_data = support.correct_dataset(dataset_data)
    print '...writing to .yaml:', yaml_file, "and updating .info file with link to .yaml:", info_file
    pyyaml.dump(dataset_data, file(yaml_file, 'w'))
    info = open(info_file, 'a')
    info.write("\n")
    info.write("reads\t" + os.path.basename(yaml_file) + "\n")
    #print "yaml"
    #print pyyaml.dump(dataset_data)