File: datasets_info_cleaner.py

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (89 lines) | stat: -rwxr-xr-x 3,212 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/python3

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################


import os
import sys
import glob

sys.path.append(os.path.join(sys.path[0], "../../spades_pipeline/"))
from process_cfg import *
import support

########################################################################

if len(sys.argv) != 2:
	print ("Cleans datasets .info files from superfluous information\n")
	print ("Usage: " + sys.argv[0] + " <a DIRECTORY with datasets in .info format or a SINGLE .info FILE>")
	exit(0)

input_dir_or_file = sys.argv[1] 
if os.path.isdir(input_dir_or_file):
    working_dir = input_dir_or_file
    info_files = glob.glob(os.path.join(working_dir, "*.info"))
    if len(info_files) == 0:
        print (".info files not found in " + working_dir + " directory!")
        exit(0)
elif os.path.isfile(input_dir_or_file):
    info_files = [input_dir_or_file]
    working_dir = os.path.dirname(input_dir_or_file)
else:
    print (input_dir_or_file + " is not exist!")
    exit(0)

# aux function
def add_to_dataset(option, data, dataset_data, lib_type='pe'):
    data_type = support.get_data_type(option)    
    if lib_type == 'pe':
        record_id = 0
    else: # mate-pairs
        record_id = 1

    if not dataset_data[record_id]: # setting default values for a new record
        if lib_type == 'pe':
            dataset_data[record_id]['type'] = 'paired-end'
        else:
            dataset_data[record_id]['type'] = 'mate-pairs'            
    if data_type.endswith('reads'): # reads are stored as lists
        if data_type in dataset_data[record_id]:
            dataset_data[record_id][data_type].append(data)
        else:
            dataset_data[record_id][data_type] = [data]
    else: # other values are stored as plain strings
        dataset_data[record_id][data_type] = data


def load_dataset_from_info_file(info_file):
    content = dict()
    for line in open(info_file):
        if line.strip():
            key = line.split()[0]
            value = line[len(key) + 1:].strip()
            content[key] = value
    return content


for info_file in info_files:
    content = load_dataset_from_info_file(info_file)
    if "reads" in content:
        print '...updating .info file', info_file
        new_info = open(info_file, 'w')    
        new_info.write("reads\t" + content["reads"] + "\n")
        if "single_cell" in content:
            new_info.write("single_cell\t" + content["single_cell"] + "\n")
        if "reference_genome" in content:
            new_info.write("reference_genome\t" + content["reference_genome"] + "\n")
        new_info.write("\n")
        for info_field in ["RL", "IS", "delta", "jump_rl", "jump_is", "jump_delta"]:
            if info_field in content:
                new_info.write("; " + info_field + "\t" + content[info_field] + "\n")
        new_info.close()
    else:
        print "\nSkipping", info_file, "because it doesn't contains link to a .yaml file!\n"