File: download_NCTC_pipeline.py

package info (click to toggle)
hinge 0.5.0-8
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 2,972 kB
  • sloc: cpp: 9,480; ansic: 8,826; python: 5,023; sh: 340; makefile: 10
file content (50 lines) | stat: -rw-r--r-- 1,252 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
import os
import sys
import subprocess

base_dir = '/data/pacbio_assembly/pb_data/NCTC/'
bact_dict = json.load(open(base_dir+'NCTC.json'))

#bacterium_of_interest='NCTC7972'

bacterium_of_interest=sys.argv[1]

if len(sys.argv) > 2:
	bact_dict=sys.argv[2]

bact_name="_".join(bact_dict[bacterium_of_interest]['Species'][0].split())

cmd_base = 'ascp -QT -l 1000m -i /data/pacbio_assembly/pb_data/asperaweb_id_dsa.openssh era-fasp@fasp.ega.ebi.ac.uk:vol1/'
dest_dir = base_dir+bacterium_of_interest+'/'

os.system('mkdir -p '+dest_dir)

for run, file_list in list(bact_dict[bacterium_of_interest]['file_paths'].items()):
    for file_path in  file_list:
        cmd = cmd_base+file_path+' '+dest_dir
        print(cmd)
        os.system(cmd)

dest_fasta_name = dest_dir+bact_name

dextract_cmd = 'dextract -o'+dest_fasta_name

bax_files = [x for x in os.listdir(dest_dir) if x.endswith('.bax.h5')]

for bax_file in bax_files:
	dextract_cmd +=  " " + dest_dir+bax_file

print(dextract_cmd)

try:
    subprocess.check_output(dextract_cmd.split())
    print('dextract done. deleting .bax.h5 files')
    os.system('rm '+dest_dir+'*.bax.h5')
    print('removing .quiva files')
    os.system('rm '+dest_dir+'*.quiva')
except:
    print('error')