File: get_data.py

package info (click to toggle)
hisat2 2.2.1-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 19,448 kB
  • sloc: cpp: 97,109; python: 11,075; perl: 7,279; sh: 2,328; ansic: 1,458; makefile: 532; javascript: 273; java: 116
file content (102 lines) | stat: -rwxr-xr-x 3,787 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/python3

import sys, os
from argparse import ArgumentParser, FileType

def get_data(small = False):
    data_root = "http://www.ccb.jhu.edu/software/hisat2/downloads/evaluation"
    
    # Download the reference human genome, SNPs, and gene annotations
    if not os.path.exists("data"):
        os.mkdir("data")
    os.chdir("data")
    genome_files = ["genome.fa", "genome.fa.fai", "genome.gtf", "snpCommon.txt", "genome.snp", "genome.ss", "genome.exon"]
    small_genome_files = ["22.fa", "22.fa.fai", "22.gtf", "22.snp", "22.ss", "22.exon", \
                              "22_20-21M.fa", "22_20-21M.fa.fai", "22_20-21M.gtf", "22_20-21M.snp", "22_20-21M.ss", "22_20-21M.exon"]
    files = []
    if not small:
        files += genome_files
    files += small_genome_files
    for file in files:
        if os.path.exists(file):
            continue
        wget_cmd = "wget %s/data/%s" % (data_root, file)
        print(wget_cmd, file=sys.stderr)
        os.system(wget_cmd)
    os.chdir("..")

    # Download indexes
    if not os.path.exists("indexes"):
        os.mkdir("indexes")
    os.chdir("indexes")
    aligners = ["HISAT2", "HISAT", "Bowtie", "STAR", "GSNAP"]
    for genome in ["genome", "22", "22_20-21M"]:
        if small and genome == "genome":
            continue
        for aligner in aligners:
            if genome == "genome":
                aligner_dir = aligner
            else:
                aligner_dir = aligner + "_" + genome
            if os.path.exists(aligner_dir):
                continue
            cmd = "wget %s/indexes/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
                (data_root, aligner_dir, aligner_dir, aligner_dir)
            print(cmd, file=sys.stderr)
            os.system(cmd)
    os.chdir("..")

    # Download simulated and real reads
    if not os.path.exists("reads"):
        os.mkdir("reads")
    os.chdir("reads")
    for type in ["simulation", "real"]:
        if small and type == "real":
            continue
        if not os.path.exists(type):
            os.mkdir(type)
        os.chdir(type)
        if type == "simulation":
            files = ["1M_DNA_reads_22",
                     "1M_DNA_mismatch_reads_22",
                     "1M_DNA_snp_reads_22",
                     "1M_DNA_mismatch_snp_reads_22",
                     "1M_RNA_reads_22",
                     "1M_RNA_constant_reads_22",
                     "1M_RNA_mismatch_reads_22",
                     "1M_RNA_snp_reads_22",
                     "1M_RNA_mismatch_snp_reads_22",
                     "1M_RNA_reads_22_20-21M",
                     "20M_DNA_reads_genome",
                     "20M_DNA_snp_reads_genome",
                     "20M_RNA_reads_genome",
                     "20M_RNA_snp_reads_genome"]
        else:
            files = ["108M_RNA_wgEncodeCshlLongRnaSeq",
                     "62M_RNA_SRR353653",
                     "80M_DNA_SRR345300",
                     "5M_DNA_NA12878D"]
        for file in files:
            if small and file.find("20M") != -1:
                continue
            if os.path.exists(file):
                continue
            cmd = "wget %s/reads/%s/%s.tar.gz; tar xvzf %s.tar.gz; rm %s.tar.gz" % \
                (data_root, type, file, file, file)
            print(cmd, file=sys.stderr)
            os.system(cmd)
        os.chdir("..")
    
    os.chdir("..")
    
    
if __name__ == "__main__":
    parser = ArgumentParser(
        description='Get reference genome, annotations, and indexes')
    parser.add_argument('-s', '--small',
                        dest='small',
                        action='store_true',
                        default=False,
                        help='small testset')
    args = parser.parse_args()
    get_data(args.small)