File: metatximeta.R

package info (click to toggle)
r-bioc-tximeta 1.16.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 860 kB
  • sloc: makefile: 2
file content (120 lines) | stat: -rw-r--r-- 4,637 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
hashit <- function(source, organism, release, catNC=FALSE, ...) {

  if (source == "GENCODE") {
    if (organism == "Homo sapiens") {
      fasta <- paste0("ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_",
                      release,"/gencode.v",release,".transcripts.fa.gz")
      gtf <- paste0("ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_",
                    release,"/gencode.v",release,".annotation.gtf.gz")
      genome <- "GRCh38"
    }
    if (organism == "Mus musculus") {
      fasta <- paste0("ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_",
                      release,"/gencode.v",release,".transcripts.fa.gz")
      gtf <- paste0("ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_",
                    release,"/gencode.v",release,".annotation.gtf.gz")
      genome <- "GRCm38"
    }
  }
  
  if (source == "Ensembl") {
    org <- sub(" ","_",tolower(organism))
    org_upper <- sub(" ","_",organism)
    genome <- if (organism == "Homo sapiens") {
                "GRCh38"
              } else if (organism == "Mus musculus") {
                if (as.numeric(release) >= 103) {
                  "GRCm39"
                } else {
                  "GRCm38"
                }
              } else if (organism == "Drosophila melanogaster") {
                if (as.numeric(release) >= 103) {
                  "BDGP6.32"
                } else if (as.numeric(release) >= 99) {
                  "BDGP6.28"
                } else if (as.numeric(release) >= 96) {
                  "BDGP6.22"
                } else {
                  "BDGP6"
                }
              } 
  
    fasta <- paste0("ftp://ftp.ensembl.org/pub/release-",
                    release,"/fasta/",org,"/cdna/",org_upper,".",genome,".cdna.all.fa.gz")
    if (catNC) {
      nc <- paste0("ftp://ftp.ensembl.org/pub/release-",
                   release,"/fasta/",org,"/ncrna/",org_upper,".",genome,".ncrna.fa.gz")
      fasta <- c(fasta, nc)
    }
    gtf <- paste0("ftp://ftp.ensembl.org/pub/release-",
                  release,"/gtf/",org,"/",org_upper,".",genome,".",release,".gtf.gz")

  }

  if (source == "RefSeq") {
    org_upper <- sub(" ","_",organism)
    chop <- function(x) as.numeric(sub(".*\\.p","",x))
    if (organism == "Homo sapiens") {
      genome <- "GRCh38"
      release_num <- chop(release) + 26
      gcf <- "GCF_000001405"
    } else if (organism == "Mus musculus") {
      genome <- "GRCm38"
      release_num <- chop(release) + 20
      gcf <- "GCF_000001635"
    }
    fasta <- paste0("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/",org_upper,
                    "/all_assembly_versions/",gcf,".",release_num,"_",release,"/",
                    gcf,".",release_num,"_",release,"_rna.fna.gz")
    gtf <- paste0("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/",org_upper,
                  "/all_assembly_versions/",gcf,".",release_num,"_",release,"/",
                  gcf,".",release_num,"_",release,"_genomic.gff.gz")
  }

  #message(fasta)
  #message(gtf)
  
  stopifnot(all(sapply(fasta, RCurl::url.exists)))
  stopifnot(RCurl::url.exists(gtf))
  if (catNC) {
    for (f in fasta) {
      download.file(f, basename(f), ...)
    }
    system(paste("cat",paste(basename(fasta),collapse=" "),"> transcripts.fa.gz"))
    for (f in fasta) {
      system(paste("rm -f",basename(f)))
    }
  } else {
    download.file(fasta, "transcripts.fa.gz", ...)
  }
  system("gunzip transcripts.fa.gz")
  system("compute_fasta_digest --reference transcripts.fa --out hash.json")
  sha256 <- jsonlite::fromJSON("hash.json")$seq_hash
  df <- data.frame(source, organism, release, genome, paste(fasta, collapse=" "), gtf, sha256)
  write.table(df, file="newrows.csv", append=TRUE, quote=FALSE, sep=",", row.names=FALSE, col.names=FALSE)
  system("rm -f transcripts.fa")
  system("rm -f hash.json")
  message("done!")
}

# download method
m <- "wget"

i <- 43
hashit("GENCODE", "Homo sapiens", i, method=m)
i <- 32
hashit("GENCODE", "Mus musculus", paste0("M",i), method=m)
#
i <- 109
hashit("Ensembl", "Homo sapiens", i, method=m)
hashit("Ensembl", "Mus musculus", i, method=m)
hashit("Ensembl", "Drosophila melanogaster", i, method=m)
#
hashit("Ensembl", "Homo sapiens", i, catNC=TRUE, method=m)
hashit("Ensembl", "Mus musculus", i, catNC=TRUE, method=m)
hashit("Ensembl", "Drosophila melanogaster", i, catNC=TRUE, method=m)
#
# RefSeq permalink transcriptomes are rarely updated
# hashit("RefSeq", "Homo sapiens", paste0("GRCh38.p",i), method=m)
# hashit("RefSeq", "Mus musculus", paste0("GRCm38.p",i), method=m)