File: JASPAR.R

package info (click to toggle)
r-bioc-tfbstools 1.20.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 920 kB
  • sloc: xml: 1,137; ansic: 590; asm: 54; sh: 13; makefile: 2
file content (84 lines) | stat: -rw-r--r-- 3,468 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
### -----------------------------------------------------------------
### make FlatFileDir from JASPAR database: including *.pfm and matrix_list.txt
### The matrix_list.txt has columns: ID, InformationContent, Name, Class, Tags
### Exported!
makeFlatFileDir <- function(JASPAR){
  outputDir <- "FlatFileDir"
  dir.create(outputDir)
  
  # Make the matrix_list.txt
  pfms <- getMatrixSet(JASPAR, opts=list(all=TRUE))
  pfmIDs <- ID(pfms)
  pfmICs <- sapply(pfms, function(x){sum(totalIC(toICM(x)))})
  pfmNames <- name(pfms)
  ## There are cases that have more than two classes
  pfmClasses <- sapply(matrixClass(pfms), paste, collapse="/")
  pfmTags <- lapply(pfms, function(x){sapply(x@tags, paste, 
                                              collapse="/")})
  pfmTags <- sapply(pfmTags, function(x){x[order(names(x))]})
  pfmTags <- sapply(pfmTags, function(x){paste(";", paste0(names(x), " \"", x, 
                                                "\" ", collapse="; "))})
  ans <- cbind(data.frame(IDs=pfmIDs,
                          ICs=pfmICs,
                          Names=pfmNames,
                          Classes=pfmClasses,
                          Tags=pfmTags
                          ))
  write.table(ans, file=file.path(outputDir, "matrix_list.txt"),
              quote=FALSE, sep="\t",
              col.names = FALSE, row.names = FALSE)
  
  # Make the *.pfm
  lapply(pfms, function(x){write.table(x@profileMatrix,
                                       file=file.path(outputDir,
                                                      paste0(x@ID, ".pfm")),
                                       quote=FALSE, sep="\t",
                                       col.names = FALSE, row.names = FALSE)})
  return("success")
}

### -----------------------------------------------------------------
### readJASPARMatrix: read the jaspar format PFM in txt file
### "individual" format:
### >MA0001.1 AGL3
### A  [ 0  3 79 40 66 48 65 11 65  0 ]
### C  [94 75  4  3  1  2  5  2  3  3 ]
### G  [ 1  0  3  4  1  0  5  3 28 88 ]
### T  [ 2 19 11 50 29 47 22 81  1  6 ]
### "all" format: multiple "individual" matrices and seperated with a blank line
### Exported
.processJASPARText <- function(text){
  ID <- sub("^>", "", strsplit(text[1], "\t")[[1]][1])
  name <- strsplit(text[1], "\t")[[1]][2]
  if(!identical(substr(text[2:5], 1, 1), DNA_BASES)){
    stop("The second to fifth lines of the file must start with",
         "`A`, `C`, `G`, `T`, respectively.")
  }
  profileMatrix <- do.call(rbind, strsplit(sub(" *]$", "", 
                                               sub("^(A|C|G|T)  \\[ *", "",
                                                   text[2:5])), " +"))
  mode(profileMatrix) <- "integer"
  rownames(profileMatrix) <- DNA_BASES
  ans <- PFMatrix(ID=ID, name=name, profileMatrix=profileMatrix)
}

readJASPARMatrix <- function(fn, type=c("individual", "all")){
  type <- match.arg(type)
  text <- readLines(fn)
  if(type == "individual"){
    if(length(text) != 5L){
      stop("The `individual` format is supposed to have 5 lines!")
    }
    ans <- .processJASPARText(text)
  }else{
    if(length(text) %% 6 != 0L){
      stop("The `all` format is supposed to have a number of lines",
           "mutipled by 6!")
    }
    text2 <- split(text, rep(1:(length(text)/6), rep(6, length(text)/6)))
    ans <- lapply(text2, .processJASPARText)
    names(ans) <- lapply(ans, name)
    ans <- do.call(PFMatrixList, ans)
  }
  return(ans)
}