File: mi.R

package info (click to toggle)
r-cran-bios2cor 2.2.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,520 kB
  • sloc: makefile: 5
file content (128 lines) | stat: -rw-r--r-- 4,009 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Package: Bios2cor 
# This file is part of Bios2cor R package.
# Bios2cor is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# See the GNU General Public License at:
# http://www.gnu.org/licenses/

mi <- function (align, gap_ratio = 0.2) {

    if ((gap_ratio < 0) | (gap_ratio > 1)) {
      stop("gap_ratio must be in the [0,1] range.")
    }

    diag <- 0	

    msa<-align
    MSA <- matrix(as.vector(unlist(msa)), ncol= length(msa[[1]]), byrow= TRUE)
    nb_pos <- length(MSA[1,]) #number of positions in the alignment
    nb_seq <- length(MSA[,1]) #number of sequences in the alignment
    colnames(MSA)<-c(1:nb_pos)
    pos_names <- colnames(MSA) 
	
    gap <- 1-gap_ratio      #gap value indicates the minimal ratio of aa to nb_seq in the MSA 
    if (gap < 1/nb_seq) {
      gap <- 1/nb_seq     # positions must have at leat ONE aa to be taken into account (removes gap column)
    }

    names<-c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","-")
    AA<-lapply(1:length(MSA[1,]),function(i){t(table(c(MSA[,i],names),row.names=c(1:(length(MSA[,i])+21))))[1:length(MSA[,i]),-1]})

    names<-c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
    nb_aa <- length(names)
	
    # Determining valid positions with correct gap ratio (equal to 1 - gap argument)
    Valid_pos <- c()
    for(i in 1:nb_pos){
       mat_i <- AA[[i]] #matrix nb_seq*nb_aa
        S_i <- colSums(AA[[i]])
        Tot_i <- sum(S_i)
        if (Tot_i/nb_seq >= gap) {
           Valid_pos <- c(Valid_pos, i)
        }
    }
    nb_Valid_pos <- length(Valid_pos)
	
    MI <- matrix(0, ncol= nb_pos, nrow= nb_pos)
    freq_ij <- matrix(0, ncol= nb_aa, nrow= nb_aa, dimnames=list(names, names))
	
    # Calculating MI score for each valid position
    for(i in 1:nb_Valid_pos){
	pos_i <- Valid_pos[i] #current valid position
	mat_i <- AA[[pos_i]] #matrix nb_seq*nb_aa
	    
	message(paste("pos_i : ", pos_i, "\n"))
	    
	#amino acids frequency at position i in the alignment
	freq_i <- colSums(mat_i)/nb_seq
	    
	for(j in i:nb_Valid_pos){
	    pos_j <- Valid_pos[j]
	    mat_j <- AA[[pos_j]] #matrix nb_seq*nb_aa
		
	    #amino acids frequency at position j in the alignment
	    freq_j <- colSums(mat_j)/nb_seq #matrix 1*nb_aa

	    #amino acids frequency at positions i AND j in the alignment
	    freq_p <- ((t(mat_i))%*%mat_j)/nb_seq #matrix nb_aa*nb_aa
		
	    for (k in names) {
	        for (l in names) {
	    	    freq_ij[k,l]<-freq_i[k]*freq_j[l]
		}
	    }
		
	LOG<-(log(freq_p, base=400)-log(freq_ij, base=400))
	LOG<-replace(LOG, which(LOG=="-Inf"),0)
	LOG<-replace(LOG, which(LOG=="NaN"),0)
		
	MI[pos_i, pos_j]<-sum((freq_p)*(LOG))
	}
    }
	

    COV2<-matrix(0, ncol= nb_pos, nrow= nb_pos)
	
    # MI final value
    COV2 <- MI 
	
    # Setting columns and rows names before matrix reduction
    rownames(COV2)<-pos_names
    colnames(COV2)<-pos_names
	
    COV2 <- COV2+t(COV2) #Complete the second triangular part of the matrix
    diag(COV2) <- diag
	
    # Reduction of the final correlation matrix to the valid positions
    COV2 <- COV2[Valid_pos,]
    COV2 <- COV2[,Valid_pos]
	
    # Removing "Inf" and "NaN" values
    COV2[is.infinite(COV2)] <- 0
    COV2[is.na(COV2)] <- 0
	
    res <- list()
    res$score <- COV2
	
    # Compute and save matrix of Z_scores
    # Mean and stdev must be calculated on off diagonal elements  
    mean_up <- mean(COV2[upper.tri(COV2)])
    stdev <- sd(COV2[upper.tri(COV2)])
    
    COV2 <- (COV2-mean_up)/stdev
    diag(COV2) <- diag
       
    res$Zscore <- COV2
	
    return(res)

}