File: procset.r

package info (click to toggle)
r-bioc-arrayexpress 1.66.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 460 kB
  • sloc: makefile: 2
file content (172 lines) | stat: -rw-r--r-- 7,340 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
procset = function(files, procol){
  
  stopifnot(length(procol)==1)
  with(files,
       {
         #ph = try(read.AnnotatedDataFrame(sdrf, path = path, row.names = NULL, blank.lines.skip = TRUE, fill = TRUE, varMetadata.char = "$", quote="\""))
         #Read SDRF
         sdrf = basename(sdrf)
         samples = readPhenoData(sdrf,path);
         if(!inherits(samples, 'try-error')){
           derivedMatrixCol = getSDRFcolumn("DerivedArrayMatrix",varLabels(samples))
           derivedFileCol = getSDRFcolumn("DerivedArrayFile",varLabels(samples))	
         }
         else
           warning("Unable to read SDRF")
         
         if(length(processedFiles) > 1 && length(derivedFileCol) != 0){
           #stop("The processed files contain different numbers/subsets of reporters and cannot be automatically assembled.")
           # READ individual files for each assay
           proceset = readDerivedDataFiles(basename(processedFiles),procol,path)
           assayCol = derivedFileCol
           ADFrefCol = "Reporter.Name"
         }
         else if(length(processedFiles) == 1 && length(derivedMatrixCol) != 0){
           #Read Processed data file
           res = readDerivedDataMatrixFile(basename(processedFiles),procol,path)
           proceset = res$proceset
           assayREF = res$assayREF
           assayCol = grep(assayREF,varLabels(samples))
           ADFrefCol = res$ADFrefCol
         }		
         else
           stop("Unable to read processed data")
         
         
         #attach sample data
         dataSamples = gsub("\\.[a-z][a-z][a-z]$","",sampleNames(proceset),ignore.case=T)
         SDRFsamples = gsub("\\.[a-z][a-z][a-z]$","",pData(samples)[[assayCol]])
         
         if(all(dataSamples %in% SDRFsamples)){
           #if(all(dataSamples %in% rownames(pData(samples)))){
           rownames(pData(samples)) = SDRFsamples
           samples = samples[dataSamples,] #Reorder samples in phenoData to match those in data
           phenoData(proceset) = samples
         } else warning("Cannot attach phenoData")
         
         #read and attach feature data
         features= try(readFeatures(adf=basename(adf),path=path, procADFref=ADFrefCol))
         if(inherits(features, "try-error")){
           warning("ArrayExpress: Unable to attach feature data")
         }else{
           if(all(featureNames(proceset) %in% featureNames(features))){
             features = features[featureNames(proceset),]
             featureData(proceset) = features	
           } else warning("Cannot attach feature data")
         }
         
         #read and attach experiment meta data
         experimentData = try(readExperimentData(idf=basename(idf),path=path))
         if(inherits(experimentData, "try-error")){
           warning("ArrayExpress: Unable to read experiment data");
           experimentData = new("MIAME");
         }
         experimentData(proceset) = experimentData;
         
         if(!validObject(proceset)) 
           warning(validObject(proceset))
         else
           message(paste("\n",gsub(".sdrf.txt","",basename(sdrf))," processed data was successfully loaded into ",class(proceset),"\n"))
         
         return(proceset)
       }) ## with
}

readDerivedDataFiles = function(processedFiles,procol,path){
  
  for(procFile in processedFiles){
    cat("Reading processed File: ",procFile,"\n")
    procData <- try(read.table(file.path(path, basename(procFile)), sep = "\t", 
                               header = TRUE, # interpret header
                               row.names = 1, # header located in line 1
                               stringsAsFactors = FALSE, 
                               na.strings = c('NA','NULL','null')));
    
    procsel = matrix(as.numeric(as.matrix(procData[, procol == colnames(procData)])), nrow = nrow(procData))
    
    if(grep(procFile,processedFiles) == 1){
      E = matrix(procsel)
      rownames(E) = rownames(procData)
      
    }else
      E = cbind(E,procsel)
  }
  
  ###TEMP###
  colnames(E) = gsub("\\.txt","",processedFiles)
  ##########
  
  proceset = new("ExpressionSet", exprs = E)
  return(proceset)	
}

readDerivedDataMatrixFile = function(processedFile,procol,path){
  #Convention for ArrayDataMatrix file
  # First row contains assay refs (eg. SCAN REF)
  # Second row contains gene annotation values (eg. GEO:AFFYMETRIX_VALUE)
  # avoided using header=T because of automatic name 
  
  
  cat("Reading processed data matrix file, ",file.path(path,basename(processedFile)), "\n")
  #READ HEADER
  matrix.header <- try(read.table(file.path(path, basename(processedFile)), sep = "\t",
                                  header = FALSE, # read in and interprete header
                                  row.names = 1, # header located in row 1
                                  nrows = 2,     # num rows after header
                                  stringsAsFactors = FALSE));
  
  if (inherits(matrix.header, "try-error")) {
    stop("Cannot read header ", file.path(path, basename(processedFile)));
  }
  
  #READ a few rows to determine column classes for faster read.table
  proctot.class <- try(read.table(file.path(path, basename(processedFile)), sep = "\t",
                                  header = TRUE, # interprete header
                                  skip = 1,      # starting from second line
                                  row.names = 1, # header located in line 1
                                  nrows = 5,     # read in 5 lines
                                  stringsAsFactors = FALSE));        
  
  if (inherits(proctot.class, "try-error")) {
    stop("Cannot read classes ", file.path(path, basename(processedFile)));
  }
  
  classes <- sapply(proctot.class, class) # figure out classes
  
  #now read all table
  data.matrix <- try(read.table(file.path(path, basename(processedFile)), sep = "\t", 
                                header = TRUE, # interpret header
                                skip = 1,      # starting from second line
                                row.names = 1, # header located in line 1
                                stringsAsFactors = FALSE, 
                                na.strings = c('NA','NULL','null'),
                                colClasses = classes));
  
  if(inherits(data.matrix, 'try-error'))
    if(length(grep("duplicate",data.matrix)) != 0) 
      stop("The probe identifiers are not unique. The processed file cannot automatically be treated.")
  else stop("Cannot read the processed file automatically.")
  
  #Create matrix of data and construct an expressionSet
  datasel = matrix(as.numeric(as.matrix(data.matrix[, procol == matrix.header[2, ]])), nrow = nrow(data.matrix))
  #colnames(datasel) = colnames(proctot.header[, procol == proctot.header[1, ]])
  colnames(datasel) = matrix.header[, procol == matrix.header[2, ]][1,]
  rownames(datasel) = rownames(data.matrix)
  
  proceset = new("ExpressionSet", exprs = datasel)
  
  
  assayREF = rownames(matrix.header)[1]
  assayREF = gsub('\\sREF','',assayREF)
  
  #Choose ADF ref column in processed data matrix to link to from ADF
  ADFrefCol = rownames(matrix.header)[2]
  ADFrefCol = gsub(" ",".",gsub(" REF"," Name",ADFrefCol))
  
  res = list()
  res$proceset = proceset
  res$assayREF = assayREF
  res$ADFrefCol = ADFrefCol
  
  return(res)	
}