1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
|
procset = function(files, procol){
stopifnot(length(procol)==1)
with(files,
{
#ph = try(read.AnnotatedDataFrame(sdrf, path = path, row.names = NULL, blank.lines.skip = TRUE, fill = TRUE, varMetadata.char = "$", quote="\""))
#Read SDRF
sdrf = basename(sdrf)
samples = readPhenoData(sdrf,path);
if(!inherits(samples, 'try-error')){
derivedMatrixCol = getSDRFcolumn("DerivedArrayMatrix",varLabels(samples))
derivedFileCol = getSDRFcolumn("DerivedArrayFile",varLabels(samples))
}
else
warning("Unable to read SDRF")
if(length(processedFiles) > 1 && length(derivedFileCol) != 0){
#stop("The processed files contain different numbers/subsets of reporters and cannot be automatically assembled.")
# READ individual files for each assay
proceset = readDerivedDataFiles(basename(processedFiles),procol,path)
assayCol = derivedFileCol
ADFrefCol = "Reporter.Name"
}
else if(length(processedFiles) == 1 && length(derivedMatrixCol) != 0){
#Read Processed data file
res = readDerivedDataMatrixFile(basename(processedFiles),procol,path)
proceset = res$proceset
assayREF = res$assayREF
assayCol = grep(assayREF,varLabels(samples))
ADFrefCol = res$ADFrefCol
}
else
stop("Unable to read processed data")
#attach sample data
dataSamples = gsub("\\.[a-z][a-z][a-z]$","",sampleNames(proceset),ignore.case=T)
SDRFsamples = gsub("\\.[a-z][a-z][a-z]$","",pData(samples)[[assayCol]])
if(all(dataSamples %in% SDRFsamples)){
#if(all(dataSamples %in% rownames(pData(samples)))){
rownames(pData(samples)) = SDRFsamples
samples = samples[dataSamples,] #Reorder samples in phenoData to match those in data
phenoData(proceset) = samples
} else warning("Cannot attach phenoData")
#read and attach feature data
features= try(readFeatures(adf=basename(adf),path=path, procADFref=ADFrefCol))
if(inherits(features, "try-error")){
warning("ArrayExpress: Unable to attach feature data")
}else{
if(all(featureNames(proceset) %in% featureNames(features))){
features = features[featureNames(proceset),]
featureData(proceset) = features
} else warning("Cannot attach feature data")
}
#read and attach experiment meta data
experimentData = try(readExperimentData(idf=basename(idf),path=path))
if(inherits(experimentData, "try-error")){
warning("ArrayExpress: Unable to read experiment data");
experimentData = new("MIAME");
}
experimentData(proceset) = experimentData;
if(!validObject(proceset))
warning(validObject(proceset))
else
message(paste("\n",gsub(".sdrf.txt","",basename(sdrf))," processed data was successfully loaded into ",class(proceset),"\n"))
return(proceset)
}) ## with
}
readDerivedDataFiles = function(processedFiles,procol,path){
for(procFile in processedFiles){
cat("Reading processed File: ",procFile,"\n")
procData <- try(read.table(file.path(path, basename(procFile)), sep = "\t",
header = TRUE, # interpret header
row.names = 1, # header located in line 1
stringsAsFactors = FALSE,
na.strings = c('NA','NULL','null')));
procsel = matrix(as.numeric(as.matrix(procData[, procol == colnames(procData)])), nrow = nrow(procData))
if(grep(procFile,processedFiles) == 1){
E = matrix(procsel)
rownames(E) = rownames(procData)
}else
E = cbind(E,procsel)
}
###TEMP###
colnames(E) = gsub("\\.txt","",processedFiles)
##########
proceset = new("ExpressionSet", exprs = E)
return(proceset)
}
readDerivedDataMatrixFile = function(processedFile,procol,path){
#Convention for ArrayDataMatrix file
# First row contains assay refs (eg. SCAN REF)
# Second row contains gene annotation values (eg. GEO:AFFYMETRIX_VALUE)
# avoided using header=T because of automatic name
cat("Reading processed data matrix file, ",file.path(path,basename(processedFile)), "\n")
#READ HEADER
matrix.header <- try(read.table(file.path(path, basename(processedFile)), sep = "\t",
header = FALSE, # read in and interprete header
row.names = 1, # header located in row 1
nrows = 2, # num rows after header
stringsAsFactors = FALSE));
if (inherits(matrix.header, "try-error")) {
stop("Cannot read header ", file.path(path, basename(processedFile)));
}
#READ a few rows to determine column classes for faster read.table
proctot.class <- try(read.table(file.path(path, basename(processedFile)), sep = "\t",
header = TRUE, # interprete header
skip = 1, # starting from second line
row.names = 1, # header located in line 1
nrows = 5, # read in 5 lines
stringsAsFactors = FALSE));
if (inherits(proctot.class, "try-error")) {
stop("Cannot read classes ", file.path(path, basename(processedFile)));
}
classes <- sapply(proctot.class, class) # figure out classes
#now read all table
data.matrix <- try(read.table(file.path(path, basename(processedFile)), sep = "\t",
header = TRUE, # interpret header
skip = 1, # starting from second line
row.names = 1, # header located in line 1
stringsAsFactors = FALSE,
na.strings = c('NA','NULL','null'),
colClasses = classes));
if(inherits(data.matrix, 'try-error'))
if(length(grep("duplicate",data.matrix)) != 0)
stop("The probe identifiers are not unique. The processed file cannot automatically be treated.")
else stop("Cannot read the processed file automatically.")
#Create matrix of data and construct an expressionSet
datasel = matrix(as.numeric(as.matrix(data.matrix[, procol == matrix.header[2, ]])), nrow = nrow(data.matrix))
#colnames(datasel) = colnames(proctot.header[, procol == proctot.header[1, ]])
colnames(datasel) = matrix.header[, procol == matrix.header[2, ]][1,]
rownames(datasel) = rownames(data.matrix)
proceset = new("ExpressionSet", exprs = datasel)
assayREF = rownames(matrix.header)[1]
assayREF = gsub('\\sREF','',assayREF)
#Choose ADF ref column in processed data matrix to link to from ADF
ADFrefCol = rownames(matrix.header)[2]
ADFrefCol = gsub(" ",".",gsub(" REF"," Name",ADFrefCol))
res = list()
res$proceset = proceset
res$assayREF = assayREF
res$ADFrefCol = ADFrefCol
return(res)
}
|