1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
|
# A function that takes ArrayExpress MAGETAB files for a specific experiment and returns an equivalent R object representation (NChannelSet, ExpressionSet or AffyBatch)
#
# Author: iemam
# Maintainer: Jose Marugan
###############################################################################
ae2bioc = function(mageFiles, dataCols=NULL, drop=TRUE){
dataFiles <- NULL
sdrf <- NULL
idf <- NULL
adf <- NULL
path <- NULL
dataFiles = lapply(mageFiles$rawArchive, function(x){
if (!is_empty(x)){
return(basename(x))
}
})
if (!is_empty(mageFiles$mageTabFiles)){
sdrf = basename(mageFiles$mageTabFiles[grep(mageFiles$mageTabFiles, pattern = "sdrf.txt$")])
idf = basename(mageFiles$mageTabFiles[grep(mageFiles$mageTabFiles, pattern = "idf.txt$")])
}
else{
stop("No SDRF file found.")
}
adf = mageFiles$adf
path = mageFiles$path
notuse = grep(dataFiles, pattern = "info.txt$|idf.txt$|processed|sdrf.txt$|.log$|RData|class|txt.magetab")
if (length(notuse) != 0)
dataFiles = dataFiles[-notuse]
dataFiles = dataFiles[dataFiles != ""]
allDataFiles = dataFiles;
if(length(dataFiles)==0)
stop("ArrayExpress: Experiment has no raw files available. Consider using processed data instead by following procedure in the vignette")
#check for duplicates in sdrf
sdrfData<-read.delim(paste(path,sdrf,sep='/'), check.names=FALSE)
if(!'Array Data File' %in% colnames(sdrfData)){
stop("ArrayExpress: 'Array Data File' column not found in the SDRF file. Please make sure the assay is an array assay")
}
if(any(duplicated(sdrfData[, 'Array Data File']))){
message("Duplicates found in SDRF file")
#remove duplicates based on Array Data Files and update SDRF
sdrfData<-sdrfData[!duplicated(sdrfData[, 'Array Data File']), ]
write.table(sdrfData, file=sdrf, row.names=FALSE, quote=FALSE, sep='\t')
message("Removed duplicates in SDRF file")
}
#read sample annotations
ph = readPhenoData(sdrf,path)
if(inherits(ph, 'try-error')){
ph=NULL
stop("ArrayExpress: Parsing SDRF failed. Please make sure SDRF file ",sdrf," exists in ",path, " and is not corrupt.")
}
fullPhenoData = ph;
#Checks
arrayDataCol = getSDRFcolumn("ArrayDataFile",varLabels(ph))
# labelCol = getSDRFcolumn("label",varLabels(ph))
#ArrayDesign REF in SDRF
adr = unique(pData(ph)[,getSDRFcolumn("ArrayDesignREF",varLabels(ph))])
adr = adr[adr != ""]
if(!all(dataFiles %in% ph[[arrayDataCol]]))
warning("Some data files in the zip archive are missing from the SDRF. The object may not be built.")
if(length(adr)>1)
message("ArrayExpress: Experiment uses multiple Array Designs. A separate expressionSet will be created for each")
if((length(adr) == 0 || is.na(adr)) && length(grep(".cel",dataFiles, ignore.case = TRUE)) != 0)
warning("ArrayExpress: Cannot find the array design reference in the sdrf file. The object may not be built.")
if((length(adr) == 0 || is.na(adr)) && length(grep(".cel",dataFiles, ignore.case = TRUE)) == 0)
stop("ArrayExpress: Cannot find the array design reference in the sdrf file. The object cannot be built.")
#list of return R objects
robjs=list();
for (ad in adr){
#Subselect SDRF data for current ArrayDesign REF
if(length(adr)>1){
res=getPhenoDataPerAD(ad,fullPhenoData,allDataFiles)
dataFiles = unique(res$dataFiles)
ph = res$pheno
}
#read data files
green.only = isOneChannel(sdrf,path)
rawdata= try(readAEdata(path = path,files = dataFiles,dataCols=dataCols,green.only=green.only))
if(inherits(rawdata, "try-error"))
stop("ArrayExpress: Unable to read assay data")
#read and match array feature metadata to raw data
if(!inherits(rawdata,"FeatureSet")){
adfFile = adf[grep(ad,adf)]
features= try(readFeatures(adf=adfFile,path=path))
if(inherits(features, "try-error")){
warning("ArrayExpress: Unable to read feature data")
features = NULL;
}
}
#read experiment meta data
experimentData = try(readExperimentData(idf=idf,path=path))
if(inherits(experimentData, "try-error")){
warning("ArrayExpress: Unable to read experiment data");
experimentData = new("MIAME");
}
#Finally build ExpressionSet
#Attach pheno and feature data to oligo::FeatureSet
if(inherits(rawdata,"FeatureSet")){
raweset=rawdata
phenoData(raweset) = ph[sampleNames(rawdata)]
}
if(class(rawdata) == "RGList" | class(rawdata) == "EListRaw"){
#construct nchannelset
if(class(rawdata) == "RGList"){
assayData = if("Rb" %in% names(rawdata)) #FIXME: keep all
with(rawdata, assayDataNew(R = R, G = G, Rb = Rb, Gb = Gb)) #will not work if datacolumns where user specified
else
with(rawdata, assayDataNew(G = G, R = R))
raweset = new("NChannelSet",
assayData = assayData,
experimentData = experimentData)
}
#construct expressionSet
if(class(rawdata) == "EListRaw"){
assayData = with(rawdata, assayDataNew(E = E, Eb = Eb))
raweset = new("NChannelSet",
assayData = assayData,
experimentData = experimentData)
}
#Attach pheno data
if(!is.null(ph)){ #FIXME:?
#imagene doesnt have targets slot
ph = ph[rawdata$targets$FileName,]
phenoData(raweset) = ph
}
#Attach features
if(!is.null(rawdata$genes) && rawdata$source != "ae1"){
features2 = new("AnnotatedDataFrame",rawdata$genes)
featureData(raweset) = features2;
}
else
if(!is.null(features))
featureData(raweset) = features;
#consistency of order between features in featureData and assayData is established via previous sorting of feature columns
if(length(featureNames(assayData(raweset))) != length(featureNames(featureData(raweset))))
warning("Number of features in assayData and featureData are not equal. Check control features (NA) that might have been removed from either assayData or featureData.");
}
robjs[[ad]]=raweset
}
if(drop && length(robjs) == 1)
robjs = robjs[[1]]
return(robjs)
}
|