File: parseMAGE.r

package info (click to toggle)
r-bioc-arrayexpress 1.66.0-2
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 460 kB
sloc: makefile: 2
file content (539 lines) | stat: -rw-r--r-- 22,083 bytes
# TODO: Add comment
# 
# Author: iemam
###############################################################################

headers<-list(
		ae1=c("metaColumn","metaRow","row","column"),
		genepix=c("Block","Column","Row","X","Y"),
		arrayvision=c("Primary","Secondary"),
		agilent=c("Row","Col","PositionX","PositionY"),
		scanalyze=c("GRID","COL","ROW","LEFT","TOP","RIGHT","BOT"),
		scanarray=c('Array Column','Array Row','Spot Column','Spot Row','X','Y'),
		quantarray=c('Array Column', 'Array Row', 'Column', 'Row'),
		spotfinder=c("MC","MR","SC","SR","C","R"),
		mev=c("MC","MR","C","R","UID"),
		codelink=c("Logical_row","Logical_col","Center_X","Center_Y"),
		bluefuse=c("COL","ROW","SUBGRIDCOL","SUBGRIDROW"),
		UCSFSpot=c("Arr-colx","Arr-rowy","Spot-colx","Spot-rowy"),
		NimbleScanFeature=c("X","Y","PROBE_ID","X_PIXEL","Y_PIXEL"),
		NimblegenNASA=c("X_BC","Y_BC","Feature_ID","ProbID_BC"),
		imagene=c('Meta Column', 'Meta Row', 'Column', 'Row', 'Field', 'Gene ID'),
		ImaGene3=c("Meta_col","Meta_row","Sub_col","Sub_row","Name","Selected"),
		ImaGene7=c("Block","Column","Row","Ch1 XCoord","Ch1 YCoord", "Ch2 XCoord", "Ch2 YCoord"),
		ImaGeneFields=c("Field","Column","Row","XCoord","YCoord"),
		CSIRO_Spot=c("grid_c","grid_r","spot_c","spot_r","indexs")

#add illumina
)

isOneChannel = function(sdrf,path){
	ph = try(read.AnnotatedDataFrame(sdrf, path = path, row.names = NULL, blank.lines.skip = TRUE, fill = TRUE, varMetadata.char = "$", quote="\""))
	labelCol = getSDRFcolumn("label",varLabels(ph))
    if(length(labelCol)==0)
        return(TRUE) # assume 1-color if no label reported - we warn the user in readPhenoData
    else
	    return(length(unique(tolower(ph[[labelCol]])))==1)
}

readPhenoData = function(sdrf,path){
	
	message("ArrayExpress: Reading pheno data from SDRF")
	ph = try(read.AnnotatedDataFrame(sdrf, path = path, row.names = NULL, blank.lines.skip = TRUE, fill = TRUE, varMetadata.char = "$", quote="\""))

        arrayDataCol = getSDRFcolumn("ArrayDataFile",varLabels(ph))
        labelCol = getSDRFcolumn("label",varLabels(ph))

        if(length(arrayDataCol)==0)
                warning("ArrayExpress: Cannot find 'Array Data File' column in SDRF. Object might not be created correctly.")
        if(length(labelCol)==0)
                warning("ArrayExpress: Cannot find 'Label' column in SDRF. Object might not be created correctly.")

	ph = ph[gsub(" ", "", ph$Array.Data.File) != ""]
	sampleNames(ph) = ph$Array.Data.File
	ph@varMetadata['Array.Data.File','labelDescription'] = "Index"
	ph@varMetadata['Array.Data.File','channel'] = as.factor("_ALL_")
		
	#Remove empty rows from pheno data
	emptylines = which(sapply(seq_len(nrow(pData(ph))), function(i) all(pData(ph)[i,] == "",na.rm = TRUE)))
	if(length(emptylines) != 0)
		pData(ph) = pData(ph)[-emptylines,]
		
	phenoData = pData(ph)

	if(length(arrayDataCol)!=0 && length(labelCol)!=0){
		#filter out duplicated rows where multiple derived data files are available per one array data file
		ph=ph[!duplicated(phenoData[,c(arrayDataCol,labelCol)])]
	}
	
	#treat SDRF for two channel experiments
	if(length(labelCol)==1 && length(unique(tolower(ph[[labelCol]])))==2){
		
		arrayFilesNum = length(unique(ph[[arrayDataCol]]))
		
		si = pData(ph)[1:(arrayFilesNum*2),]
		lab = split(si,si[,"Label"])
		
		if(nrow(lab[[1]]) != nrow(lab[[2]])){
			stop("Number of CY3/CY5 is not equal")
		}
		
		#Reorder rows in each group (Cy3,Cy5) to the same order
		lab[[1]] = lab[[1]][order(lab[[1]][,arrayDataCol]),]
		lab[[2]] = lab[[2]][order(lab[[2]][,arrayDataCol]),]
		
		same = which(lapply(1:ncol(lab[[1]]), function(i) all(lab[[1]][i] == lab[[2]][i])) == TRUE)
		all = lab[[1]][same]
		gspe = lab[[1]][-same]
		colnames(gspe) = paste(colnames(gspe),names(lab)[1],sep = ".")
		rspe = lab[[2]][-same]
		colnames(rspe) = paste(colnames(rspe),names(lab)[2],sep = ".")
		
		metaData = data.frame(labelDescription = c(rep("_ALL_",ncol(all)),rep("G",ncol(gspe)),rep("R",ncol(rspe))))
		ph = new("AnnotatedDataFrame", data = cbind(all,gspe,rspe), varMetadata = metaData)
		
		arrayDataCol = getSDRFcolumn("ArrayDataFile",varLabels(ph))	
		rownames(pData(ph)) = gsub(".[a-z][a-z][a-z]$","",ph[[arrayDataCol]],ignore.case=T)
	}
		
		
#		if(!all(basename(files) %in% pData(ph)[,getSDRFcolumn("ArrayDataFile",varLabels(ph))]))
#			warning("Some data files in the zip archive are missing from the SDRF. The object may not be built.")
	return(ph)
}

readAEdata = function(path,files,dataCols,green.only){
	
	message("ArrayExpress: Reading data files")
	source = getDataFormat(path,files)

#process extra arguments passed to read.maimages
#		if (!is.null(dataCols)) {
#		if (length(rawcol) > 1 && !is(rawcol, "list")) 
#			stop("The argument 'rawcol' must be a list if multiple column name are given.")
#		if (length(rawcol) == 1 && !is(rawcol, "character")) 
#			stop("The argument 'rawcol' must be a character if one column name is given.")
#		if (is(rawcol, "list") && !("R" %in% names(rawcol) && "G" %in% names(rawcol))) 
#			stop("The names of the columns must contain R and G.")
#	}
	
	if(source == "affy"){
	  if (.Platform$OS.type == "windows"){
	    rawdata = try(oligo::read.celfiles(filenames = file.path(path,unique(files), fsep='\\')))
	  }
	  else{
	    rawdata = try(oligo::read.celfiles(filenames = file.path(path,unique(files))))
	  }
		
		if(inherits(rawdata, 'try-error')){
			stop("Unable to read cel files in ",path)
		}
		return(rawdata)
		
	}else if(source=="ae1"){
		#Old AE1 formatted data files
		if (is.null(dataCols)){
			dataCols= try(getDataColsForAE1(path,files))
			if(inherits(dataCols,'try-error')) return()
		}
		rawdata = try(read.maimages(files=files,path=path,source="generic",columns=dataCols,annotation=headers$ae1))
		if(!inherits(rawdata, 'try-error')) rawdata$source="ae1"

	}else if(source %in% c("agilent","arrayvision","bluefuse","genepix","bluefuse","imagene","quantarray")){
		#data format can be directly read by limma
		rawdata = try(read.maimages(files=files,path=path,source=source,columns=dataCols,green.only=green.only))
		
	}else if(!is.null(dataCols) && source=="unknown"){
		#read generic source given columns specified by user
		rawdata = try(read.maimages(files=files,path=path,source="generic",columns=dataCols,green.only=green.only))
		
	}else
		# TODO: add more formats (illumina)
		stop("Unable to recognize data file format")
		
	if(inherits(rawdata, 'try-error')){
		stop("Unable to read data files in",path)
	}
	
	#reorder rows of RGList to reflect the same order in ADF as Block Row/Block Column/Row/Column
	if(is.null(rawdata$genes))
		stop("Unable to read probe annotation from RGList")
	if(is.null(rawdata$source))
		stop("Unable to read source from RGList")
	
	rawdata<-switch(source,
			agilent = rawdata<-rawdata[with(rawdata$genes,order(Row,Col)),],
			genepix = rawdata<-rawdata[with(rawdata$genes,order(Block,Row,Column)),],
			ae1 = rawdata<-rawdata[with(rawdata$genes,order(metaRow,metaColumn,row,column)),],
			rawdata)
	#if generic user has to specify feature columns in data
	
	
	return(rawdata)
}

readFeatures<-function(adf,path,procADFref=NULL){
	
	message("ArrayExpress: Reading feature metadata from ADF")
		
	lines2skip = skipADFheader(adf,path,!is.null(procADFref))
	features = try(read.table(file.path(path, adf), row.names = NULL, blank.lines.skip = TRUE, fill = TRUE, sep="\t", na.strings=c('?','NA'), skip = lines2skip, header=TRUE, quote=""))
	
	if('Block.Column' %in% colnames(features) & 'Reporter.Name' %in% colnames(features)){
		ommittedRows = which(is.na(features[,'Block.Column']) | is.na(features[,'Reporter.Name']))
		if(length(ommittedRows)!=0){
			message("ArrayExpress: Ommitting NA rows from ADF")
			features = features[-ommittedRows,]
		}
	}
	

	
	if(!is.null(procADFref)){
		if(procADFref %in% colnames(features))
			rownames(features) = features[,procADFref]
		else{
			repCol = getSDRFcolumn("reporter",colnames(features))
			if(length(repCol) != 0)
				repCol= repCol[1]
			else
				repCol = getSDRFcolumn("composite",colnames(features))
			if(length(repCol) != 0)
				repCol = repCol[1]
			else
				repCol = NULL
			if(!is.null(repCol))
				rownames(features) = features[[repCol]]
		}
	}
	

	
	#Sort ADF features by columns Block row/Block column/Row/Column (only applicable for raw data exps, processed data is ordered by reporter/composite name)
	if("Block.Row" %in% colnames(features))
		features = features[with(features,order(Block.Row,Block.Column,Row,Column)),]
	
	
	#Row names of featureData must match row names of the matrix / matricies in assayData
#	ri1 = grep("reporter.identifier|reporter.name", colnames(adff), ignore.case=TRUE)
#	ri2 = grep("reporter.identifier|reporter.name", colnames(fn), ignore.case=TRUE)	
#	if(all(adff2[,ri1] == fn[,ri2])) 
#		featureData(eset) = new("AnnotatedDataFrame",adff2) 
#	else stop("Do not manage to map the reporter identifier between the annotation and the data files.\n")

	return(new("AnnotatedDataFrame",features))
}

## Assign experiment Data
## By Juok Cho
readExperimentData = function(idf, path){
	idffile = scan(file.path(path,idf),character(),sep = "\n",encoding="UTF-8")
	idf.data = list()
	for(g in idffile) { 
		e = unlist(strsplit(g,"\t"))
		key = e[1] 
		if(length(e)>1)
			values = e[2:length(e)]
		else
			values = NA
		idf.data[[key]] = values
	}
	
	## making key matches #
	## (Person Last Name, Person First Name, Person Mid Initials), Person Email, Person Phone, Person Address, Person Affiliation, 
	Person_Name = c(idf.data$"Person First Name", idf.data$"Person Last Name",idf.data$"Person Mid Initials") 
	Personal_contact = c(idf.data$"Person Email", idf.data$"Person Phone", idf.data$"Person Address")
	
	## making experimentData object #		
	SubmitterIndex = which(idf.data$"Person Roles"=="submitter")
	experimentData = new("MIAME", 
			name = as.character(paste(idf.data$"Person Last Name"[SubmitterIndex],", ",idf.data$"Person First Name"[SubmitterIndex], sep = "")), #performer
			lab = as.character(idf.data$"Person Affiliation"[SubmitterIndex]) , #Person Affiliation 
			contact = as.character(idf.data$"Person Email"[SubmitterIndex]), # Person Email(Person Phone, Person Address)
			title = as.character(idf.data$"Investigation Title") , #description #Investigation Title
			##abstract= "",	#not provided in the idf.data
			##url	= "",
			other = list(
					accession = gsub(".sdrf.txt","",idf.data$"SDRF File"), #Experiment Name
					identifier = gsub(".sdrf.txt","",idf.data$"SDRF File"), #Experiment Name
					##Experimental Factor Type
					experimentalFactor = c(idf.data$"Experimental Factor Type"), 
					##Experimental Design
					type = c(idf.data$"Experimental Design")
					#measurementType = experimentData(eset)@other$measurementType #from processed data.zip depending on user answer about QT type
			)
	)
	#experimentData(eset) = experimentData
	return(experimentData)	  
}

skipADFheader<-function(adf,path,proc=F){
	if(!proc)
		columns = list('Block Column','Block Row','Column','Row')
	else
		columns = list('Composite Element Name')
	
	con = file(file.path(path, adf), "r")	
	on.exit(close(con))
	
	Found = FALSE
	i = 0
	repeat {
		i = i+1
		txt <- readLines(con,n=1)
		if(!length(txt))
			stop("Failed to recognize ADF file format")
		Found = TRUE
		for(a in columns) 
			Found = Found && length(grep(a,txt))
		Found2 = length(grep("^Reporter[[:punct:]|[:blank:]]*Name",txt,ignore.case=TRUE))
		if(Found || Found2)
			break
	}
	return(i-1)
}

getPhenoDataPerAD<-function(ad,ph,dataFiles){
	phenoData = pData(ph)
	arrayDataCol = getSDRFcolumn("ArrayDataFile",varLabels(ph))
	arrayDesignRefCol = getSDRFcolumn("ArrayDesignREF",varLabels(ph))
	
	if(length(arrayDataCol)==0)
		stop("Cannot find 'Array.Data.File' column in SDRF. Experiment uses multiple array designs. Cannot distinguish arrays with similar array design.")
	
	if(length(arrayDesignRefCol)==0)
		stop("Cannot find 'Array.Design.REF' column in SDRF. Experiment uses multiple array designs. Cannot distinguish arrays with similar array design.")
	
	if(length(phenoData[phenoData[arrayDesignRefCol]==ad,arrayDataCol]) == 0)
		stop("Cannot find array data file names in the sdrf file. Experiment uses multiple array designs. Cannot distinguish arrays with similar array design.") 
	
	#Subselect data files for current ArrayDesign REF
	selectFiles = phenoData[phenoData[arrayDesignRefCol]==ad,arrayDataCol]
	
	
	if(!all(selectFiles %in% dataFiles)){
		stop("Some or all data files for ",ad," array are missing.")
	}
	
	#subselect phenoData Frame for files 
	ph = ph[phenoData[arrayDesignRefCol]==ad]
	phenoData = pData(ph)
	
	return(list(pheno=ph,dataFiles=selectFiles))
}

getDataFormat=function(path,files){
	
	if(length(grep(".cel",files, ignore.case = TRUE)) == length(files)){
		return("affy")
	}
	else{
		#Retrieve Column names from first data file
		allcnames = scan(file.path(path,files[1]),what = "",nlines = 200, sep = "\t",quiet=TRUE)
		allcnamesL = try(tolower(gsub("^\\s","",allcnames)),silent=TRUE)
		if(inherits(allcnamesL, 'try-error')){
			allcnames = scan(file.path(path,files[1]),what = "",nlines = 200, sep = "\t",quiet=TRUE,encoding="latin1")
			allcnamesL = try(tolower(gsub("^\\s","",allcnames)))
			if(inherits(allcnamesL, 'try-error')){
				allcnamesL = allcnames
			}
		}
		
		#Find source of data
		for(source in names(headers)){
			allthere<-tolower(headers[[source]]) %in% allcnamesL
			if(all(allthere))
				return(source)
		}
	}

	#Unable to detect source
	allcnames = scan(file.path(path,files[1]),what = "",nlines = 1, sep = "\n",quiet=TRUE)
	return("unknown")
	
	#stop("Unable to recognize data file format. First line:\n",allcnames)
	
}

getDataColsForAE1 = function(path,files){
	url2 = "http://sourceforge.net/p/tab2mage/code/HEAD/tree/trunk/Tab2MAGE/lib/ArrayExpress/Datafile/QT_list.txt?format=raw" 
	
	qt = try(read.table(url2, sep = "\t", quote = "",
					check.names = FALSE, fill = TRUE,
					comment.char = "#",               
					stringsAsFactors =  FALSE,
					encoding = "UTF-8")) ##read the QT file from the web
	if(inherits(qt, "try-error"))
		qt = try(read.table(
						file.path(system.file("doc", package = "ArrayExpress"),"QT_list.txt"),
						sep = "\t", quote = "",
						check.names = FALSE, fill = TRUE,
						comment.char = "#",               
						stringsAsFactors =  FALSE,
						encoding = "UTF-8")) ##read the QT file from the package
	
	
	scanners = grep(">>>",qt[,1],value = TRUE) ## list all the scanner names
	sl = grep(">>>",qt[,1]) ## list all the line numbers wherea scanner type starts
	scanners = gsub(">","",scanners)
	
	## Parsing the first line of the expression file
	allcnames = scan(file.path(path,files[1]),what = "",nlines = 1, sep = "\t",quiet=TRUE)
	
	## Looking for the right column to use
	scanname = allcnames[grep(":",allcnames)]
	if(length(grep("Database|Reporter",scanname)) != 0)
		scanname = scanname[-grep("Database|Reporter",scanname)] 
	
	##Feature is a problem because of Feature Extraction
	feature = grep("^Feature^",scanname)
	fe = grep("Feature Extraction",scanname)
	if(length(feature) != 0)
		scanname = scanname[-feature[!feature %in% fe]]
	
	##Ready to read data
	if(length(scanname) == 0) 
		stop(sprintf("No scanner name is given. It is not possible to handle such a case. Try to set the argument 'dataCols' by choosing among the following columns names: \n") ,
				sprintf("\"%s\" \n",allcnames))
	
	##Image Analysis Program
	software = unique(sapply(seq_len(length(scanname)), function(i) strsplit(scanname,":")[[i]][1]))     
	st = NULL

    for(x in software){
      if(length(grep(x, scanners)) != 0){
        st = x
        break
      } 
    }

	#if(length(grep(st, scanners)) == 0)
	if(is.null(st))
    	stop(sprintf("Scanner name is, ",software,". This scanner type is not valid. \nTry to set the argument 'dataCols' by choosing among the following columns names: \n", st),sprintf("\"%s\" \n",scanname))
	
	if(length(st) != 1)
		stop(sprintf("%s scanner names are given ( ",length(st)), sprintf("\"%s\" ",st), sprintf("). It is not possible to handle such a case. Try to set the argument 'dataCols' by choosing among the following columns names: \n") ,sprintf("\"%s\" \n",scanname))
	
	if(length(grep(st, scanners)) > 1)
		stop(sprintf("Scanner name can be '%s'. \nTry to set the argument 'dataCols' by choosing among the following columns names: \n", scanners[grep(st, scanners)]),sprintf("\"%s\" \n",scanname))
	
	gs = qt[((sl[grep(st,scanners)]+1):(sl[grep(st,scanners)+1]-1)),] ## extract the QTs of the specific scanner type
	foreground = gs[(gs[,4] == "MeasuredSignal" & (is.na(gs[,5]) | gs[,5] == 0)),c(1,7)] ## the colnames to use in the read.column
	background = gs[(gs[,4] == "MeasuredSignal" & (gs[,5] == 1)),c(1,7)] ## the colnames to use in the read.column
	
	foreground[,1] = paste(st,":",foreground[,1],sep = "")
	colnamesf = foreground[which(foreground[,1] %in% allcnames),]
	df = dim(colnamesf)        
	
	if(dim(background)[1] != 0){
		background[,1] = paste(st,":",background[,1],sep = "")
		colnamesb = background[which(background[,1] %in% allcnames),]
		db = dim(colnamesb)
	}else 
		db = 0
	
	if(length(files) != 1){
		if(!all(sapply(2:length(files), function(i) readLines(file.path(path,files[1]),1) == readLines(file.path(path,files[i]),1))))
			warning(sprintf("The files do not all have the same headings whereas the array design is the same. It may cause the object not being created."))
	}
	
	#Two channel data
	if(df[1] == 2){
		rawcol = if(db[1] == 2) 
					list(R = colnamesf[colnamesf[,2] == "Cy5",1], 
						 G = colnamesf[colnamesf[,2] == "Cy3",1],
						 Rb = colnamesb[colnamesb[,2] == "Cy5",1],
						 Gb = colnamesb[colnamesb[,2] == "Cy3",1])
			 	 else 
					list(R = colnamesf[colnamesf[,2] == "Cy5",1],
						 G = colnamesf[colnamesf[,2] == "Cy3",1])
		
		if(length(rawcol) == 0 || (0 %in% sapply(seq_len(length(rawcol)), function(i) length(rawcol[[i]]))))
			stop(sprintf("The known column names for this scanner are not in the heading of the files.\nTry to set the argument 'rawcol' by choosing among the following columns names: \n"),
					sprintf("\"%s\" \n",scanname))
	}
	## one channel data
	if(df[1] == 1){
		rawcol = if(db != 0)
					list(G = colnamesf[,1],
						 Gb = colnamesb[,1])
			 	 else
					 list(G = colnamesf[,1])
	}
	
	if(df[1] == 0)
		stop(sprintf("None of the columns names of the expression files is matching a valid known quantitation type.\nTry to set the argument 'rawcol' by choosing among the following columns names: \n"), sprintf("\"%s\" \n",scanname))
	if(df[1] > 2)
		stop(sprintf("There are too many columns that could be read in the files.\n Try to set the argument 'rawcol' by choosing among the following columns names: \n"),sprintf("\"%s\" \n",scanname))
	
	return(rawcol)		
}

## assign phenoData to Nchannelset
preparePhenoDataFor2channel = function(ph,files){
	#if(length(unique(tolower(ph[[labelCol]])))==2){
		
		arrayFilesNum = length(unique(ph[[arrayDataCol]]))
		
		si = pData(ph)[1:(arrayFilesNum*2),]
		lab = split(si,si[,"Label"])
		
		if(nrow(lab[[1]]) != nrow(lab[[2]])){
			stop("Number of CY3/CY5 is not equal")
		}
		
		#Reorder rows in each group (Cy3,Cy5) to the same order
		lab[[1]] = lab[[1]][order(lab[[1]][,arrayDataCol]),]
		lab[[2]] = lab[[2]][order(lab[[2]][,arrayDataCol]),]
		
		same = which(lapply(1:ncol(lab[[1]]), function(i) all(lab[[1]][i] == lab[[2]][i])) == TRUE)
		all = lab[[1]][same]
		gspe = lab[[1]][-same]
		colnames(gspe) = paste(colnames(gspe),names(lab)[1],sep = ".")
		rspe = lab[[2]][-same]
		colnames(rspe) = paste(colnames(rspe),names(lab)[2],sep = ".")
		
		metaData = data.frame(labelDescription = c(rep("_ALL_",ncol(all)),rep("G",ncol(gspe)),rep("R",ncol(rspe))))
		ph = new("AnnotatedDataFrame", data = cbind(all,gspe,rspe), varMetadata = metaData)
		
		arrayDataCol = getSDRFcolumn("ArrayDataFile",varLabels(ph))	
		rownames(pData(ph)) = gsub(".[a-z][a-z][a-z]$","",ph[[arrayDataCol]],ignore.case=T)
	#}
	return(ph)
}

getSDRFcolumn = function(col,headers){
	pattern<-switch(col,
			ArrayDataFile = "^Array[[:punct:]|[:blank:]]*Data[[:punct:]|[:blank:]]*File",
			ArrayDesignREF = "^Array[[:punct:]|[:blank:]]*Design[[:punct:]|[:blank:]]*REF",
			ArrayDataMatrixFile = "^Array[[:punct:]|[:blank:]]*Data[[:punct:]|[:blank:]]*Matrix[[:punct:]|[:blank:]]*File",
			label = "^Label$",
			factorValues = "^Factor[[:punct:]|[:blank:]]*Value",
			DerivedArrayMatrix = "^Derived[[:punct:]|[:blank:]]*Array[[:punct:]|[:blank:]]*Data[[:punct:]|[:blank:]]*Matrix[[:punct:]|[:blank:]]*File",
			DerivedArrayFile = "^Derived[[:punct:]|[:blank:]]*Array[[:punct:]|[:blank:]]*Data[[:punct:]|[:blank:]]*File",
			reporter = "^Reporter[[:punct:]|[:blank:]]*[Name | Identifier]",
			composite = "^Composite[[:punct:]|[:blank:]]*Element[[:punct:]|[:blank:]]*[Name | Identifier]")
	colIndex = grep(pattern,headers,ignore.case = TRUE)
	return(colIndex)
}

## remove all downloaded files
cleanupAE = function(mageFiles){
	path = mageFiles$path
	if (.Platform$OS.type == "windows"){
	  sep = '\\'
	}
	else{
	  sep = '/'
	}
	try(file.remove(file.path(path, basename(mageFiles$rawFiles), fsep = sep)))
	try(file.remove(file.path(path, basename(mageFiles$processedFiles), fsep = sep)))
	
	try(file.remove(file.path(path, basename(mageFiles$sdrf), fsep = sep)))
	try(file.remove(file.path(path, basename(mageFiles$idf), fsep = sep)))
	try(file.remove(file.path(path, basename(mageFiles$adf), fsep = sep)))
	try(file.remove(file.path(path, basename(mageFiles$rawArchive), fsep = sep)))
	try(file.remove(file.path(path, basename(mageFiles$processedArchive), fsep = sep)))
}