1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
|
#------------------------------------------------------------------------------
# I envision ERD data creation this way
# * STEM_erd.data.creation - processes data from CSV files
# * STEM_erd.data.subsetting - load and further subset data
#
# * Consider SRD & ERD data together as a single functional step
# because they rely on the same parameters
# ==> subsetting should also be done together
#
# Kevin, we will need to talk about how to incorporate
# your work on ERD & NDVI data creation.
# In general we want this to extend to other SRD with
# temporally varying covariates.
#
# Kevin, I dont know where this fits
#------------------------------------------------------------------------------
# Load NDVI Data
#-----------------------------------------------------------------------------
## KFW
## vegetationIndexData <- read.csv(file="ebird_reference_data/processed_VI_Data_SRD.csv", na.strings = c("NA", "?"))
#vegetationIndexData <- matrix(scan("ebird_reference_data/custom_vi.txt"), 18880842, 6, byrow=TRUE)
#print(class(vegetationIndexData))
#print(nrow(vegetationIndexData))
#print(ncol(vegetationIndexData))
testRpyCall <- function(object) {
print(class(object))
print(object)
}
#------------------------------------------------------------------------------
erd.srd.data.subsetting <- function(
erd.data.filename, #erd.data.tag,
srd.pred.design.filename,
spatial.extent.list,
temporal.extent.list,
## KFW speciesList,
species.name,
predictor.names,
response.family,
srd.max.sample.size,
split.par.list,
response.truncation.cutoff=20){
#---------------------------------------------------------------------------
# Load SRD Data #------------------------------------------------------------------------------------------------------------
print(srd.pred.design.filename)
load(file=srd.pred.design.filename) # pred.data
#dim(pred.data$D.pred)
#dim(pred.data$locs)
# ----------------------------------------------------------------------
# Limit Spatial Extent by Point in Rectangle, Polygon, or ShapeFile
# ----------------------------------------------------------------------
xxx <- pred.data$locs$x
yyy <- pred.data$locs$y
region.index <- rep(TRUE, nrow(pred.data$D.pred) )
if (!is.null(spatial.extent.list)){
## KFW if (spatial.extent.list$type == "rectangle"){
if (spatial.extent.list[['type']] == "rectangle"){
ttt.index <- (
yyy > spatial.extent.list$lat.min &
yyy < spatial.extent.list$lat.max &
xxx > spatial.extent.list$lon.min &
xxx < spatial.extent.list$lon.max )
region.index <- (ttt.index & region.index)
}
## KFW if (spatial.extent.list$type == "polygon"){
if (spatial.extent.list[['type']] == "polygon"){
ttt.index <- point.in.polygon(
xxx = xxx,
yyy = yyy,
polygon.vertices =
spatial.extent.list$polygon.vertices)
region.index <- (ttt.index & region.index)
}
## KFW if (spatial.extent.list$type == "shapefile"){
if (spatial.extent.list[['type']] == "shapefile"){
ttt.index <- point.in.shapefile(
sites = data.frame(lon=xxx, lat=yyy),
shape.dir=spatial.extent.list$shape.dir,
shape.filename=spatial.extent.list$shape.filename,
att.selection.column.name=
spatial.extent.list$att.selection.column.name,
selected.shape.names=
spatial.extent.list$selected.shape.names)
region.index <- (ttt.index & region.index)
}
} # if (!is.null(spatial.extent.list)){
# ------------------
# Subset SRD
# ------------------
srd.data <-NULL
srd.data$D.pred <- pred.data$D.pred[region.index, ]
srd.data$locs <- pred.data$locs[region.index, ]
#dim(pred.data$D.pred)
#dim(pred.data$locs)
rm(pred.data)
## KFW gc()
# ------------------
# Subset SRD
# DMF 3.17.10
# ------------------
if (!is.null(srd.max.sample.size)) {
if (nrow(srd.data$D.pred) > srd.max.sample.size){
ttt.index <- sample(x=c(1:nrow(srd.data$D.pred)),
size=srd.max.sample.size)
srd.data$D.pred <- srd.data$D.pred[ttt.index, ]
srd.data$locs <- srd.data$locs[ttt.index, ]
}
}
# --------------------------
#-----------------------------------------------------------------------------
# Load ERD parameter data & Data
#
# Wish List for erd.data.creation
# ** combine data & parameters into a single list object
# ** save only index to identify train & test sets. There is no need to
# save entire thing twice.
#------------------------------------------------------------------------------
#erdParameterListFile <- paste(data.dir, erd.data.tag, "erd.data.par.list.RData", sep="")
#load(file=erdParameterListFile) #return.list
#D.erd.data.par.list <- return.list
#erdDataFile <- paste(data.dir, D.erd.data.par.list$erd.data.tag, "erd.data.RData", sep="")
load(file=erd.data.filename) # erdDataFile) #D
# ----------------------------------------------------------------------
# Limit Spatial Extent by Point in Rectangle, Polygon, or ShapeFile
# ----------------------------------------------------------------------
xxx <- D$LONGITUDE
yyy <- D$LATITUDE
region.index <- rep(TRUE, nrow(D) )
if (!is.null(spatial.extent.list)){
## KFW if (spatial.extent.list$type == "rectangle"){
if (spatial.extent.list[['type']] == "rectangle"){
ttt.index <- (
yyy > spatial.extent.list$lat.min &
yyy < spatial.extent.list$lat.max &
xxx > spatial.extent.list$lon.min &
xxx < spatial.extent.list$lon.max )
region.index <- (ttt.index & region.index)
}
## KFW if (spatial.extent.list$type == "polygon"){
if (spatial.extent.list[['type']] == "polygon"){
ttt.index <- point.in.polygon(
xxx = xxx,
yyy = yyy,
polygon.vertices =
spatial.extent.list$polygon.vertices)
region.index <- (ttt.index & region.index)
}
## KFW if (spatial.extent.list$type == "shapefile"){
if (spatial.extent.list[['type']] == "shapefile"){
ttt.index <- point.in.shapefile(
sites = data.frame(lon=xxx, lat=yyy),
shape.dir=spatial.extent.list$shape.dir,
shape.filename=spatial.extent.list$shape.filename,
att.selection.column.name=
spatial.extent.list$att.selection.column.name,
selected.shape.names=
spatial.extent.list$selected.shape.names)
region.index <- (ttt.index & region.index)
}
} # if (!is.null(spatial.extent.list)){
# -------------------------------------------------------------------
# -------------------------------------------------------------------
# Limit Temporal Extent
# -------------------------------------------------------------------
season.index <- rep(TRUE, nrow(D) )
if (!is.null(temporal.extent.list)){
# Convert JDATE + YEAR to single temporal index
begin.tindex <- temporal.extent.list$begin.year +
temporal.extent.list$begin.jdate/366
end.tindex <- temporal.extent.list$end.year +
temporal.extent.list$end.jdate/366
D.tindex <- D$YEAR + D$DAY/366
season.index <- D.tindex >= begin.tindex &
D.tindex <= end.tindex
}
# -------------------------------------------------------------------
# ST extent
# -------------------------------------------------------------------
regional.seasonal.index <- region.index & season.index
sum(regional.seasonal.index)
ebirdReferenceDataFrame <- D[regional.seasonal.index, ]
rm(D)
## KFW gc(D)
#------------------------------------------------------------------------------------------------------------
# Train:Test Set Split : maximum coverage, eq wt train-test split
#------------------------------------------------------------------------------------------------------------
D.split <- maximum.coverage.selection(locs=data.frame(x=ebirdReferenceDataFrame$LONGITUDE ,
y=ebirdReferenceDataFrame$LATITUDE),
jdates=ebirdReferenceDataFrame$DAY,
# vector length n
grid.cell.min.lat = split.par.list$grid.cell.min.lat,
grid.cell.min.lon = split.par.list$grid.cell.min.lon,
min.val.cell.locs = split.par.list$min.val.cell.locs,
fraction.training.data = split.par.list$fraction.training.data,
mfrac = split.par.list$mfrac,
plot.it=split.par.list$plot.it)
# -----------------------------------------------
train.index <- D.split$train.index
test.index <- D.split$test.index
rm(D.split)
## KFW response.index <- names(ebirdReferenceDataFrame) %in% speciesList[[1]][1]
response.index <- names(ebirdReferenceDataFrame) %in% species.name
predictors.index <- names(ebirdReferenceDataFrame) %in% predictor.names
train.data <- NULL
test.data <- NULL
# ---------------------
train.data$X <- ebirdReferenceDataFrame[train.index, predictors.index]
ttt.y <- ebirdReferenceDataFrame[, response.index]
train.data$y <- ttt.y[train.index]
rm(ttt.y)
train.data$locs$x <- ebirdReferenceDataFrame$LONGITUDE[train.index]
train.data$locs$y <- ebirdReferenceDataFrame$LATITUDE[train.index]
train.data$jdates <- ebirdReferenceDataFrame$DAY[train.index]
# ------------------------------------
test.data$X <- ebirdReferenceDataFrame[test.index,predictors.index]
ttt.y <- ebirdReferenceDataFrame[, response.index]
test.data$y <- ttt.y[test.index]
rm(ttt.y)
test.data$locs$x <- ebirdReferenceDataFrame$LONGITUDE[test.index]
test.data$locs$y <- ebirdReferenceDataFrame$LATITUDE[test.index]
test.data$jdates <- ebirdReferenceDataFrame$DAY[test.index]
# --------------------------------------------
rm(ebirdReferenceDataFrame)
# ----------------------------------------------------------------------
# ERD parameters
# ----------------------------------------------------------------------
#protocol.tag <- D.erd.data.par.list$protocol.tag
#c("traveling","stationary") #,"casual.obs")
#if (D.erd.data.par.list$response.type == "occurrence") {
# resp.family <- "bernoulli" # "gaussian" or "bernoulli"
# }
#if (D.erd.data.par.list$response.type == "abundance"){
# resp.family <- "gaussian" # "gaussian" or "bernoulli"
# }
# --------------------------------
if ( response.family =="bernoulli") {
# ------------------------------------
# Logical - where the predicted response is the
# probability of a positive/True class
# -------------------------------------
train.data$y <- as.factor( train.data$y > 0) # TRUE if positive
test.data$y <- as.factor( test.data$y > 0) # TRUE if positive
}
# Create Truncated Gaussian Response
if ( response.family == "gaussian"){
#
# truncate responses @ 10
#sum(train.data$y > response.truncation.cutoff)
train.data$y[train.data$y > response.truncation.cutoff] <- response.truncation.cutoff
test.data$y[test.data$y >response.truncation.cutoff] <- response.truncation.cutoff
}
# --------------------------------------------------------------------------
return.list <- list(
train.index = train.index,
test.index = test.index,
#protocol.tag=protocol.tag,
response.family=response.family,
train.data = train.data,
test.data = test.data,
srd.data = srd.data)
# --------------------------------------------------------------------------
return(return.list)
}
# --------------------------------------------------------------------------
|