A simple modeling workflow

Overview

Teaching: 20 min
Exercises: 20 min

Objectives

re-format data for model input

view the model

predict function

model evaluation

thresholds

5.0 Prepare occ & raster

library("raster")
library("dismo")

if(!file.exists("data/occ_raw.rdata")){
  occ_raw <- gbif(genus="Dasypus",species="novemcinctus",download=TRUE) 
  save(occ_raw,file = "data/occ_raw.rdata")
}else{
  load("data/occ_raw.rdata")
}
occ_clean <- subset(occ_raw,(!is.na(lat))&(!is.na(lon))) 
occ_unique <- occ_clean[!duplicated( occ_clean[c("lat","lon")]  ),]
occ_unique_specimen <- subset(occ_unique, basisOfRecord=="PRESERVED_SPECIMEN")
occ_final <- subset(occ_unique_specimen, year>=1950 & year <=2000)
coordinates(occ_final) <- ~ lon + lat
myCRS1 <- CRS("+init=epsg:4326") # WGS 84
crs(occ_final) <- myCRS1

if( !file.exists( paste0("data/bioclim/bio_10m_bil.zip")   )){
  utils::download.file(url="http://biogeo.ucdavis.edu/data/climate/worldclim/1_4/grid/cur/bio_10m_bil.zip",
                       destfile="data/bioclim/bio_10m_bil.zip"   ) 
  utils::unzip("data/bioclim/bio_10m_bil.zip",exdir="data/bioclim") 
}

clim_list <- list.files("data/bioclim/",pattern=".bil$",full.names = T)
clim <- raster::stack(clim_list) 

occ_buffer <- buffer(occ_final,width=4*10^5) #unit is meter
clim_mask <- mask(clim, occ_buffer)

5.1 Re-format data input for Maxent

The data input can either be spatial (i.e. spatial points + rasters) or *tabular (data frame).
Here is an example of using spatial data:

cat(class(clim_mask),"  ",  class(occ_final))

RasterBrick    SpatialPointsDataFrame

m0 <- maxent(x=clim_mask,p=occ_final)

Warning in .local(x, p, ...): 2 (0.39%) of the presence points have NA
predictor values

However, using spatial data input will make the model less under control. So if you want more control of the modeling process, it is recommended to prepare data in a tabular format. The ideal format is as following:
.

Here, we extract environmental conditions for occurrences (training, and testing) and background points, and merge them by row.

set.seed(1) 
bg <- sampleRandom(x=clim_mask,
                   size=10000,
                   na.rm=T, #removes the 'Not Applicable' points  
                   sp=T) # return spatial points 

set.seed(1) 

# randomly select 50% for training
selected <- sample(  1:nrow(occ_final),  nrow(occ_final)*0.5)

occ_train <- occ_final[selected,] # this is the selection to be used for model training
occ_test <- occ_final[-selected,] # this is the opposite of the selection which will be used for model testing


# extracting env conditions for training occ from the raster stack;
# a data frame is returned (i.e multiple columns)
env_occ_train <- extract(clim,occ_train)

# env conditions for testing occ
env_occ_test <- extract(clim,occ_test)

# extracting env conditions for background
env_bg <- extract(clim,bg)  

#combine the conditions by row
myPredictors <- rbind(env_occ_train,env_bg)

# change matrix to dataframe
myPredictors <- as.data.frame(myPredictors)
head(myPredictors)

  bio1 bio10 bio11 bio12 bio13 bio14 bio15 bio16 bio17 bio18 bio19 bio2
236   275   193  1309   158    44    35   453   160   418   160  118
197   220   166  1358   256    23    70   683    86   625   107  112
180   270    86  1232   129    68    15   350   244   263   324  127
250   285   209   751   178     1   106   517     8   393    69  105
121   248    -8   593    90    11    56   262    42   249    42  148
240   243   236  4149   633    97    50  1690   372   665  1690  106
  bio3 bio4 bio5 bio6 bio7 bio8 bio9
 57 3279  336  131  205  270  193
 63 2205  274   98  176  220  166
 39 7168  340   15  325  220  265
 57 3052  330  147  183  283  240
 34 9906  339  -93  432  222   -8
 87  304  301  180  121  236  238

Maxent reads a 1 as presence and 0 as background. Thus, we need to assign a 1 to the training environmental conditions and a 0 for the background.We create a set of rows with the same number as the training and testing data, and put the value of “1” for each cell and a “0” for background. We combine the “1”s and “0”s into a vector.

# repeat the number 1 as many times as the number of rows in p, 
# and repeat 0 for the rows of background points
myResponse <- c(rep(1,nrow(env_occ_train)),
                rep(0,nrow(env_bg))) 

# (rep(1,nrow(p)) creating the number of rows as the p data set to 
# have the number '1' as the indicator for presence; rep(0,nrow(a)) 
# creating the number of rows as the a data set to have the number
# '0' as the indicator for absence; the c combines these ones and 
# zeros into a new vector that can be added to the Maxent table data
# frame with the environmental attributes of the presence and absence locations

mod <- maxent(x=myPredictors,p=myResponse)

5.2 View the model

mod@lambdas

  [1] "bio1, 0.0, -17.0, 288.0"                            
  [2] "bio10, 0.0, 28.0, 323.0"                            
  [3] "bio11, 0.0, -115.0, 279.0"                          
  [4] "bio12, 0.0, 0.0, 7507.0"                            
  [5] "bio13, 0.0, 0.0, 789.0"                             
  [6] "bio14, 0.0, 0.0, 471.0"                             
  [7] "bio15, 0.26763106606745934, 0.0, 179.0"             
  [8] "bio16, 0.0, 0.0, 2220.0"                            
  [9] "bio17, 0.0, 0.0, 1446.0"                            
 [10] "bio18, 0.0, 0.0, 1862.0"                            
 [11] "bio19, 0.0, 0.0, 1833.0"                            
 [12] "bio2, 0.0, 49.0, 197.0"                             
 [13] "bio3, 0.0, 24.0, 92.0"                              
 [14] "bio4, 0.0, 153.0, 11700.0"                          
 [15] "bio5, 0.0, 91.0, 420.0"                             
 [16] "bio6, 0.0, -218.0, 238.0"                           
 [17] "bio7, 0.0, 67.0, 458.0"                             
 [18] "bio8, 0.0, -109.0, 318.0"                           
 [19] "bio9, 0.0, -93.0, 290.0"                            
 [20] "bio11^2, -0.9050993113644507, 0.0, 77841.0"         
 [21] "bio6^2, -0.23747649426508424, 0.0, 56644.0"         
 [22] "bio9^2, 0.4275995833412999, 0.0, 84100.0"           
 [23] "bio1*bio4, 0.4275531776699976, -133807.0, 1744554.0"
 [24] "bio10*bio4, 1.2031027271578127, 25020.0, 2573018.0" 
 [25] "bio11*bio6, -0.30462690571479045, -3264.0, 63800.0" 
 [26] "bio11*bio8, -0.3250871769514228, -19623.0, 80352.0" 
 [27] "bio12*bio15, 0.19172246501526033, 0.0, 293293.0"    
 [28] "bio13*bio15, 1.9606573117627615, 0.0, 65130.0"      
 [29] "bio14*bio15, 1.580649741721404, 0.0, 8100.0"        
 [30] "bio18*bio2, -1.2542634205271663, 0.0, 180614.0"     
 [31] "bio18*bio7, -0.33814584317352503, 0.0, 247450.0"    
 [32] "bio19*bio4, -1.6067710599902456, 0.0, 5057568.0"    
 [33] "bio3*bio4, -0.1353239541968972, 13464.0, 376200.0"  
 [34] "bio3*bio5, -0.18718091328019476, 5100.0, 30096.0"   
 [35] "bio4*bio8, 1.0999308089617332, -857939.0, 2533188.0"
 [36] "(116.5<bio1), 1.7050295973145897, 0.0, 1.0"         
 [37] "(264.5<bio10), 0.35291115992405353, 0.0, 1.0"       
 [38] "(391.5<bio12), 0.5268415414479714, 0.0, 1.0"        
 [39] "(464.5<bio12), 0.2809049348106028, 0.0, 1.0"        
 [40] "(702.5<bio4), 0.02142587052662699, 0.0, 1.0"        
 [41] "(181.5<bio6), -0.0035686507209030487, 0.0, 1.0"     
 [42] "(94.5<bio15), 0.38328645908332803, 0.0, 1.0"        
 [43] "(113.5<bio7), 0.27630610703640396, 0.0, 1.0"        
 [44] "(44.5<bio3), -0.24341708475932264, 0.0, 1.0"        
 [45] "(169.5<bio2), -0.8805244643493182, 0.0, 1.0"        
 [46] "(180.5<bio6), -0.18330370526478784, 0.0, 1.0"       
 [47] "(108.5<bio7), 0.5002170217141941, 0.0, 1.0"         
 [48] "(89.5<bio15), 0.26746743942617796, 0.0, 1.0"        
 [49] "(161.5<bio2), -0.5999534648855973, 0.0, 1.0"        
 [50] "(168.5<bio1), 3.879523444910368E-4, 0.0, 1.0"       
 [51] "(644.5<bio18), 0.4529722070472205, 0.0, 1.0"        
 [52] "(6.5<bio17), 0.3377788196645386, 0.0, 1.0"          
 [53] "(83.5<bio2), 0.7336888159816374, 0.0, 1.0"          
 [54] "(200.5<bio10), -0.1446986549956838, 0.0, 1.0"       
 [55] "(2375.0<bio12), 0.21713583667210642, 0.0, 1.0"      
 [56] "(9534.0<bio4), 0.30445606508477613, 0.0, 1.0"       
 [57] "(922.5<bio18), -0.8194849771088258, 0.0, 1.0"       
 [58] "(166.5<bio14), -0.35761122017187547, 0.0, 1.0"      
 [59] "(90.5<bio14), 0.012535558287089285, 0.0, 1.0"       
 [60] "(229.5<bio9), 0.09777880311125317, 0.0, 1.0"        
 [61] "(120.5<bio2), -0.11265232152875598, 0.0, 1.0"       
 [62] "(147.5<bio2), 0.06641891160954848, 0.0, 1.0"        
 [63] "'bio3, -0.6590942767069409, 84.5, 92.0"             
 [64] "(118.5<bio2), -0.019175055467032555, 0.0, 1.0"      
 [65] "(9.5<bio19), 0.1082764651308983, 0.0, 1.0"          
 [66] "`bio1, -2.62299662570326, -17.0, 174.5"             
 [67] "(77.5<bio11), 0.1224812869941741, 0.0, 1.0"         
 [68] "'bio7, 1.0737412378814233, 388.5, 458.0"            
 [69] "`bio17, -0.27527123085013905, 0.0, 42.5"            
 [70] "(280.5<bio8), -0.11244424237993984, 0.0, 1.0"       
 [71] "(-8.5<bio11), 0.001422384121425667, 0.0, 1.0"       
 [72] "`bio12, -2.229804910918043, 0.0, 550.0"             
 [73] "'bio11, 1.2088338560623306, 266.5, 279.0"           
 [74] "(91.5<bio14), 0.08522175616366341, 0.0, 1.0"        
 [75] "`bio15, 0.8558439229257911, 0.0, 18.5"              
 [76] "(77.5<bio3), -0.09865537583557168, 0.0, 1.0"        
 [77] "(1293.0<bio19), -0.1549899172584807, 0.0, 1.0"      
 [78] "(1046.0<bio18), -0.08883760942640118, 0.0, 1.0"     
 [79] "'bio6, -0.6530681835192078, 206.5, 238.0"           
 [80] "`bio18, -0.2316277231865191, 0.0, 68.5"             
 [81] "(269.5<bio8), 0.08085238645569892, 0.0, 1.0"        
 [82] "`bio12, -0.3914383332253453, 0.0, 551.5"            
 [83] "(358.5<bio5), -0.06183686404200746, 0.0, 1.0"       
 [84] "`bio1, -0.39386424700682987, -17.0, 175.5"          
 [85] "'bio15, 0.37506121489990235, 78.5, 179.0"           
 [86] "`bio15, 0.9011870410944306, 0.0, 19.5"              
 [87] "'bio15, 1.366947964804506, 77.5, 179.0"             
 [88] "(3279.5<bio4), -0.19794148295258748, 0.0, 1.0"      
 [89] "`bio10, 1.7863768328333707, 28.0, 224.5"            
 [90] "(283.5<bio8), -0.05669483888677688, 0.0, 1.0"       
 [91] "`bio1, -0.27054487968330077, -17.0, 179.5"          
 [92] "`bio1, -0.6186742117687162, -17.0, 180.5"           
 [93] "(355.5<bio5), -0.08548324725550764, 0.0, 1.0"       
 [94] "`bio17, -0.2424451986068934, 0.0, 11.5"             
 [95] "(2352.0<bio12), 0.018080484180326595, 0.0, 1.0"     
 [96] "(359.5<bio5), -0.029811208725023956, 0.0, 1.0"      
 [97] "`bio14, 0.17659628831660346, 0.0, 0.5"              
 [98] "`bio17, -0.29706599399834954, 0.0, 10.5"            
 [99] "`bio1, -0.9043074623818957, -17.0, 181.5"           
[100] "`bio3, 0.1325513115525726, 24.0, 48.5"              
[101] "(266.5<bio17), 0.031937862680527056, 0.0, 1.0"      
[102] "linearPredictorNormalizer, 6.840459818603078"       
[103] "densityNormalizer, 195.11664123760133"              
[104] "numBackgroundPoints, 1418"                          
[105] "entropy, 6.784636178667567"                         

5.3 Predict function

Running Maxent in R will not automatically make a projection to the data layers, unless you specify this using the parameter projectionlayers. However, we could make projections (to dataframes or raster layers) post hoc using the predict() function.

project model on raster layers (training layers)

# example 1, project to study area [raster]
ped1 <- predict(mod,clim_mask) # studyArea is the clipped rasters 
plot(ped1) # plot the continuous prediction

plot of chunk predict1

project model on raster layers (whole world maps)

# example 2, project to the world
ped2 <- predict(mod,clim)
plot(ped2)

plot of chunk predict2

example 3, project to a dataframe (training occurrences). This returns the predicion value assocaited with a set of condions. In this example, we use the training condition to extract a prediction for each point.

ped3 <- predict(mod,env_occ_train)
head(ped3)

[1] 0.3992549 0.3542653 0.7025179 0.6716762 0.6041868 0.4814589

5.4 Model evaluation

To evaluate models, we use the evaluate() function from the dismo package. Evaluation indices include AUC, TSS, Sensitivity, Specificity, etc.

Model evaluation, where p & a are dataframes (environmental conditions for presences and background points)

Evaluate model with training data

mod_eval_train <- dismo::evaluate(p=env_occ_train,
                                  a=env_bg,
                                  model=mod) 
print(mod_eval_train)

class          : ModelEvaluation 
n presences    : 303 
n absences     : 1122 
AUC            : 0.8896257 
cor            : 0.5906694 
max TPR+TNR at : 0.3927896 

Evaluate model with testing data

mod_eval_test <- dismo::evaluate(p=env_occ_test,
                                 a=env_bg,
                                 model=mod)  
print(mod_eval_test) 

class          : ModelEvaluation 
n presences    : 304 
n absences     : 1122 
AUC            : 0.8279007 
cor            : 0.5036505 
max TPR+TNR at : 0.393721 

compare training & testing AUC

cat( "the training AUC is: ",mod_eval_train@auc ,"\n" )

the training AUC is:  0.8896257

cat( "the testing AUC is: ", mod_eval_test@auc  ,"\n" )

the testing AUC is:  0.8279007

5.5 Thresholds

To threshold our continuous predictions of suitability into binary predictions we use the threshold function of the “dismo” package. To plot the binary prediction, we plot the predictions that are larger than the threshold.

Here we use threshold() function to obtain particular thresholds based on evaluation results from the previous step.

thd1 <- threshold(mod_eval_train,stat="no_omission") # 0% omission rate 
thd2 <- threshold(mod_eval_train,stat="spec_sens") # highest TSS
thd3 <- threshold(mod_eval_train,stat="sensitivity",sensitivity=0.9) # 10% omission rate, i.e. sensitivity=0.9
thd4 <- threshold(mod_eval_train,stat="sensitivity",sensitivity=0.95) # 5% omission rate, i.e. sensitivity=0.95

# plotting points that are higher than the previously calculated thresholded value
plot(ped1>=thd1) 

plot of chunk model_evaluation2

Challenge: train a Maxent model with dataframe as input, calculate the AUC

–load occurrences & raster layers
–build a xxx meter buffer around occurrences
–mask raster by the buffer of occurrences
–generate random samples from the masked raster using sampleRandom()
–extract() environmental conditions from raster by points
–re-format the environmental conditions as input for maxent
–train a maxent model
–evaluate() the model with testing environmental conditions

Solution

library("raster")
library("dismo")

# prepare spatial occ data
if(!file.exists("data/occ_raw.rdata")){
  occ_raw <- gbif(genus="Dasypus",species="novemcinctus",download=TRUE) 
  save(occ_raw,file = "data/occ_raw.rdata")
}else{
  load("data/occ_raw.rdata")
}
occ_clean <- subset(occ_raw,(!is.na(lat))&(!is.na(lon))) 
occ_unique <- occ_clean[!duplicated( occ_clean[c("lat","lon")]  ),]
occ_unique_specimen <- subset(occ_unique, basisOfRecord=="PRESERVED_SPECIMEN")
occ_final <- subset(occ_unique_specimen, year>=1950 & year <=2000)
coordinates(occ_final) <- ~ lon + lat
myCRS1 <- CRS("+init=epsg:4326") # WGS 84
crs(occ_final) <- myCRS1

# prepare raster data
if( !file.exists( paste0("data/bioclim/bio_10m_bil.zip")   )){
  utils::download.file(url="http://biogeo.ucdavis.edu/data/climate/worldclim/1_4/grid/cur/bio_10m_bil.zip",
                       destfile="data/bioclim/bio_10m_bil.zip"   ) 
  utils::unzip("data/bioclim/bio_10m_bil.zip",exdir="data/bioclim") 
}

# load rasters
clim_list <- list.files("data/bioclim/",pattern=".bil$",full.names = T)
clim <- raster::stack(clim_list) 

occ_buffer <- buffer(occ_final,width=4*10^5) #unit is meter
clim_mask <- mask(clim, occ_buffer)

# extract environmental conditions
set.seed(1) 
bg <- sampleRandom(x=clim_mask,
                   size=10000,
                   na.rm=T, #removes the 'Not Applicable' points  
                   sp=T) # return spatial points 

set.seed(1) 

# randomly select 50% for training
selected <- sample(  1:nrow(occ_final),  nrow(occ_final)*0.5)

occ_train <- occ_final[selected,] # this is the selection to be used for model training
occ_test <- occ_final[-selected,] # this is the opposite of the selection which will be used for model testing

# extracting env conditions
env_occ_train <- extract(clim,occ_train)
env_occ_test <- extract(clim,occ_test)

# extracting env conditions for background
env_bg <- extract(clim,bg)  

#combine the conditions by row
myPredictors <- rbind(env_occ_train,env_bg)

# change matrix to dataframe
myPredictors <- as.data.frame(myPredictors)

# repeat the number 1 as many times as the number of rows in p, and repeat 0 for the rows of background points
myResponse <- c(rep(1,nrow(env_occ_train)),
                rep(0,nrow(env_bg))) 

# training a maxent model with dataframes
mod <- dismo::maxent(x=myPredictors, ## env conditions
                     p=myResponse)   ## 1:presence or 0:absence

# evaluate model based on testing data
mod_eval_test <- dismo::evaluate(p=env_occ_test,
                                 a=env_bg,
                                 model=mod) 
mod_eval_test@auc

previous episode

next episode