####Setup ####
#all the packages needed for this tutorial are listed here
library(mgcv)
library(MASS)
library(stringr)
library(gamm4)
library(tidyr)
library(ggplot2)
library(viridis)
library(cowplot)
library(kableExtra)
library(docxtools)
library(knitr)
library(tibble)
library(dplyr)

#Set the default theme for ggplot objects to theme_bw()
theme_set(theme_bw())
theme_update(panel.grid = element_blank())

#### Code for part I: Introduction ####

#### Code for II: A review of Generalized Additive Models ####
# example of varying lambda

set.seed(12)

# generate some data
dat <- gamSim(1, n=100, dist="normal", scale=2)
dat$y <- dat$y - (dat$f1 + dat$f0 + dat$f3)
dat$x <- dat$x2
true <- data.frame(x = sort(dat$x),
                   y = dat$f2[order(dat$x)])

## REML fit
b <- gam(y~s(x, k=100), data=dat, method = "REML")

# lambda=0
b.0 <- gam(y~s(x, k=100), data=dat, sp=0)

# lambda=infinity
b.inf <- gam(y~s(x, k=100), data=dat, sp=1e10)

# merging predictions from the models together to make plotting easier
pdat <- with(dat, data.frame(x = seq(min(x), max(x), length = 200)))
p <- cbind(pdat, fit = predict(b, newdata = pdat))
p.0 <- cbind(pdat, fit = predict(b.0, newdata = pdat))
p.inf <- cbind(pdat, fit = predict(b.inf, newdata = pdat))
ylims <- range(p, p.0, p.inf)

lab.l <- labs(x = "x", y = "y")
dat.l <- geom_point(data = dat, aes(x = x, y = y), colour = "darkgrey")
true.l <- geom_line(data = true, aes(x = x, y = y), colour = "blue")
coord.l <- coord_cartesian(ylim = ylims)

#plotting models
p1 <- ggplot(p, aes(x = x, y = fit)) +
  dat.l + true.l +
  geom_line(colour = "darkred") + lab.l + coord.l

p2 <- ggplot(p.0, aes(x = x, y = fit)) +
  dat.l + true.l +
  geom_line(colour = "darkred") + lab.l + coord.l

p3 <- ggplot(p.inf, aes(x = x, y = fit)) +
  dat.l + true.l +
  geom_line(colour = "darkred") + lab.l + coord.l

plot_grid(p1, p2, p3, align = "hv", axis = "lrtb", ncol = 3, labels = "auto")

#Code for generating figure 3: examples of basis functions and splines ####
k = 6
plotting_data = data.frame(x = seq(0,1,length=100))

#This creates the basis functions for a thin-plate spline
#The absorb.cons=FALSE setting makes sure that smoothCon does not remove basis 
#functions that have a non-zero sum (in this case, the intercept). Absorbing 
#constraints would result in having less than k basis functions, which is why fitted 
#terms in mgcv often have less than k maximum EDF. 
tp_basis = smoothCon(s(x,bs="tp",k=k), data=plotting_data,
                     knots=NULL,absorb.cons=FALSE)[[1]]

#### Extract basis functions #### 
tp_basis_funcr = as.data.frame(tp_basis$X)
names(tp_basis_funcr) = paste("F",1:k, sep="")
tp_basis_funcr$x = plotting_data$x
tp_basis_funcr$model = "Thin plate spline"

spline_basis_funcr = gather(tp_basis_funcr,func,value, -x,-model )



##### Extract penalty matrices ####

tp_basis_P = as.data.frame(tp_basis$S[[1]])
tp_basis_P = tp_basis_P/max(tp_basis_P)
names(tp_basis_P) = paste("F",1:k, sep="")
tp_basis_P$basis_y = factor(paste("F",1:k, sep=""), 
                            levels= rev(paste("F",1:k, sep="")))
tp_basis_P$model = "Thin plate spline"
spline_basis_penalties = gather(tp_basis_P ,basis_x,value,
                                -basis_y,-model )

spline_basis_penalties$penalty_type = "Smoothness penalty"


##### Creating a random draw from the function ####
set.seed(6) #ensure we can reproduce this example
coef_sample = rmvn(n = 1, mu = rep(0, times = k),V = 4*ginv(tp_basis$S[[1]]))
coef_sample[(k-1):k] = c(-0.1,0.12) #randomly draw the null space terms from a normal distribution
random_basis = as.matrix(tp_basis_funcr[,1:6])%*%diag(coef_sample) 
random_basis = as.data.frame(random_basis)
names(random_basis) = names(tp_basis_funcr)[1:6]

tp_example_curve = tp_basis_funcr %>%
  select(-matches("F[1-9]")) %>% #remove the old basis functions
  bind_cols(random_basis) %>%#append the new basis functions multiplied by a random sample of coefficients
  mutate(Ftotal = rowSums(select(., matches("F[1-9]"))))%>%
  gather(key = `basis function`, value = value, matches("F[1-9]"))

  
bs_func_labels = tp_example_curve %>%
  group_by(`basis function`)%>%
  summarise(value = value[x==max(x)],
            x     = max(x))%>%
  mutate(coef = round(coef_sample,2),
         basis_label = paste("paste(",`basis function`,"%*%", coef, ")", sep=""))




#### Creating plots for smoothness and null-space penalties
basis_func_plot = ggplot(aes(x=x,y=value),data=spline_basis_funcr)+
  geom_line()+
  scale_x_continuous(breaks=seq(0,1,length=3),
                     labels=c("0","0.5","1"))+
  facet_wrap(~func)+
  theme_bw()+
  theme(strip.background = element_blank(), panel.grid = element_blank())

basis_penalty_plot = ggplot(aes(x=basis_x,y=basis_y,fill=value),
                            data=spline_basis_penalties)+
  geom_tile(color="black")+
  theme_bw()+
  scale_fill_gradient2("penalty",high = "#b2182b",low="#2166ac",midpoint = 0 )+
  theme(strip.background = element_blank())+
  labs(x="", y="")+
  coord_fixed()+
  scale_x_discrete(expand = c(0,0))+
  scale_y_discrete(expand = c(0,0))+
  theme(axis.ticks=element_blank(),
        panel.grid=element_blank())


basis_sample_plot = ggplot(data= tp_example_curve, aes(x, value))+
  geom_line(aes(y = Ftotal),size=2)+
  geom_line(aes(group = `basis function`,color= `basis function`),size=0.5)+ 
  geom_text(data= bs_func_labels,
            aes(label = basis_label, 
                color= `basis function`,
                y = value),
            parse=T, hjust = 0,nudge_x = 0.01)+
  scale_x_continuous(expand = c(0,0),
                     limits = c(0,1.15))+
  scale_color_viridis_d()+
  guides(color = "none")


top_row_plot = plot_grid(basis_func_plot,basis_penalty_plot,
                         ncol=2,
                         rel_widths = c(1.,1),labels= c("","b"))
  
full_plot = plot_grid(top_row_plot,basis_sample_plot,
                      nrow=2,labels= c("a", "c"),axis = "t",rel_heights = c(1,0.9))

full_plot

#### Code for III: What are hierarchical GAMs? ####

#The default CO2 plant variable is ordered;
#This recodes it to an unordered factor (see main text for why).
CO2 <- transform(CO2, Plant_uo=factor(Plant, ordered=FALSE))

#Loading simulated bird movement data
bird_move <- read.csv("../data/bird_move.csv")

CO2_vis_plot <- ggplot(CO2, aes(x=conc, y=uptake, group=Plant,color=Plant, lty=Plant)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values = rep(c("red","blue","black"), times =4))+
  scale_linetype_manual(values = rep(1:4, each=3))+
  guides(color="none",linetype="none")+
  labs(x=expression(CO[2] ~ concentration ~ (mL ~ L^{-1})), y=expression(CO[2] ~ uptake ~ (mu*mol ~ m^{-2})))

bird_vis_plot <- ggplot(dplyr::filter(bird_move, count > 0),
                        aes(x=week, y=latitude, size=count))+
  facet_wrap(~ species) +
  geom_point() +
  scale_size(name = "Count", range = c(0.2, 3)) +
  labs(x = "Week", y = "Latitude") +
  theme(legend.position = "bottom")

plot_grid(CO2_vis_plot, bird_vis_plot, nrow=1, labels=c("a","b"),
          align = "hv", axis = "lrtb")
CO2_mod1 <- gam(log(uptake) ~ s(log(conc), k=5, bs="tp") +
                  s(Plant_uo, k=12, bs="re"),
                data=CO2, method="REML", family="gaussian")

plot(CO2_mod1, pages=1, seWithMean=TRUE)

# setup prediction data
CO2_mod1_pred <- with(CO2,
                      expand.grid(conc=seq(min(conc), max(conc), length=100),
                                  Plant_uo=levels(Plant_uo)))

# make the prediction, add this and a column of standard errors to the prediction
# data.frame. Predictions are on the log scale.
CO2_mod1_pred <- cbind(CO2_mod1_pred,
                       predict(CO2_mod1, CO2_mod1_pred, se.fit=TRUE, type="response"))

# make the plot. Note here the use of the exp() function to back-transform the
# predictions (which are for log-uptake) to the original scale
ggplot(data=CO2, aes(x=conc, y=uptake, group=Plant_uo)) +
  facet_wrap(~Plant_uo) +
  geom_point() +
  geom_line(aes(y=exp(fit)), data=CO2_mod1_pred) +
  geom_ribbon(aes(ymin=exp(fit - 2*se.fit), ymax=exp(fit + 2*se.fit), x=conc),
              data=CO2_mod1_pred, alpha=0.3, inherit.aes=FALSE) +
  labs(x=expression(CO[2] ~ concentration ~ (mL ~ L^{-1})),
       y=expression(CO[2] ~ uptake ~ (mu*mol ~ m^{-2})))
## # Note for specifying tensor products: you can either specify bs (basis) and
## # k (number of basis functions) as single values, which would assign the same
## # basis and k to each marginal value, or pass them as vectors, one value for each
## # distinct marginal smoother (see ?mgcv::te for details)
bird_mod1 <- gam(count ~ te(week, latitude, bs=c("cc", "tp"), k=c(10, 10)),
                 data=bird_move, method="REML", family="poisson",
                 knots = list(week = c(0, 52)))
#mgcv gam plot for the two-dimensional tensor product smoother for bird_mod1.
#scheme=2 displays the color scheme (rather than mgcv's default, which only
#shows contour lines)
plot(bird_mod1, pages=1, scheme=2, rug=FALSE)
box()
#add the predicted values from the model to bird_move
bird_move <- transform(bird_move, mod1 = predict(bird_mod1, type="response"))

ggplot(bird_move, aes(x=mod1, y=count)) +
  facet_wrap(~species) +
  geom_point() +
  geom_abline() +
  labs(x="Predicted count", y= "Observed count")
CO2_mod2 <- gam(log(uptake) ~ s(log(conc), k=5, m=2) + 
                  s(log(conc), Plant_uo, k=5,  bs="fs", m=2),
                data=CO2, method="REML")
plot(CO2_mod2, page=1, seWithMean=TRUE)
CO2_mod2_pred <- predict(CO2_mod2, se.fit=TRUE)
CO2 <- transform(CO2, mod2 = CO2_mod2_pred$fit, mod2_se = CO2_mod2_pred$se.fit)

ggplot(data=CO2, aes(x=conc, y=uptake, group=Plant_uo)) +
  facet_wrap(~Plant_uo) +
  geom_point() +
  geom_line(aes(y=exp(mod2))) +
  geom_ribbon(aes(ymin=exp(mod2-2*mod2_se),
                  ymax=exp(mod2+2*mod2_se)), alpha=0.25) +
  labs(x=expression(CO[2] ~ concentration ~ (mL ~ L^{-1})),
       y=expression(CO[2] ~ uptake ~ (mu*mol ~ m^{-2})))
bird_mod2 <- gam(count ~ te(week, latitude, bs=c("cc", "tp"),
                            k=c(10, 10), m=c(2, 2)) +
                   t2(week, latitude, species, bs=c("cc", "tp", "re"),
                      k=c(10, 10, 6), m=c(2, 2, 2), full=TRUE),
                 data=bird_move, method="REML", family="poisson", 
                 knots = list(week = c(0, 52)))
bird_move <- transform(bird_move, mod2 = predict(bird_mod2, type="response"))

bird_mod2_indiv <- ggplot(data=bird_move, aes(x=week, y=latitude, fill=mod2,color=mod2)) +
  geom_tile(size=0.25) +
  facet_wrap(~ species, ncol=6) +
  scale_fill_viridis("Count") +
  scale_color_viridis("Count") +
  scale_x_continuous(expand=c(0, 0), breaks=c(1, 26, 52)) +
  scale_y_continuous(expand=c(0, 0), breaks=c(0, 30, 60)) +
  labs(x = "Week", y = "Latitude") +
  theme(legend.position="right")

bird_mod2_indiv_fit <- ggplot(data=bird_move, aes(x=mod2, y=count)) +
  facet_wrap(~ species, ncol=6) +
  geom_point() +
  geom_abline() +
  labs(x="Predicted count (model 2)", y= "Observed count")

plot_grid(bird_mod2_indiv, bird_mod2_indiv_fit, ncol=1, align="vh", axis = "lrtb",
          labels=c("a","b"), rel_heights= c(1,1))
## CO2_mod3 <- gam(log(uptake) ~ s(log(conc), k=5, m=2, bs="tp") +
##                   s(log(conc), by=Plant_uo, k=5, m=1, bs="ts") +
##                   s(Plant_uo, bs="re", k=12),
##                 data=CO2, method="REML")
CO2_mod3 <- gam(log(uptake) ~ s(log(conc), k=5, m=2, bs="tp") +
                  s(log(conc), by= Plant_uo, k=5, m=1, bs="ts") +
                  s(Plant_uo, bs="re", k=12),
                data=CO2, method="REML")

op <- par(mfrow=c(2, 3), mar =c(4, 4, 1, 1))
plot(CO2_mod3, scale=0, select=1,  ylab="Global smoother", seWithMean=TRUE)
plot(CO2_mod3, scale=0, select=14, ylab="Intercept",     main=NA)
plot(CO2_mod3, scale=0, select=3,  ylab="Plant Qn1",     seWithMean=TRUE)
plot(CO2_mod3, scale=0, select=5,  ylab="Plant Qc1",     seWithMean=TRUE)
plot(CO2_mod3, scale=0, select=10, ylab="Plant Mn1",     seWithMean=TRUE)
plot(CO2_mod3, scale=0, select=13, ylab="Plant Mc1",     seWithMean=TRUE)
par(op)
bird_mod3 <- gam(count ~ species +
                   te(week, latitude, bs=c("cc", "tp"),
                      k=c(10, 10), m=c(2, 2)) +
                   te(week, latitude, by=species, bs= c("cc", "ts"),
                      k=c(10, 10), m=c(1, 1)),
                 data=bird_move, method="REML", family="poisson",
                 knots = list(week = c(0, 52)))
CO2_mod4 <- gam(log(uptake) ~ s(log(conc), Plant_uo, k=5, bs="fs", m=2),
                data=CO2, method="REML")

bird_mod4 <- gam(count ~ t2(week, latitude, species, bs=c("cc", "tp", "re"),
                            k=c(10, 10, 6), m=c(2, 2, 2)),
                 data=bird_move, method="REML", family="poisson",
                 knots = list(week = c(0, 52)))
CO2_mod5 <- gam(log(uptake) ~ s(log(conc), by=Plant_uo, k=5, bs="tp", m=2) +
                              s(Plant_uo, bs="re", k=12),
                data= CO2, method="REML")


bird_mod5 <- gam(count ~ species + 
                   te(week, latitude, by=species, bs= c("cc", "ts"), 
                      k=c(10, 10), m=c(2, 2)),
                 data=bird_move, method="REML", family="poisson",
                 knots = list(week = c(0, 52)))
#This is the raw 
AIC_table = AIC(CO2_mod1,CO2_mod2, CO2_mod3, CO2_mod4, CO2_mod5,
             bird_mod1, bird_mod2, bird_mod3, bird_mod4, bird_mod5)%>%
  rownames_to_column(var= "Model")%>%
  mutate(data_source = rep(c("CO2","bird_data"), each =5))%>%
  group_by(data_source)%>%
  mutate(deltaAIC = AIC - min(AIC))%>%
  ungroup()%>%
  dplyr::select(-data_source)%>%
  mutate_at(.vars = vars(df,AIC, deltaAIC), .funs = funs(round,.args = list(digits=0)))

#### Code for IV: Examples ####
zooplankton <- read.csv("../data/zooplankton_example.csv")%>%
  mutate(year_f = factor(year))

#This is what the data looks like:
str(zooplankton)
levels(zooplankton$taxon)
levels(zooplankton$lake)

# We'll now break it into testing and training data. The training data will be
# used to fit the model, and the testing data will be used to evaluate model fit.

#the first training and testing data set will be used to compare dynamics of
#plankton communities in Lake Mendota
zoo_train <- subset(zooplankton, year%%2==0 & lake=="Mendota")
zoo_test  <- subset(zooplankton, year%%2==1 & lake=="Mendota") 

#The second training and testing set will compare Daphnia mendotae dynamics among
#four lakes
daphnia_train <- subset(zooplankton, year%%2==0 & taxon=="D. mendotae")
daphnia_test  <- subset(zooplankton, year%%2==1 & taxon=="D. mendotae")

#This function calculates the root-mean-squared-error for out-of-sample data
get_RMSE <- function(fit, obs) sqrt(mean((fit-obs)^2))

zoo_comm_mod4 <- gam(density_adj ~ s(day, taxon,
                                     bs="fs",
                                     k=10,
                                     xt=list(bs="cc"))+
                                   s(taxon, year_f, bs="re"),
                     data=zoo_train,
                     knots = list(day =c(0, 365)),
                     family = Gamma(link ="log"), 
                     method = "REML",
                     drop.unused.levels = FALSE)
# Note that  s(taxon, bs="re") has to be explicitly included here, as the 
# day  by taxon smoother does not include an intercept
zoo_comm_mod5 <- gam(density_adj ~ s(day, by=taxon,
                                     k=10, bs="cc") + 
                                   s(taxon, bs="re") +
                                   s(taxon, year_f, bs="re"),
                     data=zoo_train,
                     knots = list(day =c(0, 365)),
                     family = Gamma(link ="log"), 
                     method = "REML",
                     drop.unused.levels = FALSE)
## gam.check(zoo_comm_mod4)
## gam.check(zoo_comm_mod5)
#individual components of gam.check: the results for k.check
round(k.check(zoo_comm_mod5),2)
#individual components of gam.check: residual plots
par(mfrow= c(1,2))
qq.gam(zoo_comm_mod4)
plot(log(fitted(zoo_comm_mod5)), residuals.gam(zoo_comm_mod5,type = "deviance"), xlab = "linear predictor",
     ylab = "residuals")
#Create synthetic data to use to compare predictions
zoo_plot_data <- expand.grid(day = 1:365, taxon = factor(levels(zoo_train$taxon)), year_f = 1980)

#extract predicted values and standard errors for both models. the exclude = "s(taxon,year_f)" 
#term indicates that predictions should be made excluding the effect of the
#taxon by year random effect (effectively setting making predictions averaging
#over year-taxon effects).
zoo_mod4_fit <- predict(zoo_comm_mod4, zoo_plot_data, se.fit = T, exclude = "s(taxon,year_f)")
zoo_mod5_fit <- predict(zoo_comm_mod5, zoo_plot_data, se.fit = T, exclude = "s(taxon,year_f)")

zoo_plot_data$mod4_fit <- as.numeric(zoo_mod4_fit$fit)
zoo_plot_data$mod5_fit <- as.numeric(zoo_mod5_fit$fit)

zoo_plot_data <- gather(zoo_plot_data, model, fit, mod4_fit, mod5_fit)
zoo_plot_data <- mutate(zoo_plot_data, se= c(as.numeric(zoo_mod4_fit$se.fit), as.numeric(zoo_mod5_fit$se.fit)),
                         upper = exp(fit + (2 * se)),
                         lower = exp(fit - (2 * se)),
                         fit   = exp(fit))

#Plot the model output, with means plus standard deviations for each model.
zoo_plot <- ggplot(zoo_plot_data) +
  facet_wrap(~taxon, nrow = 4,scales = "free_y")+
  geom_point(data= zoo_train, aes(x = day, y = density_adj),size=0.1)+
  geom_line(aes(x = day, y = fit, color = model))+
  geom_ribbon(aes(x=day,
                  ymin = lower,
                  ymax = upper,
                  fill = model),
              alpha=0.2)+
  labs(y = expression(atop(Population~density,("10 000"~individuals~m^{-2}))), x = "Day of Year") +
    scale_fill_brewer(name = "", palette = "Dark2",
                      labels = paste("Model", 4:5)) +
    scale_colour_brewer(name = "",
                        palette = "Dark2", labels = paste("Model", 4:5))+
  theme(legend.position = "top")

zoo_plot

#Getting the out of sample predictions for both models:

# we need to compare how well this model fits with a null model. here we'll use an
# intercept-only model
zoo_comm_mod0 <- gam(density_adj ~ s(taxon,bs="re"),
                     data=zoo_train,
                     knots = list(day =c(0, 365)),
                     family = Gamma(link ="log"), 
                     method = "REML",
                     drop.unused.levels = FALSE)

#Correlations between fitted and observed values for all species:
#\n is in variable titles to add a line break in the printed table.
zoo_test_summary = zoo_test %>%
  mutate(
    mod0 = predict(zoo_comm_mod0, ., type="response"),
    mod4 = predict(zoo_comm_mod4, ., type="response"),
    mod5 = predict(zoo_comm_mod5, ., type="response"))%>%
  group_by(taxon)%>%
  summarise(
    `Intercept only` = format(get_RMSE(mod0, density_adj), scientific = FALSE, digits=3),
    `Model 4` = format(get_RMSE(mod4, density_adj), scientific = FALSE, digits=3),
    `Model 5` = format(get_RMSE(mod5, density_adj), scientific = FALSE, digits=3))%>%
  mutate(taxon = cell_spec(taxon, italic = c(T,F,F,T,T,T,T,T))) #need to specify this to ensure that species names are italized in the table

zoo_daph_mod1 <- gam(density_adj ~ s(day, bs="cc", k=10)+
                       s(lake, bs="re") + 
                       s(lake, year_f,bs="re"),
                     data=daphnia_train,
                     knots=list(day =c(0, 365)),
                     family=Gamma(link ="log"),
                     method="REML",
                     drop.unused.levels = FALSE)
zoo_daph_mod2 <-
  gam(density_adj ~ s(day, bs="cc", k=10) + 
        s(day, lake, k=10, bs="fs", xt=list(bs="cc")) + 
        s(lake, year_f,bs="re"),
      data=daphnia_train, 
      knots=list(day=c(0, 365)), 
      family=Gamma(link ="log"),
      drop.unused.levels = FALSE,
      method="REML")
zoo_daph_mod3 <- gam(density_adj~s(day, bs="cc", k=10) +
                             s(day, by=lake, k=10, bs="cc")+
                             s(lake, bs="re") + 
                             s(lake, year_f,bs="re"),
                     data=daphnia_train,
                     knots=list(day =c(0, 365)),
                     family=Gamma(link ="log"),
                     method="REML",
                     drop.unused.levels = FALSE)
#Create synthetic data to use to compare predictions
daph_plot_data <- expand.grid(day = 1:365, lake = factor(levels(zoo_train$lake)),year_f = 1980)

#extract predicted values and standard errors for both models. the exclude = "s(taxon,year_f)" 
#term indicates that predictions should be made excluding the effect of the
#taxon by year random effect (effectively setting making predictions averaging
#over year-taxon effects).
daph_mod1_fit <- predict(zoo_daph_mod1, daph_plot_data, se.fit = TRUE, exclude = "s(lake,year_f)")
daph_mod2_fit <- predict(zoo_daph_mod2, daph_plot_data, se.fit = TRUE, exclude = "s(lake,year_f)")
daph_mod3_fit <- predict(zoo_daph_mod3, daph_plot_data, se.fit = TRUE, exclude = "s(lake,year_f)")

daph_plot_data$mod1_fit <- as.numeric(daph_mod1_fit$fit)
daph_plot_data$mod2_fit <- as.numeric(daph_mod2_fit$fit)
daph_plot_data$mod3_fit <- as.numeric(daph_mod3_fit$fit)

daph_plot_data <- gather(daph_plot_data, model, fit, mod1_fit, mod2_fit, mod3_fit)

daph_plot_data <- mutate(daph_plot_data, se = c(as.numeric(daph_mod1_fit$se.fit),
                                                as.numeric(daph_mod2_fit$se.fit),
                                                as.numeric(daph_mod3_fit$se.fit)),
                         upper = exp(fit + (2 * se)),
                         lower = exp(fit - (2 * se)),
                         fit   = exp(fit))


daph_plot <- ggplot(daph_plot_data, aes(x=day))+
  facet_wrap(~lake, nrow = 2)+
  geom_point(data= daphnia_train, aes(x = day, y = density_adj),size=0.1)+
  geom_ribbon(aes(x = day, ymin = lower, ymax = upper, fill = model), 
                alpha = 0.2) +
  geom_line(aes(x = day, y = fit, colour = model)) +

  labs(y = expression(atop(Population~density,("10 000"~individuals~m^{-2}))), x = "Day of Year") +
  scale_x_continuous(expand = c(0,0))+
    scale_fill_brewer(name = "", palette = "Dark2",
                      labels = paste("Model", 1:3)) +
    scale_colour_brewer(name = "",
                        palette = "Dark2", labels = paste("Model", 1:3))


daph_plot
# we need to compare how well this model fits with a null model. here we'll use an
# intercept-only model
zoo_daph_mod0 <- gam(density_adj~s(lake, bs="re"),
                     data=daphnia_train,
                     knots=list(day =c(0, 365)),
                     family=Gamma(link ="log"),
                     method="REML",
                     drop.unused.levels = FALSE)



# We'll look at the correlation between fitted and observed values for all species:

daph_test_summary <- daphnia_test %>%
  mutate(#get out-of-sample predicted fits
    mod0 = as.numeric(predict(zoo_daph_mod0,.,type="response")),
    mod1 = as.numeric(predict(zoo_daph_mod1,.,type="response")),
    mod2 = as.numeric(predict(zoo_daph_mod2,.,type="response")),
    mod3 = as.numeric(predict(zoo_daph_mod3,.,type="response")))%>%
  group_by(lake)%>%
  summarise(`Intercept only` = format(get_RMSE(mod0, density_adj), scientific = FALSE, digits=2),
            `Model 1` = format(get_RMSE(mod1, density_adj), scientific = FALSE, digits=2),
            `Model 2` = format(get_RMSE(mod2, density_adj), scientific = FALSE, digits=2),
            `Model 3` = format(get_RMSE(mod3, density_adj), scientific = FALSE, digits=2))%>%
  rename(Lake = lake)

#### Code for V: Computational and statistical issues when fitting HGAMs ####
#This code will generate the bias-variance tradeoff plot
set.seed(1)

#Calculate the numerical approximation of the 2nd derivative for a a function given x and y values 
#Assumes a constant step size (delta) for x along its path.
calc_2nd_deriv = function(x,y){
  deriv_val = (lag(y) + lead(y) - 2*y)/(x-lag(x))^2
  deriv_val
}

#Generate true regression functions that differ in their frequencies. 
#Higher frequencies correspond to more variable functions. 
freq_vals = c(1/4,1/2,1,2,4)
dat = crossing(x = seq(0,2*pi,length=150),freq = freq_vals)%>%
  mutate(y = sin(freq*x) +rnorm(n(), 0, 0.2),
         grp = paste("frequency = ",freq,sep= ""),
         grp = factor(grp,  levels = paste("frequency = ",freq_vals,sep= "")))

#Fit a type-4 function (shared smoothness) for the test data
mod1 = bam(y~s(x,k=30,grp, bs="fs"), data=dat)

#Fit a type-5 function (differing smoothness) for the test data
mod2 = bam(y~s(x,k=30,by=grp)+s(grp,bs="re"), data=dat)

#Extract fitted values for each model for all the test data
overfit_predict_data = crossing(x = seq(0,2*pi,length=500), freq = freq_vals)%>%
  mutate(grp = paste("frequency = ",freq,sep= ""),
         grp = factor(grp,  levels = paste("frequency = ",freq_vals,sep= "")),
         y = sin(freq*x))%>%
  mutate(fit1 = as.numeric(predict(mod1,newdata = .,type = "response")),
         fit2 = as.numeric(predict(mod2,newdata = .,type="response")))

#turn this into long-format data for plotting, and to make it easier to calculate derivatives
overfit_predict_data_long = overfit_predict_data %>%
  gather(model, value, y, fit1, fit2)%>%
  mutate(model = recode(model, y = "true value",fit1 = "model 4 fit",
                        fit2 = "model 5 fit"),
         model = factor(model, levels=  c("true value","model 4 fit",
                                          "model 5 fit")))

#estimate 2nd derivatives of each curve, then for each curve calculate the sum of
#squared 2nd derivatives of the true curve and the predictions for both models.
deriv_est_data = overfit_predict_data%>%
  group_by(grp)%>%
  arrange(grp, x)%>%
  mutate(fit1_deriv = calc_2nd_deriv(x,fit1),
         fit2_deriv = calc_2nd_deriv(x,fit2))%>%
  summarize(freq = freq[1], fit1_int = sum(fit1_deriv^2*(x-lag(x)),na.rm = T),
            fit2_int = sum(fit2_deriv^2*(x-lag(x)),na.rm = T))%>%
  ungroup()%>%
  mutate(sqr_2nd_deriv = -freq^3*(sin(4*pi*freq)-4*pi*freq)/4)%>%
  gather(key=model,value = obs_sqr_deriv,fit1_int,fit2_int)%>%
  mutate(model = factor(ifelse(model=="fit1_int", "model 4 fit",
                               "model 5 fit"),
                        levels = c("model 4 fit",
                                   "model 5 fit")))

#The derivative plots
deriv_plot =  ggplot(data=deriv_est_data, aes(x=sqr_2nd_deriv, y= obs_sqr_deriv,color= model))+
  geom_point()+
  scale_y_log10("Estimated wiggliness\nof fitted curves")+
  scale_x_log10("Wiggliness of true curve")+
  scale_color_brewer(name=NULL,palette= "Set1")+
  geom_abline(color="black")+
  theme(legend.position = "top")

fit_colors = c("black",RColorBrewer::brewer.pal(3, "Set1")[1:2])

overfit_vis_plot = ggplot(data=overfit_predict_data_long,aes(x=x,y= value,color=model))+
  geom_line()+
  scale_color_manual(values=fit_colors)+
  facet_grid(.~grp)+
  theme(legend.position = "top")
#Plot the overfit graphs together.
cowplot::plot_grid(overfit_vis_plot, deriv_plot, ncol=1, labels="auto",
                   align="hv", axis="lrtb")
#Note: this code takes quite a long time to run! It's fitting all 10 models.
#Run once if possible, then rely on the cached code. There's a reason it's split off from the rest of the chunks of code.

#This function extracts the number of penalties used in a model. For Gamma and Gaussian
#families, you need to remove the penalty (i.e. variance) associated with the scale term
get_n_pen  = function(model) {
  family = model$family[[1]]
  if(family %in% c("Gamma","gaussian")){
    capture.output({out_val = nrow(gam.vcomp(model))-1})
  }else{
    capture.output({out_val = nrow(gam.vcomp(model))})
  }
  return(out_val)
}

#Extract the number of coefficients from a fitted GAM
get_n_coef = function(model) length(coef(model))

#Get the number of inner and outer iterations needed to fit the final model
get_n_iter = function(model) model$outer.info$iter
get_n_out_iter = function(model) model$iter

#combine results into a single table
comp_resources = crossing(model_number = c("1","2","3","4","5"),
                       data_source = factor(c("CO2","bird_move"),
                                     levels = c("CO2","bird_move")),
                       time = 0, n_smooths = 0,
                       n_coef = 0)


#Fit each model to the example data sets, and calculate run time for them
comp_resources[1,"time"] = system.time(CO2_mod1 <- gam(log(uptake) ~ s(log(conc),k=5,m=2, bs="tp")+s(Plant_uo, k =12,  bs="re"),
                                                    data= CO2,method="REML",
                                                    control = list(keepData=TRUE)))[3]

comp_resources[2,"time"] = system.time(bird_mod1 <- gam(count ~ te(week,latitude, bs= c("cc", "tp"), k=c(10,10)),
                                                     data= bird_move, method="REML", family= poisson,
                                                     control = list(keepData=TRUE)))[3]

comp_resources[3,"time"] = system.time(CO2_mod2 <- gam(log(uptake) ~ s(log(conc),k=5,m=2, bs="tp")+
                                                       s(log(conc), Plant_uo, k=5, bs="fs",m=1),
                                                     data= CO2,method="REML",
                                                     control = list(keepData=TRUE)))[3]


comp_resources[4,"time"] = system.time(bird_mod2 <- gam(count ~ te(week,latitude, bs= c("cc", "tp"),
                                                                 k=c(10,10),m=c(2,2))+
                                                        te(week,latitude,species, bs= c("cc", "tp","re"),
                                                           k=c(10,10,6),m = c(1,1,1)),
                                                      data= bird_move, method="REML", family= poisson,
                                                      control = list(keepData=TRUE)))[3]


comp_resources[5,"time"] = system.time(
  CO2_mod3 <- gam(log(uptake) ~ s(log(conc),k=5,m=2, bs="tp")+
                    s(log(conc),by= Plant_uo, k =5,  bs="ts",m=1)+
                    s(Plant_uo,bs="re",k=12),
                  data= CO2,method="REML",
                  control = list(keepData=TRUE)))[3]



comp_resources[6,"time"] = system.time(
  bird_mod3 <- gam(count ~ te(week,latitude, bs= c("cc", "tp"),
                              k=c(10,10),m=c(2,2)) +
                     te(week,latitude, bs= c("cc", "tp"),
                        k=c(10,10),m=c(1,1),by= species),
                   data= bird_move, method="REML", family= poisson,
                   control = list(keepData=TRUE)))[3]


comp_resources[7,"time"] = system.time(
  CO2_mod4 <- gam(log(uptake) ~ s(log(conc), Plant_uo, k=5,  bs="fs",m=2),
                  data= CO2,method="REML",
                  control = list(keepData=TRUE)))[3]


comp_resources[8,"time"] = system.time(
  bird_mod4 <- gam(count ~ te(week,latitude,species, bs= c("cc", "tp","re"),
                              k=c(10,10,6),m = 2),
                   data= bird_move, method="REML", family= poisson,
                   control = list(keepData=TRUE))
)[3]


comp_resources[9,"time"] = system.time(
  CO2_mod5 <- gam(log(uptake) ~ s(log(conc),by= Plant_uo, k =5,  bs="tp",m=2)+
                    s(Plant_uo,bs="re",k=12), data= CO2,method="REML",
                  control = list(keepData=TRUE))

)[3]

comp_resources[10,"time"] = system.time(
  bird_mod5 <- gam(count ~ te(week,latitude,by=species, bs= c("cc", "tp"),
                              k=c(10,10),m = 2),
                   data= bird_move, method="REML", family= poisson,
                   control = list(keepData=TRUE))
)[3]

#combine all fitted models into a list
comp_resources$model = list(CO2_mod1, bird_mod1, CO2_mod2, bird_mod2,
                            CO2_mod3, bird_mod3,CO2_mod4, bird_mod4,
                            CO2_mod5, bird_mod5)

#Extract all of the information on computer time and resources needed for each model
comp_resources = comp_resources %>%
  group_by(model_number, data_source)%>%
  mutate(n_smooths = get_n_pen(model[[1]]),
         n_coef = get_n_coef(model[[1]]),
         n_iter = get_n_iter(model[[1]]),
         n_iter_out = get_n_out_iter(model[[1]]))


comp_resources_table =comp_resources %>%
  ungroup()%>%
  arrange(data_source,model_number)%>%
  transmute(data_source =data_source, model=model_number,
            `relative time` = time,`coefficients` = n_coef,
            `penalties` = n_smooths
            )%>%
  group_by(data_source) %>%
  mutate(`relative time` = `relative time`/`relative time`[1],#scales processing time relative to model 1
         `relative time` = ifelse(`relative time`<10, signif(`relative time`,1), signif(`relative time`, 2)) #rounds to illustrate differences in timing.
         )%>%
  ungroup() %>%
  dplyr::select( - data_source)

#This code calculates the timing it takes to fit the same model (with varying amounts of data) for 
#gam, bam, gamm, and gamm4. 

set.seed = 1 # ensures that each new model parameter set is an extension of the old one

n_x = 20
x = seq(-2,2, length=n_x)
n_steps = 7

#setting up blank data frame to put results in.
fit_timing_data = data_frame(n_groups = 2^(1:n_steps),
                             gam=rep(0,length=n_steps),
                             `bam (discrete = FALSE)`= 0,
                             `bam (discrete = TRUE)` = 0,
                             gamm = 0, gamm4 = 0)

fac_all = paste("g", 1:max(fit_timing_data$n_groups),sep = "")

model_coefs_all = data_frame(fac=fac_all)%>%
  mutate(int = rnorm(n(), 0,0.1),
         x2  = rnorm(n(),0,0.2),
         logit_slope = rnorm(n(),0, 0.2))

#for each number of observations, fit all the models and calculate run times for them
for(i in 1:n_steps){

  n_g =  fit_timing_data$n_groups[i]
  
  fac_current = fac_all[1:n_g]
  fac_current = factor(fac_current, levels=  unique(fac_current))

  model_coefs = model_coefs_all%>%
    filter(fac %in% fac_current)%>%
    mutate(fac = factor(fac, levels= unique(fac)))
  
  set.seed = 1 #ensures that each new data set is an extension of the old one
  
  model_data = crossing(fac=fac_current, x=x)%>%
    left_join(model_coefs)%>%
    mutate(base_func  = dnorm(x)*10,
           indiv_func = int + x^2*x2 + 2*(exp(x*logit_slope)/(1+exp(x*logit_slope))-0.5),
           y = base_func + indiv_func + rnorm(n()))
  
  fit_timing_data$gam[i] = system.time(gam(y~s(x,k=10, bs="cp") + s(x,fac, k=10, bs="fs", xt=list(bs="cp"), m=2),
                             data= model_data, method="REML"))[3]
  
  fit_timing_data$`bam (discrete = FALSE)`[i] = system.time(bam(y~s(x,k=10, bs="cp") + s(x,fac, k=10, bs="fs", xt=list(bs="cp"),m=2),
                             data= model_data, discrete = FALSE))[3]
  
  fit_timing_data$`bam (discrete = TRUE)`[i] = system.time(bam(y~s(x,k=10, bs="cp") + s(x,fac, k=10, bs="fs", xt=list(bs="cp"),m=2),
                                                                data= model_data,discrete=TRUE))[3]
  
  
  fit_timing_data$gamm[i] = system.time(gamm(y~s(x,k=10, bs="cp") + s(x,fac, k=10, bs="fs", xt=list(bs="cp"),m=2),
                               data= model_data))[3]
  
  
  fit_timing_data$gamm4[i] = system.time(gamm4(y~s(x,k=10, bs="cp") + s(x,fac, k=10, bs="fs", xt=list(bs="cp"),m=2),
                                 data= model_data))[3]
}

#Combine all of the timing data into long format, ready for plotting
fit_timing_long = fit_timing_data %>% 
  gather(model, timing, gam,`bam (discrete = FALSE)`, 
         `bam (discrete = TRUE)`, gamm, gamm4)%>%
  mutate(model =factor(model, levels = c("gam",
                                         "bam (discrete = FALSE)",
                                         "bam (discrete = TRUE)",
                                         "gamm", 
                                         "gamm4")))


timing_plot = ggplot(aes(n_groups, timing, color=model, linetype= model), 
                     data=fit_timing_long)+
  geom_line()+
  geom_point()+
  scale_color_manual(values = c("black", "#1b9e77","#1b9e77", "#d95f02", "#7570b3"))+
  scale_linetype_manual(values =c(1,1,2,1,1))+
  scale_y_log10("run time (seconds)", breaks = c(0.1,1,10,100), labels = c("0.1", "1","10", "100"))+
  scale_x_log10("number of groups", breaks = c(2,8,32,128))+
  guides(color = guide_legend(nrow = 2, byrow = TRUE))+
  theme(legend.position = "top")
timing_plot