What factors explain best the pattern in age-specific Catch-Per-Unit-Effort (CPUE) of Pleuronectes platessa (plaice) in the first quarter of 2015?
Since we have longitude, latitude, and age in our dataset we can test whether these variables have any effect. Lets start with longitude as it is commonly known that plaice is more frequent in the Western Baltic Sea.
Data import and transformation
library(tidyverse)
library(lubridate)
library(forcats)
library(modelr)
# Import the downloaded data (set your own working directory) --
cpue <- read_csv("data/CPUE per age per area_2017-11-20 06_48_16.csv")
# Transform the data (which is already tidy) -----------------
# Subset data and convert to long format --
# (so that CPUE is grouped for all age classes)
plaice <- cpue %>%
mutate(Area = factor(Area, levels = c(21,22,
23,24,25,26,27,28,29,30,31,32))) %>%
filter(Year == 2015, Quarter == 1, Species == "Pleuronectes platessa") %>%
select(Area, Quarter, Species, contains("Age")) %>%
gather(key = "Age", value = "CPUE", Age_0:Age_10) %>%
mutate(Age = factor(Age, levels = paste0("Age_", 0:10),
labels = c("0","1","2","3","4","5","6","7","8","9","10")))
# Create a tibble wiht central coordinates for each area (ICES subdivision) --
sd_coord <- tibble(
Area = factor(c(21,22,23,24,25,26,27,28,29,30,31,32)),
Lat = c(57,55,55.75,55,55.5,55.5,58,57.5,59.5,62,64.75,60),
Long = c(11.5,11,12.5,13.5,16,19.5,18,20,21,19.5,22.5,26)
)
# Merge this table into the cpue data --
plaice <- left_join(plaice, sd_coord, by = "Area")
plaice %>%
ggplot(aes(x = Long, y = CPUE)) +
geom_jitter(aes(colour = Age)) +
scale_colour_brewer(
palette = "Paired") +
guides(colour = "none") +
ggtitle("Plaice CPUE ~ Longitude (1.Quarter in 2015)")
Since a straight line seems less appropriate we will compare different model families including polynomials and transformation of Y.
plaice <- plaice %>%
mutate(CPUE_log = log(CPUE + 0.001))
m_lm <- lm(CPUE ~ Long, data = plaice)
m_poly2 <- lm(CPUE ~ poly(Long,2), data = plaice)
m_poly3 <- lm(CPUE ~ poly(Long,3), data = plaice)
m_log_y <- lm(CPUE_log ~ Long, data = plaice)
pred_dat <- plaice %>%
data_grid(Long = seq_range(Long, 20)) %>%
# generates an evenly spaced grid of data points with
spread_predictions(m_lm, m_poly2, m_poly3, m_log_y) %>%
gather(m_lm:m_log_y, key = "Model", value = "Pred") %>%
mutate(Model = factor(Model))
p1 <- pred_dat %>%
filter(Model != "m_log_y") %>%
ggplot(aes(x = Long, y = Pred)) +
geom_line(aes(linetype = Model)) +
geom_jitter(data = plaice, aes(y = CPUE, colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("CPUE") +
ggtitle("Models with untransformed data")
p2 <- pred_dat %>%
filter(Model == "m_log_y") %>%
ggplot(aes(x = Long, y = Pred)) +
geom_line(aes(linetype = Model)) +
geom_jitter(data = plaice, aes(y = CPUE_log, colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("log(CPUE)") +
ggtitle("Model with log-transformed CPUE data")
gridExtra::grid.arrange(p1, p2, ncol = 2)
Which model do you prefer? Which models are misfitting the data? Which problem becomes more apparent when log-transforming CPUE?
par(mfrow = c(4,4))
plot(m_lm, main = "m_lm")
plot(m_poly2, main = "m_poly2")
plot(m_poly3, main = "m_poly3")
plot(m_log_y, main = "m_log_y")
Too many zeros in the data can inflate the parameterization of coefficients.
→ In any case, we need to rephrase our research question!
plaice %>% group_by(Age, CPUE) %>%
filter(CPUE < 0.01) %>%
summarize(counts = n()) %>% print(n=40)
## # A tibble: 8 x 3
## # Groups: Age [?]
## Age CPUE counts
## <fct> <dbl> <int>
## 1 0 0 5
## 2 1 0 3
## 3 5 0 1
## 4 6 0 2
## 5 7 0 3
## 6 8 0 2
## 7 9 0 3
## 8 10 0 4
Lets remove age 0, 1 and > 6 and start with step 2 again
plaice <- plaice %>% filter(Age %in% 2:6) %>%
mutate(Age = fct_drop(Age))
m_lm <- lm(CPUE ~ Long, data = plaice)
m_poly2 <- lm(CPUE ~ poly(Long,2), data = plaice)
m_poly3 <- lm(CPUE ~ poly(Long,3), data = plaice)
m_log_y <- lm(CPUE_log ~ Long, data = plaice)
For reasons of convenience I named the subset also ‘plaice’ so I can simply run the previous code again without many modifications.
pred_dat <- plaice %>%
data_grid(Long = seq_range(Long, 20)) %>%
spread_predictions(m_lm, m_poly2, m_poly3, m_log_y) %>%
gather(m_lm:m_log_y, key = "Model", value = "Pred") %>%
mutate(Model = factor(Model))
p1 <- pred_dat %>%
filter(Model != "m_log_y") %>%
ggplot(aes(x = Long, y = Pred)) +
geom_line(aes(linetype = Model)) +
geom_jitter(data = plaice, aes(y = CPUE, colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("CPUE") +
ggtitle("Models with untransformed data")
p2 <- pred_dat %>%
filter(Model == "m_log_y") %>%
ggplot(aes(x = Long, y = Pred)) +
geom_line(aes(linetype = Model)) +
geom_jitter(data = plaice, aes(y = CPUE_log, colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("log(CPUE)") +
ggtitle("Model with log-transformed CPUE data")
gridExtra::grid.arrange(p1, p2, ncol = 2)
How do the models perform now?
par(mfrow = c(4,4))
plot(m_lm, main = "m_lm")
plot(m_poly2, main = "m_poly2")
plot(m_poly3, main = "m_poly3")
plot(m_log_y, main = "m_log_y")
m_log_y
shows the best residual distribution so we could focus on this model and move on the step 4; however, for some age classes this model greatly underestimates the CPUE.r <- plaice %>% spread_residuals(m_lm, m_poly2, m_poly3, m_log_y) %>%
ggplot() + geom_hline(yintercept = 0,
colour = "blue", size = 0.5)
r1_lat <- r + geom_point(aes(x = Lat, y = m_lm),
colour = "red", alpha = 0.5) + ggtitle("m_lm")
r2_lat <- r + geom_point(aes(x = Lat, y = m_poly2),
colour = "red", alpha = 0.5) + ggtitle("m_poly2")
r3_lat <- r + geom_point(aes(x = Lat, y = m_poly3),
colour = "red", alpha = 0.5) + ggtitle("m_poly3")
r4_lat <- r + geom_point(aes(x = Lat, y = m_log_y),
colour = "red", alpha = 0.5) + ggtitle("m_log_y")
r1_age <- r + geom_boxplot(aes(x = Age, y = m_lm))
r2_age <- r + geom_boxplot(aes(x = Age, y = m_poly2))
r3_age <- r + geom_boxplot(aes(x = Age, y = m_poly3))
r4_age <- r + geom_boxplot(aes(x = Age, y = m_log_y))
gridExtra::grid.arrange(grobs = list(
r1_lat,r2_lat,r3_lat,r4_lat,r1_age,r2_age,r3_age,r4_age), ncol = 4, nrow = 2)
Lets include the factor Age in all 4 models!
m_lm <- update(m_lm, .~. + Age)
m_poly2 <- update(m_poly2, .~. + Age)
m_poly3 <- update(m_poly3, .~. + Age)
m_log_y <- update(m_log_y, .~. + Age)
pred_dat <- plaice %>%
data_grid(Long = seq_range(Long, 20), Age = Age) %>%
spread_predictions(m_lm, m_poly2, m_poly3, m_log_y) %>%
gather(m_lm:m_log_y, key = "Model", value = "Pred") %>%
mutate(Model = factor(Model))
p <- plaice %>%
ggplot(aes(x = Long, y = CPUE)) +
geom_jitter(aes(colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("CPUE")
p1 <- p + geom_line(data = filter(pred_dat, Model == "m_lm"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_lm") +
theme(legend.position = c(1,1) ,
legend.justification = c(1.1,1.1))
p2 <- p + geom_line(data = filter(pred_dat, Model == "m_poly2"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_poly2") +
guides(colour = "none")
p3 <- p + geom_line(data = filter(pred_dat, Model == "m_poly3"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_poly3") +
guides(colour = "none")
p4 <- plaice %>%
ggplot(aes(x = Long, y = CPUE_log)) +
geom_jitter(aes(colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("CPUE") +
geom_line(data = filter(pred_dat, Model == "m_log_y"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_log_y") +
guides(colour = "none")
gridExtra::grid.arrange(grobs = list(p1, p2, p3, p4), ncol = 4)
Happy now?
Plotting the residual diagnostics of these models
par(mfrow = c(4,4))
plot(m_lm, main = "m_lm")
plot(m_poly2, main = "m_poly2")
plot(m_poly3, main = "m_poly3")
plot(m_log_y, main = "m_log_y")
Long
and Age
m_lm_int <- lm(CPUE ~ Long * Age, data = plaice)
m_poly2_int <- lm(CPUE ~ poly(Long,2) * Age, data = plaice)
m_poly3_int <- lm(CPUE ~ poly(Long,3) * Age, data = plaice)
m_log_y_int <- lm(CPUE_log ~ Long * Age, data = plaice)
pred_dat <- plaice %>%
data_grid(Long = seq_range(Long, 20), Age = Age) %>%
spread_predictions(m_lm_int, m_poly2_int, m_poly3_int, m_log_y_int) %>%
gather(m_lm_int:m_log_y_int, key = "Model", value = "Pred") %>%
mutate(Model = factor(Model))
p <- plaice %>%
ggplot(aes(x = Long, y = CPUE)) +
geom_jitter(aes(colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("CPUE")
p1 <- p + geom_line(data = filter(pred_dat, Model == "m_lm_int"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_lm_int") +
theme(legend.position = c(1,1) ,
legend.justification = c(1.1,1.1))
p2 <- p + geom_line(data = filter(pred_dat, Model == "m_poly2_int"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_poly2_int") +
guides(colour = "none")
p3 <- p + geom_line(data = filter(pred_dat, Model == "m_poly3_int"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_poly3_int") +
guides(colour = "none")
p4 <- plaice %>%
ggplot(aes(x = Long, y = CPUE_log)) +
geom_jitter(aes(colour = Age)) +
scale_colour_brewer(palette = "Paired") +
ylab("CPUE") +
geom_line(data = filter(pred_dat, Model == "m_log_y_int"),
aes(y = Pred, group = Age, colour = Age)) + ggtitle("m_log_y_int") +
guides(colour = "none")
gridExtra::grid.arrange(grobs = list(p1, p2, p3, p4), ncol = 4)
Better?
Plotting the residual diagnostics of these models
par(mfrow = c(4,4))
plot(m_lm_int, main = "m_lm_int")
plot(m_poly2_int, main = "m_poly2_int")
plot(m_poly3_int, main = "m_poly3_int")
plot(m_log_y_int, main = "m_log_y_int")
No clear pattern that suggests to include latitude in the model (in the mod_log_y model it seems residuals decrease with latitude but that is mainly caused by the 2 outliers).
mod_log_y
with Age
and Long
is the best, but the cubic model is also quite good.# Generate a null-model for the untransformed models --
m0 <- lm(CPUE ~ 1, data = plaice)
m_lm_1 <- lm(CPUE ~ Long, data = plaice)
m_lm_2 <- lm(CPUE ~ Long + Age, data = plaice)
m_lm_3 <- lm(CPUE ~ Long * Age, data = plaice)
m_poly2_1 <- lm(CPUE ~ poly(Long,2), data = plaice)
m_poly2_2 <- lm(CPUE ~ poly(Long,2) + Age, data = plaice)
m_poly2_3 <- lm(CPUE ~ poly(Long,2) * Age, data = plaice)
m_poly3_1 <- lm(CPUE ~ poly(Long,3), data = plaice)
m_poly3_2 <- lm(CPUE ~ poly(Long,3) + Age, data = plaice)
m_poly3_3 <- lm(CPUE ~ poly(Long,3) * Age, data = plaice)
# Generate a null-model for the log-models --
m_log_y_0 <-lm(CPUE_log ~ 1, data = plaice)
m_log_y_1 <-lm(CPUE_log ~ Long, data = plaice)
m_log_y_2 <-lm(CPUE_log ~ Long + Age, data = plaice)
m_log_y_3 <-lm(CPUE_log ~ Long * Age, data = plaice)
# Untransf. models --
AIC(m0, m_lm_1, m_lm_2, m_lm_3, m_poly2_1, m_poly2_2, m_poly2_3,
m_poly3_1, m_poly3_2, m_poly3_3)
## df AIC
## m0 2 262.6016
## m_lm_1 3 256.8791
## m_lm_2 7 254.4298
## m_lm_3 11 252.9204
## m_poly2_1 4 257.0557
## m_poly2_2 8 253.6051
## m_poly2_3 16 252.1079
## m_poly3_1 5 258.3897
## m_poly3_2 9 254.5438
## m_poly3_3 21 255.0540
# Log-models --
AIC(m_log_y_0, m_log_y_1, m_log_y_2, m_log_y_3)
## df AIC
## m_log_y_0 2 137.3974
## m_log_y_1 3 135.0501
## m_log_y_2 7 129.8373
## m_log_y_3 11 135.0000
m_log_y_2
summary(m_log_y_2)
##
## Call:
## lm(formula = CPUE_log ~ Long + Age, data = plaice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7375 -0.6531 0.1546 1.4860 3.1414
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.5793 2.8565 3.003 0.00731 **
## Long -0.4436 0.1793 -2.474 0.02295 *
## Age3 1.1864 1.7805 0.666 0.51320
## Age4 0.0158 1.7805 0.009 0.99301
## Age5 -2.6477 1.7805 -1.487 0.15340
## Age6 -4.4044 1.7805 -2.474 0.02298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.815 on 19 degrees of freedom
## Multiple R-squared: 0.5046, Adjusted R-squared: 0.3742
## F-statistic: 3.871 on 5 and 19 DF, p-value: 0.01377
m_log_y_2
pred_dat <- plaice %>%
data_grid(Long = seq_range(Long, 20), Age = Age)
pred_dat$Pred <- predict(m_log_y_2, se = TRUE, newdata = pred_dat)$fit
pred_dat$Pred_se <- predict(m_log_y_2, se = TRUE, newdata = pred_dat)$se.fit
pred_dat_untrans <- pred_dat %>%
mutate(Pred = exp(Pred - 0.001), Pred_se = exp(Pred_se - 0.001))
p_fin1 <- plaice %>%
ggplot(aes(x = Long)) +
geom_ribbon(data = pred_dat,
aes(ymin = (Pred - Pred_se), ymax = (Pred + Pred_se),
group = Age, fill = Age), alpha = 0.5) +
geom_line(data = pred_dat,
aes(y = Pred, group = Age, colour = Age)) +
geom_jitter(aes(y = CPUE_log, colour = Age)) +
scale_colour_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
facet_grid(. ~ Age, labeller = label_both) +
guides(colour = "none") +
ylab("CPUE") + theme_bw() +
theme(panel.grid = element_blank()) +
ggtitle("Obs. and pred. (with s.e.) CPUE on the modelled log-scale")
p_fin2 <- plaice %>%
ggplot(aes(x = Long)) +
geom_ribbon(data = pred_dat_untrans,
aes(ymin = (Pred - Pred_se), ymax = (Pred + Pred_se),
group = Age, fill = Age), alpha = 0.5) +
geom_line(data = pred_dat_untrans,
aes(y = Pred, group = Age, colour = Age)) +
geom_jitter(aes(y = CPUE, colour = Age)) +
scale_colour_brewer(palette = "Paired") +
scale_fill_brewer(palette = "Paired") +
facet_grid(. ~ Age, labeller = label_both) +
guides(colour = "none") +
ylab("CPUE") + theme_bw() +
theme(panel.grid = element_blank()) +
ggtitle("Obs. and pred. (with s.e.) CPUE back transformed")
gridExtra::grid.arrange(p_fin1, p_fin2, nrow = 2)