--- title: "Getting Started" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{getting_started} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, echo = FALSE, include = FALSE, warning = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, eval = TRUE, echo= FALSE, warning = FALSE, message = FALSE} library(MASS) library(purrr) library(TMB) library(dplyr) library(ggplot2) library(sf) library(rnaturalearth) ``` # Loading & viewing your data The example shown here will include a simulated spatial process giving rise to heterogeneity in fish length-at-age with a predominant north-south cline. ```{r, echo = FALSE, warning = FALSE, message = FALSE} ## based on code by G.D. Adams ## Specify data size set.seed(731) G= 50 ## number of years nsamples = 5000 group <- sample(1:G, nsamples, replace = TRUE) # Group for individual X N <- nsamples # Total number of samples ## Mu VBGM hyperparameters mu.Linf = 50 mu.k = 0.3 mut.t0 = -0.5 mu.parms <- c(mu.Linf, mu.k, mut.t0) sigma = 0.1*mu.Linf # Observation error ## Group level random effects sigma.group = c(0.3, 0.05, 0.2) rho = 0.3 # Correlation between group level parameters cor.group.mat = matrix(rho, 3, 3) diag(cor.group.mat) <- 1 cov.group.mat <- diag(sigma.group) %*% cor.group.mat %*% diag(sigma.group) # Get covariance ## Simulate parameters for groups---- # - Empty matrix and vectors to fill with parameters and data, respectively group.param.mat <- group.re.mat <- matrix(NA,G,3,byrow = T) # - Random effects colnames(group.re.mat) <- c("log.Linf.group.re", "log.k.group.re", "t0.group.re") # - On VBGF scale colnames(group.param.mat) <- c("Linf.group", "k.group", "t0.group") # - Simulate group level parameters for(i in 1:G){ # - Sim from mvnorm group.re.mat[i,] <- mvrnorm(1, rep(0,3), cov.group.mat) # - Convert to parameter space group.param.mat[i,1:2] <- mu.parms[1:2] * exp(group.re.mat[i,1:2]) # Log to natural scale group.param.mat[i,3] <- mu.parms[3] + group.re.mat[i,3] } group.param.mat <- group.param.mat %>% data.frame() %>% arrange(Linf.group) ## Simulate length-at-age data ---- ages = seq(from=1,to=20, by = 1) age = c() length = c() for(j in 1:N) { age[j] = sample(ages, 1) # Sample random age from age range length[j] = (group.param.mat[group[j],1] * (1 - exp(-group.param.mat[group[j],2]*(age[j]-group.param.mat[group[j],3])))) + rnorm(1,0,sigma) # Add normally distributed random error } # Assign data to data frame and fill spatial info dat <- data.frame(age = age, length = length, year = as.numeric(group)) dat <- dat[which(dat$length > 0),] # Make sure all lengths are positive dat <- dat %>% arrange(year, age, length) dat$long <- runif(nrow(dat),-180, -135) ## first phase of sampling: higher lengths @ higher latitudes sample_value <- function(group) { weights <- seq(50, 68, length.out = 50) sample(seq(50, 68, length.out = 50), 1, prob = weights^(2*group)) } dat$lat <- sapply(as.numeric(dat$year), sample_value) ## second phase of sampling: even weight by latitude bin # Break the latitude column into 20 equal bins simulated_data <- dat %>% mutate(lat_bin = cut(lat, breaks = 20)) # Calculate the weights for each bin # bin_counts <- simulated_data %>% # count(lat_bin) # Calculate the weights (inverse of bin counts) # bin_weights <- bin_counts %>% # mutate(weight = 1 / n) # Merge the weights back into the simulated data # simulated_data <- simulated_data %>% # left_join(bin_weights, by = "lat_bin") # Sample from the data using the calculated weights # sampled_data <- simulated_data %>% # sample_n(size = 4500, weight = weight, replace = TRUE) %>% # select(year, lat, long, age, length) # Plot the data p1 <- ggplot(simulated_data, aes(x = age, y = length, colour = year)) + geom_point(size = 2) + # scale_colour_manual(values=cols) + theme_minimal()+ theme(legend.position = 'none')+ labs(title = 'length at age observations') p2 <- ggplot(simulated_data, aes(x = long, y = lat, colour = year, size= length)) + geom_point(alpha = 0.5) + # scale_colour_manual(values=cols) + theme_minimal()+ labs(title = 'spatial length-at-age') # Create a dataframe with latitude and longitude columns df <- simulated_data %>% mutate(meanl = mean(length), .by = c('age')) %>% mutate(resid = length-meanl) # Convert the dataframe to an sf object sf_df <- st_as_sf(df, coords = c("long", "lat"), crs = 4326) # Obtain a map of the US us <- ne_countries(scale = "medium", returnclass = "sf") %>% filter(admin == "United States of America" | admin == 'Canada') # Perform spatial operation to remove points in the dataframe that overlap with the US polygon sf_df_clipped <- sf_df[!st_within(sf_df, st_union(us), sparse = FALSE), ] ## a few add'l steps to save this into data/ simulated_data <- sf_df_clipped %>% tidyr::extract(geometry, c('long', 'lat'), '\\((.*), (.*)\\)', convert = TRUE) %>% select(year, age, length, lat, long) # usethis::use_data(simulated_data,overwrite = TRUE,version = 2 ) ``` ```{r, echo = FALSE, include = FALSE} # Plot the US polygon and the points outside of it using ggplot2 p1 <- ggplot( ) + geom_point(aes(x = age, y = length, color = group), size = 2)+ scale_color_gradient2(low = "blue", mid = "grey90", high = "red", midpoint = 3) + guides(color = 'none')+ theme_minimal() p2 <- ggplot() + geom_sf(data = us, fill = NA, color = 'black') + geom_sf(data = sf_df_clipped, aes( color =resid, size = length), alpha = 0.9) + scale_y_continuous(limits = c(50,71)) + scale_x_continuous(limits = c(-185,-130))+ guides(size = 'none')+ theme_minimal() + scale_color_gradient2(low = "blue", mid = "grey90", high = "red", midpoint = 0) + labs(color = 'length residual') # p1 # p2 ``` The first step is to ensure your data are formatted correctly and to be aware of any sample size issues. This is accomplished via `check_data()`, which returns plots of the observations and residuals. ```{r, echo = TRUE, include = TRUE, warning = FALSE, message = FALSE} library(growthbreaks) data(simulated_data) ## from the package head(simulated_data) p <- check_data(simulated_data, showPlot = FALSE) ``` The first plot (`p[[1]]`) shows the input data, colored by year: ```{r, echo = FALSE, include = TRUE, warning = FALSE} p[[1]] ``` The second two plots `p[[2]];p[[3]]` are maps of the observations and simple residuals (observation minus age-specific mean). The red colors are the highest values. We'd expect these to look similar to one another. ```{r, echo = FALSE, include = TRUE, warning = FALSE} p[[2]] p[[3]] ``` Once you've completed that step you are ready to investigate potential breakpoints in length-at-age via `get_Breaks()`. This example will use the default option `axes = 0` which looks for spatial breakpoints only. The function is ignorant of any underlying structure in the data and is not fitting growth curves at this time. If you keep the default settings you will get back plots of the hypothesized breaks as well as a dataframe with the breakpoints. # Detecting Breakpoints Now we can pass our data to `get_Breaks()`. The `ages_to_use` argument allows you to specify a subset of your age observations for which you'd like to test for breakpoints. If you are unsure, you may choose to use age(s) that are well sampled in your data, or all ages. However, you will want to include at least some observations of small (young) fish, since discrepancies in size may be less obvious for fish at or near their asymptotic length. Here I am testing three ages and saving the output to a dataframe called `breakpoints`. ```{r, echo = TRUE, include = TRUE, warning = FALSE} breakpoints <- get_Breaks(dat = simulated_data, ages_to_use = c(5,10,15), sex = FALSE, axes = 0, showPlot = FALSE) breakpoints$breakpoints plot_Breaks(dat=simulated_data, breakpoints$breakpoints, showData = TRUE) ``` As we might have expected based on the raw observations, the algorithm detected a break at about $66^{\circ}$N, as well as an unexpected one at $~141^{\circ}$W. We can see this on the map as well as in the dataframe. The `count` column indicates the proportion of tested ages for which this breakpoint was detected (in this case, 1/3, suggesting that no breakpoints were detected for 2/3 ages). *You may choose to stop here. If you're interested in some of the automated checking and curve fitting functionality, proceed to the next step*. # Re-fitting growth curves at putative breaks The function `refit_Growth()` can be used to generate plots of the growth parameters and associated curves by splitting your data into the spatio/temporal zones ("strata") defined by `breakpoints`. This can allow you to make inferences about a) the magnitude and significance of differences in individual parameters and b) the resultant impact on our perception of length-at-age. *It is common for growth curves to be very similar especially when the number of strata is large (>3), so user judgment is required to discern the best use of the information provided here*. Run the function with the breakpoints as-is and inspect the outputs. ```{r, echo = TRUE, include = TRUE, warning = FALSE} fits1 <- refit_Growth(simulated_data, breakpoints$breakpoints, showPlot = FALSE) names(fits1$split_tables) ## description of the strata used ## split_tables contains your observed data broken up by strata ``` We can inspect the fitted curves in two ways: first, by visualizing the `$par_plot`. The red points & error bars indicate estimates for which the mean fell outside the confidence interval(s) for other strata. This can be used to infer which components of the growth curve are possibly contributing to detected differences among strata. It doesn't look like the second strata's $L_{\infty}$ nor $k$ values are very different than the other regions'. The AIC of this first model is given by `fits1$AIC`: `r fits1$AIC`. ```{r, echo = TRUE, include = TRUE, warning = FALSE} fits1$pars_plot ``` The second way to explore the results are by visualizing the fitted curves against the strata-specific observations contained in `$fits_plot`. Here we see that the second and third strata (all data points below $66^{\circ}$)N exhhibit very similar curves, and there are not many data points in the second strata to begin with. ```{r} fits1$fits_plot ``` # Updating and re-fitting the breakpoints Based on the exploration above, you may decide to manually combine the second and third strata, using just a single breakpoint north to south. In this case, we would do away with the breakpoint at $-141^{\circ}$)W, replacing it with `-Inf`. ```{r, echo = TRUE, include = TRUE, warning = FALSE} breakpoints$breakpoints$long <- -Inf fits2 <- refit_Growth(simulated_data, breakpoints$breakpoints, showPlot = FALSE) ## refit the curves ``` We can repeat the visualization and see that 1) the key parameter values are now more distinct from one another and 2) so are the curves.The AIC of this updated model is given by `fits2$AIC`: `r fits2$AIC`, for a $\Delta$AIC of `r fits1$AIC-fits2$AIC`. ```{r, echo = TRUE, include = TRUE, warning = FALSE} fits2$pars_plot fits2$fits_plot ```