---
title: "Getting Started"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{getting_started}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, echo = FALSE, include = FALSE, warning = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r setup, eval = TRUE, echo= FALSE, warning = FALSE, message = FALSE}
library(MASS)
library(purrr)
library(TMB)
library(dplyr)
library(ggplot2)
library(sf)
library(rnaturalearth)
```

# Loading & viewing your data

The example shown here will include a simulated spatial process giving rise to heterogeneity in fish length-at-age with a predominant north-south cline.  

```{r, echo = FALSE, warning = FALSE, message = FALSE}
## based on code by G.D. Adams
## Specify data size
set.seed(731)
G= 50 ## number of years
nsamples = 5000
group <- sample(1:G, nsamples, replace = TRUE) # Group for individual X
N <- nsamples # Total number of samples


## Mu VBGM hyperparameters 
mu.Linf = 50
mu.k = 0.3 
mut.t0 = -0.5
mu.parms <- c(mu.Linf, mu.k, mut.t0)
sigma = 0.1*mu.Linf # Observation error


## Group level random effects
sigma.group = c(0.3, 0.05, 0.2)
rho = 0.3 # Correlation between group level parameters
cor.group.mat = matrix(rho, 3, 3)
diag(cor.group.mat) <- 1
cov.group.mat <- diag(sigma.group) %*% cor.group.mat %*% diag(sigma.group) # Get covariance


## Simulate parameters for groups----
# - Empty matrix and vectors to fill with parameters and data, respectively
group.param.mat <- group.re.mat <- matrix(NA,G,3,byrow = T)

# - Random effects
colnames(group.re.mat) <- c("log.Linf.group.re", "log.k.group.re", "t0.group.re")

# - On VBGF scale
colnames(group.param.mat) <- c("Linf.group", "k.group", "t0.group")


# - Simulate group level parameters
for(i in 1:G){
  # - Sim from mvnorm
  group.re.mat[i,] <- mvrnorm(1, rep(0,3), cov.group.mat) 
  
  # - Convert to parameter space
  group.param.mat[i,1:2] <- mu.parms[1:2] * exp(group.re.mat[i,1:2]) # Log to natural scale
  group.param.mat[i,3] <- mu.parms[3] + group.re.mat[i,3]
}

group.param.mat <- group.param.mat %>%
  data.frame() %>%
  arrange(Linf.group)


## Simulate length-at-age data ----
ages = seq(from=1,to=20, by = 1)
age = c()
length = c()
for(j in 1:N) {
  age[j] = sample(ages, 1) # Sample random age from age range
  length[j] = (group.param.mat[group[j],1] * (1 - exp(-group.param.mat[group[j],2]*(age[j]-group.param.mat[group[j],3])))) + rnorm(1,0,sigma) # Add normally distributed random error
}


# Assign data to data frame and fill spatial info
dat <- data.frame(age = age, length = length, year = as.numeric(group))
dat <- dat[which(dat$length > 0),] # Make sure all lengths are positive
dat <- dat %>% arrange(year, age, length)

dat$long <- runif(nrow(dat),-180, -135)

## first phase of sampling: higher lengths @ higher latitudes
sample_value <- function(group) {
  weights <- seq(50, 68, length.out = 50)
  sample(seq(50, 68, length.out = 50), 1, prob = weights^(2*group))
} 
dat$lat <- sapply(as.numeric(dat$year), sample_value) 

## second phase of sampling: even weight by latitude bin
# Break the latitude column into 20 equal bins
simulated_data <- dat %>%
  mutate(lat_bin = cut(lat, breaks = 20))

# Calculate the weights for each bin
# bin_counts <- simulated_data %>%
#   count(lat_bin)

# Calculate the weights (inverse of bin counts)
# bin_weights <- bin_counts %>%
#   mutate(weight = 1 / n)

# Merge the weights back into the simulated data
# simulated_data <- simulated_data %>%
#   left_join(bin_weights, by = "lat_bin")

# Sample from the data using the calculated weights
# sampled_data <- simulated_data %>%
#   sample_n(size = 4500, weight = weight, replace = TRUE) %>%
#   select(year, lat, long, age, length)

# Plot the data
 
p1 <- ggplot(simulated_data, aes(x = age, y = length, colour = year)) +
  geom_point(size = 2) + 
  # scale_colour_manual(values=cols) +
  theme_minimal()+
  theme(legend.position = 'none')+
  labs(title = 'length at age observations')

p2 <- ggplot(simulated_data, aes(x = long, y = lat, colour = year, size= length)) +
  geom_point(alpha = 0.5) + 
  # scale_colour_manual(values=cols) +
  theme_minimal()+
  labs(title = 'spatial length-at-age') 

# Create a dataframe with latitude and longitude columns
df <- simulated_data %>%
      mutate(meanl = mean(length), .by = c('age')) %>%
      mutate(resid = length-meanl) 

# Convert the dataframe to an sf object
sf_df <- st_as_sf(df, coords = c("long", "lat"), crs = 4326)

# Obtain a map of the US
us <- ne_countries(scale = "medium", returnclass = "sf") %>%
  filter(admin == "United States of America"  | admin == 'Canada')

# Perform spatial operation to remove points in the dataframe that overlap with the US polygon
sf_df_clipped <- sf_df[!st_within(sf_df, st_union(us), sparse = FALSE), ]

## a few add'l steps to save this into data/ 
simulated_data <- sf_df_clipped %>% 
  tidyr::extract(geometry, c('long', 'lat'), '\\((.*), (.*)\\)', convert = TRUE)  %>% 
  select(year, age, length, lat, long)

# usethis::use_data(simulated_data,overwrite = TRUE,version = 2 )
 
```
 
```{r, echo = FALSE, include = FALSE}
# Plot the US polygon and the points outside of it using ggplot2
p1 <- ggplot( ) +
  geom_point(aes(x = age, y = length, color = group), size = 2)+
  scale_color_gradient2(low = "blue", mid = "grey90", high = "red", midpoint = 3) +
  guides(color = 'none')+
  theme_minimal() 

p2 <- ggplot() +
  geom_sf(data = us, fill = NA, color = 'black') +
  geom_sf(data = sf_df_clipped, aes( color =resid, size = length), alpha = 0.9) +
  scale_y_continuous(limits = c(50,71)) +
  scale_x_continuous(limits = c(-185,-130))+
  guides(size = 'none')+
  theme_minimal() +
  scale_color_gradient2(low = "blue", mid = "grey90", high = "red", midpoint = 0) +
  labs(color = 'length residual')

# p1
# p2
```


The first step is to ensure your data are formatted correctly and to be aware of any sample size issues. This is accomplished via `check_data()`, which returns plots of the observations and residuals.

```{r, echo = TRUE, include = TRUE, warning = FALSE, message = FALSE}
library(growthbreaks)
data(simulated_data) ## from the package
head(simulated_data)
p <- check_data(simulated_data, showPlot = FALSE)
```

The first plot (`p[[1]]`) shows the input data, colored by year:

```{r, echo = FALSE, include = TRUE, warning = FALSE} 
p[[1]]
```

The second two plots `p[[2]];p[[3]]` are maps of the observations and simple residuals (observation minus age-specific mean). The red colors are the highest values. We'd expect these to look similar to one another.

```{r, echo = FALSE, include = TRUE, warning = FALSE} 
p[[2]]
p[[3]]
```


Once you've completed that step you are ready to investigate potential breakpoints in length-at-age via `get_Breaks()`. This example will use the default option `axes = 0` which looks for spatial breakpoints only. The function is ignorant of any underlying structure in the data and is not fitting growth curves at this time. If you keep the default settings you will get back plots of the hypothesized breaks as well as a dataframe with the breakpoints.

# Detecting Breakpoints 
Now we can pass our data to `get_Breaks()`.


The `ages_to_use` argument allows you to specify a subset of your age observations for which you'd like to test for breakpoints. If you are unsure, you may choose to use age(s) that are well sampled in your data, or all ages. However, you will want to include at least some observations of small (young) fish, since discrepancies in size may be less obvious for fish at or near their asymptotic length. Here I am testing three ages and saving the output to a dataframe called `breakpoints`.


```{r, echo = TRUE, include = TRUE, warning = FALSE} 
breakpoints <- get_Breaks(dat = simulated_data,
                          ages_to_use = c(5,10,15),
                          sex = FALSE, 
                          axes = 0, 
                          showPlot = FALSE)
breakpoints$breakpoints
plot_Breaks(dat=simulated_data, breakpoints$breakpoints, showData =  TRUE)
```
As we might have expected based on the raw observations, the algorithm detected a break at about $66^{\circ}$N, as well as an unexpected one at $~141^{\circ}$W. We can see this on the map as well as in the dataframe. The `count` column indicates the proportion of tested ages for which this breakpoint was detected (in this case, 1/3, suggesting that no breakpoints were detected for 2/3 ages). *You may choose to stop here. If you're interested in some of the automated checking and curve fitting functionality, proceed to the next step*.

# Re-fitting growth curves at putative breaks

The function `refit_Growth()` can be used to generate plots of the growth parameters and associated curves by splitting your data into the spatio/temporal zones ("strata") defined by `breakpoints`. This can allow you to make inferences about a) the magnitude and significance of differences in individual parameters and b) the resultant impact on our perception of length-at-age.

*It is common for growth curves to be very similar especially when the number of strata is large (>3), so user judgment is required to discern the best use of the information provided here*.

Run the function with the breakpoints as-is and inspect the outputs.

```{r, echo = TRUE, include = TRUE, warning = FALSE} 
fits1 <- refit_Growth(simulated_data, breakpoints$breakpoints, showPlot = FALSE)

names(fits1$split_tables) ## description of the strata used
## split_tables contains your observed data broken up by strata

```

We can inspect the fitted curves in two ways: first, by visualizing the `$par_plot`. The red points & error bars indicate estimates for which the mean fell outside the confidence interval(s) for other strata. This can be used to infer which components of the growth curve are possibly contributing to detected differences among strata. It doesn't look like the second strata's $L_{\infty}$ nor $k$ values are very different than the other regions'. 

The AIC of this first model is given by `fits1$AIC`: `r fits1$AIC`.

```{r, echo = TRUE, include = TRUE, warning = FALSE} 
fits1$pars_plot
```

The second way to explore the results are by visualizing the fitted curves against the strata-specific observations contained in `$fits_plot`. Here we see that the second and third strata (all data points below $66^{\circ}$)N exhhibit very similar curves, and there are not many data points in the second strata to begin with. 

```{r}
fits1$fits_plot
```

# Updating and re-fitting the breakpoints

Based on the exploration above, you may decide to manually combine the second and third strata, using just a single breakpoint north to south. In this case, we would do away with the breakpoint at $-141^{\circ}$)W, replacing it with `-Inf`. 


```{r, echo = TRUE, include = TRUE, warning = FALSE} 
breakpoints$breakpoints$long <- -Inf 
fits2 <- refit_Growth(simulated_data,
                      breakpoints$breakpoints,
                      showPlot = FALSE) ## refit the curves
```

We can repeat the visualization and see that 1) the key parameter values are now more distinct from one another and 2) so are the curves.The AIC of this updated model is given by `fits2$AIC`: `r fits2$AIC`, for a $\Delta$AIC of `r fits1$AIC-fits2$AIC`.

```{r, echo = TRUE, include = TRUE, warning = FALSE} 
fits2$pars_plot
fits2$fits_plot
```