Skip to content

Commit

Permalink
Moved regime means procedure to Rust and bumped version, vignette (#23)
Browse files Browse the repository at this point in the history
* added regime means and plotting

* roxygenise

* fixed regime means and removed Rd file

* moved regime-means to Rust and bumped version, updated vignette

* fixed latex
  • Loading branch information
alexhroom committed May 16, 2024
1 parent 24f7d52 commit 008169d
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 61 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: rshift
Type: Package
Title: Paleoecology Functions for Regime Shift Analysis
Version: 3.0.0
Version: 3.1.0
Authors@R: c(person("Alex H.", "Room",
role = c("aut", "cre", "cph"),
email = "alex.room@btinternet.com",
Expand Down
11 changes: 10 additions & 1 deletion R/extendr-wrappers.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
# Generated by extendr: Do not edit by hand

# nolint start

#
# This file was created with the following call:
# .Call("wrap__make_rshift_wrappers", use_symbols = TRUE, package_name = "rshift")

#' @docType package
#' @usage NULL
#' @useDynLib rshift, .registration = TRUE
NULL
Expand All @@ -14,3 +16,10 @@ NULL
#' @param l The cut-off length of a regime; affects sensitivity
rust_rodionov <- function(vals, t_crit, l) .Call(wrap__rust_rodionov, vals, t_crit, l)

#' Calculates the mean for each regime in a regime shift analysis.
#' @param col The column we are measuring change on.
#' @param rsi The column containing RSI values.
rust_regime_means <- function(col, rsi) .Call(wrap__rust_regime_means, col, rsi)


# nolint end
20 changes: 3 additions & 17 deletions R/regime_means.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,13 @@
#' @param col The column we are measuring change on.
#' @param rsi The column containing RSI values.
#' @return A vector of the mean value for each regime.
#' @examples
#' @examples
#' regime_means(lake_RSI, "DCA1", "RSI")
#' @importFrom ggplot2 ggplot geom_col aes
#'
#' @export
regime_means <- function(data, col, rsi){
regime_means <- function(data, col, rsi){
col <- data[[col]]
rsi <- data[[rsi]]
current_regime <- c()
regime_means <- c()

for(i in seq_along(col)){
if(rsi[i] == 0){
current_regime <- c(current_regime, col[i])
}
else {
regime_means <- c(regime_means, rep(mean(current_regime), length(current_regime)))
current_regime <- c(col[i]) # reset regime
}
}
# calculate means for last regime
regime_means <- c(regime_means, rep(mean(current_regime), length(current_regime)))
regime_means <- rust_regime_means(col, rsi)
return(regime_means)
}
16 changes: 16 additions & 0 deletions man/rust_regime_means.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/rust/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = 'rshift'
version = '3.0.0'
version = '3.1.0'
edition = '2021'

[lib]
Expand Down
36 changes: 35 additions & 1 deletion src/rust/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use extendr_api::prelude::*;
use std::cmp::min;
use std::vec::Vec;

/// Calculate STARS RSI points and return to R as a vector
/// @param vals The column we are measuring change on
/// @param t_crit The critical value of a t-distribution at the desired p-value
/// @param l The cut-off length of a regime; affects sensitivity
#[extendr(use_try_from = true)]
fn rust_rodionov(vals: &[f64], t_crit: f64, l: usize) -> std::vec::Vec<f64> {
fn rust_rodionov(vals: &[f64], t_crit: f64, l: usize) -> Vec<f64> {
let mut results = vec![0.; l];

// calculate sigma^2_l
Expand Down Expand Up @@ -102,10 +103,43 @@ fn calculate_rsi(regime: &[f64], shift_boundary: &f64, is_down: bool, l: &f64, v
rsi
}


/// Calculates the mean for each regime in a regime shift analysis.
/// @param col The column we are measuring change on.
/// @param rsi The column containing RSI values.
#[extendr(use_try_from = true)]
fn rust_regime_means(col: &[f64], rsi: &[f64]) -> Vec<f64> {
let mut means: Vec<f64> = Vec::new();
let mut current_regime: Vec<f64> = Vec::new();
let mut regime_mean: f64;
let mut regime_length: usize;

for i in 0..col.len() {
if rsi[i] == 0. {
// add to current regime
current_regime.push(col[i]);
} else {
// new regime starts: calculate mean of last regime
regime_length = current_regime.len();
regime_mean = current_regime.drain(..).sum::<f64>() / regime_length as f64;
means.append(&mut [regime_mean].repeat(regime_length));
current_regime.push(col[i])
}
}
// calculate means for final regime
regime_length = current_regime.len();
regime_mean = current_regime.drain(..).sum::<f64>() / regime_length as f64;
means.append(&mut [regime_mean].repeat(regime_length));

means
}


// Macro to generate exports.
// This ensures exported functions are registered with R.
// See corresponding C code in `entrypoint.c`.
extendr_module! {
mod rshift;
fn rust_rodionov;
fn rust_regime_means;
}
82 changes: 43 additions & 39 deletions vignettes/STARSmanual.ltx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
\usepackage{caption}
\usepackage{subcaption}

\title{\texttt{rshift} STARS manual - regime shift analysis for paleoecological data\\v3.0.0}
\title{\texttt{rshift} STARS manual - regime shift analysis for paleoecological data\\v3.1.0}

\author{A. H. Room, F. Franco-Gaviria, D. H. Urrego}

Expand All @@ -17,69 +17,69 @@
\maketitle

\section{Introduction}
STARS (Sequential T-test Analysis of Regime Shifts) is an algorithm which detects regime shifts in an ecosystem \citep{rodionov2004sequential} -
i.e., where the ecosystem undergoes a persistent change in some measure. Regime shifts can happen as a direct result of extrinsic factors like
STARS (Sequential T-test Analysis of Regime Shifts) is an algorithm which detects regime shifts in an ecosystem \citep{rodionov2004sequential} -
i.e., where the ecosystem undergoes a persistent change in some measure. Regime shifts can happen as a direct result of extrinsic factors like
fire or tidal disturbance, or by the breaking of a 'threshold factor' intrinsic to the ecosystem itself.\\
This document will illustrate the theory behind STARS, as well as a guide on how to easily deploy it in R
This document will illustrate the theory behind STARS, as well as a guide on how to easily deploy it in R
using the \texttt{rshift}\footnote{found at https://github.com/alexhroom/rshift} package.

\section{Theory}
STARS is based on the Student's t-test\footnote{which can be problematic; see section 4}; it goes along the data set,
comparing each new observation against every entry in the current 'regime'. If an entry deviates significantly from the
regime average, it becomes a hypothesised 'shift point', and the algorithm then tests if this shift is persistent.
If it is, then an RSI (regime shift index) is created to quantify the size of the shift, and the algorithm starts over
using the shift point as the start of a new regime. Otherwise, it rejects the shift point, adds it to the current regime,
STARS is based on the Student's t-test\footnote{which can be problematic; see section 4}; it goes along the data set,
comparing each new observation against every entry in the current 'regime'. If an entry deviates significantly from the
regime average, it becomes a hypothesised 'shift point', and the algorithm then tests if this shift is persistent.
If it is, then an RSI (regime shift index) is created to quantify the size of the shift, and the algorithm starts over
using the shift point as the start of a new regime. Otherwise, it rejects the shift point, adds it to the current regime,
and continues the search.\\
You will need a table with two columns; an ordinated proxy or variable (such as pollen, temperature, or charcoal),
You will need a table with two columns; an ordinated proxy or variable (such as pollen, temperature, or charcoal),
and some form of time series, like a list of depths for samples or a chronology. From there, it goes like this:
\begin{enumerate}
\item Firstly, we choose an '$l$' value. This is essentially your predicted length for each regime, and has knock-on effects for the sensitivity of the algorithm.
If you choose a bad value, don't worry - you can always change it and start over. 10-15 is usually a good starting point.
\item Firstly, we choose an '$l$' value. This is essentially your predicted length for each regime, and has knock-on effects for the sensitivity of the algorithm.
If you choose a bad value, don't worry - you can always change it and start over. 10-15 is usually a good starting point.
We also need a probability for statistical significance $p$ - usually 0.05 (a 5\% chance that a positive result is a fluke).
\item We then calculate the difference between mean values that would be significantly different - i.e., how much your value has to shift between
\item We then calculate the difference between mean values that would be significantly different - i.e., how much your value has to shift between
two adjacent observations for it to be considered a new regime. The formula for this is
$$\textrm{diff} = t_p(2l-2) \cdot \sqrt{2\sigma_l^2 / l}$$
There are a couple new things here: $t_p(2l-2)$ is the critical value for a two-tailed t-test, at probability level $p$,
with $2l-2$ degrees of freedom. We don't need to know what all these things mean just to use STARS - just find $2l-2$, google a t-table,
There are a couple new things here: $t_p(2l-2)$ is the critical value for a two-tailed t-test, at probability level $p$,
with $2l-2$ degrees of freedom. We don't need to know what all these things mean just to use STARS - just find $2l-2$, google a t-table,
and grab the value from it. \\
$\sigma_l^2$ is an average 'rolling variance' - the variance of each adjacent group of 10 values. To find this,
$\sigma_l^2$ is an average 'rolling variance' - the variance of each adjacent group of 10 values. To find this,
we calculate the variance for values 1 to 10, 2 to 11, 3 to 12, and so on until we reach the last ten, then find the mean.
\item We calculate the initial mean of your proxy for regime 1 $\overline{x_{R1}}$, and find the range of acceptable values in this regime,
which is the range from $\overline{x_{R1}} - \textrm{diff}$ to $\overline{x_{R1}} + \textrm{diff}$. Then, for each value starting with the $l + 1$'th value,
\item We calculate the initial mean of your proxy for regime 1 $\overline{x_{R1}}$, and find the range of acceptable values in this regime,
which is the range from $\overline{x_{R1}} - \textrm{diff}$ to $\overline{x_{R1}} + \textrm{diff}$. Then, for each value starting with the $l + 1$'th value,
check if it is outside the acceptable range. If it isn't, we change $\overline{x_{R1}}$ to the average of this value and the $l-1$ values before it.
\item If the value is outside the acceptable range, we consider this as our candidate shift point. Hereon, this candidate sample will be called $j$.
\item If the value is outside the acceptable range, we consider this as our candidate shift point. Hereon, this candidate sample will be called $j$.
We then calculate the Regime Shift Index (RSI) to test if it is a real shift point. The equation for this is:\\
\[RSI_{i,j} = \sum_{i = j}^{j+l-1}\frac{x^*_i}{l \cdot \sigma_l}.\]
I will explain the process of how to use this, but first it is important to note that here, the direction of the shift becomes important.
$x_i^*$ is equal to $x_i - (\overline{x_{R1}} + \textrm{diff})$ if the shift is up (i.e. $x_j > \overline{x_{R1}} + \textrm{diff}$), or
$\overline{x_{R1}} - \textrm{diff} - x_i$ if the shift is down. (so $x_j < \overline{x_{R1}} - \textrm{diff}$) $\sigma_l$ is trivially the
square root of $\sigma_l^2$, which you calculated in step 2. Furthermore, if the RSI value turns negative at any point during that sum,
I will explain the process of how to use this, but first it is important to note that here, the direction of the shift becomes important.
$x_i^*$ is equal to $x_i - (\overline{x_{R1}} + \textrm{diff})$ if the shift is up (i.e. $x_j > \overline{x_{R1}} + \textrm{diff}$), or
$\overline{x_{R1}} - \textrm{diff} - x_i$ if the shift is down. (so $x_j < \overline{x_{R1}} - \textrm{diff}$) $\sigma_l$ is trivially the
square root of $\sigma_l^2$, which you calculated in step 2. Furthermore, if the RSI value turns negative at any point during that sum,
stop calculating and go onto step 5.\\
Of course, looking at that equation one might be confused on what to do with it, so we will look at it in more detail:\\
$i$ is an index number; it stands for every value in the data between $j$ and $j + l - 1$. This means, starting from the j'th value of the data:\\
find $x_i$, the i'th value of your proxy/variable. Then, use it to calculate $x_i^*$. Divide this by $l \cdot \sigma_l$, and add it to a running total.
Repeat with $i + 1$, $i + 2$ and so on until you reach $j + l - 1$. Each time you add a new value to your total, check if it is positive or negative;
find $x_i$, the i'th value of your proxy/variable. Then, use it to calculate $x_i^*$. Divide this by $l \cdot \sigma_l$, and add it to a running total.
Repeat with $i + 1$, $i + 2$ and so on until you reach $j + l - 1$. Each time you add a new value to your total, check if it is positive or negative;
if negative, stop calculating and go onto step 5.
\item if RSI has gone negative, the test has failed. We consider RSI for $x_j$ to be 0. Add $x_j$ to the values for regime 1
(i.e. use it when calculating $\overline{x_{R1}}$, and go back to step 3. If it remained positive, then your final value is the RSI value for $x_j$,
\item if RSI has gone negative, the test has failed. We consider RSI for $x_j$ to be 0. Add $x_j$ to the values for regime 1
(i.e. use it when calculating $\overline{x_{R1}}$, and go back to step 3. If it remained positive, then your final value is the RSI value for $x_j$,
and j is the start point for a new regime.
\item Calculate the value of $\overline{x_{R2}}$, i.e. the mean of all the values you used for the RSI calculation. We then start the search for
regime 3 from $j + 1$, rather than from $l$ values along like before; this is so we still detect shifts even if regime 2 is a different length to $l$.
\item Calculate the value of $\overline{x_{R2}}$, i.e. the mean of all the values you used for the RSI calculation. We then start the search for
regime 3 from $j + 1$, rather than from $l$ values along like before; this is so we still detect shifts even if regime 2 is a different length to $l$.
Repeat steps 3 through 6 until you have processed all the data.
\end{enumerate}
Note that when discarding a value, the way the new average is calculated is quite tricky: if you're $l$ or more values into the regime, then
the new average is that value and the $l-1$ values before it. Otherwise (e.g. if you've started the regime 3 search from $j+1$) the current
value being looked at is already in the average, so does not need to be recalculated!
If you would like an example of this in action, see Rodionov's original paper \citep{rodionov2004sequential}, which gives an example for annual
value being looked at is already in the average, so does not need to be recalculated!
If you would like an example of this in action, see Rodionov's original paper \citep{rodionov2004sequential}, which gives an example for annual
PDO data for temperature in January.

\section{Using STARS with \texttt{rshift}}
Of course, all this grunt work is very boring. With modern technology, we can make computers do it for us instead!
Of course, all this grunt work is very boring. With modern technology, we can make computers do it for us instead!
There is a function for this built into the \texttt{rshift} R package.

\subsection{Using \texttt{Rodionov()}}
The \texttt{rshift} command for STARS is \texttt{Rodionov()}. It takes 7 variables, so in RStudio will look more like
\texttt{Rodionov(data, col, time, l, prob = 0.05, startrow = 1, merge = FALSE)}. These are the 7 inputs you can put into the function,
The \texttt{rshift} command for STARS is \texttt{Rodionov()}. It takes 7 variables, so in RStudio will look more like
\texttt{Rodionov(data, col, time, l, prob = 0.05, startrow = 1, merge = FALSE)}. These are the 7 inputs you can put into the function,
but only the first 4 are mandatory (the rest default to what's after the = sign).
\begin{itemize}
\item \texttt{data} - the dataset in R that you're using. This should NOT be in quote marks.
Expand All @@ -91,13 +91,13 @@ The other 3 variables are optional:
\begin{itemize}
\item \texttt{prob} - the significance probability. Defaults to $p = 0.05$.
\item \texttt{startrow} - where the algorithm should start from, if you want to skip the first few rows of the dataset. Defaults to 1.
\item \texttt{merge} - changes the result to be either a regime-shift only table (if FALSE), or an additional column to the original table (if TRUE).
If merge = FALSE (default), produces a 2-column table of 'time' (the time value for each regime shift) and 'RSI' (the RSI for each regime shift).
\item \texttt{merge} - changes the result to be either a regime-shift only table (if FALSE), or an additional column to the original table (if TRUE).
If merge = FALSE (default), produces a 2-column table of 'time' (the time value for each regime shift) and 'RSI' (the RSI for each regime shift).
If merge = TRUE, returns the original dataset with an extra RSI column, giving the RSI for each time unit - 0 for non-shift years.
\end{itemize}
So, as an example, we will reproduce the data from the Rodionov paper. Rodionov uses the Pacific Decadal Oscillation (PDO) as his dataset, a measure of climate
variability over the Pacific Ocean (see \citet{mantua2002pacific} for more details). This is available in the \texttt{rshift} package as the dataset \texttt{January\_PDO}.
To load it, ensure that `rshift' has been imported into your R session (with \texttt{library(rshift)}) and then run \texttt{data(January\_PDO)} to
To load it, ensure that `rshift' has been imported into your R session (with \texttt{library(rshift)}) and then run \texttt{data(January\_PDO)} to
load the data into your session. We then can run the STARS algorithm in the simplest way with the following line of code:

\begin{verbatim}
Expand All @@ -110,13 +110,17 @@ and it would return a list of points in the chronology where it has detected a r
\texttt{rshift} also contains \texttt{RSI\_graph}, a way of visualising the results of a STARS test.
Using our previous data, we run \texttt{Rodionov()} with \texttt{merge} set to TRUE,
save it as RSI\_data with the \texttt{<-} operator, then use \texttt{RSI\_graph()} to create a graph.
The parameter \texttt{mean\_lines} in \texttt{RSI\_graph()} add lines to the graph
indicating the mean value of each regime.

\begin{verbatim}
RSI_data <- Rodionov(January_PDO, "PDO", "Age", 10, merge = TRUE)
RSI_graph(RSI_data, "DCA1", "Age", "RSI"))
RSI_graph(RSI_data, "DCA1", "Age", "RSI", mean_lines = TRUE))
\end{verbatim}

It takes the data, variable column and chronology like before, but you must also specify the name of the column with RSI values in (which \texttt{Rodionov()} automatically creates as 'RSI'). \newpage
It takes the data, variable column and chronology like before,
but you must also specify the name of the column with RSI values in
(which \texttt{Rodionov()} automatically creates as 'RSI'). \newpage
\begin{figure}[!h]
\centering
\begin{subfigure}[b]{\textwidth}
Expand Down
Binary file modified vignettes/rshift_PDO_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 008169d

Please sign in to comment.