Working with Spatio-Temporal Data

This tutorial shows how to work with spatio-temporal data in CHAP models using established R packages. Rather than reinventing the wheel, we leverage the excellent tidyverse ecosystem.

Recommended Packages

Package	Purpose	Install
tsibble	Temporal data structures	`install.packages("tsibble")`
fabletools	Temporal aggregation, reconciliation	`install.packages("fabletools")`
sf	Spatial data and operations	`install.packages("sf")`
cubble	Spatio-temporal data wrangling	`install.packages("cubble")`

library(chapr)
library(dplyr)
library(tidyr)
library(tsibble)

Working with CHAP Example Data

Let’s start with the example data from the SDK:

data <- get_example_data('laos', 'M')
training_data <- data$training_data

# Examine the structure
training_data
#> # A tsibble: 1,057 x 12 [1M]
#> # Key:       location [7]
#>    time_period rainfall mean_temperature disease_cases population parent
#>          <mth>    <dbl>            <dbl>         <dbl>      <dbl> <chr> 
#>  1    2000 Jul   430.               23.4             0     58503. -     
#>  2    2000 Aug   322.               23.8             0     58503. -     
#>  3    2000 Sep   265.               22.7             0     58503. -     
#>  4    2000 Oct   103.               22.6             0     58503. -     
#>  5    2000 Nov    19.7              20.3             0     58503. -     
#>  6    2000 Dec    26.0              19.1             0     58503. -     
#>  7    2001 Jan    17.6              19.8             0     60157. -     
#>  8    2001 Feb     7.28             22.0             0     60157. -     
#>  9    2001 Mar   123.               22.6             0     60157. -     
#> 10    2001 Apr    29.6              27.5             0     60157. -     
#> # ℹ 1,047 more rows
#> # ℹ 6 more variables: location <chr>, Cases <dbl>, E <dbl>, month <dbl>,
#> #   ID_year <dbl>, ID_spat <chr>

The data is already a tsibble with: - time_period as the temporal index (monthly) - location as the spatial key - disease_cases as the target variable - Various covariates (rainfall, temperature, population)

Temporal Operations with tsibble

Lag Features

Create lagged versions of variables for time series modeling:

# Add lagged disease cases (1 and 2 months ago)
training_with_lags <- training_data |>
  group_by_key() |>
  mutate(
    cases_lag1 = lag(disease_cases, 1),
    cases_lag2 = lag(disease_cases, 2),
    rainfall_lag1 = lag(rainfall, 1)
  ) |>
  ungroup()

training_with_lags |>
  select(time_period, location, disease_cases, cases_lag1, cases_lag2) |>
  head(10)
#> # A tsibble: 10 x 5 [1M]
#> # Key:       location [1]
#>    time_period location disease_cases cases_lag1 cases_lag2
#>          <mth> <chr>            <dbl>      <dbl>      <dbl>
#>  1    2000 Jul Bokeo                0         NA         NA
#>  2    2000 Aug Bokeo                0          0         NA
#>  3    2000 Sep Bokeo                0          0          0
#>  4    2000 Oct Bokeo                0          0          0
#>  5    2000 Nov Bokeo                0          0          0
#>  6    2000 Dec Bokeo                0          0          0
#>  7    2001 Jan Bokeo                0          0          0
#>  8    2001 Feb Bokeo                0          0          0
#>  9    2001 Mar Bokeo                0          0          0
#> 10    2001 Apr Bokeo                0          0          0

Rolling Window Statistics

Calculate moving averages and other rolling statistics using the slider package:

# Rolling mean of disease cases (3-month window)
training_with_rolling <- training_data |>
  group_by_key() |>
  mutate(
    cases_ma3 = slider::slide_dbl(disease_cases, mean, .before = 2, .complete = TRUE),
    cases_max3 = slider::slide_dbl(disease_cases, max, .before = 2, .complete = TRUE)
  ) |>
  ungroup()

training_with_rolling |>
  select(time_period, location, disease_cases, cases_ma3, cases_max3) |>
  filter(location == "Bokeo") |>
  head(10)
#> # A tsibble: 10 x 5 [1M]
#> # Key:       location [1]
#>    time_period location disease_cases cases_ma3 cases_max3
#>          <mth> <chr>            <dbl>     <dbl>      <dbl>
#>  1    2000 Jul Bokeo                0        NA         NA
#>  2    2000 Aug Bokeo                0        NA         NA
#>  3    2000 Sep Bokeo                0         0          0
#>  4    2000 Oct Bokeo                0         0          0
#>  5    2000 Nov Bokeo                0         0          0
#>  6    2000 Dec Bokeo                0         0          0
#>  7    2001 Jan Bokeo                0         0          0
#>  8    2001 Feb Bokeo                0         0          0
#>  9    2001 Mar Bokeo                0         0          0
#> 10    2001 Apr Bokeo                0         0          0

Differencing

Remove trends with differencing:

training_diff <- training_data |>
  group_by_key() |>
  mutate(
    cases_diff = difference(disease_cases, 1),
    cases_diff12 = difference(disease_cases, 12)  # Seasonal difference
  ) |>
  ungroup()

training_diff |>
  select(time_period, location, disease_cases, cases_diff, cases_diff12) |>
  filter(location == "Bokeo") |>
  head(15)
#> # A tsibble: 15 x 5 [1M]
#> # Key:       location [1]
#>    time_period location disease_cases cases_diff cases_diff12
#>          <mth> <chr>            <dbl>      <dbl>        <dbl>
#>  1    2000 Jul Bokeo                0         NA           NA
#>  2    2000 Aug Bokeo                0          0           NA
#>  3    2000 Sep Bokeo                0          0           NA
#>  4    2000 Oct Bokeo                0          0           NA
#>  5    2000 Nov Bokeo                0          0           NA
#>  6    2000 Dec Bokeo                0          0           NA
#>  7    2001 Jan Bokeo                0          0           NA
#>  8    2001 Feb Bokeo                0          0           NA
#>  9    2001 Mar Bokeo                0          0           NA
#> 10    2001 Apr Bokeo                0          0           NA
#> 11    2001 May Bokeo                0          0           NA
#> 12    2001 Jun Bokeo                1          1           NA
#> 13    2001 Jul Bokeo                1          0            1
#> 14    2001 Aug Bokeo                1          0            1
#> 15    2001 Sep Bokeo                1          0            1

Handling Missing Values

tsibble provides tools for gap detection and filling:

# Check for gaps in the time series
has_gaps(training_data)
#> # A tibble: 7 × 2
#>   location              .gaps
#>   <chr>                 <lgl>
#> 1 Bokeo                 FALSE
#> 2 Champasak             FALSE
#> 3 LouangNamtha          FALSE
#> 4 Oudomxai              FALSE
#> 5 Savannakhet           FALSE
#> 6 Vientiane[prefecture] FALSE
#> 7 Xiangkhoang           FALSE

# Count gaps per location
count_gaps(training_data)
#> # A tibble: 0 × 4
#> # ℹ 4 variables: location <chr>, .from <mth>, .to <mth>, .n <int>

# If there were gaps, fill them:
filled_data <- training_data |>
  fill_gaps() |>
  group_by_key() |>
  tidyr::fill(disease_cases, .direction = "down") |>
  ungroup()

Temporal Aggregation with fabletools

Aggregate to Quarterly Data

library(fabletools)

# Convert monthly to quarterly
quarterly_data <- training_data |>
  index_by(quarter = yearquarter(time_period)) |>
  group_by(location) |>
  summarise(
    disease_cases = sum(disease_cases, na.rm = TRUE),
    rainfall = mean(rainfall, na.rm = TRUE),
    mean_temperature = mean(mean_temperature, na.rm = TRUE),
    population = mean(population, na.rm = TRUE)
  )

quarterly_data
#> # A tsibble: 357 x 6 [1Q]
#> # Key:       location [7]
#>    location quarter disease_cases rainfall mean_temperature population
#>    <chr>      <qtr>         <dbl>    <dbl>            <dbl>      <dbl>
#>  1 Bokeo    2000 Q3             0    339.              23.3     58503.
#>  2 Bokeo    2000 Q4             0     49.7             20.7     58503.
#>  3 Bokeo    2001 Q1             0     49.4             21.4     60157.
#>  4 Bokeo    2001 Q2             1    253.              24.8     60157.
#>  5 Bokeo    2001 Q3             3    337.              23.5     60157.
#>  6 Bokeo    2001 Q4             1     67.2             19.9     60157.
#>  7 Bokeo    2002 Q1             0     33.0             21.0     61812.
#>  8 Bokeo    2002 Q2             2    276.              24.6     61812.
#>  9 Bokeo    2002 Q3             1    370.              23.2     61812.
#> 10 Bokeo    2002 Q4             1    107.              20.0     61812.
#> # ℹ 347 more rows

Aggregate to Yearly Data

yearly_data <- training_data |>
  as_tibble() |>
  mutate(year = lubridate::year(time_period)) |>
  group_by(location, year) |>
  summarise(
    total_cases = sum(disease_cases, na.rm = TRUE),
    mean_rainfall = mean(rainfall, na.rm = TRUE),
    .groups = "drop"
  )

yearly_data
#> # A tibble: 98 × 4
#>    location  year total_cases mean_rainfall
#>    <chr>    <dbl>       <dbl>         <dbl>
#>  1 Bokeo     2000           0          194.
#>  2 Bokeo     2001           5          177.
#>  3 Bokeo     2002           4          197.
#>  4 Bokeo     2003          20          147.
#>  5 Bokeo     2004           0          185.
#>  6 Bokeo     2005           4          175.
#>  7 Bokeo     2006           6          173.
#>  8 Bokeo     2007           1          170.
#>  9 Bokeo     2008         218          205.
#> 10 Bokeo     2009         709          160.
#> # ℹ 88 more rows

Spatial Aggregation

Aggregate Across Locations

Sometimes you need national-level totals from regional data:

# Aggregate all locations to national level
national_data <- training_data |>
  as_tibble() |>
  group_by(time_period) |>
  summarise(
    total_cases = sum(disease_cases, na.rm = TRUE),
    total_population = sum(population, na.rm = TRUE),
    mean_rainfall = weighted.mean(rainfall, population, na.rm = TRUE),
    mean_temperature = weighted.mean(mean_temperature, population, na.rm = TRUE)
  )

national_data |> head(10)
#> # A tibble: 10 × 5
#>    time_period total_cases total_population mean_rainfall mean_temperature
#>          <mth>       <dbl>            <dbl>         <dbl>            <dbl>
#>  1    2000 Jul         159         2269893.        389.               24.2
#>  2    2000 Aug         135         2269893.        380.               24.4
#>  3    2000 Sep         124         2269893.        294.               23.6
#>  4    2000 Oct          87         2269893.        143.               23.4
#>  5    2000 Nov          31         2269893.         22.4              21.5
#>  6    2000 Dec          25         2269893.         12.3              21.3
#>  7    2001 Jan          11         2304144.         11.7              22.3
#>  8    2001 Feb          19         2304144.          8.11             23.1
#>  9    2001 Mar          37         2304144.        119.               24.0
#> 10    2001 Apr          36         2304144.         54.6              27.5

Hierarchical Aggregation

Create data at multiple levels for hierarchical forecasting:

# Assume we have a parent column for regional grouping
# Create hierarchical structure
hierarchical_data <- training_data |>
  as_tibble() |>
  group_by(time_period, parent) |>
  summarise(
    regional_cases = sum(disease_cases, na.rm = TRUE),
    regional_pop = sum(population, na.rm = TRUE),
    .groups = "drop"
  )

hierarchical_data |> head(10)
#> # A tibble: 10 × 4
#>    time_period parent regional_cases regional_pop
#>          <mth> <chr>           <dbl>        <dbl>
#>  1    2000 Jul -                 159     2269893.
#>  2    2000 Aug -                 135     2269893.
#>  3    2000 Sep -                 124     2269893.
#>  4    2000 Oct -                  87     2269893.
#>  5    2000 Nov -                  31     2269893.
#>  6    2000 Dec -                  25     2269893.
#>  7    2001 Jan -                  11     2304144.
#>  8    2001 Feb -                  19     2304144.
#>  9    2001 Mar -                  37     2304144.
#> 10    2001 Apr -                  36     2304144.

Working with Spatial Data (sf)

If your data includes geographic coordinates or boundaries:

library(sf)

# Example: Create spatial features from coordinates
locations_sf <- tibble(
  location = c("Bokeo", "Luangprabang", "Oudomxay"),
  lon = c(100.5, 102.1, 101.5),
  lat = c(20.2, 19.9, 20.7)
) |>
  st_as_sf(coords = c("lon", "lat"), crs = 4326)

# Spatial join with administrative boundaries
# admin_boundaries <- st_read("boundaries.geojson")
# joined <- st_join(locations_sf, admin_boundaries)

# Calculate distances between locations
# distances <- st_distance(locations_sf)

Spatio-Temporal Data with cubble

For more complex spatio-temporal analyses, the cubble package provides a unified structure:

library(cubble)

# Create a cubble from separate spatial and temporal data
cb <- as_cubble(
  data = training_data,
  key = location,
  index = time_period,
  coords = c(lon, lat)  # if coordinates available
)

# Switch between spatial and temporal views
cb_temporal <- cb |> face_temporal()
cb_spatial <- cb |> face_spatial()

# Useful for linking maps with time series plots

Feature Engineering for CHAP Models

Here’s a complete example of preparing features for a CHAP model:

prepare_features <- function(data) {
  data |>
    group_by_key() |>
    mutate(
      # Lag features
      cases_lag1 = lag(disease_cases, 1),
      cases_lag2 = lag(disease_cases, 2),
      cases_lag3 = lag(disease_cases, 3),
      rainfall_lag1 = lag(rainfall, 1),

      # Rolling statistics
      cases_ma3 = slider::slide_dbl(disease_cases, mean, .before = 2, .complete = TRUE),
      rainfall_ma3 = slider::slide_dbl(rainfall, mean, .before = 2, .complete = TRUE),

      # Seasonal features
      month = lubridate::month(time_period),

      # Year-over-year change
      cases_yoy = disease_cases - lag(disease_cases, 12)
    ) |>
    ungroup() |>
    # Remove rows with NA from lagging
    filter(!is.na(cases_lag3))
}

# Apply to training data
features <- prepare_features(training_data)
features |>
  select(time_period, location, disease_cases, cases_lag1, cases_ma3, month) |>
  head(10)
#> # A tsibble: 10 x 6 [1M]
#> # Key:       location [1]
#>    time_period location disease_cases cases_lag1 cases_ma3 month
#>          <mth> <chr>            <dbl>      <dbl>     <dbl> <dbl>
#>  1    2000 Oct Bokeo                0          0     0        10
#>  2    2000 Nov Bokeo                0          0     0        11
#>  3    2000 Dec Bokeo                0          0     0        12
#>  4    2001 Jan Bokeo                0          0     0         1
#>  5    2001 Feb Bokeo                0          0     0         2
#>  6    2001 Mar Bokeo                0          0     0         3
#>  7    2001 Apr Bokeo                0          0     0         4
#>  8    2001 May Bokeo                0          0     0         5
#>  9    2001 Jun Bokeo                1          0     0.333     6
#> 10    2001 Jul Bokeo                1          1     0.667     7

Using Features in a CHAP Model

Here’s a complete example using lag features, validated against the Laos dataset:

# Define a model that uses lag features
train_fn <- function(training_data, model_configuration = list()) {
  # Calculate mean and use lagged values for adjustment
  location_stats <- training_data |>
    as_tibble() |>
    group_by(location) |>
    summarise(
      mean_cases = mean(disease_cases, na.rm = TRUE),
      .groups = "drop"
    )

  list(location_stats = location_stats)
}

predict_fn <- function(historic_data, future_data, saved_model,
                       model_configuration = list()) {
  # Get the last observed value per location from historic data
  last_obs <- historic_data |>
    as_tibble() |>
    group_by(location) |>
    slice_max(time_period, n = 1) |>
    select(location, last_cases = disease_cases) |>
    ungroup()

  # Blend historical mean with last observation
  future_data |>
    left_join(saved_model$location_stats, by = "location") |>
    left_join(last_obs, by = "location") |>
    mutate(
      # Simple blend: 70% mean + 30% last observation
      prediction = 0.7 * mean_cases + 0.3 * coalesce(last_cases, mean_cases),
      samples = purrr::map(prediction, ~c(.x))
    ) |>
    select(-mean_cases, -last_cases, -prediction)
}

# Validate on Laos dataset
data <- get_example_data('laos', 'M')
result <- validate_model_io(train_fn, predict_fn, data)

result$success
#> [1] TRUE
result$n_predictions
#> [1] 21
result$n_samples
#> [1] 1

The validation confirms our model produces the correct output structure.

A More Complex Example with Rolling Features

For models using rolling window features (requires the slider package):

train_fn <- function(training_data, model_configuration = list()) {
  # Prepare features with rolling statistics
  features <- training_data |>
    group_by_key() |>
    mutate(
      cases_lag1 = lag(disease_cases, 1),
      cases_ma3 = slider::slide_dbl(disease_cases, mean, .before = 2, .complete = TRUE),
      rainfall_lag1 = lag(rainfall, 1)
    ) |>
    ungroup() |>
    filter(!is.na(cases_ma3))

  # Fit a simple linear model per location
  models <- features |>
    as_tibble() |>
    group_by(location) |>
    group_map(~ lm(disease_cases ~ cases_lag1 + cases_ma3 + rainfall_lag1, data = .x))

  names(models) <- unique(features$location)
  list(models = models, feature_means = features |>
         summarise(across(c(cases_lag1, cases_ma3, rainfall_lag1), mean, na.rm = TRUE)))
}

predict_fn <- function(historic_data, future_data, saved_model,
                       model_configuration = list()) {
  # Get the last known feature values from historic data
  last_features <- historic_data |>
    group_by_key() |>
    mutate(
      cases_lag1 = lag(disease_cases, 1),
      cases_ma3 = slider::slide_dbl(disease_cases, mean, .before = 2, .complete = TRUE),
      rainfall_lag1 = lag(rainfall, 1)
    ) |>
    slice_max(time_period, n = 1) |>
    ungroup() |>
    select(location, cases_lag1, cases_ma3, rainfall_lag1)

  # Generate predictions
  future_data |>
    left_join(last_features, by = "location") |>
    rowwise() |>
    mutate(
      pred = predict(saved_model$models[[location]], newdata = pick(everything())),
      samples = list(c(max(0, pred)))  # Ensure non-negative
    ) |>
    ungroup() |>
    select(-pred, -cases_lag1, -cases_ma3, -rainfall_lag1)
}

Summary

For spatio-temporal data in CHAP:

tsibble - Use for all temporal data structures and operations
fabletools - Use for temporal aggregation and hierarchical forecasting
sf - Use for spatial operations (joins, distances, boundaries)
cubble - Use for complex spatio-temporal analysis with linked views
slider - Use for rolling window calculations

These packages integrate seamlessly with the tidyverse and work well with the CHAP SDK’s tsibble-based data format.