Source code for chap_core.assessment.prediction_evaluator

from collections import defaultdict
from typing import Protocol, TypeVar, Iterable, Dict
from gluonts.model import SampleForecast
from gluonts.evaluation import Evaluator
from gluonts.model import Forecast
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import pandas as pd

from chap_core import get_temp_dir
from chap_core.assessment.dataset_splitting import (
    train_test_generator,
)
from chap_core.data.gluonts_adaptor.dataset import ForecastAdaptor
from chap_core.datatypes import TimeSeriesData, Samples, SamplesWithTruth
import logging

from chap_core.spatio_temporal_data.temporal_dataclass import DataSet


plt.set_loglevel(level="warning")
logger = logging.getLogger(__name__)


FetureType = TypeVar("FeatureType", bound=TimeSeriesData)


[docs] def without_disease(t): return t
[docs] class Predictor(Protocol):
[docs] def predict(
self, historic_data: DataSet[FetureType], future_data: DataSet[without_disease(FetureType)], ) -> Samples: ...
[docs] class Estimator(Protocol):
[docs] def train(self, data: DataSet) -> Predictor: ...
[docs] def backtest( estimator: Estimator, data: DataSet, prediction_length, n_test_sets, stride=1, weather_provider=None ) -> Iterable[DataSet]: train, test_generator = train_test_generator( data, prediction_length, n_test_sets, future_weather_provider=weather_provider ) predictor = estimator.train(train) for historic_data, future_data, future_truth in test_generator: r = predictor.predict(historic_data, future_data) samples_with_truth = future_truth.merge(r, result_dataclass=SamplesWithTruth) yield samples_with_truth
[docs] def evaluate_model( estimator: Estimator, data: DataSet, prediction_length=3, n_test_sets=4, report_filename=None, weather_provider=None, ): """ Evaluate a model on a dataset on a held out test set, making multiple predictions on the test set using the same trained model Parameters ---------- estimator : Estimator The estimator to train and evaluate data : DataSet The data to train and evaluate on prediction_length : int The number of periods to predict ahead n_test_sets : int The number of test sets to evaluate on Returns ------- tuple Summary and individual evaluation results """ logger.info(f"Evaluating {estimator} with {n_test_sets} test sets for {prediction_length} periods ahead") train, test_generator = train_test_generator( data, prediction_length, n_test_sets, future_weather_provider=weather_provider ) predictor = estimator.train(train) truth_data = { location: pd.DataFrame( data[location].disease_cases, index=data[location].time_period.to_period_index(), ) for location in data.keys() } # transformed = create_multiloc_timeseries(truth_data) if report_filename is None: report_filename = str(get_temp_dir() / "evaluation_report.pdf") if report_filename is not None: logger.info(f"Plotting forecasts to {report_filename}") _, plot_test_generatro = train_test_generator( data, prediction_length, n_test_sets, future_weather_provider=weather_provider ) forecasts_and_truths_generator = plot_forecasts(predictor, plot_test_generatro, truth_data, report_filename) logger.info("Getting forecasts") # forecast_list, tss = _get_forecast_generators(predictor, test_generator, truth_data) forecast_list, tss = zip(*forecasts_and_truths_generator) logger.info("Evaluating") evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9], num_workers=None, allow_nan_forecast=True) results = evaluator(tss, forecast_list) logger.info("Finished Evaluating") return results
[docs] def create_multiloc_timeseries(truth_data): from chap_core.assessment.representations import MultiLocationDiseaseTimeSeries multi_location_disease_time_series = MultiLocationDiseaseTimeSeries() for location, df in truth_data.items(): from chap_core.assessment.representations import DiseaseTimeSeries from chap_core.assessment.representations import DiseaseObservation disease_time_series = DiseaseTimeSeries( observations=[ DiseaseObservation(time_period=period, disease_cases=cases) for period, cases in df.itertuples(index=True, name="Pandas") ] ) multi_location_disease_time_series[location] = disease_time_series return multi_location_disease_time_series
def _get_forecast_generators( predictor: Predictor, test_generator: Iterable[tuple[DataSet, DataSet, DataSet]], truth_data: Dict[str, pd.DataFrame], ) -> tuple[list[Forecast], list[pd.DataFrame]]: """ Get the forecast and truth data for a predictor and test generator. One entry is a combination of prediction start period and location Parameters ---------- predictor : Predictor The predictor to evaluate test_generator : Iterable[tuple[DataSet, DataSet, DataSet]] The test generator to generate test data truth_data : dict[str, pd.DataFrame] The truth data for the locations """ tss = [] forecast_list = [] for historic_data, future_data, _ in test_generator: forecasts = predictor.predict(historic_data, future_data) for location, samples in forecasts.items(): forecast = ForecastAdaptor.from_samples(samples) t = truth_data[location] tss.append(t) forecast_list.append(forecast) return forecast_list, tss def _get_forecast_dict(predictor: Predictor, test_generator) -> dict[str, list[SampleForecast]]: forecast_dict = defaultdict(list) for historic_data, future_data, _ in test_generator: assert len(future_data.period_range) > 0, ( f"Future data must have at least one period {historic_data.period_range}, {future_data.period_range}" ) forecasts = predictor.predict(historic_data, future_data) for location, samples in forecasts.items(): forecast_dict[location].append(ForecastAdaptor.from_samples(samples)) return forecast_dict
[docs] def plot_forecasts(predictor, test_instance, truth, pdf_filename): forecast_dict = _get_forecast_dict(predictor, test_instance) with PdfPages(pdf_filename) as pdf: for location, forecasts in forecast_dict.items(): logging.info(f"Running on location {location}") try: _t = truth[location] except KeyError: location = str(location) try: _t = truth[location] except KeyError: logger.error( f"Location {repr(location)} not found in truth data which has locations {truth.keys()}" ) raise logging.warning( f"Had to convert location to string {location}, something has maybe gone wrong at some point with data types" ) for forecast in forecasts: logging.info("Forecasts: ") # logging.info(forecasts) if np.any(np.isnan(forecast.samples)): logger.warning(f"Forecast {forecast} has NaN values: {forecast.samples}") plt.figure(figsize=(8, 4)) # Set the figure size t = _t[_t.index <= forecast.index[-1]] forecast.plot(show_label=True) plotting_context = 52 * 6 plt.plot(t[-plotting_context:].to_timestamp()) plt.title(location) plt.legend() pdf.savefig() plt.close() # Close the figure yield forecast, t
[docs] def plot_predictions(predictions: DataSet[Samples], truth: DataSet, pdf_filename): truth_dict = { location: pd.DataFrame( truth[location].disease_cases, index=truth[location].time_period.to_period_index(), ) for location in truth.keys() } with PdfPages(pdf_filename) as pdf: for location, prediction in predictions.items(): prediction = ForecastAdaptor.from_samples(prediction) t = truth_dict[location] plt.figure(figsize=(8, 4)) # Set the figure size # t = _t[_t.index <= prediction.index[-1]] prediction.plot(show_label=True) context_length = 52 * 6 plt.plot(t[-context_length:].to_timestamp()) plt.title(location) plt.legend() pdf.savefig() plt.close() # Close the figure