Source code for chap_core.hpo.hpoModel

import logging
from typing import Literal, Optional, Any, Tuple

from chap_core.spatio_temporal_data.temporal_dataclass import DataSet
from chap_core.database.model_templates_and_config_tables import ModelConfiguration
from chap_core.file_io.example_data_set import DataSetType

from .hpoModelInterface import HpoModelInterface
from .objective import Objective
from .searcher import Searcher
from .base import write_yaml

Direction = Literal["maximize", "minimize"]


logger = logging.getLogger()
logger.setLevel(logging.INFO)


[docs] class HpoModel(HpoModelInterface): def __init__( self, searcher: Searcher, objective: Objective, direction: Direction = "minimize", model_configuration: Optional[dict[str, list]] = None, ): if direction not in ("maximize", "minimize"): raise ValueError("direction must be 'maximize' or 'minimize'") self._searcher = searcher self._objective = objective self._direction = direction self.base_configs = model_configuration self._best_config: dict[str, dict[str, Any]] = None self._leaderboard: list[dict[str, Any]] = [] self._predictor = None
[docs] def train(self, dataset: Optional[DataSetType]) -> Tuple[str, dict[str, Any]]: """ Calls get_leaderboard to find the optimal configuration. Then trains the tuned model on the whole input dataset (train + validation). """ self.get_leaderboard( dataset ) # calculates leaderboard, don't need the return value here bc best_config it stores best_config in self template = self._objective.model_template # not sure if accessing template from objective is correct, maybe pass template to hpoModel and hpoModel calls get_model? # TODO: validate config without "user_option_values" config = None if self._best_config is not None: logger.info(f"Validating best model configuration: {self._best_config}") config = ModelConfiguration.model_validate(self._best_config) logger.info(f"Validated best model configuration: {config}") else: raise ValueError("No best configuration found. Have you run get_leaderboard()?") estimator = template.get_model(config) self._predictor = estimator.train(dataset) return self._predictor
[docs] def predict(self, historic_data: DataSet, future_data: DataSet) -> DataSet: return self._predictor.predict(historic_data, future_data)
[docs] def get_leaderboard(self, dataset: Optional[DataSetType]) -> list[dict[str, Any]]: """ Runs hyperparameter optimization over the search space. Returns a sorted list of configurations together with their score. """ best_score = float("inf") if self._direction == "minimize" else float("-inf") best_params: dict[str, Any] = {} self._leaderboard = [] self._searcher.reset(self.base_configs) while True: params = self._searcher.ask() if params is None: break trial_number = None if params.get("_trial_id") is not None: # for TPESearcher trial_number = params.pop("_trial_id") # Maybe best to seperate hpo_config and other configs in two files ?? score = self._objective(params, dataset) if trial_number is not None: # for parallel TPE search params["_trial_id"] = trial_number self._searcher.tell(params, score) params.pop("_trial_id") else: self._searcher.tell(params, score) self._leaderboard.append( { "config": params, "score": score, } ) is_better = (score < best_score) if self._direction == "minimize" else (score > best_score) if is_better or best_params is None: best_score = score best_params = params # best_config = config # vs. model_config, safe_load vs. model_validate logger.info(f"Tried {params} -> score={score}") self._best_config = {"user_option_values": best_params} logger.info(f"\nBest params: {best_params} | best score: {best_score}") self._leaderboard.sort(key=lambda conf: conf["score"], reverse=self._direction == "maximize") assert best_params == self._leaderboard[0]["config"], "best params is not the first in leaderboard" return self._leaderboard
@property def get_best_config(self): return self._best_config
[docs] def write_best_config(self, output_yaml): if self._best_config is not None: write_yaml(output_yaml, self._best_config)