from datetime import datetime
from typing import TYPE_CHECKING, Literal
import git
from chap_core.exceptions import InvalidModelException
from chap_core.external.external_model import logger
from chap_core.external.model_configuration import ModelTemplateConfigV2
from chap_core.models.external_chapkit_model import ExternalChapkitModelTemplate
from chap_core.models.model_template import ModelTemplate
import shutil
import uuid
import yaml
import logging
import os
from pathlib import Path
if TYPE_CHECKING:
from chap_core.models.external_model import ExternalModel
def _get_working_dir(model_path, base_working_dir, run_dir_type, model_name):
if run_dir_type == "use_existing" and not Path(model_path).exists():
logging.warning(
f"Model path {model_path} does not exist. Will create a directory for the run (using the name 'latest')"
)
run_dir_type = "latest"
if run_dir_type == "latest":
working_dir = base_working_dir / model_name / "latest"
# clear working dir
if working_dir.exists():
logger.info(f"Removing previous working dir {working_dir}")
shutil.rmtree(working_dir)
elif run_dir_type == "timestamp":
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
unique_identifier = timestamp + "_" + str(uuid.uuid4())[:8]
# timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S%f")
working_dir = base_working_dir / model_name / unique_identifier
# check that working dir does not exist
assert not working_dir.exists(), (
f"Working dir {working_dir} already exists. This should not happen if make_run_dir is True"
)
elif run_dir_type == "use_existing":
working_dir = Path(model_path)
else:
raise ValueError(f"Invalid run_dir_type: {run_dir_type}")
return run_dir_type, working_dir
def _get_model_code_base(model_path, base_working_dir, run_dir_type):
is_github = False
commit = None
if isinstance(model_path, str) and model_path.startswith("https://github.com"):
dir_name = model_path.split("/")[-1].replace(".git", "")
model_name = dir_name
if "@" in model_path:
model_path, commit = model_path.split("@")
is_github = True
else:
model_name = Path(model_path).name
run_dir_type, working_dir = _get_working_dir(model_path, base_working_dir, run_dir_type, model_name)
logger.info(f"Writing results to {working_dir}")
if is_github:
working_dir.mkdir(parents=True)
if commit:
# For specific commits, clone with --filter to minimize download
# then fetch only the specific commit
logger.info(f"Cloning repository with specific commit {commit}")
repo = git.Repo.clone_from(model_path, working_dir, filter="blob:none", no_checkout=True)
# Fetch only the specific commit with minimal history
repo.git.fetch("origin", commit, depth=1)
repo.git.checkout(commit)
else:
# For latest branch, use shallow clone with depth=1
logger.info(f"Cloning repository {model_path} (shallow clone)")
repo = git.Repo.clone_from(model_path, working_dir, depth=1)
elif run_dir_type == "use_existing":
logging.info("Not copying any model files, using existing directory")
else:
# copy contents of model_path to working_dir
logger.info(f"Copying files from {model_path} to {working_dir}")
shutil.copytree(
model_path,
working_dir,
ignore=lambda dir, contents: list({".venv", "venv"}.intersection(contents)),
dirs_exist_ok=True,
)
return working_dir
[docs]
def get_model_template_from_mlproject_file(mlproject_file, ignore_env=False, working_dir=None) -> ModelTemplate:
if working_dir is None:
working_dir = Path(mlproject_file).parent
else:
working_dir = Path(working_dir)
with open(mlproject_file, "r") as file:
config = yaml.load(file, Loader=yaml.FullLoader)
config = ModelTemplateConfigV2.model_validate(config)
model_template = ModelTemplate(config, working_dir, ignore_env)
return model_template
[docs]
def get_model_template_from_directory_or_github_url(
model_template_path,
base_working_dir=Path("runs/"),
ignore_env=False,
run_dir_type="timestamp",
is_chapkit_model: bool = False,
) -> ModelTemplate:
"""
Note: Preferably use ModelTemplate.from_directory_or_github_url instead of
using this function directly. This function may be depcrecated in the future.
Gets the model template and initializes a working directory with the code for the model.
model_path can be a local directory or github url
Parameters
----------
model_template_path : str
Path to the model. Can be a local directory or a github url
base_working_dir : Path, optional
Base directory to store the working directory, by default Path("runs/")
ignore_env : bool, optional
If True, will ignore the environment specified in the MLproject file, by default False
run_dir_type : Literal["timestamp", "latest", "use_existing"], optional
Type of run directory to create, by default "timestamp", which creates a new directory based on current timestamp for the run.
"latest" will create a new directory based on the model name, but will remove any existing directory with the same name.
"use_existing" will use the existing directory specified by the model path if that exists. If that does not exist, "latest" will be used.
"""
if is_chapkit_model:
logger.info("Model is chapkit model")
# For now, we assume that if a model template has a url on localhost it is
# a chapkit model
template = ExternalChapkitModelTemplate(model_template_path)
assert template.name is not None, template
return template
logger.info(
f"Getting model template from {model_template_path}. Ignore env: {ignore_env}. Base working dir: {base_working_dir}. Run dir type: {run_dir_type}"
)
working_dir = _get_model_code_base(model_template_path, base_working_dir, run_dir_type)
logger.info(f"Current directory is {os.getcwd()}, working dir is {working_dir.absolute()}")
assert os.path.isdir(working_dir), working_dir
assert os.path.isdir(os.path.abspath(working_dir)), working_dir
# assert that a config file exists
if not (working_dir / "MLproject").exists():
raise InvalidModelException("No MLproject file found in model directory")
template = get_model_template_from_mlproject_file(working_dir / "MLproject", ignore_env=ignore_env)
return template
[docs]
def get_model_from_directory_or_github_url(
model_template_path,
base_working_dir=Path("runs/"),
ignore_env=False,
run_dir_type: Literal["timestamp", "latest", "use_existing"] = "timestamp",
model_configuration_yaml: str = None,
) -> "ExternalModel":
"""
NOTE: This function is deprecated, can be removed in the future.
Gets the model and initializes a working directory with the code for the model.
model_path can be a local directory or github url
Parameters
----------
model_template_path : str
Path to the model. Can be a local directory or a github url
base_working_dir : Path, optional
Base directory to store the working directory, by default Path("runs/")
ignore_env : bool, optional
If True, will ignore the environment specified in the MLproject file, by default False
run_dir_type : Literal["timestamp", "latest", "use_existing"], optional
Type of run directory to create, by default "timestamp", which creates a new directory based on current timestamp for the run.
"latest" will create a new directory based on the model name, but will remove any existing directory with the same name.
"use_existing" will use the existing directory specified by the model path if that exists. If that does not exist, "latest" will be used.
model_configuration_yaml : str, optional
Path to the model configuration yaml file, by default None. This has to be a yaml that is compatible with the model configuration class given by the ModelTemplate.
"""
template = get_model_template_from_directory_or_github_url(
model_template_path, ignore_env=ignore_env, run_dir_type=run_dir_type
)
model_configuration = None
# config_class = template.get_config_class()
if model_configuration_yaml:
with open(model_configuration_yaml, "r") as file:
model_configuration = yaml.load(file, Loader=yaml.FullLoader)
# model_configuration = config_class.model_validate(model_configuration)
return template.get_model(model_configuration=model_configuration)