Source code for pykeen.ablation.ablation

"""Utilities for ablation study configurations."""

from __future__ import annotations

import itertools as itt
import json
import logging
import pathlib
import time
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, TypedDict
from uuid import uuid4

from ..training import SLCWATrainingLoop, training_loop_resolver
from ..typing import OneOrSequence
from ..utils import normalize_path, normalize_string, upgrade_to_sequence

__all__ = [
    "ablation_pipeline",
    "ablation_pipeline_from_config",
    "prepare_ablation_from_config",
    "prepare_ablation_from_path",
    "prepare_ablation",
]

logger = logging.getLogger(__name__)

Mapping2D = Mapping[str, Mapping[str, Any]]
Mapping3D = Mapping[str, Mapping[str, Mapping[str, Any]]]


class SplitToPathDict(TypedDict):
    """A mapping of the split keys to the paths in which the triples are stored."""

    training: str | pathlib.Path
    validation: str | pathlib.Path
    testing: str | pathlib.Path



[docs]
def ablation_pipeline(
    datasets: OneOrSequence[str | SplitToPathDict],
    directory: str | pathlib.Path,
    models: str | list[str],
    losses: str | list[str],
    optimizers: str | list[str],
    training_loops: str | list[str],
    *,
    epochs: int | None = None,
    create_inverse_triples: bool | list[bool] = False,
    regularizers: None | str | list[str] = None,
    negative_sampler: str | None = None,
    evaluator: str | None = None,
    stopper: str | None = "NopStopper",
    model_to_model_kwargs: Mapping2D | None = None,
    model_to_model_kwargs_ranges: Mapping2D | None = None,
    model_to_loss_to_loss_kwargs: Mapping3D | None = None,
    model_to_loss_to_loss_kwargs_ranges: Mapping3D | None = None,
    model_to_optimizer_to_optimizer_kwargs: Mapping3D | None = None,
    model_to_optimizer_to_optimizer_kwargs_ranges: Mapping3D | None = None,
    model_to_negative_sampler_to_negative_sampler_kwargs: Mapping3D | None = None,
    model_to_negative_sampler_to_negative_sampler_kwargs_ranges: Mapping3D | None = None,
    model_to_training_loop_to_training_loop_kwargs: Mapping3D | None = None,
    model_to_training_loop_to_training_kwargs: Mapping3D | None = None,
    model_to_training_loop_to_training_kwargs_ranges: Mapping3D | None = None,
    model_to_regularizer_to_regularizer_kwargs: Mapping3D | None = None,
    model_to_regularizer_to_regularizer_kwargs_ranges: Mapping3D | None = None,
    evaluator_kwargs: Mapping[str, Any] | None = None,
    evaluation_kwargs: Mapping[str, Any] | None = None,
    stopper_kwargs: Mapping[str, Any] | None = None,
    n_trials: int | None = 5,
    timeout: int | None = 3600,
    metric: str | None = "hits@10",
    direction: str | None = "maximize",
    sampler: str | None = "random",
    pruner: str | None = "nop",
    metadata: Mapping | None = None,
    save_artifacts: bool = True,
    move_to_cpu: bool = True,
    dry_run: bool = False,
    best_replicates: int | None = None,
    discard_replicates: bool = False,
    create_unique_subdir: bool = False,
) -> None:
    """Run ablation study.

    :param datasets: A single or a list of dataset specifications. Datasets can be specified either by name (referring
        to a single built-in dataset) or as a dictionary with paths for training, validation, and testing.
    :param directory: The directory in which the experimental artifacts will be saved.
    :param models: A model name or list of model names.
    :param losses: A loss function name or list of loss function names.
    :param optimizers: An optimizer name or list of optimizer names.
    :param training_loops: A training loop name or list of training loop names.
    :param epochs: A quick way to set the ``num_epochs`` in the training kwargs.
    :param create_inverse_triples: Either a boolean for a single entry or a list of booleans.
    :param regularizers: A regularizer name, list of regularizer names, or None if no regularizer is desired.
    :param negative_sampler: A negative sampler name, list of regularizer names, or None if no negative sampler is
        desired. Negative sampling is used only in combination with :class:`pykeen.training.SLCWATrainingLoop`.
    :param evaluator: The name of the evaluator to be used. Defaults to rank-based evaluator.
    :param stopper: The name of the stopper to be used. Defaults to NopStopper which doesn't define a stopping
        criterion.
    :param model_to_model_kwargs: A mapping from model name to dictionaries of default keyword arguments for the
        instantiation of that model.
    :param model_to_model_kwargs_ranges: A mapping from model name to dictionaries of keyword argument ranges for that
        model to be used in HPO.
    :param model_to_loss_to_loss_kwargs: A mapping from model name to a mapping of loss name to a mapping of default
        keyword arguments for the instantiation of that loss function. This is useful because for some losses, have
        hyper-parameters such as :class:`pykeen.losses.MarginRankingLoss`.
    :param model_to_loss_to_loss_kwargs_ranges: A mapping from model name to a mapping of loss name to a mapping of
        keyword argument ranges for that loss to be used in HPO.
    :param model_to_optimizer_to_optimizer_kwargs: A mapping from model name to a mapping of optimizer name to a mapping
        of default keyword arguments for the instantiation of that optimizer. This is useful because the optimizers,
        have hyper-parameters such as the learning rate.
    :param model_to_optimizer_to_optimizer_kwargs_ranges: A mapping from model name to a mapping of optimizer name to a
        mapping of keyword argument ranges for that optimizer to be used in HPO.
    :param model_to_regularizer_to_regularizer_kwargs: A mapping from model name to a mapping of regularizer name to a
        mapping of default keyword arguments for the instantiation of that regularizer. This is useful because the
        optimizers, have hyper-parameters such as the regularization weight.
    :param model_to_regularizer_to_regularizer_kwargs_ranges: A mapping from model name to a mapping of regularizer name
        to a mapping of keyword argument ranges for that regularizer to be used in HPO.
    :param model_to_negative_sampler_to_negative_sampler_kwargs: A mapping from model name to a mapping of negative
        sampler name to a mapping of default keyword arguments for the instantiation of that negative sampler. This is
        useful because the negative samplers, have hyper-parameters such as the number of negatives that should get
        generated for each positive training example.
    :param model_to_negative_sampler_to_negative_sampler_kwargs_ranges: A mapping from model name to a mapping of
        negative sampler name to a mapping of keyword argument ranges for that negative sampler to be used in HPO.
    :param model_to_training_loop_to_training_loop_kwargs: A mapping from model name to a mapping of training loop name
        to a mapping of default keyword arguments for the training loop.
    :param model_to_training_loop_to_training_kwargs: A mapping from model name to a mapping of trainer name to a
        mapping of default keyword arguments for the training procedure. This is useful because you can set the
        hyper-parameters such as the number of training epochs and the batch size.
    :param model_to_training_loop_to_training_kwargs_ranges: A mapping from model name to a mapping of trainer name to a
        mapping of keyword argument ranges for that trainer to be used in HPO.
    :param evaluator_kwargs: The keyword arguments passed to the evaluator.
    :param evaluation_kwargs: The keyword arguments passed during evaluation.
    :param stopper_kwargs: The keyword arguments passed to the stopper.
    :param n_trials: Number of HPO trials.
    :param timeout: The time (seconds) after which the ablation study will be terminated.
    :param metric: The metric to optimize during HPO.
    :param direction: Defines, whether to 'maximize' or 'minimize' the metric during HPO.
    :param sampler: The HPO sampler, it defaults to random search.
    :param pruner: Defines approach for pruning trials. Per default no pruning is used, i.e., pruner is set to
        'Nopruner'.
    :param metadata: A mapping of meta data arguments such as name of the ablation study.
    :param save_artifacts: Defines, whether each trained model sampled during HPO should be saved.
    :param move_to_cpu: Defines, whether a replicate of the best model should be moved to CPU.
    :param dry_run: Defines whether only the configurations for the single experiments should be created without running
        them.
    :param best_replicates: Defines how often the final model should be re-trained and evaluated based on the best
        hyper-parameters enabling to measure the variance in performance.
    :param discard_replicates: Defines, whether the best model should be discarded after training and evaluation.
    :param create_unique_subdir: Defines, whether a unique sub-directory for the experimental artifacts should be
        created. The sub-directory name is defined by the current data + a unique id.
    """
    directory = normalize_path(directory, *iter_unique_ids(disable=not create_unique_subdir))
    directories = prepare_ablation(
        datasets=datasets,
        models=models,
        losses=losses,
        optimizers=optimizers,
        training_loops=training_loops,
        epochs=epochs,
        create_inverse_triples=create_inverse_triples,
        regularizers=regularizers,
        model_to_model_kwargs=model_to_model_kwargs,
        model_to_model_kwargs_ranges=model_to_model_kwargs_ranges,
        model_to_loss_to_loss_kwargs=model_to_loss_to_loss_kwargs,
        model_to_loss_to_loss_kwargs_ranges=model_to_loss_to_loss_kwargs_ranges,
        model_to_optimizer_to_optimizer_kwargs=model_to_optimizer_to_optimizer_kwargs,
        model_to_optimizer_to_optimizer_kwargs_ranges=model_to_optimizer_to_optimizer_kwargs_ranges,
        negative_sampler=negative_sampler,
        model_to_neg_sampler_to_neg_sampler_kwargs=model_to_negative_sampler_to_negative_sampler_kwargs,
        model_to_neg_sampler_to_neg_sampler_kwargs_ranges=model_to_negative_sampler_to_negative_sampler_kwargs_ranges,
        model_to_training_loop_to_training_loop_kwargs=model_to_training_loop_to_training_loop_kwargs,
        model_to_training_loop_to_training_kwargs=model_to_training_loop_to_training_kwargs,
        model_to_training_loop_to_training_kwargs_ranges=model_to_training_loop_to_training_kwargs_ranges,
        model_to_regularizer_to_regularizer_kwargs=model_to_regularizer_to_regularizer_kwargs,
        model_to_regularizer_to_regularizer_kwargs_ranges=model_to_regularizer_to_regularizer_kwargs_ranges,
        evaluator=evaluator,
        n_trials=n_trials,
        timeout=timeout,
        metric=metric,
        direction=direction,
        sampler=sampler,
        pruner=pruner,
        evaluator_kwargs=evaluator_kwargs,
        evaluation_kwargs=evaluation_kwargs,
        stopper=stopper,
        stopper_kwargs=stopper_kwargs,
        metadata=metadata,
        directory=directory,
        save_artifacts=save_artifacts,
    )

    _run_ablation_experiments(
        directories=directories,
        best_replicates=best_replicates,
        dry_run=dry_run,
        move_to_cpu=move_to_cpu,
        discard_replicates=discard_replicates,
    )



def _run_ablation_experiments(
    directories: Sequence[tuple[str | pathlib.Path, str | pathlib.Path]],
    best_replicates: int | None = None,
    dry_run: bool = False,
    move_to_cpu: bool = True,
    discard_replicates: bool = False,
) -> None:
    """Run ablation experiments."""
    if dry_run:
        return

    from pykeen.hpo import hpo_pipeline_from_path

    for output_directory, rv_config_path in directories:
        if isinstance(output_directory, str):
            output_directory = pathlib.Path(output_directory).resolve()
        hpo_pipeline_result = hpo_pipeline_from_path(rv_config_path)
        hpo_pipeline_result.save_to_directory(output_directory)

        if not best_replicates:
            continue

        best_pipeline_dir = output_directory.joinpath("best_pipeline")
        best_pipeline_dir.mkdir(exist_ok=True, parents=True)
        logger.info("Re-training best pipeline and saving artifacts in %s", best_pipeline_dir)
        hpo_pipeline_result.replicate_best_pipeline(
            replicates=best_replicates,
            move_to_cpu=move_to_cpu,
            save_replicates=not discard_replicates,
            directory=best_pipeline_dir,
        )


def iter_unique_ids(disable: bool = False) -> Iterable[str]:
    """Iterate unique id to append to a path."""
    if disable:
        return
    datetime = time.strftime("%Y-%m-%d-%H-%M")
    yield f"{datetime}_{uuid4()}"


def ablation_pipeline_from_config(
    config: Mapping[str, Any],
    directory: str,
    *,
    dry_run: bool = False,
    best_replicates: int | None = None,
    save_artifacts: bool = True,
    move_to_cpu: bool = True,
    discard_replicates: bool = False,
) -> None:
    """Generate a set of HPO configurations.

    A sample file can be run with``pykeen experiment ablation tests/resources/hpo_complex_nations.json``.

    :param config: Dictionary defining the ablation studies.
    :param directory: The directory in which the experimental artifacts will be saved.
    :param dry_run: Defines whether only the configurations for the single experiments should be created without running
        them.
    :param best_replicates: Defines how often the final model should be re-trained and evaluated based on the best
        hyper-parameters enabling to measure the variance in performance.
    :param save_artifacts: Defines, whether each trained model sampled during HPO should be saved.
    :param move_to_cpu: Defines, whether a replicate of the best model should be moved to CPU. We recommend to set this
        flag to 'True' to avoid unnecessary GPU usage.
    :param discard_replicates: Defines, whether the best model should be discarded after training and evaluation.
    """
    ablation_pipeline(
        **config,
        directory=directory,
        dry_run=dry_run,
        best_replicates=best_replicates,
        save_artifacts=save_artifacts,
        move_to_cpu=move_to_cpu,
        discard_replicates=discard_replicates,
    )


def prepare_ablation_from_path(
    path: str | pathlib.Path,
    directory: str | pathlib.Path,
    save_artifacts: bool,
) -> list[tuple[pathlib.Path, pathlib.Path]]:
    """Prepare a set of ablation study directories.

    :param path: Path to configuration file defining the ablation studies.
    :param directory: The directory in which the experimental artifacts (including the ablation configurations) will be
        saved.
    :param save_artifacts: Defines, whether the output directories for the trained models sampled during HPO should be
        created.

    :returns: pairs of output directories and HPO config paths inside those directories
    """
    directory = normalize_path(directory, *iter_unique_ids())
    with directory.open() as file:
        config = json.load(file)
    return prepare_ablation_from_config(config=config, directory=directory, save_artifacts=save_artifacts)



[docs]
def prepare_ablation_from_config(
    config: Mapping[str, Any],
    directory: str | pathlib.Path,
    save_artifacts: bool,
) -> list[tuple[pathlib.Path, pathlib.Path]]:
    """Prepare a set of ablation study directories.

    :param config: Dictionary defining the ablation studies.
    :param directory: The directory in which the experimental artifacts (including the ablation configurations) will be
        saved.
    :param save_artifacts: Defines, whether the output directories for the trained models sampled during HPO should be
        created.

    :returns: pairs of output directories and HPO config paths inside those directories
    """
    metadata = config["metadata"]
    optuna_config = config["optuna"]
    ablation_config = config["ablation"]

    return prepare_ablation(
        **ablation_config,
        **optuna_config,
        metadata=metadata,
        directory=directory,
        save_artifacts=save_artifacts,
    )



def path_to_str(x: object) -> str:
    """Convert path to string and error on everything which is not a path."""
    if isinstance(x, pathlib.Path):
        return x.as_posix()
    raise TypeError(x)



[docs]
def prepare_ablation(  # noqa:C901
    datasets: OneOrSequence[str | SplitToPathDict],
    models: OneOrSequence[str],
    losses: OneOrSequence[str],
    optimizers: OneOrSequence[str],
    training_loops: OneOrSequence[str],
    directory: str | pathlib.Path,
    *,
    create_inverse_triples: OneOrSequence[bool] = False,
    regularizers: OneOrSequence[None | str] = None,
    epochs: int | None = None,
    negative_sampler: str | None = None,
    evaluator: str | None = None,
    model_to_model_kwargs: Mapping2D | None = None,
    model_to_model_kwargs_ranges: Mapping2D | None = None,
    model_to_loss_to_loss_kwargs: Mapping3D | None = None,
    model_to_loss_to_loss_kwargs_ranges: Mapping3D | None = None,
    model_to_optimizer_to_optimizer_kwargs: Mapping3D | None = None,
    model_to_optimizer_to_optimizer_kwargs_ranges: Mapping3D | None = None,
    model_to_training_loop_to_training_loop_kwargs: Mapping3D | None = None,
    model_to_neg_sampler_to_neg_sampler_kwargs: Mapping3D | None = None,
    model_to_neg_sampler_to_neg_sampler_kwargs_ranges: Mapping3D | None = None,
    model_to_training_loop_to_training_kwargs: Mapping3D | None = None,
    model_to_training_loop_to_training_kwargs_ranges: Mapping3D | None = None,
    model_to_regularizer_to_regularizer_kwargs: Mapping3D | None = None,
    model_to_regularizer_to_regularizer_kwargs_ranges: Mapping3D | None = None,
    n_trials: int | None = 5,
    timeout: int | None = 3600,
    metric: str | None = "hits@10",
    direction: str | None = "maximize",
    sampler: str | None = "random",
    pruner: str | None = "nop",
    evaluator_kwargs: Mapping[str, Any] | None = None,
    evaluation_kwargs: Mapping[str, Any] | None = None,
    stopper: str | None = "NopStopper",
    stopper_kwargs: Mapping[str, Any] | None = None,
    metadata: Mapping | None = None,
    save_artifacts: bool = True,
) -> list[tuple[pathlib.Path, pathlib.Path]]:
    """Prepare an ablation directory.

    :param datasets: A single or a list of dataset specifications. Datasets can be specified either by name (referring
        to a single built-in dataset) or as a dictionary with paths for training, validation, and testing.
    :param models: A model name or list of model names.
    :param losses: A loss function name or list of loss function names.
    :param optimizers: An optimizer name or list of optimizer names.
    :param training_loops: A training loop name or list of training loop names.
    :param epochs: A quick way to set the ``num_epochs`` in the training kwargs.
    :param create_inverse_triples: Either a boolean for a single entry or a list of booleans.
    :param regularizers: A regularizer name, list of regularizer names, or None if no regularizer is desired.
    :param negative_sampler: A negative sampler name, list of regularizer names, or None if no negative sampler is
        desired. Negative sampling is used only in combination with the pykeen.training.sclwa training loop.
    :param evaluator: The name of the evaluator to be used. Defaults to rank-based evaluator.
    :param stopper: The name of the stopper to be used. Defaults to NopStopper which doesn't define a stopping
        criterion.
    :param model_to_model_kwargs: A mapping from model name to dictionaries of default keyword arguments for the
        instantiation of that model.
    :param model_to_model_kwargs_ranges: A mapping from model name to dictionaries of keyword argument ranges for that
        model to be used in HPO.
    :param model_to_loss_to_loss_kwargs: A mapping from model name to a mapping of loss name to a mapping of default
        keyword arguments for the instantiation of that loss function. This is useful because for some losses, have
        hyper-parameters such as pykeen.losses.MarginRankingLoss
    :param model_to_loss_to_loss_kwargs_ranges: A mapping from model name to a mapping of loss name to a mapping of
        keyword argument ranges for that loss to be used in HPO.
    :param model_to_optimizer_to_optimizer_kwargs: A mapping from model name to a mapping of optimizer name to a mapping
        of default keyword arguments for the instantiation of that optimizer. This is useful because the optimizers,
        have hyper-parameters such as the learning rate.
    :param model_to_optimizer_to_optimizer_kwargs_ranges: A mapping from model name to a mapping of optimizer name to a
        mapping of keyword argument ranges for that optimizer to be used in HPO.
    :param model_to_regularizer_to_regularizer_kwargs: A mapping from model name to a mapping of regularizer name to a
        mapping of default keyword arguments for the instantiation of that regularizer. This is useful because the
        optimizers, have hyper-parameters such as the regularization weight.
    :param model_to_regularizer_to_regularizer_kwargs_ranges: A mapping from model name to a mapping of regularizer name
        to a mapping of keyword argument ranges for that regularizer to be used in HPO.
    :param model_to_neg_sampler_to_neg_sampler_kwargs: A mapping from model name to a mapping of negative sampler name
        to a mapping of default keyword arguments for the instantiation of that negative sampler. This is useful because
        the negative samplers, have hyper-parameters such as the number of negatives that should get generated for each
        positive training example.
    :param model_to_neg_sampler_to_neg_sampler_kwargs_ranges: A mapping from model name to a mapping of negative sampler
        name to a mapping of keyword argument ranges for that negative sampler to be used in HPO.
    :param model_to_training_loop_to_training_loop_kwargs: A mapping from model name to a mapping of training loop name
        to a mapping of default keyword arguments for the training loop.
    :param model_to_training_loop_to_training_kwargs: A mapping from model name to a mapping of trainer name to a
        mapping of default keyword arguments for the training procedure. This is useful because you can set the
        hyper-parameters such as the number of training epochs and the batch size.
    :param model_to_training_loop_to_training_kwargs_ranges: A mapping from model name to a mapping of trainer name to a
        mapping of keyword argument ranges for that trainer to be used in HPO.
    :param evaluator_kwargs: The keyword arguments passed to the evaluator.
    :param evaluation_kwargs: The keyword arguments passed during evaluation.
    :param stopper_kwargs: The keyword arguments passed to the stopper.
    :param n_trials: Number of HPO trials.
    :param timeout: The time (seconds) after which the ablation study will be terminated.
    :param metric: The metric to optimize during HPO.
    :param direction: Defines, whether to 'maximize' or 'minimize' the metric during HPO.
    :param sampler: The HPO sampler, it defaults to random search.
    :param pruner: Defines approach for pruning trials. Per default no pruning is used, i.e., pruner is set to
        'Nopruner'.
    :param metadata: A mapping of meta data arguments such as name of the ablation study.
    :param directory: The directory in which the experimental artifacts will be saved.
    :param save_artifacts: Defines, whether each trained model sampled during HPO should be saved.

    :returns: pairs of output directories and HPO config paths inside those directories.

    :raises ValueError: If the dataset is not specified correctly, i.e., dataset is not of type str, or a dictionary
        containing the paths to the training, testing, and validation data.
    """
    directory = normalize_path(path=directory)
    datasets = upgrade_to_sequence(datasets)
    create_inverse_triples = upgrade_to_sequence(create_inverse_triples)
    models = upgrade_to_sequence(models)
    losses = upgrade_to_sequence(losses)
    optimizers = upgrade_to_sequence(optimizers)
    training_loops = upgrade_to_sequence(training_loops)
    regularizers = upgrade_to_sequence(regularizers)

    # note: for some reason, mypy does not properly recognize the tuple[T1, T2, T3] notation,
    #  but rather uses tuple[T1 | T2 | T3, ...]
    it: Iterable[
        tuple[
            # dataset
            str | SplitToPathDict,
            # create inverse triples
            bool,
            # models, losses
            str,
            str,
            # regularizers
            str | None,
            # optimizers, training loops
            str,
            str,
        ]
    ]
    it = itt.product(
        datasets,
        create_inverse_triples,
        models,
        losses,
        regularizers,
        optimizers,
        training_loops,
    )

    if not model_to_model_kwargs:
        model_to_model_kwargs = {}
    if not model_to_model_kwargs_ranges:
        model_to_model_kwargs_ranges = {}

    directories = []
    for counter, (
        dataset,
        this_create_inverse_triples,
        model,
        loss,
        regularizer,
        optimizer,
        training_loop,
    ) in enumerate(it):
        dataset_name = normalize_string(dataset) if isinstance(dataset, str) else "user_data"
        experiment_name = f"{counter:04d}_{dataset_name}_{normalize_string(model)}"
        output_directory = directory.joinpath(experiment_name)
        output_directory.mkdir(exist_ok=True, parents=True)
        # TODO what happens if already exists?

        _experiment_optuna_config = {
            "n_trials": n_trials,
            "timeout": timeout,
            "metric": metric,
            "direction": direction,
            "sampler": sampler,
            "pruner": pruner,
        }
        _experiment_optuna_config["storage"] = f"sqlite:///{output_directory.as_posix()}/optuna_results.db"
        if save_artifacts:
            save_model_directory = output_directory.joinpath("artifacts")
            save_model_directory.mkdir(exist_ok=True, parents=True)
            _experiment_optuna_config["save_model_directory"] = save_model_directory.as_posix()

        hpo_config: dict[str, Any] = {}
        hpo_config["stopper"] = stopper

        if stopper_kwargs is not None:
            hpo_config["stopper_kwargs"] = stopper_kwargs

        # TODO incorporate setting of random seed
        # pipeline_kwargs=dict(
        #    random_seed=random_non_negative_int(),
        # ),

        def _set_arguments(config: Mapping3D | None, key: str, value: str) -> None:
            """Set argument and its values."""
            d = {}
            d[key] = {} if config is None else config.get(model, {}).get(value, {})  # noqa:B023
            if d[key]:
                hpo_config.update(d)  # noqa:B023

        # Add dataset to current_pipeline
        if isinstance(dataset, str):
            hpo_config["dataset"] = dataset
        elif isinstance(dataset, dict):
            # Training, test, and validation paths are provided
            hpo_config["training"] = dataset["training"]
            hpo_config["testing"] = dataset["testing"]
            hpo_config["validation"] = dataset["validation"]
        else:
            raise ValueError(
                "Dataset must be either the dataset name, i.e., of type str, or a dictionary containing\n"
                "the paths to the training, testing, and validation data.",
            )
        logger.info(f"Dataset: {dataset}")
        hpo_config["dataset_kwargs"] = {"create_inverse_triples": this_create_inverse_triples}
        logger.info(f"Add inverse triples: {this_create_inverse_triples}")

        hpo_config["model"] = model
        hpo_config["model_kwargs"] = model_to_model_kwargs.get(model, {})
        hpo_config["model_kwargs_ranges"] = model_to_model_kwargs_ranges.get(model, {})
        logger.info(f"Model: {model}")

        # Add loss function to current_pipeline
        hpo_config["loss"] = loss
        _set_arguments(config=model_to_loss_to_loss_kwargs, key="loss_kwargs", value=loss)
        _set_arguments(config=model_to_loss_to_loss_kwargs_ranges, key="loss_kwargs_ranges", value=loss)
        logger.info(f"Loss functions: {loss}")

        # Add regularizer to current_pipeline
        if regularizer is not None:
            hpo_config["regularizer"] = regularizer
            _set_arguments(
                config=model_to_regularizer_to_regularizer_kwargs,
                key="regularizer_kwargs",
                value=regularizer,
            )
            _set_arguments(
                config=model_to_regularizer_to_regularizer_kwargs_ranges,
                key="regularizer_kwargs_ranges",
                value=regularizer,
            )
            logger.info(f"Regularizer: {regularizer}")

        # Add optimizer to current_pipeline
        hpo_config["optimizer"] = optimizer
        _set_arguments(config=model_to_optimizer_to_optimizer_kwargs, key="optimizer_kwargs", value=optimizer)
        _set_arguments(
            config=model_to_optimizer_to_optimizer_kwargs_ranges,
            key="optimizer_kwargs_ranges",
            value=optimizer,
        )
        logger.info(f"Optimizer: {optimizer}")

        # Add training approach to current_pipeline
        hpo_config["training_loop"] = training_loop
        _set_arguments(
            config=model_to_training_loop_to_training_loop_kwargs,
            key="training_loop_kwargs",
            value=training_loop,
        )
        _set_arguments(config=model_to_training_loop_to_training_kwargs, key="training_kwargs", value=training_loop)
        _set_arguments(
            config=model_to_training_loop_to_training_kwargs_ranges,
            key="training_kwargs_ranges",
            value=training_loop,
        )
        logger.info(f"Training loop: {training_loop}")

        if issubclass(training_loop_resolver.lookup(training_loop), SLCWATrainingLoop):
            negative_sampler = negative_sampler or "basic"  # default to basic
            _set_arguments(
                config=model_to_neg_sampler_to_neg_sampler_kwargs,
                key="negative_sampler_kwargs",
                value=negative_sampler,
            )
            _set_arguments(
                config=model_to_neg_sampler_to_neg_sampler_kwargs_ranges,
                key="negative_sampler_kwargs_ranges",
                value=negative_sampler,
            )
            logger.info(f"Negative sampler: {negative_sampler}")

        # Add evaluation
        hpo_config["evaluator"] = evaluator
        if evaluator_kwargs:
            hpo_config["evaluator_kwargs"] = evaluator_kwargs
        hpo_config["evaluation_kwargs"] = evaluation_kwargs or {}
        logger.info(f"Evaluator: {evaluator}")

        if epochs is not None:
            hpo_config.setdefault("training_kwargs", {}).setdefault("num_epochs", epochs)

        rv_config = {
            "type": "hpo",
            "metadata": metadata or {},
            "pipeline": hpo_config,
            "optuna": _experiment_optuna_config,
        }

        rv_config_path = output_directory.joinpath("hpo_config.json")
        with rv_config_path.open("w") as file:
            # paths need to be encoded as strings to make them JSON-serializable
            json.dump(rv_config, file, indent=2, ensure_ascii=True, default=path_to_str)

        directories.append((output_directory, rv_config_path))

    return directories