Source code for pykeen.datasets.inductive.base

"""Utility classes for constructing inductive datasets."""

from __future__ import annotations

import logging
import pathlib
from collections.abc import Iterable, Mapping
from dataclasses import dataclass
from typing import Any

from pystow.utils import download, name_from_url
from tabulate import tabulate

from ...constants import PYKEEN_DATASETS
from ...triples import CoreTriplesFactory, TriplesFactory
from ...utils import normalize_path

__all__ = [
    # Base class
    "InductiveDataset",
    # Mid-level classes
    "EagerInductiveDataset",
    "LazyInductiveDataset",
    "DisjointInductivePathDataset",
    "UnpackedRemoteDisjointInductiveDataset",
]

logger = logging.getLogger(__name__)



[docs]
class InductiveDataset:
    """Contains transductive train and inductive inference/validation/test datasets."""

    #: A factory wrapping the training triples
    transductive_training: CoreTriplesFactory
    #: A factory wrapping the inductive inference triples that MIGHT or MIGHT NOT
    # share indices with the transductive training
    inductive_inference: CoreTriplesFactory
    #: A factory wrapping the testing triples, that share indices with the INDUCTIVE INFERENCE triples
    inductive_testing: CoreTriplesFactory
    #: A factory wrapping the validation triples, that share indices with the INDUCTIVE INFERENCE triples
    inductive_validation: CoreTriplesFactory | None = None
    #: All datasets should take care of inverse triple creation
    create_inverse_triples: bool = True

    def _summary_rows(self):
        return [
            (label, triples_factory.num_entities, triples_factory.num_relations, triples_factory.num_triples)
            for label, triples_factory in zip(
                ("Transductive Training", "Inductive Inference", "Inductive Testing", "Inductive Validation"),
                (
                    self.transductive_training,
                    self.inductive_inference,
                    self.inductive_testing,
                    self.inductive_validation,
                ),
                strict=False,
            )
        ]


[docs]
    def summary_str(self, title: str | None = None, show_examples: int | None = 5, end="\n") -> str:
        """Make a summary string of all of the factories."""
        rows = self._summary_rows()
        n_triples = sum(count for *_, count in rows)
        rows.append(("Total", "-", "-", n_triples))
        t = tabulate(rows, headers=["Name", "Entities", "Relations", "Triples"])
        rv = f"{title or self.__class__.__name__} (create_inverse_triples={self.create_inverse_triples})\n{t}"
        if show_examples:
            if not isinstance(self.transductive_training, TriplesFactory):
                raise AttributeError(f"{self.transductive_training.__class__} does not have labeling information.")
            examples = tabulate(
                self.transductive_training.label_triples(self.transductive_training.mapped_triples[:show_examples]),
                headers=["Head", "Relation", "tail"],
            )
            rv += "\n" + examples
        return rv + end



[docs]
    def summarize(self, title: str | None = None, show_examples: int | None = 5, file=None) -> None:
        """Print a summary of the dataset."""
        print(self.summary_str(title=title, show_examples=show_examples), file=file)  # noqa:T201


    def __str__(self) -> str:  # noqa: D105
        return (
            f"{self.__class__.__name__}(Training num_entities={self.transductive_training.num_entities},"
            f" num_relations={self.transductive_training.num_relations})"
        )




[docs]
@dataclass
class EagerInductiveDataset(InductiveDataset):
    """An eager inductive datasets."""

    transductive_training: CoreTriplesFactory
    inductive_inference: CoreTriplesFactory
    inductive_testing: CoreTriplesFactory
    inductive_validation: CoreTriplesFactory | None = None
    create_inverse_triples: bool = True




[docs]
class LazyInductiveDataset(InductiveDataset):
    """An inductive dataset that has lazy loading."""

    #: The actual instance of the training factory, which is exposed to the user through `transductive_training`
    _transductive_training: TriplesFactory | None = None
    #: The actual instance of the inductive inference factory,
    #: which is exposed to the user through `inductive_inference`
    _inductive_inference: TriplesFactory | None = None
    #: The actual instance of the testing factory, which is exposed to the user through `inductive_testing`
    _inductive_testing: TriplesFactory | None = None
    #: The actual instance of the validation factory, which is exposed to the user through `inductive_validation`
    _inductive_validation: TriplesFactory | None = None
    #: The directory in which the cached data is stored
    cache_root: pathlib.Path

    @property
    def transductive_training(self) -> TriplesFactory:  # type: ignore[override]  # noqa: D401
        """The training triples factory."""
        if not self._loaded:
            self._load()
        assert self._transductive_training is not None
        return self._transductive_training

    @property
    def inductive_inference(self) -> TriplesFactory:  # type: ignore[override]  # noqa: D401
        """The inductive inference triples factory. MIGHT or MIGHT NOT share indices with the transductive train."""
        if not self._loaded:
            self._load()
        assert self._inductive_inference is not None
        return self._inductive_inference

    @property
    def inductive_testing(self) -> TriplesFactory:  # type: ignore[override]  # noqa: D401
        """The testing triples factory that share indices with the INDUCTIVE INFERENCE triples factory."""
        if not self._loaded:
            self._load()
        assert self._inductive_testing is not None
        return self._inductive_testing

    @property
    def inductive_validation(self) -> TriplesFactory | None:  # type: ignore[override]  # noqa: D401
        """The validation triples factory that shares indices with the INDUCTIVE INFERENCE triples factory."""
        if not self._loaded:
            self._load()
        assert self._inductive_validation is not None
        return self._inductive_validation

    @property
    def _loaded(self) -> bool:
        return self._transductive_training is not None and self._inductive_inference is not None

    def _load(self) -> None:
        raise NotImplementedError

    def _load_validation(self) -> None:
        raise NotImplementedError

    def _help_cache(
        self,
        cache_root: None | str | pathlib.Path,
        version: str | None = None,
        sep_train_inference: bool = False,
    ) -> pathlib.Path:
        """Get the appropriate cache root directory.

        :param cache_root: If none is passed, defaults to a subfolder of the PyKEEN home directory defined in
            :data:`pykeen.constants.PYKEEN_HOME`. The subfolder is named based on the class inheriting from
            :class:`pykeen.datasets.base.Dataset`.
        :param version: accepts a string "v1" to "v4" to select among Teru et al inductive datasets
        :param sep_train_inference: a flag to store training and inference splits in different folders

        :returns: A path object for the calculated cache root directory
        """
        cache_root = normalize_path(
            cache_root, *self._cache_sub_directories(version=version), default=PYKEEN_DATASETS, mkdir=True
        )
        if sep_train_inference:
            # generate subfolders 'training' and  'inference'
            for name in ("training", "inference"):
                cache_root.joinpath(name).mkdir(parents=True, exist_ok=True)
        logger.debug("using cache root at %s", cache_root.as_uri())
        return cache_root

    def _cache_sub_directories(self, version: str | None) -> Iterable[str]:
        """Iterate over appropriate cache sub-directory."""
        # TODO: use class-resolver normalize?
        yield self.__class__.__name__.lower()
        # add v1 / v2 / v3 / v4 for inductive splits if available
        if version:
            yield version




[docs]
class DisjointInductivePathDataset(LazyInductiveDataset):
    """A disjoint inductive dataset specified by paths.

    Contains a lazy reference to a training, inductive inference, inductive testing, and inductive validation dataset.
    In this dataset, inductive inference is disjoint with the transductive train
    """

    def __init__(
        self,
        transductive_training_path: str | pathlib.Path,
        inductive_inference_path: str | pathlib.Path,
        inductive_testing_path: str | pathlib.Path,
        inductive_validation_path: str | str | pathlib.Path,
        eager: bool = False,
        create_inverse_triples: bool = False,
        load_triples_kwargs: Mapping[str, Any] | None = None,
    ) -> None:
        """Initialize the dataset.

        :param transductive_training_path: Path to the training triples file or training triples file.
        :param inductive_inference_path: Path to the inductive inference triples file or training triples file.
        :param inductive_testing_path: Path to the testing triples file or testing triples file.
        :param inductive_validation_path: Path to the validation triples file or validation triples file.
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path` and ultimately through
            to :func:`pykeen.triples.utils.load_triples`.
        """
        self.transductive_training_path = pathlib.Path(transductive_training_path)
        self.inductive_inference_path = pathlib.Path(inductive_inference_path)
        self.inductive_testing_path = pathlib.Path(inductive_testing_path)
        self.inductive_validation_path = pathlib.Path(inductive_validation_path)

        self.create_inverse_triples = create_inverse_triples
        self.load_triples_kwargs = load_triples_kwargs

        if eager:
            self._load()

    def _load(self) -> None:
        self._transductive_training = TriplesFactory.from_path(
            path=self.transductive_training_path,
            create_inverse_triples=self.create_inverse_triples,
            load_triples_kwargs=self.load_triples_kwargs,
        )

        # important: inductive_inference shares the same RELATIONS with the transductive training graph
        self._inductive_inference = TriplesFactory.from_path(
            path=self.inductive_inference_path,
            create_inverse_triples=self.create_inverse_triples,
            relation_to_id=self._transductive_training.relation_to_id,
            load_triples_kwargs=self.load_triples_kwargs,
        )

        # inductive validation shares both ENTITIES and RELATIONS with the inductive inference graph
        self._inductive_validation = TriplesFactory.from_path(
            path=self.inductive_validation_path,
            entity_to_id=self._inductive_inference.entity_to_id,  # shares entity index with inductive inference
            relation_to_id=self._inductive_inference.relation_to_id,  # shares relation index with inductive inference
            # do not explicitly create inverse triples for testing; this is handled by the evaluation code
            create_inverse_triples=False,
            load_triples_kwargs=self.load_triples_kwargs,
        )

        # inductive testing shares both ENTITIES and RELATIONS with the inductive inference graph
        self._inductive_testing = TriplesFactory.from_path(
            path=self.inductive_testing_path,
            entity_to_id=self._inductive_inference.entity_to_id,  # share entity index with inductive inference
            relation_to_id=self._inductive_inference.relation_to_id,  # share relation index with inductive inference
            # do not explicitly create inverse triples for testing; this is handled by the evaluation code
            create_inverse_triples=False,
            load_triples_kwargs=self.load_triples_kwargs,
        )

    def __repr__(self) -> str:  # noqa: D105
        return (
            f'{self.__class__.__name__}(training_path="{self.transductive_training_path}", '
            f' inductive_inference="{self.inductive_inference_path}",'
            f' inductive_test="{self.inductive_testing_path}",'
            f' inductive_validation="{self.inductive_validation_path}")'
        )




[docs]
class UnpackedRemoteDisjointInductiveDataset(DisjointInductivePathDataset):
    """A dataset with all four of train, inductive_inference, inductive test, and inductive validation sets as URLs."""

    def __init__(
        self,
        transductive_training_url: str,
        inductive_inference_url: str,
        inductive_testing_url: str,
        inductive_validation_url: str,
        cache_root: str | None = None,
        force: bool = False,
        eager: bool = False,
        create_inverse_triples: bool = False,
        load_triples_kwargs: Mapping[str, Any] | None = None,
        download_kwargs: Mapping[str, Any] | None = None,
        version: str | None = None,
    ):
        """Initialize dataset.

        :param transductive_training_url: The URL of the training file
        :param inductive_inference_url: The URL of the inductive inference graph file
        :param inductive_testing_url: The URL of the inductive testing file
        :param inductive_validation_url: The URL of the inductive validation file
        :param cache_root: An optional directory to store the extracted files. Is none is given, the default PyKEEN
            directory is used. This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to
            ``~/.data/pykeen``.
        :param force: If true, redownload any cached files
        :param eager: Should the data be loaded eagerly? Defaults to false.
        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path` and ultimately through
            to :func:`pykeen.triples.utils.load_triples`.
        :param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download`
        :param version: accepts a string "v1" to "v4" to select among Teru et al inductive datasets
        """
        self.cache_root = self._help_cache(cache_root, version, sep_train_inference=True)

        self.transductive_training_url = transductive_training_url
        self.inductive_inference_url = inductive_inference_url
        self.inductive_testing_url = inductive_testing_url
        self.inductive_validation_url = inductive_validation_url

        transductive_training_path = self.cache_root.joinpath("training", name_from_url(self.transductive_training_url))
        inductive_inference_path = self.cache_root.joinpath("inference", name_from_url(self.inductive_inference_url))
        inductive_testing_path = self.cache_root.joinpath("inference", name_from_url(self.inductive_testing_url))
        inductive_validation_path = self.cache_root.joinpath("inference", name_from_url(self.inductive_validation_url))

        download_kwargs = {} if download_kwargs is None else dict(download_kwargs)
        download_kwargs.setdefault("backend", "urllib")

        for url, path in [
            (self.transductive_training_url, transductive_training_path),
            (self.inductive_inference_url, inductive_inference_path),
            (self.inductive_testing_url, inductive_testing_path),
            (self.inductive_validation_url, inductive_validation_path),
        ]:
            if force or not path.is_file():
                download(url, path, **download_kwargs)

        super().__init__(
            transductive_training_path=transductive_training_path,
            inductive_inference_path=inductive_inference_path,
            inductive_testing_path=inductive_testing_path,
            inductive_validation_path=inductive_validation_path,
            eager=eager,
            create_inverse_triples=create_inverse_triples,
            load_triples_kwargs=load_triples_kwargs,
        )