"""Utility classes for constructing inductive datasets."""
from __future__ import annotations
import logging
import pathlib
from collections.abc import Iterable, Mapping
from dataclasses import dataclass
from typing import Any
from pystow.utils import download, name_from_url
from tabulate import tabulate
from ...constants import PYKEEN_DATASETS
from ...triples import CoreTriplesFactory, TriplesFactory
from ...utils import normalize_path
__all__ = [
# Base class
"InductiveDataset",
# Mid-level classes
"EagerInductiveDataset",
"LazyInductiveDataset",
"DisjointInductivePathDataset",
"UnpackedRemoteDisjointInductiveDataset",
]
logger = logging.getLogger(__name__)
[docs]
class InductiveDataset:
"""Contains transductive train and inductive inference/validation/test datasets."""
#: A factory wrapping the training triples
transductive_training: CoreTriplesFactory
#: A factory wrapping the inductive inference triples that MIGHT or MIGHT NOT
# share indices with the transductive training
inductive_inference: CoreTriplesFactory
#: A factory wrapping the testing triples, that share indices with the INDUCTIVE INFERENCE triples
inductive_testing: CoreTriplesFactory
#: A factory wrapping the validation triples, that share indices with the INDUCTIVE INFERENCE triples
inductive_validation: CoreTriplesFactory | None = None
#: All datasets should take care of inverse triple creation
create_inverse_triples: bool = True
def _summary_rows(self):
return [
(label, triples_factory.num_entities, triples_factory.num_relations, triples_factory.num_triples)
for label, triples_factory in zip(
("Transductive Training", "Inductive Inference", "Inductive Testing", "Inductive Validation"),
(
self.transductive_training,
self.inductive_inference,
self.inductive_testing,
self.inductive_validation,
),
)
]
[docs]
def summary_str(self, title: str | None = None, show_examples: int | None = 5, end="\n") -> str:
"""Make a summary string of all of the factories."""
rows = self._summary_rows()
n_triples = sum(count for *_, count in rows)
rows.append(("Total", "-", "-", n_triples))
t = tabulate(rows, headers=["Name", "Entities", "Relations", "Triples"])
rv = f"{title or self.__class__.__name__} (create_inverse_triples={self.create_inverse_triples})\n{t}"
if show_examples:
if not isinstance(self.transductive_training, TriplesFactory):
raise AttributeError(f"{self.transductive_training.__class__} does not have labeling information.")
examples = tabulate(
self.transductive_training.label_triples(self.transductive_training.mapped_triples[:show_examples]),
headers=["Head", "Relation", "tail"],
)
rv += "\n" + examples
return rv + end
[docs]
def summarize(self, title: str | None = None, show_examples: int | None = 5, file=None) -> None:
"""Print a summary of the dataset."""
print(self.summary_str(title=title, show_examples=show_examples), file=file) # noqa:T201
def __str__(self) -> str: # noqa: D105
return (
f"{self.__class__.__name__}(Training num_entities={self.transductive_training.num_entities},"
f" num_relations={self.transductive_training.num_relations})"
)
[docs]
@dataclass
class EagerInductiveDataset(InductiveDataset):
"""An eager inductive datasets."""
transductive_training: CoreTriplesFactory
inductive_inference: CoreTriplesFactory
inductive_testing: CoreTriplesFactory
inductive_validation: CoreTriplesFactory | None = None
create_inverse_triples: bool = True
[docs]
class LazyInductiveDataset(InductiveDataset):
"""An inductive dataset that has lazy loading."""
#: The actual instance of the training factory, which is exposed to the user through `transductive_training`
_transductive_training: TriplesFactory | None = None
#: The actual instance of the inductive inference factory,
#: which is exposed to the user through `inductive_inference`
_inductive_inference: TriplesFactory | None = None
#: The actual instance of the testing factory, which is exposed to the user through `inductive_testing`
_inductive_testing: TriplesFactory | None = None
#: The actual instance of the validation factory, which is exposed to the user through `inductive_validation`
_inductive_validation: TriplesFactory | None = None
#: The directory in which the cached data is stored
cache_root: pathlib.Path
@property
def transductive_training(self) -> TriplesFactory: # type:ignore # noqa: D401
"""The training triples factory."""
if not self._loaded:
self._load()
assert self._transductive_training is not None
return self._transductive_training
@property
def inductive_inference(self) -> TriplesFactory: # type:ignore # noqa: D401
"""The inductive inference triples factory. MIGHT or MIGHT NOT share indices with the transductive train."""
if not self._loaded:
self._load()
assert self._inductive_inference is not None
return self._inductive_inference
@property
def inductive_testing(self) -> TriplesFactory: # type:ignore # noqa: D401
"""The testing triples factory that share indices with the INDUCTIVE INFERENCE triples factory."""
if not self._loaded:
self._load()
assert self._inductive_testing is not None
return self._inductive_testing
@property
def inductive_validation(self) -> TriplesFactory | None: # type:ignore # noqa: D401
"""The validation triples factory that shares indices with the INDUCTIVE INFERENCE triples factory."""
if not self._loaded:
self._load()
assert self._inductive_validation is not None
return self._inductive_validation
@property
def _loaded(self) -> bool:
return self._transductive_training is not None and self._inductive_inference is not None
def _load(self) -> None:
raise NotImplementedError
def _load_validation(self) -> None:
raise NotImplementedError
def _help_cache(
self,
cache_root: None | str | pathlib.Path,
version: str | None = None,
sep_train_inference: bool = False,
) -> pathlib.Path:
"""Get the appropriate cache root directory.
:param cache_root: If none is passed, defaults to a subfolder of the
PyKEEN home directory defined in :data:`pykeen.constants.PYKEEN_HOME`.
The subfolder is named based on the class inheriting from
:class:`pykeen.datasets.base.Dataset`.
:param version: accepts a string "v1" to "v4" to select among Teru et al inductive datasets
:param sep_train_inference: a flag to store training and inference splits in different folders
:returns: A path object for the calculated cache root directory
"""
cache_root = normalize_path(
cache_root, *self._cache_sub_directories(version=version), default=PYKEEN_DATASETS, mkdir=True
)
if sep_train_inference:
# generate subfolders 'training' and 'inference'
for name in ("training", "inference"):
cache_root.joinpath(name).mkdir(parents=True, exist_ok=True)
logger.debug("using cache root at %s", cache_root.as_uri())
return cache_root
def _cache_sub_directories(self, version: str | None) -> Iterable[str]:
"""Iterate over appropriate cache sub-directory."""
# TODO: use class-resolver normalize?
yield self.__class__.__name__.lower()
# add v1 / v2 / v3 / v4 for inductive splits if available
if version:
yield version
[docs]
class DisjointInductivePathDataset(LazyInductiveDataset):
"""A disjoint inductive dataset specified by paths.
Contains a lazy reference to a training, inductive inference, inductive testing, and inductive validation dataset.
In this dataset, inductive inference is disjoint with the transductive train
"""
def __init__(
self,
transductive_training_path: str | pathlib.Path,
inductive_inference_path: str | pathlib.Path,
inductive_testing_path: str | pathlib.Path,
inductive_validation_path: str | str | pathlib.Path,
eager: bool = False,
create_inverse_triples: bool = False,
load_triples_kwargs: Mapping[str, Any] | None = None,
) -> None:
"""Initialize the dataset.
:param transductive_training_path: Path to the training triples file or training triples file.
:param inductive_inference_path: Path to the inductive inference triples file or training triples file.
:param inductive_testing_path: Path to the testing triples file or testing triples file.
:param inductive_validation_path: Path to the validation triples file or validation triples file.
:param eager: Should the data be loaded eagerly? Defaults to false.
:param create_inverse_triples: Should inverse triples be created? Defaults to false.
:param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
and ultimately through to :func:`pykeen.triples.utils.load_triples`.
"""
self.transductive_training_path = pathlib.Path(transductive_training_path)
self.inductive_inference_path = pathlib.Path(inductive_inference_path)
self.inductive_testing_path = pathlib.Path(inductive_testing_path)
self.inductive_validation_path = pathlib.Path(inductive_validation_path)
self.create_inverse_triples = create_inverse_triples
self.load_triples_kwargs = load_triples_kwargs
if eager:
self._load()
def _load(self) -> None:
self._transductive_training = TriplesFactory.from_path(
path=self.transductive_training_path,
create_inverse_triples=self.create_inverse_triples,
load_triples_kwargs=self.load_triples_kwargs,
)
# important: inductive_inference shares the same RELATIONS with the transductive training graph
self._inductive_inference = TriplesFactory.from_path(
path=self.inductive_inference_path,
create_inverse_triples=self.create_inverse_triples,
relation_to_id=self._transductive_training.relation_to_id,
load_triples_kwargs=self.load_triples_kwargs,
)
# inductive validation shares both ENTITIES and RELATIONS with the inductive inference graph
self._inductive_validation = TriplesFactory.from_path(
path=self.inductive_validation_path,
entity_to_id=self._inductive_inference.entity_to_id, # shares entity index with inductive inference
relation_to_id=self._inductive_inference.relation_to_id, # shares relation index with inductive inference
# do not explicitly create inverse triples for testing; this is handled by the evaluation code
create_inverse_triples=False,
load_triples_kwargs=self.load_triples_kwargs,
)
# inductive testing shares both ENTITIES and RELATIONS with the inductive inference graph
self._inductive_testing = TriplesFactory.from_path(
path=self.inductive_testing_path,
entity_to_id=self._inductive_inference.entity_to_id, # share entity index with inductive inference
relation_to_id=self._inductive_inference.relation_to_id, # share relation index with inductive inference
# do not explicitly create inverse triples for testing; this is handled by the evaluation code
create_inverse_triples=False,
load_triples_kwargs=self.load_triples_kwargs,
)
def __repr__(self) -> str: # noqa: D105
return (
f'{self.__class__.__name__}(training_path="{self.transductive_training_path}", '
f' inductive_inference="{self.inductive_inference_path}",'
f' inductive_test="{self.inductive_testing_path}",'
f' inductive_validation="{self.inductive_validation_path}")'
)
[docs]
class UnpackedRemoteDisjointInductiveDataset(DisjointInductivePathDataset):
"""A dataset with all four of train, inductive_inference, inductive test, and inductive validation sets as URLs."""
def __init__(
self,
transductive_training_url: str,
inductive_inference_url: str,
inductive_testing_url: str,
inductive_validation_url: str,
cache_root: str | None = None,
force: bool = False,
eager: bool = False,
create_inverse_triples: bool = False,
load_triples_kwargs: Mapping[str, Any] | None = None,
download_kwargs: Mapping[str, Any] | None = None,
version: str | None = None,
):
"""Initialize dataset.
:param transductive_training_url: The URL of the training file
:param inductive_inference_url: The URL of the inductive inference graph file
:param inductive_testing_url: The URL of the inductive testing file
:param inductive_validation_url: The URL of the inductive validation file
:param cache_root:
An optional directory to store the extracted files. Is none is given, the default PyKEEN directory is used.
This is defined either by the environment variable ``PYKEEN_HOME`` or defaults to ``~/.data/pykeen``.
:param force: If true, redownload any cached files
:param eager: Should the data be loaded eagerly? Defaults to false.
:param create_inverse_triples: Should inverse triples be created? Defaults to false.
:param load_triples_kwargs: Arguments to pass through to :func:`TriplesFactory.from_path`
and ultimately through to :func:`pykeen.triples.utils.load_triples`.
:param download_kwargs: Keyword arguments to pass to :func:`pystow.utils.download`
:param version: accepts a string "v1" to "v4" to select among Teru et al inductive datasets
"""
self.cache_root = self._help_cache(cache_root, version, sep_train_inference=True)
self.transductive_training_url = transductive_training_url
self.inductive_inference_url = inductive_inference_url
self.inductive_testing_url = inductive_testing_url
self.inductive_validation_url = inductive_validation_url
transductive_training_path = self.cache_root.joinpath("training", name_from_url(self.transductive_training_url))
inductive_inference_path = self.cache_root.joinpath("inference", name_from_url(self.inductive_inference_url))
inductive_testing_path = self.cache_root.joinpath("inference", name_from_url(self.inductive_testing_url))
inductive_validation_path = self.cache_root.joinpath("inference", name_from_url(self.inductive_validation_url))
download_kwargs = {} if download_kwargs is None else dict(download_kwargs)
download_kwargs.setdefault("backend", "urllib")
for url, path in [
(self.transductive_training_url, transductive_training_path),
(self.inductive_inference_url, inductive_inference_path),
(self.inductive_testing_url, inductive_testing_path),
(self.inductive_validation_url, inductive_validation_path),
]:
if force or not path.is_file():
download(url, path, **download_kwargs)
super().__init__(
transductive_training_path=transductive_training_path,
inductive_inference_path=inductive_inference_path,
inductive_testing_path=inductive_testing_path,
inductive_validation_path=inductive_validation_path,
eager=eager,
create_inverse_triples=create_inverse_triples,
load_triples_kwargs=load_triples_kwargs,
)