# -*- coding: utf-8 -*-
"""Load the OGB datasets.
Run with ``python -m pykeen.datasets.ogb``
"""
from typing import ClassVar, Optional
import click
import numpy as np
from docdata import parse_docdata
from more_click import verbose_option
from .base import LazyDataset
from ..triples import TriplesFactory
__all__ = [
"OGBLoader",
"OGBBioKG",
"OGBWikiKG2",
]
class OGBLoader(LazyDataset):
"""Load from the Open Graph Benchmark (OGB)."""
#: The name of the dataset to download
name: ClassVar[str]
def __init__(self, cache_root: Optional[str] = None, create_inverse_triples: bool = False):
"""Initialize the OGB loader.
:param cache_root: An optional override for where data should be cached.
If not specified, uses default PyKEEN location with :mod:`pystow`.
:param create_inverse_triples: Should inverse triples be created? Defaults to false.
"""
self.cache_root = self._help_cache(cache_root)
self.create_inverse_triples = create_inverse_triples
def _load(self) -> None:
try:
from ogb.linkproppred import LinkPropPredDataset
except ImportError as e:
raise ModuleNotFoundError(
f"Need to `pip install ogb` to use pykeen.datasets.{self.__class__.__name__}.",
) from e
dataset = LinkPropPredDataset(name=self.name, root=self.cache_root)
edge_split = dataset.get_edge_split()
self._training = self._make_tf(edge_split["train"])
assert self._training is not None # makes mypy hapy
self._testing = self._make_tf(
edge_split["test"],
entity_to_id=self._training.entity_to_id,
relation_to_id=self._training.relation_to_id,
)
self._validation = self._make_tf(
edge_split["valid"],
entity_to_id=self._training.entity_to_id,
relation_to_id=self._training.relation_to_id,
)
def _loaded_validation(self) -> bool:
return self._loaded
def _load_validation(self) -> None:
pass
def _make_tf(self, x, entity_to_id=None, relation_to_id=None):
# note: we do not use the built-in constants here, since those refer to OGB nomenclature
# (which happens to coincide with ours)
triples = np.stack([x["head"], x["relation"], x["tail"]], axis=1)
# FIXME these are already identifiers
triples = triples.astype(np.str)
return TriplesFactory.from_labeled_triples(
triples=triples,
create_inverse_triples=self.create_inverse_triples,
entity_to_id=entity_to_id,
relation_to_id=relation_to_id,
)
[docs]@parse_docdata
class OGBBioKG(OGBLoader):
"""The OGB BioKG dataset.
.. seealso:: https://ogb.stanford.edu/docs/linkprop/#ogbl-biokg
---
name: OGB BioKG
citation:
author: Hu
year: 2020
link: https://arxiv.org/abs/2005.00687
statistics:
entities: 45085
relations: 51
training: 4762677
testing: 162870
validation: 162886
triples: 5088433
"""
name = "ogbl-biokg"
def _make_tf(self, x, entity_to_id=None, relation_to_id=None):
head_triples = _array(x, "head_type", "head")
tail_triples = _array(x, "tail_type", "tail")
triples = np.stack([head_triples, x["relation"], tail_triples], axis=1).astype(np.str)
return TriplesFactory.from_labeled_triples(
triples=triples,
create_inverse_triples=self.create_inverse_triples,
entity_to_id=entity_to_id,
relation_to_id=relation_to_id,
)
def _array(df, entity_type_label, entity_label):
return np.array(
[f"{entity_type}:{entity}" for entity_type, entity in zip(df[entity_type_label], df[entity_label])],
dtype=np.str,
)
[docs]@parse_docdata
class OGBWikiKG2(OGBLoader):
"""The OGB WikiKG2 dataset.
.. seealso:: https://ogb.stanford.edu/docs/linkprop/#ogbl-wikikg2
---
name: OGB WikiKG2
citation:
author: Hu
year: 2020
link: https://arxiv.org/abs/2005.00687
github: snap-stanford/ogb
statistics:
entities: 2500604
relations: 535
training: 16109182
testing: 598543
validation: 429456
triples: 17137181
"""
name = "ogbl-wikikg2"
@click.command()
@verbose_option
def _main():
for _cls in [OGBBioKG, OGBWikiKG2]:
_cls().summarize()
if __name__ == "__main__":
_main()