Source code for pykeen.datasets.openea

# -*- coding: utf-8 -*-

"""The OpenEA dataset family.

Get a summary with ``python -m pykeen.datasets.openea``
"""

import logging
import pathlib
from typing import Optional, Tuple, cast

import click
from docdata import parse_docdata
from more_click import verbose_option
from pystow.utils import download, read_zipfile_csv

from .base import LazyDataset
from ..triples import TriplesFactory
from ..typing import LABEL_HEAD, LABEL_RELATION, LABEL_TAIL, TorchRandomHint

__all__ = [
    "OpenEA",
]

logger = logging.getLogger(__name__)

GRAPH_PAIRS = ("D_W", "D_Y", "EN_DE", "EN_FR")
GRAPH_SIZES = ("15K", "100K")
GRAPH_VERSIONS = ("V1", "V2")


[docs]@parse_docdata
class OpenEA(LazyDataset):
    """The OpenEA dataset family.

    ---
    name: OpenEA Family
    citation:
        author: Sun
        year: 2020
        link: http://www.vldb.org/pvldb/vol13/p2326-sun.pdf
    single: true
    statistics:
        entities: 15000
        relations: 248
        triples: 38265
        training: 30612
        testing: 3826
        validation: 3827
    """

    #: The link to the zip file
    FIGSHARE_LINK: str = "https://figshare.com/ndownloader/files/34234391"

    #: The hex digest for the zip file
    SHA512: str = "c1589f185f86e05c497de147b4d6c243c66775cb4b50c6b41ecc71b36cfafb4c9f86fbee94e1e78a7ee056dd69df1ce3fc210ae07dc64955ad2bfda7450545ef"  # noqa: E501

    def __init__(
        self,
        graph_pair: str = "D_W",
        side: str = "D",
        size: str = "15K",
        version: str = "V1",
        cache_root: Optional[str] = None,
        eager: bool = False,
        create_inverse_triples: bool = False,
        random_state: TorchRandomHint = 0,
        split_ratios: Tuple[float, float, float] = (0.8, 0.1, 0.1),
        force: bool = False,
    ):
        """
        Initialize the dataset.

        :param graph_pair:
            The graph-pair within the dataset family (cf. GRAPH_PAIRS).
        :param side:
            The side of the graph-pair, a substring of the graph-pair selection.
        :param size:
            The size of the graphs (either "15K" or "100K").
        :param version:
            The version of the pairing (either "V1" or "V2). "V1" has lower connectivity in the graph than "V2".
        :param cache_root:
            The cache root.
        :param eager:
            Whether to directly load the dataset, or defer it to the first access of a relevant attribute.
        :param create_inverse_triples:
            Whether to create inverse triples.
        :param random_state:
            The random state used for splitting.
        :param split_ratios:
            The split ratios used for splitting the dataset into train / validation / test.
        :param force:
            Whether to enforce re-download of existing files.

        :raises ValueError:
            If the graph pair or side is invalid.
        """
        # Input validation.
        if graph_pair not in GRAPH_PAIRS:
            raise ValueError(f"Invalid graph pair: Allowed are: {GRAPH_PAIRS}")
        available_sides = graph_pair.split("_")
        if side not in available_sides:
            raise ValueError(f"side must be one of {available_sides}")
        if size not in GRAPH_SIZES:
            raise ValueError(f"size must be one of {GRAPH_SIZES}")
        if version not in GRAPH_VERSIONS:
            raise ValueError(f"version must be one of {GRAPH_VERSIONS}")

        relative_path_base = pathlib.PurePosixPath(
            "OpenEA_dataset_v2.0",
            graph_pair + "_" + size + "_" + version,
        )
        # left side has files ending with 1, right side with 2
        one_or_two = "1" if side == available_sides[0] else "2"
        self._relative_path_relations = pathlib.PurePosixPath(relative_path_base, f"rel_triples_{one_or_two}")

        # For downloading
        self.force = force
        self.cache_root = self._help_cache(cache_root)

        # For splitting
        self.random_state = random_state
        self.ratios = split_ratios

        # Whether to create inverse triples
        self.create_inverse_triples = create_inverse_triples

        if eager:
            self._load()

    def _load(self) -> None:
        path = self.cache_root.joinpath("OpenEA_dataset_v2.0.zip")

        # ensure file is present
        if not path.is_file() or self.force:
            logger.info(f"Downloading file from Figshare (Link: {self.__class__.FIGSHARE_LINK})")
            download(url=self.__class__.FIGSHARE_LINK, path=path, hexdigests={"sha512": self.SHA512})

        df = read_zipfile_csv(
            path=path,
            inner_path=str(self._relative_path_relations),
            header=None,
            names=[LABEL_HEAD, LABEL_RELATION, LABEL_TAIL],
            sep="\t",
            encoding="utf8",
            dtype=str,
        )

        # create triples factory
        tf = TriplesFactory.from_labeled_triples(
            triples=df.values,
            create_inverse_triples=self.create_inverse_triples,
            metadata={"path": path},
        )

        # split
        self._training, self._testing, self._validation = cast(
            Tuple[TriplesFactory, TriplesFactory, TriplesFactory],
            tf.split(
                ratios=self.ratios,
                random_state=self.random_state,
            ),
        )
        logger.info("[%s] done splitting data from %s", self.__class__.__name__, path)


@click.command()
@verbose_option
def _main():
    for graph_pair in GRAPH_PAIRS:
        for side in graph_pair.split("_"):
            for size in GRAPH_SIZES:
                for version in GRAPH_VERSIONS:
                    ds = OpenEA(graph_pair=graph_pair, side=side, size=size, version=version)
                    ds.summarize()


if __name__ == "__main__":
    _main()