Source code for pykeen.datasets.hetionet

# -*- coding: utf-8 -*-

"""The Hetionet dataset.

Get a summary with ``python -m pykeen.datasets.hetionet``
"""

import click
from docdata import parse_docdata
from more_click import verbose_option

from .base import SingleTabbedDataset
from ..typing import TorchRandomHint

__all__ = [
    "Hetionet",
]

URL = "https://github.com/hetio/hetionet/raw/master/hetnet/tsv/hetionet-v1.0-edges.sif.gz"


[docs]@parse_docdata
class Hetionet(SingleTabbedDataset):
    """The Hetionet dataset from [himmelstein2017]_.

    In its publication [himmelstein2017]_, it is demonstrated to be useful for link prediction in drug repositioning
    and made publicly available through its `GitHub repository <https://github.com/hetio/hetionet>`_ in several formats.
    The link prediction algorithm showcased does not rely on embeddings, which leaves room for interesting comparison.
    One such comparison was made during the master's thesis of Lingling Xu [xu2019]_.
    ---
    name: Hetionet
    citation:
        author: Himmelstein
        year: 2017
        link: https://doi.org/10.7554/eLife.26726
        github: hetio/hetionet
    single: true
    statistics:
        entities: 45158
        relations: 24
        triples: 2250197
        training: 1800157
        testing: 225020
        validation: 225020
    """

    def __init__(
        self,
        random_state: TorchRandomHint = 0,
        **kwargs,
    ):
        """Initialize the `Hetionet <https://github.com/hetio/hetionet>`_ dataset from [himmelstein2017]_.

        :param random_state: The random seed to use in splitting the dataset. Defaults to 0.
        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.SingleTabbedDataset`.
        """
        super().__init__(
            url=URL,
            random_state=random_state,
            **kwargs,
        )


@click.command()
@verbose_option
def _main():
    from pykeen.datasets import get_dataset

    ds = get_dataset(dataset=Hetionet)
    ds.summarize()


if __name__ == "__main__":
    _main()