Source code for pykeen.datasets.pharmebinet

# -*- coding: utf-8 -*-

"""The `PharMeBINet <https://github.com/ckoenigs/PharMeBINet/>`_ dataset.

Get a summary with ``python -m pykeen.datasets.pharmebinet``.
"""

import click
from docdata import parse_docdata
from more_click import verbose_option

from .base import TarFileSingleDataset
from ..typing import TorchRandomHint

__all__ = [
    "PharMeBINet",
]

RAW_URL = "https://zenodo.org/record/7011027/files/pharmebinet_tsv_2022_08_19_v2.tar.gz"


[docs]@parse_docdata class PharMeBINet(TarFileSingleDataset): """The PharMeBINet dataset from [koenigs2022]_. --- name: PharMeBINet citation: github: ckoenigs/PharMeBINet author: Königs year: 2022 link: https://www.nature.com/articles/s41597-022-01510-3 single: true statistics: entities: 2869407 relations: 208 triples: 15883653 training: 12702210 testing: 1587776 validation: 1587777 """ def __init__( self, random_state: TorchRandomHint = 0, **kwargs, ): """Initialize the PharMeBINet dataset from [koenigs2022]_. :param random_state: An optional random state to make the training/testing/validation split reproducible. :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.TarFileSingleDataset`. """ super().__init__( url=RAW_URL, relative_path="edges.tsv", random_state=random_state, read_csv_kwargs=dict( usecols=["start_id", "type", "end_id"], sep="\t", dtype={"start_id": str, "end_id": str}, ), **kwargs, )
@click.command() @verbose_option def _main(): from pykeen.datasets import get_dataset get_dataset(dataset=PharMeBINet).summarize() if __name__ == "__main__": _main()