"""The PrimeKG dataset.
Get a summary with ``python -m pykeen.datasets.primekg``
"""
import click
from docdata import parse_docdata
from more_click import verbose_option
from .base import SingleTabbedDataset
from ..typing import TorchRandomHint
__all__ = [
"PrimeKG",
]
URL = "https://dataverse.harvard.edu/api/access/datafile/6180620"
[docs]
@parse_docdata
class PrimeKG(SingleTabbedDataset):
"""The Precision Medicine Knowledge Graph (PrimeKG) dataset from [chandak2022]_.
---
name: PrimeKG
citation:
author: Chandak
year: 2022
link: https://doi.org/10.1101/2022.05.01.489928
github: mims-harvard/PrimeKG
single: true
statistics:
entities: 129375
relations: 30
triples: 8100498
training: 6479992
testing: 809999
validation: 810000
"""
def __init__(
self,
random_state: TorchRandomHint = 0,
**kwargs,
):
"""Initialize the PrimeKG dataset from [chandak2022]_.
:param random_state: The random seed to use in splitting the dataset. Defaults to 0.
:param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.SingleTabbedDataset`.
"""
super().__init__(
url=URL,
name="primekg.csv",
random_state=random_state,
download_kwargs=dict(
backend="requests",
),
read_csv_kwargs=dict(
usecols=["x_name", "relation", "y_name"],
sep=",",
),
**kwargs,
)
@click.command()
@verbose_option
def _main():
from pykeen.datasets import get_dataset
ds = get_dataset(dataset=PrimeKG)
ds.summarize()
if __name__ == "__main__":
_main()