Source code for pykeen.datasets.ckg

# -*- coding: utf-8 -*-

"""Clinical Knowledge Graph."""

import pathlib
import tarfile
from pathlib import Path
from typing import Iterable, Optional
from urllib.request import urlretrieve

import click
import pandas as pd
from docdata import parse_docdata
from more_click import verbose_option

from .base import TabbedDataset
from ..typing import TorchRandomHint

__all__ = [

URL = ""

[docs]@parse_docdata class CKG(TabbedDataset): """The Clinical Knowledge Graph (CKG) dataset from [santos2020]_. --- name: Clinical Knowledge Graph citation: author: Santos year: 2020 link: github: MannLabs/CKG single: true statistics: entities: 7617419 relations: 11 triples: 26691525 training: 21353220 testing: 2669152 validation: 2669153 """ def __init__( self, random_state: TorchRandomHint = 0, **kwargs, ): """Initialize the `CKG <>`_ dataset from [santos2020]_. :param random_state: The random seed to use in splitting the dataset. Defaults to 0. :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.TabbedDataset`. """ super().__init__( random_state=random_state, **kwargs, ) self.preloaded_path = self.cache_root.joinpath("preloaded.tsv.gz") def _get_path(self) -> Optional[pathlib.Path]: return self.preloaded_path def _get_df(self) -> pd.DataFrame: if self.preloaded_path.exists(): return pd.read_csv(self.preloaded_path, sep="\t", dtype=str) df = pd.concat(self._iterate_dataframes()) df.to_csv(self.preloaded_path, sep="\t", index=False) return df def _iterate_dataframes(self) -> Iterable[pd.DataFrame]: archive_path = self.cache_root / "data.tar.gz" if not archive_path.exists(): urlretrieve(URL, archive_path) # noqa:S310 with as tar_file: if tar_file is None: raise ValueError for tarinfo in tar_file: if not"data/imports/") or not".tsv"): continue path = Path( if"."): continue _inner_file = tar_file.extractfile(tarinfo) if _inner_file is None: raise ValueError(f"Unable to open inner file: {tarinfo}") with _inner_file as file: df = pd.read_csv(file, usecols=COLUMNS, sep="\t", dtype=str) df = df[COLUMNS] yield df
@click.command() @verbose_option def _main(): from pykeen.datasets import get_dataset d = get_dataset(dataset=CKG) d.summarize() if __name__ == "__main__": _main()