Source code for pykeen.datasets.ckg

# -*- coding: utf-8 -*-

"""Clinical Knowledge Graph."""

import tarfile
from pathlib import Path
from typing import Iterable, Optional
from urllib.request import urlretrieve

import click
import pandas as pd

from .base import TabbedDataset
from ..typing import TorchRandomHint

__all__ = [
    'CKG',
]

URL = 'https://md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com/d1e8d3df-2342-468a-91a9-97a981a479ad'
COLUMNS = ['START_ID', 'TYPE', 'END_ID']


[docs]class CKG(TabbedDataset): """The Clinical Knowledge Graph (CKG) dataset from [santos2020]_. This dataset contains ~7.6 million nodes, 11 relations, and ~26 million triples. .. [santos2020] Santos, A., *et al* (2020). `Clinical Knowledge Graph Integrates Proteomics Data into Clinical Decision-Making <https://doi.org/10.1101/2020.05.09.084897>`_. *bioRxiv*, 2020.05.09.084897. """ def __init__( self, eager: bool = False, create_inverse_triples: bool = False, random_state: TorchRandomHint = 0, cache_root: Optional[str] = None, ): super().__init__( eager=eager, create_inverse_triples=create_inverse_triples, random_state=random_state, cache_root=cache_root, ) self.preloaded_path = self.cache_root / 'preloaded.tsv.gz' def _get_path(self) -> Optional[str]: return self.preloaded_path.as_posix() def _get_df(self) -> pd.DataFrame: if self.preloaded_path.exists(): return pd.read_csv(self.preloaded_path, sep='\t') df = pd.concat(self._iterate_dataframes()) df.to_csv(self.preloaded_path, sep='\t', index=False) return df def _iterate_dataframes(self) -> Iterable[pd.DataFrame]: archive_path = self.cache_root / 'data.tar.gz' if not archive_path.exists(): urlretrieve(URL, archive_path) # noqa:S310 with tarfile.TarFile.open(archive_path) as tar_file: for tarinfo in tar_file: if not tarinfo.name.startswith('data/imports/') or not tarinfo.name.endswith('.tsv'): continue path = Path(tarinfo.name) if path.name.startswith('.'): continue with tar_file.extractfile(tarinfo) as file: df = pd.read_csv(file, usecols=COLUMNS, sep='\t', dtype=str) df = df[COLUMNS] yield df
@click.command() def _main(): d = CKG() d.summarize() if __name__ == '__main__': _main()