Source code for pykeen.datasets.wikidata5m

# -*- coding: utf-8 -*-

"""The Wikidata5m dataset from [wang2019]_.

Wikidata5m is a million-scale knowledge graph dataset with aligned corpus.
This dataset integrates the Wikidata knowledge graph and Wikipedia pages.
Each entity in Wikidata5m is described by a corresponding Wikipedia page,
which enables the evaluation of link prediction over unseen entities.

- Website:
- Paper:

Get a summary with ``python -m pykeen.datasets.wikidata5m``,

import pathlib

from docdata import parse_docdata

from .base import TarFileRemoteDataset

__all__ = [


[docs]@parse_docdata class Wikidata5M(TarFileRemoteDataset): """The Wikidata5M dataset from [wang2019]_. --- name: Wikidata5M statistics: entities: 4594149 relations: 822 training: 20614279 testing: 4977 validation: 4983 triples: 20624239 citation: author: Wang year: 2019 arxiv: 1911.06136 link: """ def __init__(self, create_inverse_triples: bool = False, **kwargs): """Initialize the Wikidata5M dataset. :param create_inverse_triples: Should inverse triples be created? Defaults to false. :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.TarFileRemoteDataset`. """ super().__init__( url=TRANSDUCTIVE_URL, relative_training_path=pathlib.PurePath("wikidata5m_transductive_train.txt"), relative_testing_path=pathlib.PurePath("wikidata5m_transductive_test.txt"), relative_validation_path=pathlib.PurePath("wikidata5m_transductive_valid.txt"), create_inverse_triples=create_inverse_triples, **kwargs, )
if __name__ == "__main__": Wikidata5M.cli()