Source code for pykeen.datasets.dbpedia

# -*- coding: utf-8 -*-

"""The DBpedia datasets from [shi2017b]_.

- GitHub Repository: https://github.com/bxshi/ConMask
- Paper: https://arxiv.org/abs/1711.03438
"""

from .base import UnpackedRemoteDataset

__all__ = [
    'DBpedia50',
]

BASE = 'https://raw.githubusercontent.com/ZhenfengLei/KGDatasets/master/DBpedia50'
TEST_URL = f'{BASE}/test.txt'
TRAIN_URL = f'{BASE}/train.txt'
VALID_URL = f'{BASE}/valid.txt'


[docs]class DBpedia50(UnpackedRemoteDataset):
    """The DBpedia50 dataset."""

    def __init__(self, create_inverse_triples: bool = False, **kwargs):
        """Initialize the DBpedia50 small dataset from [shi2017b]_.

        :param create_inverse_triples: Should inverse triples be created? Defaults to false.
        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.UnpackedRemoteDataset`.
        """
        # GitHub's raw.githubusercontent.com service rejects requests that are streamable. This is
        # normally the default for all of PyKEEN's remote datasets, so just switch the default here.
        kwargs.setdefault('stream', False)
        super().__init__(
            training_url=TRAIN_URL,
            testing_url=TEST_URL,
            validation_url=VALID_URL,
            create_inverse_triples=create_inverse_triples,
            load_triples_kwargs={
                # as pointed out in https://github.com/pykeen/pykeen/issues/275#issuecomment-776412294,
                # the columns are not ordered properly.
                'column_remapping': [0, 2, 1],
            },
            **kwargs,
        )


if __name__ == '__main__':
    DBpedia50().summarize()