"""Dataset analysis utilities."""
import logging
from collections.abc import Collection
from typing import Callable, Optional, Union
import pandas as pd
import torch
from .base import Dataset
from ..constants import PYKEEN_DATASETS
from ..triples import analysis as triple_analysis
from ..typing import MappedTriples
logger = logging.getLogger(__name__)
__all__ = [
"get_relation_count_df",
"get_entity_count_df",
"get_entity_relation_co_occurrence_df",
"get_relation_functionality_df",
# relation typing
"get_relation_pattern_types_df",
"get_relation_cardinality_types_df",
]
# constants
SUBSET_COLUMN_NAME = "subset"
def _get_mapped_triples(dataset: Dataset, parts: Collection[str]) -> Collection[tuple[int, int, int]]:
return torch.cat([dataset.factory_dict[part].mapped_triples for part in parts], dim=0).tolist()
def _normalize_parts(dataset: Dataset, parts: Union[None, str, Collection[str]]) -> Collection[str]:
if parts is None:
parts = dataset.factory_dict.keys()
elif isinstance(parts, str):
parts = [parts]
# unique
return list(set(parts))
def _common(
dataset: Dataset,
triple_func: Callable[[MappedTriples], pd.DataFrame],
merge_sides: bool = True,
merge_subsets: bool = True,
add_labels: bool = True,
) -> pd.DataFrame:
"""
Execute triple analysis over a dataset.
:param dataset:
The dataset.
:param triple_func:
The analysis function on the triples.
:param merge_sides:
Whether to merge sides, i.e., entity positions: head vs. tail.
:param merge_subsets:
Whether to merge subsets, i.e., train/validation/test.
:param add_labels:
Whether to add entity / relation labels.
:return:
An aggregated dataframe.
"""
# compute over all triples
data = []
for subset_name, triples_factory in dataset.factory_dict.items():
df = triple_func(triples_factory.mapped_triples)
df[SUBSET_COLUMN_NAME] = subset_name
data.append(df)
df = pd.concat(data, ignore_index=True)
# Determine group key
group_key = []
for key, condition in (
(triple_analysis.ENTITY_ID_COLUMN_NAME, True),
(triple_analysis.RELATION_ID_COLUMN_NAME, True),
(triple_analysis.ENTITY_POSITION_COLUMN_NAME, not merge_sides),
(SUBSET_COLUMN_NAME, not merge_subsets),
):
if condition and key in df.columns:
group_key.append(key)
df = df.groupby(by=group_key)[triple_analysis.COUNT_COLUMN_NAME].sum().reset_index()
# Add labels if requested
if add_labels and triple_analysis.ENTITY_ID_COLUMN_NAME in df.columns:
df = triple_analysis.add_entity_labels(
df=df,
add_labels=add_labels,
label_to_id=dataset.entity_to_id,
)
if add_labels and triple_analysis.RELATION_ID_COLUMN_NAME in df.columns:
df = triple_analysis.add_relation_labels(
df=df,
add_labels=add_labels,
label_to_id=dataset.relation_to_id,
)
return df
[docs]
def get_relation_count_df(
dataset: Dataset,
merge_subsets: bool = True,
add_labels: bool = True,
) -> pd.DataFrame:
"""Create a dataframe with relation counts.
:param dataset:
The dataset.
:param add_labels:
Whether to add relation labels to the dataframe.
:param merge_subsets:
Whether to merge subsets, i.e., train/validation/test.
:param add_labels:
Whether to add entity / relation labels.
:return:
A dataframe with columns (relation_id, count, relation_label?, subset?)
"""
return _common(
dataset=dataset,
triple_func=triple_analysis.get_relation_counts,
merge_subsets=merge_subsets,
add_labels=add_labels,
)
[docs]
def get_entity_count_df(
dataset: Dataset,
merge_sides: bool = True,
merge_subsets: bool = True,
add_labels: bool = True,
) -> pd.DataFrame:
"""Create a dataframe with entity counts.
:param dataset:
The dataset.
:param merge_sides:
Whether to merge sides, i.e., entity positions: head vs. tail.
:param merge_subsets:
Whether to merge subsets, i.e., train/validation/test.
:param add_labels:
Whether to add entity / relation labels.
:return:
A dataframe with one row per entity.
"""
return _common(
dataset=dataset,
triple_func=triple_analysis.get_entity_counts,
merge_sides=merge_sides,
merge_subsets=merge_subsets,
add_labels=add_labels,
)
[docs]
def get_entity_relation_co_occurrence_df(
dataset: Dataset,
merge_sides: bool = True,
merge_subsets: bool = True,
add_labels: bool = True,
) -> pd.DataFrame:
"""Create a dataframe of entity/relation co-occurrence.
This information can be seen as a form of pseudo-typing, e.g. entity A is something which can be a head of
`born_in`.
:param dataset:
The dataset.
:param merge_sides:
Whether to merge sides, i.e., entity positions: head vs. tail.
:param merge_subsets:
Whether to merge subsets, i.e., train/validation/test.
:param add_labels:
Whether to add entity / relation labels.
:return:
A dataframe of entity-relation pairs with their occurrence count.
"""
return _common(
dataset=dataset,
triple_func=triple_analysis.entity_relation_co_occurrence,
merge_sides=merge_sides,
merge_subsets=merge_subsets,
add_labels=add_labels,
)
[docs]
def get_relation_pattern_types_df(
dataset: Dataset,
*,
min_support: int = 0,
min_confidence: float = 0.95,
drop_confidence: bool = False,
parts: Optional[Collection[str]] = None,
force: bool = False,
add_labels: bool = True,
) -> pd.DataFrame:
r"""
Categorize relations based on patterns from RotatE [sun2019]_.
The relation classifications are based upon checking whether the corresponding rules hold with sufficient support
and confidence. By default, we do not require a minimum support, however, a relatively high confidence.
The following four non-exclusive classes for relations are considered:
- symmetry
- anti-symmetry
- inversion
- composition
This method generally follows the terminology of association rule mining. The patterns are expressed as
.. math ::
X_1 \land \cdot \land X_k \implies Y
where $X_i$ is of the form $r_i(h_i, t_i)$, and some of the $h_i / t_i$ might re-occur in other atoms.
The *support* of a pattern is the number of distinct instantiations of all variables for the left hand side.
The *confidence* is the proportion of these instantiations where the right-hand side is also true.
:param dataset:
The dataset to investigate.
:param min_support:
A minimum support for patterns.
:param min_confidence:
A minimum confidence for the tested patterns.
:param drop_confidence:
Whether to drop the support/confidence information from the result frame, and also drop duplicates.
:param parts:
Only use certain parts of the dataset, e.g., train triples. Defaults to using all triples, i.e.
{"training", "validation", "testing}.
:param force:
Whether to enforce re-calculation even if a cached version is available.
:param add_labels:
Whether to add relation labels (if available).
.. warning ::
If you intend to use the relation categorization as input to your model, or hyper-parameter selection, do *not*
include testing triples to avoid leakage!
:return:
A dataframe with columns {"relation_id", "pattern", "support"?, "confidence"?}.
"""
# TODO: Merge with _common?
parts = _normalize_parts(dataset, parts)
mapped_triples = _get_mapped_triples(dataset, parts)
# include hash over triples into cache-file name
ph = triple_analysis.triple_set_hash(mapped_triples=mapped_triples)[:16]
# include part hash into cache-file name
cache_path = PYKEEN_DATASETS.joinpath(dataset.__class__.__name__.lower(), f"relation_patterns_{ph}.tsv.xz")
# re-use cached file if possible
if not cache_path.is_file() or force:
# select triples
mapped_triples = torch.cat([dataset.factory_dict[part].mapped_triples for part in parts], dim=0).tolist()
df = triple_analysis.relation_pattern_types(mapped_triples=mapped_triples)
# save to file
cache_path.parent.mkdir(exist_ok=True, parents=True)
df.to_csv(cache_path, sep="\t", index=False)
logger.info(f"Cached {len(df)} relational pattern entries to {cache_path.as_uri()}")
else:
df = pd.read_csv(cache_path, sep="\t")
logger.info(f"Loaded {len(df)} precomputed relational patterns from {cache_path.as_uri()}")
# Prune by support and confidence
sufficient_support = df[triple_analysis.SUPPORT_COLUMN_NAME] >= min_support
sufficient_confidence = df[triple_analysis.CONFIDENCE_COLUMN_NAME] >= min_confidence
df = df[sufficient_support & sufficient_confidence]
if drop_confidence:
df = df[[triple_analysis.RELATION_ID_COLUMN_NAME, triple_analysis.PATTERN_TYPE_COLUMN_NAME]].drop_duplicates()
return triple_analysis.add_relation_labels(
df=df,
add_labels=add_labels,
label_to_id=dataset.relation_to_id,
)
[docs]
def get_relation_cardinality_types_df(
*,
dataset: Dataset,
parts: Optional[Collection[str]] = None,
add_labels: bool = True,
) -> pd.DataFrame:
r"""
Determine the relation cardinality types.
The possible types are given in relation_cardinality_types.
.. note ::
In the current implementation, we have by definition
.. math ::
1 = \sum_{type} conf(relation, type)
.. note ::
These relation types are also mentioned in [wang2014]_. However, the paper does not provide any details on
their definition, nor is any code provided. Thus, their exact procedure is unknown and may not coincide with this
implementation.
:param dataset:
The dataset to investigate.
:param parts:
Only use certain parts of the dataset, e.g., train triples. Defaults to using all triples, i.e.
{"training", "validation", "testing}.
:param add_labels:
Whether to add relation labels (if available).
:return:
A dataframe with columns ( relation_id | relation_type )
"""
# TODO: Consider merging with other analysis methods
parts = _normalize_parts(dataset=dataset, parts=parts)
mapped_triples = _get_mapped_triples(dataset=dataset, parts=parts)
return triple_analysis.relation_cardinality_types(
mapped_triples=mapped_triples,
add_labels=add_labels,
label_to_id=dataset.relation_to_id,
)
def get_relation_injectivity_df(
*,
dataset: Dataset,
parts: Optional[Collection[str]] = None,
add_labels: bool = True,
) -> pd.DataFrame:
"""
Calculate "soft" injectivity scores for each relation.
:param dataset:
The dataset to investigate.
:param parts:
Only use certain parts of the dataset, e.g., train triples. Defaults to using all triples, i.e.
{"training", "validation", "testing}.
:param add_labels:
Whether to add relation labels (if available).
:return:
A dataframe with one row per relation, its number of occurrences and head / tail injectivity scores.
"""
# TODO: Consider merging with other analysis methods
parts = _normalize_parts(dataset=dataset, parts=parts)
mapped_triples = _get_mapped_triples(dataset=dataset, parts=parts)
return triple_analysis.relation_injectivity(
mapped_triples=mapped_triples,
add_labels=add_labels,
label_to_id=dataset.relation_to_id,
)
[docs]
def get_relation_functionality_df(
*,
dataset: Dataset,
parts: Optional[Collection[str]] = None,
add_labels: bool = True,
) -> pd.DataFrame:
"""
Calculate the functionality and inverse functionality score per relation.
The (inverse) functionality was proposed in [wang2018]_. It is defined as the number of unique head (tail) entities
divided by the of triples in which the relation occurs. Thus, its value range is [0, 1]. Smaller values indicate
that entities usually have more than one outgoing (incoming) triple with the corresponding relation type. Hence,
the score is related to the relation cardinality types.
:param dataset:
The dataset to investigate.
:param parts:
Only use certain parts of the dataset, e.g., train triples. Defaults to using all triples, i.e.
{"training", "validation", "testing}.
:param add_labels:
Whether to add relation labels (if available).
:return:
A dataframe with columns (relation_id | functionality | inverse functionality)
.. [wang2018]
Wang, Z., *et al.* (2018). `Cross-lingual Knowledge Graph Alignment via Graph Convolutional Networks
<https://doi.org/10.18653/v1/D18-1032>`_. Proceedings of the 2018 Conference on Empirical Methods in
Natural Language Processing, 349–357.
"""
# TODO: Consider merging with other analysis methods
parts = _normalize_parts(dataset=dataset, parts=parts)
mapped_triples = _get_mapped_triples(dataset=dataset, parts=parts)
return triple_analysis.get_relation_functionality(
mapped_triples,
add_labels=add_labels,
label_to_id=dataset.relation_to_id,
)