Source code for pykeen.nn.node_piece.representations

# -*- coding: utf-8 -*-

"""Representation modules for NodePiece."""

import logging
from typing import Callable, Optional, Sequence, Union

import torch
from class_resolver import HintOrType, OneOrManyHintOrType, OneOrManyOptionalKwargs, OptionalKwargs
from class_resolver.contrib.torch import aggregation_resolver

from .tokenization import Tokenizer, tokenizer_resolver
from ..representation import Representation
from ...triples import CoreTriplesFactory
from ...typing import MappedTriples, OneOrSequence
from ...utils import broadcast_upgrade_to_sequences

__all__ = [
    "TokenizationRepresentation",
    "NodePieceRepresentation",
]

logger = logging.getLogger(__name__)


[docs]class TokenizationRepresentation(Representation): """A module holding the result of tokenization.""" #: the token ID of the padding token vocabulary_size: int #: the token representations vocabulary: Representation #: the assigned tokens for each entity assignment: torch.LongTensor def __init__( self, assignment: torch.LongTensor, token_representation: HintOrType[Representation] = None, token_representation_kwargs: OptionalKwargs = None, **kwargs, ) -> None: """ Initialize the tokenization. :param assignment: shape: `(n, num_chosen_tokens)` the token assignment. :param token_representation: shape: `(num_total_tokens, *shape)` the token representations :param token_representation_kwargs: additional keyword-based parameters :param kwargs: additional keyword-based parameters passed to super.__init__ :raises ValueError: if there's a mismatch between the representation size and the vocabulary size """ # needs to be lazily imported to avoid cyclic imports from .. import representation_resolver # fill padding (nn.Embedding cannot deal with negative indices) padding = assignment < 0 # sometimes, assignment.max() does not cover all relations (eg, inductive inference graphs # contain a subset of training relations) - for that, the padding index is the last index of the Representation self.vocabulary_size = ( token_representation.max_id if isinstance(token_representation, Representation) else assignment.max().item() + 2 # exclusive (+1) and including padding (+1) ) assignment[padding] = self.vocabulary_size - 1 # = assignment.max().item() + 1 max_id, num_chosen_tokens = assignment.shape # resolve token representation token_representation = representation_resolver.make( token_representation, token_representation_kwargs, max_id=self.vocabulary_size, ) super().__init__(max_id=max_id, shape=(num_chosen_tokens,) + token_representation.shape, **kwargs) # input validation if token_representation.max_id < self.vocabulary_size: raise ValueError( f"The token representations only contain {token_representation.max_id} representations," f"but there are {self.vocabulary_size} tokens in use.", ) elif token_representation.max_id > self.vocabulary_size: logger.warning( f"Token representations do contain more representations ({token_representation.max_id}) " f"than tokens are used ({self.vocabulary_size}).", ) # register as buffer self.register_buffer(name="assignment", tensor=assignment) # assign sub-module self.vocabulary = token_representation
[docs] @classmethod def from_tokenizer( cls, tokenizer: Tokenizer, num_tokens: int, mapped_triples: MappedTriples, num_entities: int, num_relations: int, token_representation: HintOrType[Representation] = None, token_representation_kwargs: OptionalKwargs = None, **kwargs, ) -> "TokenizationRepresentation": """ Create a tokenization from applying a tokenizer. :param tokenizer: the tokenizer instance. :param num_tokens: the number of tokens to select for each entity. :param token_representation: the pre-instantiated token representations, or an EmbeddingSpecification to create them :param token_representation_kwargs: additional keyword-based parameters :param mapped_triples: the ID-based triples :param num_entities: the number of entities :param num_relations: the number of relations :param kwargs: additional keyword-based parameters passed to TokenizationRepresentation.__init__ :return: A tokenization representation by applying the tokenizer """ # apply tokenizer vocabulary_size, assignment = tokenizer( mapped_triples=mapped_triples, num_tokens=num_tokens, num_entities=num_entities, num_relations=num_relations, ) return TokenizationRepresentation( assignment=assignment, token_representation=token_representation, token_representation_kwargs=token_representation_kwargs, **kwargs, )
[docs] def extra_repr(self) -> str: # noqa: D102 return "\n".join( ( f"max_id={self.assignment.shape[0]},", f"num_tokens={self.assignment.shape[1]},", f"vocabulary_size={self.vocabulary_size},", ) )
def _plain_forward( self, indices: Optional[torch.LongTensor] = None, ) -> torch.FloatTensor: # noqa: D102 # get token IDs, shape: (*, num_chosen_tokens) token_ids = self.assignment if indices is not None: token_ids = token_ids[indices] # lookup token representations, shape: (*, num_chosen_tokens, *shape) return self.vocabulary(token_ids)
[docs]class NodePieceRepresentation(Representation): r""" Basic implementation of node piece decomposition [galkin2021]_. .. math :: x_e = agg(\{T[t] \mid t \in tokens(e) \}) where $T$ are token representations, $tokens$ selects a fixed number of $k$ tokens for each entity, and $agg$ is an aggregation function, which aggregates the individual token representations to a single entity representation. .. note :: This implementation currently only supports representation of entities by bag-of-relations. """ #: the token representations token_representations: Sequence[TokenizationRepresentation] def __init__( self, *, triples_factory: CoreTriplesFactory, token_representations: OneOrManyHintOrType[Representation] = None, token_representations_kwargs: OneOrManyOptionalKwargs = None, tokenizers: OneOrManyHintOrType[Tokenizer] = None, tokenizers_kwargs: OneOrManyOptionalKwargs = None, num_tokens: OneOrSequence[int] = 2, aggregation: Union[None, str, Callable[[torch.FloatTensor, int], torch.FloatTensor]] = None, max_id: Optional[int] = None, shape: Optional[Sequence[int]] = None, **kwargs, ): """ Initialize the representation. :param triples_factory: the triples factory :param token_representations: the token representation specification, or pre-instantiated representation module. :param token_representations_kwargs: additional keyword-based parameters :param tokenizers: the tokenizer to use, cf. `pykeen.nn.node_piece.tokenizer_resolver`. :param tokenizers_kwargs: additional keyword-based parameters passed to the tokenizer upon construction. :param num_tokens: the number of tokens for each entity. :param aggregation: aggregation of multiple token representations to a single entity representation. By default, this uses :func:`torch.mean`. If a string is provided, the module assumes that this refers to a top-level torch function, e.g. "mean" for :func:`torch.mean`, or "sum" for func:`torch.sum`. An aggregation can also have trainable parameters, .e.g., ``MLP(mean(MLP(tokens)))`` (cf. DeepSets from [zaheer2017]_). In this case, the module has to be created outside of this component. We could also have aggregations which result in differently shapes output, e.g. a concatenation of all token embeddings resulting in shape ``(num_tokens * d,)``. In this case, `shape` must be provided. The aggregation takes two arguments: the (batched) tensor of token representations, in shape ``(*, num_tokens, *dt)``, and the index along which to aggregate. :param shape: the shape of an individual representation. Only necessary, if aggregation results in a change of dimensions. this will only be necessary if the aggregation is an *ad hoc* function. :param max_id: Only pass this to check if the number of entities in the triples factories is the same :param kwargs: additional keyword-based parameters passed to super.__init__ :raises ValueError: if the shapes for any vocabulary entry in all token representations are inconsistent """ if max_id: assert max_id == triples_factory.num_entities # normalize triples mapped_triples = triples_factory.mapped_triples if triples_factory.create_inverse_triples: # inverse triples are created afterwards implicitly mapped_triples = mapped_triples[mapped_triples[:, 1] < triples_factory.real_num_relations] token_representations, token_representations_kwargs, num_tokens = broadcast_upgrade_to_sequences( token_representations, token_representations_kwargs, num_tokens ) # tokenize token_representations = [ TokenizationRepresentation.from_tokenizer( tokenizer=tokenizer_inst, num_tokens=num_tokens_, token_representation=token_representation, token_representation_kwargs=token_representation_kwargs, mapped_triples=mapped_triples, num_entities=triples_factory.num_entities, num_relations=triples_factory.real_num_relations, ) for tokenizer_inst, token_representation, token_representation_kwargs, num_tokens_ in zip( tokenizer_resolver.make_many(queries=tokenizers, kwargs=tokenizers_kwargs), token_representations, token_representations_kwargs, num_tokens, ) ] # determine shape if shape is None: shapes = {t.vocabulary.shape for t in token_representations} if len(shapes) != 1: raise ValueError(f"Inconsistent token shapes: {shapes}") shape = list(shapes)[0] # super init; has to happen *before* any parameter or buffer is assigned super().__init__(max_id=triples_factory.num_entities, shape=shape, **kwargs) # assign module self.token_representations = torch.nn.ModuleList(token_representations) # Assign default aggregation self.aggregation = aggregation_resolver.lookup(aggregation) self.aggregation_index = -(1 + len(shape))
[docs] def extra_repr(self) -> str: # noqa: D102 aggregation_str = self.aggregation.__name__ if hasattr(self.aggregation, "__name__") else str(self.aggregation) return f"aggregation={aggregation_str}, "
def _plain_forward( self, indices: Optional[torch.LongTensor] = None, ) -> torch.FloatTensor: # noqa: D102 return self.aggregation( torch.cat( [tokenization(indices=indices) for tokenization in self.token_representations], dim=self.aggregation_index, ), self.aggregation_index, )