Source code for spekk.spec

"""Module containing the :class:`Spec` class — the most important component of the 
``spekk`` library."""

from functools import reduce
from typing import Dict, Optional, Sequence, Set, Union

import spekk.trees.core as trees
from spekk.trees import Tree, TreeLens, leaves, register_dispatch_fn, traverse, treedef
from spekk.trees.registry import Tree


def _is_spec_leaf(tree: Optional[Tree]):
    """A Spec-tree is a leaf if it is None or a sequence of strings.

    >>> _is_spec_leaf(None)
    True
    >>> _is_spec_leaf(["a", "b"])
    True

    Anything else is not a leaf, in the following case a list of list of strings:
    >>> _is_spec_leaf([["a"], ["c"]])
    False
    """
    if tree is None:
        return True
    if isinstance(tree, Sequence) and all(isinstance(x, str) for x in tree):
        return True
    if isinstance(tree, Spec):
        return _is_spec_leaf(tree.tree)
    return False


[docs] class Spec(TreeLens): """In a nested tree of arrays, a Spec describes the dimensions of the arrays. Spec is a subclass of :class:`TreeLens` which takes the ``tree`` as an argument when constructing an object. The tree of a Spec is a nested data-structure consisting of dictionaries and sequences, where the leaves are sequences of strings. An example of a Spec is as follows: >>> spec = Spec({"foo": ["a", "b"], "bar": ["b"]}) The above ``spec`` describes a dictionary of arrays. As data, it could look something like this: >>> import numpy as np >>> data = {"foo": np.ones([2, 3]), "bar": np.ones([3])} Note that the structure of the ``spec`` mirrors the structure of the data, but where each array has been replaced with a list of strings, representing the dimensions of the arrays. Note also that the second dimension of the ``"foo"`` array share the same name as the first dimension of the ``"bar"`` array, meaning that they are semantically the same dimension. This is better understood with a more concrete example: >>> spec = Spec({"image": ["batch", "width", "height", "channels"], ... "caption": ["batch", "tokens"]}) In the above example, both the ``"image"`` and the ``"caption"`` has the same ``"batch"`` dimension so we know that if we loop over the batch-items we must loop over both the images and captions. """
[docs] def is_leaf(self, tree: Optional[Tree] = "_NOT_GIVEN") -> bool: """Return True if this spec object represents the dimensions of an array (i.e.: not a nested data-structure of arrays). May optionally be called as a static method where ``self`` is a tree, or with an explicitly given tree. See also: func:`._is_spec_leaf`). """ if tree == "_NOT_GIVEN": # `None` has a semantic meaning if isinstance(self, Spec): return _is_spec_leaf(self.tree) else: # is_leaf was called as a static method return _is_spec_leaf(self) else: return _is_spec_leaf(tree)
[docs] def remove_dimension( self, dimension: Union[str, Sequence[str]], path: Sequence = (), ) -> "Spec": """Remove the given dimension from everywhere in the spec. >>> spec = Spec({"signal": ["transmits", "receivers"], ... "receiver": {"position": ["receivers"], "direction": []}}) >>> spec.remove_dimension("receivers") Spec({'signal': ['transmits'], 'receiver': {'position': [], 'direction': []}}) You can also remove multiple dimensions at once: >>> spec.remove_dimension(["transmits", "receivers"]) Spec({'signal': [], 'receiver': {'position': [], 'direction': []}}) """ state = self.get(path) if isinstance(dimension, (list, tuple, set)): for dim in dimension: state = state.remove_dimension(dim, path) return state for leaf in leaves(state.tree, self.is_leaf): if dimension in leaf.value: state = state.set([x for x in leaf.value if x != dimension], leaf.path) return state
[docs] def index_for(self, dimension: str, path: Sequence = ()) -> Tree: """Return the indices of the given dimension in the spec with the same structure as the spec. >>> spec = Spec({"signal": ["transmits", "receivers"], ... "receiver": {"position": ["receivers"], "direction": []}}) >>> spec.index_for("receivers") {'signal': 1, 'receiver': {'position': 0, 'direction': None}} """ state = self.get(path).tree for leaf in leaves(state, self.is_leaf): index = ( leaf.value.index(dimension) if (leaf.value is not None and dimension in leaf.value) else None ) state = trees.set(state, index, leaf.path) return state
@property def dimensions(self) -> Set[str]: """Return all dimensions in the spec. >>> spec = Spec({"signal": ["transmits", "receivers"], ... "receiver": {"position": ["receivers"], "direction": []}, ... "point_position": ["transmits", "points"]}) >>> sorted(spec.dimensions) ['points', 'receivers', 'transmits'] """ return reduce( lambda dims, leaf: dims.union(leaf.value if leaf.value is not None else []), leaves(self.tree, self.is_leaf), set(), )
[docs] def has_dimension(self, *dimensions: str) -> bool: """Return True if the spec has the given dimension(s). >>> spec = Spec({"signal": ["transmits", "receivers"], ... "receiver": {"position": ["receivers"], "direction": []}}) >>> spec.has_dimension("transmits", "receivers") True >>> spec.has_dimension("frames", "transmits", "receivers") False """ return all(dim in self.dimensions for dim in dimensions)
[docs] def add_dimension( self, dimension: str, path: Sequence = (), index: int = 0 ) -> "Spec": """Add the dimension to the list of dimensions at the specified path and at the specified index in the list. >>> spec = Spec({"foo": {"baz": ["a", "b"]}, "bar": ["b"]}) >>> spec.add_dimension("c", ["foo", "baz"], 0) Spec({'foo': {'baz': ['c', 'a', 'b']}, 'bar': ['b']}) >>> spec.add_dimension("c", ["foo", "baz"], 1) Spec({'foo': {'baz': ['a', 'c', 'b']}, 'bar': ['b']}) >>> spec.add_dimension("c", ["bar"], 0) Spec({'foo': {'baz': ['a', 'b']}, 'bar': ['c', 'b']}) """ current_dims = self.get(path) current_dims = current_dims.tree if current_dims is not None else [] if not self.is_leaf(current_dims): raise ValueError( f"The provided path does not lead to a dimensions definition. \ Dimensions must be a list of strings, but got {current_dims} at the path {path}." ) new_dims = [*current_dims[:index], dimension, *current_dims[index:]] return self.set(new_dims, path)
[docs] def replace(self, replacements: Tree) -> "Spec": """Update the spec by replacing subtrees with corresponding subtrees in the replacements tree. * A value of None in the replacements tree removes the subtree at the corresponding path. * A leaf (list of dimensions, see Spec.is_leaf) in the replacements tree always replaces the leaf (or subtree) at the corresponding path. * Keys present in the replacements tree but not in the spec are added to the spec at the corresponding path. >>> spec = Spec({"foo": {"baz": ["a", "b"]}, "bar": ["b"]}) Replacing a path with None removes the subtree at that path: >>> spec.replace({"foo": None}) Spec({'bar': ['b']}) Removing a subtree such that its parent becomes an empty collection also removes the parent: >>> spec.replace({"foo": {"baz": None}}) Spec({'bar': ['b']}) Replacing an existing path with a list of dimensions overwrites the path: >>> spec.replace({"foo": ["c"]}) Spec({'foo': ['c'], 'bar': ['b']}) Other than that, it is assumed that the ``replacements`` tree structure mirrors the spec structure: >>> spec.replace({"foo": {"baz": ["c"]}}) Spec({'foo': {'baz': ['c']}, 'bar': ['b']}) """ state = self.tree for replacement in traverse(replacements, self.is_leaf): if replacement.value is None: state = trees.remove(state, replacement.path) elif replacement.is_leaf or self.is_leaf( trees.get(state, replacement.path, None) ): state = trees.set(state, replacement.value, replacement.path) else: replacement_value = trees.filter( replacement.value, self.is_leaf, lambda tree: tree is not None, ) state = trees.update( state, # Current value takes presedence over replacement value in order to # preserve replace semantics. lambda current_value: trees.merge( replacement_value, current_value, "last" ), replacement.path, ) return Spec(state).prune_empty_branches()
[docs] def validate(self, data: Tree): """Validate that the data conforms to the spec, raising a :class:`~spekk.validation.ValidationError` if not. See also: :func:`~spekk.validation.validate` """ from spekk.util.validation import validate validate(self, data)
[docs] def size( self, data: Tree, dimension: Optional[str] = None, ) -> Union[int, Dict[str, int]]: """Get the size of dimensions (or a single dimension) in the data. >>> import numpy as np >>> spec = Spec({"signal": ["transmits", "receivers"], ... "receiver": {"position": ["receivers"], "direction": []}}) >>> data = {"signal": np.random.randn(10, 20), ... "receiver": {"position": np.random.randn(20, 3), ... "direction": np.random.randn(20, 3)}} >>> spec.size(data) == {'transmits': 10, 'receivers': 20} True >>> spec.size(data, "transmits") 10 >>> spec.size(data, "receivers") 20 """ from spekk import util if dimension is None: return {dim: self.size(data, dim) for dim in self.dimensions} if not self.has_dimension(dimension): raise ValueError(f"Spec does not contain the dimension {dimension}.") indices_tree = self.index_for(dimension) for leaf in leaves(indices_tree, lambda x: isinstance(x, int) or x is None): if leaf.value is not None and trees.has_path(data, leaf.path): # Assume that all data with the same dimension has the same size, so we # just return the first one we find. return util.shape(trees.get(data, leaf.path))[leaf.value]
def __fastmath_keys__(self): return trees.treedef(self.tree).keys() def __fastmath_children__(self): return trees.treedef(self.tree).values() def __fastmath_create__(self, keys: Sequence, children: Sequence): return Spec(trees.treedef(self.tree).create(keys, children)) def __hash__(self): return hash(self.tree) def __eq__(self, other) -> bool: """Return True if the specs are equal. >>> spec = Spec({"signal": ["transmits", "receivers"]}) >>> spec == Spec({"signal": ["transmits", "receivers"]}) True >>> spec == Spec({"signal": ["frames", "transmits", "receivers"]}) False >>> spec == Spec({"signal": ["transmits", "receivers"], "foo": ["bar"]}) False >>> spec == Spec({"foo": ["bar"]}) False """ if not isinstance(other, Spec): return False else: for subtree in traverse(other.tree, self.is_leaf): if subtree.is_leaf: if not self.has_subtree(subtree.path): return False if len(subtree.value) != len(self.get(subtree.path)): return False for d1, d2 in zip(subtree.value, self.get(subtree.path)): if isinstance(d2, Spec): d2 = d2.tree if d1 != d2: return False else: if ( treedef(self.get(subtree.path)).keys() != treedef(subtree.value).keys() ): return False return True def __len__(self): return len(self.tree) def __repr__(self): if self.tree is None: return "Spec()" return f"Spec({self.tree})"
register_dispatch_fn(lambda t: treedef(t.tree) if isinstance(t, Spec) else None) if __name__ == "__main__": import doctest doctest.testmod()