followthemoney.namespace

We like our abstractions like our offshore banks: leaky.

Entity ID namespaces are a security mechanism related to the Aleph search index.

Aleph allows the user (via mappings or the API) to create arbitrary entity IDs. Entity IDs that are controlled by the user and not the system are unusual. However, this makes it possible to generate bulk data outside Aleph, and then load entities into the system as a continuous :ref:streams.

The problem is that having user controlled entity IDs increases the chance of conflict in the search index.

Namespacing works around this by making each entity ID consist of two parts: one controlled by the client, the other controlled by the system. The second part of the ID is called its signature::

entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126

The signature is generated as hmac(entity_id, dataset_id). This guarantees that the combined ID is specific to a dataset, without needing an (expensive) index look up of each ID first. It can also be generated on the client or the server without compromising isolation.

  1"""
  2*We like our abstractions like our offshore banks: leaky.*
  3
  4Entity ID namespaces are a security mechanism related to the Aleph search index.
  5
  6Aleph allows the user (via mappings or the API) to create arbitrary entity IDs.
  7Entity IDs that are controlled by the user and not the system are unusual.
  8However, this makes it possible to generate bulk data outside Aleph,
  9and then load entities into the system as a continuous :ref:`streams`.
 10
 11The problem is that having user controlled entity IDs increases the chance
 12of conflict in the search index.
 13
 14Namespacing works around this by making each entity ID consist of two parts:
 15one controlled by the client, the other controlled by the system. The second
 16part of the ID is called its `signature`::
 17
 18    entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126
 19
 20The signature is generated as ``hmac(entity_id, dataset_id)``. This guarantees
 21that the combined ID is specific to a dataset, without needing an (expensive)
 22index look up of each ID first. It can also be generated on the client or
 23the server without compromising isolation.
 24"""
 25
 26import hmac
 27from typing import Any, Optional, Tuple, Union
 28
 29from followthemoney.types import registry
 30from followthemoney.proxy import E
 31from followthemoney.util import key_bytes, get_entity_id
 32
 33
 34class Namespace(object):
 35    """Namespaces are used to partition entity IDs into different units,
 36    which traditionally represent a dataset, collection or source.
 37
 38    See module docstring for details."""
 39
 40    SEP = "."
 41
 42    def __init__(self, name: Optional[str] = None) -> None:
 43        self.bname = key_bytes(name) if name else b""
 44        self.hmac = hmac.new(self.bname, digestmod="sha1")
 45
 46    @classmethod
 47    def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
 48        """Split up an entity ID into the plain ID and the namespace
 49        signature. If either part is missing, return None instead."""
 50        clean_id = registry.entity.clean(entity_id)
 51        if clean_id is None:
 52            return (None, None)
 53        try:
 54            plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
 55            return (plain_id, checksum)
 56        except ValueError:
 57            return (clean_id, None)
 58
 59    @classmethod
 60    def strip(cls, entity_id: str) -> Optional[str]:
 61        plain_id, _ = cls.parse(entity_id)
 62        return plain_id
 63
 64    def signature(self, entity_id: str) -> Optional[str]:
 65        """Generate a namespace-specific signature."""
 66        if not len(self.bname) or entity_id is None:
 67            return None
 68        digest = self.hmac.copy()
 69        digest.update(key_bytes(entity_id))
 70        return digest.hexdigest()
 71
 72    def sign(self, entity_id: str) -> Optional[str]:
 73        """Apply a namespace signature to an entity ID, removing any
 74        previous namespace marker."""
 75        parsed_id, _ = self.parse(entity_id)
 76        if not len(self.bname):
 77            return parsed_id
 78        if parsed_id is None:
 79            return None
 80        digest = self.signature(parsed_id)
 81        if digest is None:
 82            return None
 83        return self.SEP.join((parsed_id, digest))
 84
 85    def verify(self, entity_id: str) -> bool:
 86        """Check if the signature matches the current namespace."""
 87        parsed_id, digest = self.parse(entity_id)
 88        if digest is None or parsed_id is None:
 89            return False
 90        signature = self.signature(parsed_id)
 91        if signature is None:
 92            return False
 93        return hmac.compare_digest(digest, signature)
 94
 95    def apply(self, proxy: E, shallow: bool = False) -> E:
 96        """Rewrite an entity proxy so all IDs mentioned are limited to
 97        the namespace."""
 98        signed = proxy.clone()
 99        if proxy.id is not None:
100            signed.id = self.sign(proxy.id)
101        if not shallow:
102            for prop in proxy.iterprops():
103                if prop.type != registry.entity:
104                    continue
105                for value in signed.pop(prop):
106                    entity_id = get_entity_id(value)
107                    if entity_id is not None:
108                        signed.add(prop, self.sign(entity_id))
109        return signed
110
111    @classmethod
112    def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
113        if isinstance(name, str):
114            return cls(name)
115        return name
116
117    def __eq__(self, other: Any) -> bool:
118        return bool(self.bname == other.bname)
119
120    def __repr__(self) -> str:
121        return "<Namespace(%r)>" % self.bname
class Namespace:
 35class Namespace(object):
 36    """Namespaces are used to partition entity IDs into different units,
 37    which traditionally represent a dataset, collection or source.
 38
 39    See module docstring for details."""
 40
 41    SEP = "."
 42
 43    def __init__(self, name: Optional[str] = None) -> None:
 44        self.bname = key_bytes(name) if name else b""
 45        self.hmac = hmac.new(self.bname, digestmod="sha1")
 46
 47    @classmethod
 48    def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
 49        """Split up an entity ID into the plain ID and the namespace
 50        signature. If either part is missing, return None instead."""
 51        clean_id = registry.entity.clean(entity_id)
 52        if clean_id is None:
 53            return (None, None)
 54        try:
 55            plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
 56            return (plain_id, checksum)
 57        except ValueError:
 58            return (clean_id, None)
 59
 60    @classmethod
 61    def strip(cls, entity_id: str) -> Optional[str]:
 62        plain_id, _ = cls.parse(entity_id)
 63        return plain_id
 64
 65    def signature(self, entity_id: str) -> Optional[str]:
 66        """Generate a namespace-specific signature."""
 67        if not len(self.bname) or entity_id is None:
 68            return None
 69        digest = self.hmac.copy()
 70        digest.update(key_bytes(entity_id))
 71        return digest.hexdigest()
 72
 73    def sign(self, entity_id: str) -> Optional[str]:
 74        """Apply a namespace signature to an entity ID, removing any
 75        previous namespace marker."""
 76        parsed_id, _ = self.parse(entity_id)
 77        if not len(self.bname):
 78            return parsed_id
 79        if parsed_id is None:
 80            return None
 81        digest = self.signature(parsed_id)
 82        if digest is None:
 83            return None
 84        return self.SEP.join((parsed_id, digest))
 85
 86    def verify(self, entity_id: str) -> bool:
 87        """Check if the signature matches the current namespace."""
 88        parsed_id, digest = self.parse(entity_id)
 89        if digest is None or parsed_id is None:
 90            return False
 91        signature = self.signature(parsed_id)
 92        if signature is None:
 93            return False
 94        return hmac.compare_digest(digest, signature)
 95
 96    def apply(self, proxy: E, shallow: bool = False) -> E:
 97        """Rewrite an entity proxy so all IDs mentioned are limited to
 98        the namespace."""
 99        signed = proxy.clone()
100        if proxy.id is not None:
101            signed.id = self.sign(proxy.id)
102        if not shallow:
103            for prop in proxy.iterprops():
104                if prop.type != registry.entity:
105                    continue
106                for value in signed.pop(prop):
107                    entity_id = get_entity_id(value)
108                    if entity_id is not None:
109                        signed.add(prop, self.sign(entity_id))
110        return signed
111
112    @classmethod
113    def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
114        if isinstance(name, str):
115            return cls(name)
116        return name
117
118    def __eq__(self, other: Any) -> bool:
119        return bool(self.bname == other.bname)
120
121    def __repr__(self) -> str:
122        return "<Namespace(%r)>" % self.bname

Namespaces are used to partition entity IDs into different units, which traditionally represent a dataset, collection or source.

See module docstring for details.

Namespace(name: Optional[str] = None)
43    def __init__(self, name: Optional[str] = None) -> None:
44        self.bname = key_bytes(name) if name else b""
45        self.hmac = hmac.new(self.bname, digestmod="sha1")
SEP = '.'
bname
hmac
@classmethod
def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
47    @classmethod
48    def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
49        """Split up an entity ID into the plain ID and the namespace
50        signature. If either part is missing, return None instead."""
51        clean_id = registry.entity.clean(entity_id)
52        if clean_id is None:
53            return (None, None)
54        try:
55            plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
56            return (plain_id, checksum)
57        except ValueError:
58            return (clean_id, None)

Split up an entity ID into the plain ID and the namespace signature. If either part is missing, return None instead.

@classmethod
def strip(cls, entity_id: str) -> Optional[str]:
60    @classmethod
61    def strip(cls, entity_id: str) -> Optional[str]:
62        plain_id, _ = cls.parse(entity_id)
63        return plain_id
def signature(self, entity_id: str) -> Optional[str]:
65    def signature(self, entity_id: str) -> Optional[str]:
66        """Generate a namespace-specific signature."""
67        if not len(self.bname) or entity_id is None:
68            return None
69        digest = self.hmac.copy()
70        digest.update(key_bytes(entity_id))
71        return digest.hexdigest()

Generate a namespace-specific signature.

def sign(self, entity_id: str) -> Optional[str]:
73    def sign(self, entity_id: str) -> Optional[str]:
74        """Apply a namespace signature to an entity ID, removing any
75        previous namespace marker."""
76        parsed_id, _ = self.parse(entity_id)
77        if not len(self.bname):
78            return parsed_id
79        if parsed_id is None:
80            return None
81        digest = self.signature(parsed_id)
82        if digest is None:
83            return None
84        return self.SEP.join((parsed_id, digest))

Apply a namespace signature to an entity ID, removing any previous namespace marker.

def verify(self, entity_id: str) -> bool:
86    def verify(self, entity_id: str) -> bool:
87        """Check if the signature matches the current namespace."""
88        parsed_id, digest = self.parse(entity_id)
89        if digest is None or parsed_id is None:
90            return False
91        signature = self.signature(parsed_id)
92        if signature is None:
93            return False
94        return hmac.compare_digest(digest, signature)

Check if the signature matches the current namespace.

def apply(self, proxy: ~E, shallow: bool = False) -> ~E:
 96    def apply(self, proxy: E, shallow: bool = False) -> E:
 97        """Rewrite an entity proxy so all IDs mentioned are limited to
 98        the namespace."""
 99        signed = proxy.clone()
100        if proxy.id is not None:
101            signed.id = self.sign(proxy.id)
102        if not shallow:
103            for prop in proxy.iterprops():
104                if prop.type != registry.entity:
105                    continue
106                for value in signed.pop(prop):
107                    entity_id = get_entity_id(value)
108                    if entity_id is not None:
109                        signed.add(prop, self.sign(entity_id))
110        return signed

Rewrite an entity proxy so all IDs mentioned are limited to the namespace.

@classmethod
def make( cls, name: Union[str, Namespace]) -> Namespace:
112    @classmethod
113    def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
114        if isinstance(name, str):
115            return cls(name)
116        return name