followthemoney.namespace

We like our abstractions like our offshore banks: leaky.

Entity ID namespaces are a security mechanism related to the Aleph search index.

Aleph allows the user (via mappings or the API) to create arbitrary entity IDs. Entity IDs that are controlled by the user and not the system are unusual. However, this makes it possible to generate bulk data outside Aleph, and then load entities into the system as a continuous :ref:streams.

The problem is that having user controlled entity IDs increases the chance of conflict in the search index.

Namespacing works around this by making each entity ID consist of two parts: one controlled by the client, the other controlled by the system. The second part of the ID is called its signature::

entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126

The signature is generated as hmac(entity_id, dataset_id). This guarantees that the combined ID is specific to a dataset, without needing an (expensive) index look up of each ID first. It can also be generated on the client or the server without compromising isolation.

  1"""
  2*We like our abstractions like our offshore banks: leaky.*
  3
  4Entity ID namespaces are a security mechanism related to the Aleph search index.
  5
  6Aleph allows the user (via mappings or the API) to create arbitrary entity IDs.
  7Entity IDs that are controlled by the user and not the system are unusual.
  8However, this makes it possible to generate bulk data outside Aleph,
  9and then load entities into the system as a continuous :ref:`streams`.
 10
 11The problem is that having user controlled entity IDs increases the chance
 12of conflict in the search index.
 13
 14Namespacing works around this by making each entity ID consist of two parts:
 15one controlled by the client, the other controlled by the system. The second
 16part of the ID is called its `signature`::
 17
 18    entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126
 19
 20The signature is generated as ``hmac(entity_id, dataset_id)``. This guarantees
 21that the combined ID is specific to a dataset, without needing an (expensive)
 22index look up of each ID first. It can also be generated on the client or
 23the server without compromising isolation.
 24"""
 25import hmac
 26from typing import Any, Optional, Tuple, Union
 27
 28from followthemoney.types import registry
 29from followthemoney.proxy import E
 30from followthemoney.util import key_bytes, get_entity_id
 31
 32
 33class Namespace(object):
 34    """Namespaces are used to partition entity IDs into different units,
 35    which traditionally represent a dataset, collection or source.
 36
 37    See module docstring for details."""
 38
 39    SEP = "."
 40
 41    def __init__(self, name: Optional[str] = None) -> None:
 42        self.bname = key_bytes(name) if name else b""
 43        self.hmac = hmac.new(self.bname, digestmod="sha1")
 44
 45    @classmethod
 46    def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
 47        """Split up an entity ID into the plain ID and the namespace
 48        signature. If either part is missing, return None instead."""
 49        clean_id = registry.entity.clean(entity_id)
 50        if clean_id is None:
 51            return (None, None)
 52        try:
 53            plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
 54            return (plain_id, checksum)
 55        except ValueError:
 56            return (clean_id, None)
 57
 58    @classmethod
 59    def strip(cls, entity_id: str) -> Optional[str]:
 60        plain_id, _ = cls.parse(entity_id)
 61        return plain_id
 62
 63    def signature(self, entity_id: str) -> Optional[str]:
 64        """Generate a namespace-specific signature."""
 65        if not len(self.bname) or entity_id is None:
 66            return None
 67        digest = self.hmac.copy()
 68        digest.update(key_bytes(entity_id))
 69        return digest.hexdigest()
 70
 71    def sign(self, entity_id: str) -> Optional[str]:
 72        """Apply a namespace signature to an entity ID, removing any
 73        previous namespace marker."""
 74        parsed_id, _ = self.parse(entity_id)
 75        if not len(self.bname):
 76            return parsed_id
 77        if parsed_id is None:
 78            return None
 79        digest = self.signature(parsed_id)
 80        if digest is None:
 81            return None
 82        return self.SEP.join((parsed_id, digest))
 83
 84    def verify(self, entity_id: str) -> bool:
 85        """Check if the signature matches the current namespace."""
 86        parsed_id, digest = self.parse(entity_id)
 87        if digest is None or parsed_id is None:
 88            return False
 89        signature = self.signature(parsed_id)
 90        if signature is None:
 91            return False
 92        return hmac.compare_digest(digest, signature)
 93
 94    def apply(self, proxy: E, shallow: bool = False) -> E:
 95        """Rewrite an entity proxy so all IDs mentioned are limited to
 96        the namespace."""
 97        signed = proxy.clone()
 98        signed.id = self.sign(proxy.id)
 99        if not shallow:
100            for prop in proxy.iterprops():
101                if prop.type != registry.entity:
102                    continue
103                for value in signed.pop(prop):
104                    entity_id = get_entity_id(value)
105                    if entity_id is not None:
106                        signed.add(prop, self.sign(entity_id))
107        return signed
108
109    @classmethod
110    def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
111        if isinstance(name, str):
112            return cls(name)
113        return name
114
115    def __eq__(self, other: Any) -> bool:
116        return bool(self.bname == other.bname)
117
118    def __repr__(self) -> str:
119        return "<Namespace(%r)>" % self.bname
class Namespace:
 34class Namespace(object):
 35    """Namespaces are used to partition entity IDs into different units,
 36    which traditionally represent a dataset, collection or source.
 37
 38    See module docstring for details."""
 39
 40    SEP = "."
 41
 42    def __init__(self, name: Optional[str] = None) -> None:
 43        self.bname = key_bytes(name) if name else b""
 44        self.hmac = hmac.new(self.bname, digestmod="sha1")
 45
 46    @classmethod
 47    def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
 48        """Split up an entity ID into the plain ID and the namespace
 49        signature. If either part is missing, return None instead."""
 50        clean_id = registry.entity.clean(entity_id)
 51        if clean_id is None:
 52            return (None, None)
 53        try:
 54            plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
 55            return (plain_id, checksum)
 56        except ValueError:
 57            return (clean_id, None)
 58
 59    @classmethod
 60    def strip(cls, entity_id: str) -> Optional[str]:
 61        plain_id, _ = cls.parse(entity_id)
 62        return plain_id
 63
 64    def signature(self, entity_id: str) -> Optional[str]:
 65        """Generate a namespace-specific signature."""
 66        if not len(self.bname) or entity_id is None:
 67            return None
 68        digest = self.hmac.copy()
 69        digest.update(key_bytes(entity_id))
 70        return digest.hexdigest()
 71
 72    def sign(self, entity_id: str) -> Optional[str]:
 73        """Apply a namespace signature to an entity ID, removing any
 74        previous namespace marker."""
 75        parsed_id, _ = self.parse(entity_id)
 76        if not len(self.bname):
 77            return parsed_id
 78        if parsed_id is None:
 79            return None
 80        digest = self.signature(parsed_id)
 81        if digest is None:
 82            return None
 83        return self.SEP.join((parsed_id, digest))
 84
 85    def verify(self, entity_id: str) -> bool:
 86        """Check if the signature matches the current namespace."""
 87        parsed_id, digest = self.parse(entity_id)
 88        if digest is None or parsed_id is None:
 89            return False
 90        signature = self.signature(parsed_id)
 91        if signature is None:
 92            return False
 93        return hmac.compare_digest(digest, signature)
 94
 95    def apply(self, proxy: E, shallow: bool = False) -> E:
 96        """Rewrite an entity proxy so all IDs mentioned are limited to
 97        the namespace."""
 98        signed = proxy.clone()
 99        signed.id = self.sign(proxy.id)
100        if not shallow:
101            for prop in proxy.iterprops():
102                if prop.type != registry.entity:
103                    continue
104                for value in signed.pop(prop):
105                    entity_id = get_entity_id(value)
106                    if entity_id is not None:
107                        signed.add(prop, self.sign(entity_id))
108        return signed
109
110    @classmethod
111    def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
112        if isinstance(name, str):
113            return cls(name)
114        return name
115
116    def __eq__(self, other: Any) -> bool:
117        return bool(self.bname == other.bname)
118
119    def __repr__(self) -> str:
120        return "<Namespace(%r)>" % self.bname

Namespaces are used to partition entity IDs into different units, which traditionally represent a dataset, collection or source.

See module docstring for details.

Namespace(name: Optional[str] = None)
42    def __init__(self, name: Optional[str] = None) -> None:
43        self.bname = key_bytes(name) if name else b""
44        self.hmac = hmac.new(self.bname, digestmod="sha1")
SEP = '.'
bname
hmac
@classmethod
def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
46    @classmethod
47    def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]:
48        """Split up an entity ID into the plain ID and the namespace
49        signature. If either part is missing, return None instead."""
50        clean_id = registry.entity.clean(entity_id)
51        if clean_id is None:
52            return (None, None)
53        try:
54            plain_id, checksum = clean_id.rsplit(cls.SEP, 1)
55            return (plain_id, checksum)
56        except ValueError:
57            return (clean_id, None)

Split up an entity ID into the plain ID and the namespace signature. If either part is missing, return None instead.

@classmethod
def strip(cls, entity_id: str) -> Optional[str]:
59    @classmethod
60    def strip(cls, entity_id: str) -> Optional[str]:
61        plain_id, _ = cls.parse(entity_id)
62        return plain_id
def signature(self, entity_id: str) -> Optional[str]:
64    def signature(self, entity_id: str) -> Optional[str]:
65        """Generate a namespace-specific signature."""
66        if not len(self.bname) or entity_id is None:
67            return None
68        digest = self.hmac.copy()
69        digest.update(key_bytes(entity_id))
70        return digest.hexdigest()

Generate a namespace-specific signature.

def sign(self, entity_id: str) -> Optional[str]:
72    def sign(self, entity_id: str) -> Optional[str]:
73        """Apply a namespace signature to an entity ID, removing any
74        previous namespace marker."""
75        parsed_id, _ = self.parse(entity_id)
76        if not len(self.bname):
77            return parsed_id
78        if parsed_id is None:
79            return None
80        digest = self.signature(parsed_id)
81        if digest is None:
82            return None
83        return self.SEP.join((parsed_id, digest))

Apply a namespace signature to an entity ID, removing any previous namespace marker.

def verify(self, entity_id: str) -> bool:
85    def verify(self, entity_id: str) -> bool:
86        """Check if the signature matches the current namespace."""
87        parsed_id, digest = self.parse(entity_id)
88        if digest is None or parsed_id is None:
89            return False
90        signature = self.signature(parsed_id)
91        if signature is None:
92            return False
93        return hmac.compare_digest(digest, signature)

Check if the signature matches the current namespace.

def apply(self, proxy: ~E, shallow: bool = False) -> ~E:
 95    def apply(self, proxy: E, shallow: bool = False) -> E:
 96        """Rewrite an entity proxy so all IDs mentioned are limited to
 97        the namespace."""
 98        signed = proxy.clone()
 99        signed.id = self.sign(proxy.id)
100        if not shallow:
101            for prop in proxy.iterprops():
102                if prop.type != registry.entity:
103                    continue
104                for value in signed.pop(prop):
105                    entity_id = get_entity_id(value)
106                    if entity_id is not None:
107                        signed.add(prop, self.sign(entity_id))
108        return signed

Rewrite an entity proxy so all IDs mentioned are limited to the namespace.

@classmethod
def make( cls, name: Union[str, Namespace]) -> Namespace:
110    @classmethod
111    def make(cls, name: Union[str, "Namespace"]) -> "Namespace":
112        if isinstance(name, str):
113            return cls(name)
114        return name