followthemoney.namespace
We like our abstractions like our offshore banks: leaky.
Entity ID namespaces are a security mechanism related to the Aleph search index.
Aleph allows the user (via mappings or the API) to create arbitrary entity IDs.
Entity IDs that are controlled by the user and not the system are unusual.
However, this makes it possible to generate bulk data outside Aleph,
and then load entities into the system as a continuous :ref:streams
.
The problem is that having user controlled entity IDs increases the chance of conflict in the search index.
Namespacing works around this by making each entity ID consist of two parts:
one controlled by the client, the other controlled by the system. The second
part of the ID is called its signature
::
entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126
The signature is generated as hmac(entity_id, dataset_id)
. This guarantees
that the combined ID is specific to a dataset, without needing an (expensive)
index look up of each ID first. It can also be generated on the client or
the server without compromising isolation.
1""" 2*We like our abstractions like our offshore banks: leaky.* 3 4Entity ID namespaces are a security mechanism related to the Aleph search index. 5 6Aleph allows the user (via mappings or the API) to create arbitrary entity IDs. 7Entity IDs that are controlled by the user and not the system are unusual. 8However, this makes it possible to generate bulk data outside Aleph, 9and then load entities into the system as a continuous :ref:`streams`. 10 11The problem is that having user controlled entity IDs increases the chance 12of conflict in the search index. 13 14Namespacing works around this by making each entity ID consist of two parts: 15one controlled by the client, the other controlled by the system. The second 16part of the ID is called its `signature`:: 17 18 entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126 19 20The signature is generated as ``hmac(entity_id, dataset_id)``. This guarantees 21that the combined ID is specific to a dataset, without needing an (expensive) 22index look up of each ID first. It can also be generated on the client or 23the server without compromising isolation. 24""" 25import hmac 26from typing import Any, Optional, Tuple, Union 27 28from followthemoney.types import registry 29from followthemoney.proxy import E 30from followthemoney.util import key_bytes, get_entity_id 31 32 33class Namespace(object): 34 """Namespaces are used to partition entity IDs into different units, 35 which traditionally represent a dataset, collection or source. 36 37 See module docstring for details.""" 38 39 SEP = "." 40 41 def __init__(self, name: Optional[str] = None) -> None: 42 self.bname = key_bytes(name) if name else b"" 43 self.hmac = hmac.new(self.bname, digestmod="sha1") 44 45 @classmethod 46 def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]: 47 """Split up an entity ID into the plain ID and the namespace 48 signature. If either part is missing, return None instead.""" 49 clean_id = registry.entity.clean(entity_id) 50 if clean_id is None: 51 return (None, None) 52 try: 53 plain_id, checksum = clean_id.rsplit(cls.SEP, 1) 54 return (plain_id, checksum) 55 except ValueError: 56 return (clean_id, None) 57 58 @classmethod 59 def strip(cls, entity_id: str) -> Optional[str]: 60 plain_id, _ = cls.parse(entity_id) 61 return plain_id 62 63 def signature(self, entity_id: str) -> Optional[str]: 64 """Generate a namespace-specific signature.""" 65 if not len(self.bname) or entity_id is None: 66 return None 67 digest = self.hmac.copy() 68 digest.update(key_bytes(entity_id)) 69 return digest.hexdigest() 70 71 def sign(self, entity_id: str) -> Optional[str]: 72 """Apply a namespace signature to an entity ID, removing any 73 previous namespace marker.""" 74 parsed_id, _ = self.parse(entity_id) 75 if not len(self.bname): 76 return parsed_id 77 if parsed_id is None: 78 return None 79 digest = self.signature(parsed_id) 80 if digest is None: 81 return None 82 return self.SEP.join((parsed_id, digest)) 83 84 def verify(self, entity_id: str) -> bool: 85 """Check if the signature matches the current namespace.""" 86 parsed_id, digest = self.parse(entity_id) 87 if digest is None or parsed_id is None: 88 return False 89 signature = self.signature(parsed_id) 90 if signature is None: 91 return False 92 return hmac.compare_digest(digest, signature) 93 94 def apply(self, proxy: E, shallow: bool = False) -> E: 95 """Rewrite an entity proxy so all IDs mentioned are limited to 96 the namespace.""" 97 signed = proxy.clone() 98 signed.id = self.sign(proxy.id) 99 if not shallow: 100 for prop in proxy.iterprops(): 101 if prop.type != registry.entity: 102 continue 103 for value in signed.pop(prop): 104 entity_id = get_entity_id(value) 105 if entity_id is not None: 106 signed.add(prop, self.sign(entity_id)) 107 return signed 108 109 @classmethod 110 def make(cls, name: Union[str, "Namespace"]) -> "Namespace": 111 if isinstance(name, str): 112 return cls(name) 113 return name 114 115 def __eq__(self, other: Any) -> bool: 116 return bool(self.bname == other.bname) 117 118 def __repr__(self) -> str: 119 return "<Namespace(%r)>" % self.bname
34class Namespace(object): 35 """Namespaces are used to partition entity IDs into different units, 36 which traditionally represent a dataset, collection or source. 37 38 See module docstring for details.""" 39 40 SEP = "." 41 42 def __init__(self, name: Optional[str] = None) -> None: 43 self.bname = key_bytes(name) if name else b"" 44 self.hmac = hmac.new(self.bname, digestmod="sha1") 45 46 @classmethod 47 def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]: 48 """Split up an entity ID into the plain ID and the namespace 49 signature. If either part is missing, return None instead.""" 50 clean_id = registry.entity.clean(entity_id) 51 if clean_id is None: 52 return (None, None) 53 try: 54 plain_id, checksum = clean_id.rsplit(cls.SEP, 1) 55 return (plain_id, checksum) 56 except ValueError: 57 return (clean_id, None) 58 59 @classmethod 60 def strip(cls, entity_id: str) -> Optional[str]: 61 plain_id, _ = cls.parse(entity_id) 62 return plain_id 63 64 def signature(self, entity_id: str) -> Optional[str]: 65 """Generate a namespace-specific signature.""" 66 if not len(self.bname) or entity_id is None: 67 return None 68 digest = self.hmac.copy() 69 digest.update(key_bytes(entity_id)) 70 return digest.hexdigest() 71 72 def sign(self, entity_id: str) -> Optional[str]: 73 """Apply a namespace signature to an entity ID, removing any 74 previous namespace marker.""" 75 parsed_id, _ = self.parse(entity_id) 76 if not len(self.bname): 77 return parsed_id 78 if parsed_id is None: 79 return None 80 digest = self.signature(parsed_id) 81 if digest is None: 82 return None 83 return self.SEP.join((parsed_id, digest)) 84 85 def verify(self, entity_id: str) -> bool: 86 """Check if the signature matches the current namespace.""" 87 parsed_id, digest = self.parse(entity_id) 88 if digest is None or parsed_id is None: 89 return False 90 signature = self.signature(parsed_id) 91 if signature is None: 92 return False 93 return hmac.compare_digest(digest, signature) 94 95 def apply(self, proxy: E, shallow: bool = False) -> E: 96 """Rewrite an entity proxy so all IDs mentioned are limited to 97 the namespace.""" 98 signed = proxy.clone() 99 signed.id = self.sign(proxy.id) 100 if not shallow: 101 for prop in proxy.iterprops(): 102 if prop.type != registry.entity: 103 continue 104 for value in signed.pop(prop): 105 entity_id = get_entity_id(value) 106 if entity_id is not None: 107 signed.add(prop, self.sign(entity_id)) 108 return signed 109 110 @classmethod 111 def make(cls, name: Union[str, "Namespace"]) -> "Namespace": 112 if isinstance(name, str): 113 return cls(name) 114 return name 115 116 def __eq__(self, other: Any) -> bool: 117 return bool(self.bname == other.bname) 118 119 def __repr__(self) -> str: 120 return "<Namespace(%r)>" % self.bname
Namespaces are used to partition entity IDs into different units, which traditionally represent a dataset, collection or source.
See module docstring for details.
46 @classmethod 47 def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]: 48 """Split up an entity ID into the plain ID and the namespace 49 signature. If either part is missing, return None instead.""" 50 clean_id = registry.entity.clean(entity_id) 51 if clean_id is None: 52 return (None, None) 53 try: 54 plain_id, checksum = clean_id.rsplit(cls.SEP, 1) 55 return (plain_id, checksum) 56 except ValueError: 57 return (clean_id, None)
Split up an entity ID into the plain ID and the namespace signature. If either part is missing, return None instead.
64 def signature(self, entity_id: str) -> Optional[str]: 65 """Generate a namespace-specific signature.""" 66 if not len(self.bname) or entity_id is None: 67 return None 68 digest = self.hmac.copy() 69 digest.update(key_bytes(entity_id)) 70 return digest.hexdigest()
Generate a namespace-specific signature.
72 def sign(self, entity_id: str) -> Optional[str]: 73 """Apply a namespace signature to an entity ID, removing any 74 previous namespace marker.""" 75 parsed_id, _ = self.parse(entity_id) 76 if not len(self.bname): 77 return parsed_id 78 if parsed_id is None: 79 return None 80 digest = self.signature(parsed_id) 81 if digest is None: 82 return None 83 return self.SEP.join((parsed_id, digest))
Apply a namespace signature to an entity ID, removing any previous namespace marker.
85 def verify(self, entity_id: str) -> bool: 86 """Check if the signature matches the current namespace.""" 87 parsed_id, digest = self.parse(entity_id) 88 if digest is None or parsed_id is None: 89 return False 90 signature = self.signature(parsed_id) 91 if signature is None: 92 return False 93 return hmac.compare_digest(digest, signature)
Check if the signature matches the current namespace.
95 def apply(self, proxy: E, shallow: bool = False) -> E: 96 """Rewrite an entity proxy so all IDs mentioned are limited to 97 the namespace.""" 98 signed = proxy.clone() 99 signed.id = self.sign(proxy.id) 100 if not shallow: 101 for prop in proxy.iterprops(): 102 if prop.type != registry.entity: 103 continue 104 for value in signed.pop(prop): 105 entity_id = get_entity_id(value) 106 if entity_id is not None: 107 signed.add(prop, self.sign(entity_id)) 108 return signed
Rewrite an entity proxy so all IDs mentioned are limited to the namespace.