followthemoney.namespace
We like our abstractions like our offshore banks: leaky.
Entity ID namespaces are a security mechanism related to the Aleph search index.
Aleph allows the user (via mappings or the API) to create arbitrary entity IDs.
Entity IDs that are controlled by the user and not the system are unusual.
However, this makes it possible to generate bulk data outside Aleph,
and then load entities into the system as a continuous :ref:streams
.
The problem is that having user controlled entity IDs increases the chance of conflict in the search index.
Namespacing works around this by making each entity ID consist of two parts:
one controlled by the client, the other controlled by the system. The second
part of the ID is called its signature
::
entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126
The signature is generated as hmac(entity_id, dataset_id)
. This guarantees
that the combined ID is specific to a dataset, without needing an (expensive)
index look up of each ID first. It can also be generated on the client or
the server without compromising isolation.
1""" 2*We like our abstractions like our offshore banks: leaky.* 3 4Entity ID namespaces are a security mechanism related to the Aleph search index. 5 6Aleph allows the user (via mappings or the API) to create arbitrary entity IDs. 7Entity IDs that are controlled by the user and not the system are unusual. 8However, this makes it possible to generate bulk data outside Aleph, 9and then load entities into the system as a continuous :ref:`streams`. 10 11The problem is that having user controlled entity IDs increases the chance 12of conflict in the search index. 13 14Namespacing works around this by making each entity ID consist of two parts: 15one controlled by the client, the other controlled by the system. The second 16part of the ID is called its `signature`:: 17 18 entity_id.a40a29300ac6bb79dd2f911e77bbda7a3b502126 19 20The signature is generated as ``hmac(entity_id, dataset_id)``. This guarantees 21that the combined ID is specific to a dataset, without needing an (expensive) 22index look up of each ID first. It can also be generated on the client or 23the server without compromising isolation. 24""" 25 26import hmac 27from typing import Any, Optional, Tuple, Union 28 29from followthemoney.types import registry 30from followthemoney.proxy import E 31from followthemoney.util import key_bytes, get_entity_id 32 33 34class Namespace(object): 35 """Namespaces are used to partition entity IDs into different units, 36 which traditionally represent a dataset, collection or source. 37 38 See module docstring for details.""" 39 40 SEP = "." 41 42 def __init__(self, name: Optional[str] = None) -> None: 43 self.bname = key_bytes(name) if name else b"" 44 self.hmac = hmac.new(self.bname, digestmod="sha1") 45 46 @classmethod 47 def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]: 48 """Split up an entity ID into the plain ID and the namespace 49 signature. If either part is missing, return None instead.""" 50 clean_id = registry.entity.clean(entity_id) 51 if clean_id is None: 52 return (None, None) 53 try: 54 plain_id, checksum = clean_id.rsplit(cls.SEP, 1) 55 return (plain_id, checksum) 56 except ValueError: 57 return (clean_id, None) 58 59 @classmethod 60 def strip(cls, entity_id: str) -> Optional[str]: 61 plain_id, _ = cls.parse(entity_id) 62 return plain_id 63 64 def signature(self, entity_id: str) -> Optional[str]: 65 """Generate a namespace-specific signature.""" 66 if not len(self.bname) or entity_id is None: 67 return None 68 digest = self.hmac.copy() 69 digest.update(key_bytes(entity_id)) 70 return digest.hexdigest() 71 72 def sign(self, entity_id: str) -> Optional[str]: 73 """Apply a namespace signature to an entity ID, removing any 74 previous namespace marker.""" 75 parsed_id, _ = self.parse(entity_id) 76 if not len(self.bname): 77 return parsed_id 78 if parsed_id is None: 79 return None 80 digest = self.signature(parsed_id) 81 if digest is None: 82 return None 83 return self.SEP.join((parsed_id, digest)) 84 85 def verify(self, entity_id: str) -> bool: 86 """Check if the signature matches the current namespace.""" 87 parsed_id, digest = self.parse(entity_id) 88 if digest is None or parsed_id is None: 89 return False 90 signature = self.signature(parsed_id) 91 if signature is None: 92 return False 93 return hmac.compare_digest(digest, signature) 94 95 def apply(self, proxy: E, shallow: bool = False) -> E: 96 """Rewrite an entity proxy so all IDs mentioned are limited to 97 the namespace.""" 98 signed = proxy.clone() 99 if proxy.id is not None: 100 signed.id = self.sign(proxy.id) 101 if not shallow: 102 for prop in proxy.iterprops(): 103 if prop.type != registry.entity: 104 continue 105 for value in signed.pop(prop): 106 entity_id = get_entity_id(value) 107 if entity_id is not None: 108 signed.add(prop, self.sign(entity_id)) 109 return signed 110 111 @classmethod 112 def make(cls, name: Union[str, "Namespace"]) -> "Namespace": 113 if isinstance(name, str): 114 return cls(name) 115 return name 116 117 def __eq__(self, other: Any) -> bool: 118 return bool(self.bname == other.bname) 119 120 def __repr__(self) -> str: 121 return "<Namespace(%r)>" % self.bname
35class Namespace(object): 36 """Namespaces are used to partition entity IDs into different units, 37 which traditionally represent a dataset, collection or source. 38 39 See module docstring for details.""" 40 41 SEP = "." 42 43 def __init__(self, name: Optional[str] = None) -> None: 44 self.bname = key_bytes(name) if name else b"" 45 self.hmac = hmac.new(self.bname, digestmod="sha1") 46 47 @classmethod 48 def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]: 49 """Split up an entity ID into the plain ID and the namespace 50 signature. If either part is missing, return None instead.""" 51 clean_id = registry.entity.clean(entity_id) 52 if clean_id is None: 53 return (None, None) 54 try: 55 plain_id, checksum = clean_id.rsplit(cls.SEP, 1) 56 return (plain_id, checksum) 57 except ValueError: 58 return (clean_id, None) 59 60 @classmethod 61 def strip(cls, entity_id: str) -> Optional[str]: 62 plain_id, _ = cls.parse(entity_id) 63 return plain_id 64 65 def signature(self, entity_id: str) -> Optional[str]: 66 """Generate a namespace-specific signature.""" 67 if not len(self.bname) or entity_id is None: 68 return None 69 digest = self.hmac.copy() 70 digest.update(key_bytes(entity_id)) 71 return digest.hexdigest() 72 73 def sign(self, entity_id: str) -> Optional[str]: 74 """Apply a namespace signature to an entity ID, removing any 75 previous namespace marker.""" 76 parsed_id, _ = self.parse(entity_id) 77 if not len(self.bname): 78 return parsed_id 79 if parsed_id is None: 80 return None 81 digest = self.signature(parsed_id) 82 if digest is None: 83 return None 84 return self.SEP.join((parsed_id, digest)) 85 86 def verify(self, entity_id: str) -> bool: 87 """Check if the signature matches the current namespace.""" 88 parsed_id, digest = self.parse(entity_id) 89 if digest is None or parsed_id is None: 90 return False 91 signature = self.signature(parsed_id) 92 if signature is None: 93 return False 94 return hmac.compare_digest(digest, signature) 95 96 def apply(self, proxy: E, shallow: bool = False) -> E: 97 """Rewrite an entity proxy so all IDs mentioned are limited to 98 the namespace.""" 99 signed = proxy.clone() 100 if proxy.id is not None: 101 signed.id = self.sign(proxy.id) 102 if not shallow: 103 for prop in proxy.iterprops(): 104 if prop.type != registry.entity: 105 continue 106 for value in signed.pop(prop): 107 entity_id = get_entity_id(value) 108 if entity_id is not None: 109 signed.add(prop, self.sign(entity_id)) 110 return signed 111 112 @classmethod 113 def make(cls, name: Union[str, "Namespace"]) -> "Namespace": 114 if isinstance(name, str): 115 return cls(name) 116 return name 117 118 def __eq__(self, other: Any) -> bool: 119 return bool(self.bname == other.bname) 120 121 def __repr__(self) -> str: 122 return "<Namespace(%r)>" % self.bname
Namespaces are used to partition entity IDs into different units, which traditionally represent a dataset, collection or source.
See module docstring for details.
47 @classmethod 48 def parse(cls, entity_id: str) -> Tuple[Optional[str], Optional[str]]: 49 """Split up an entity ID into the plain ID and the namespace 50 signature. If either part is missing, return None instead.""" 51 clean_id = registry.entity.clean(entity_id) 52 if clean_id is None: 53 return (None, None) 54 try: 55 plain_id, checksum = clean_id.rsplit(cls.SEP, 1) 56 return (plain_id, checksum) 57 except ValueError: 58 return (clean_id, None)
Split up an entity ID into the plain ID and the namespace signature. If either part is missing, return None instead.
65 def signature(self, entity_id: str) -> Optional[str]: 66 """Generate a namespace-specific signature.""" 67 if not len(self.bname) or entity_id is None: 68 return None 69 digest = self.hmac.copy() 70 digest.update(key_bytes(entity_id)) 71 return digest.hexdigest()
Generate a namespace-specific signature.
73 def sign(self, entity_id: str) -> Optional[str]: 74 """Apply a namespace signature to an entity ID, removing any 75 previous namespace marker.""" 76 parsed_id, _ = self.parse(entity_id) 77 if not len(self.bname): 78 return parsed_id 79 if parsed_id is None: 80 return None 81 digest = self.signature(parsed_id) 82 if digest is None: 83 return None 84 return self.SEP.join((parsed_id, digest))
Apply a namespace signature to an entity ID, removing any previous namespace marker.
86 def verify(self, entity_id: str) -> bool: 87 """Check if the signature matches the current namespace.""" 88 parsed_id, digest = self.parse(entity_id) 89 if digest is None or parsed_id is None: 90 return False 91 signature = self.signature(parsed_id) 92 if signature is None: 93 return False 94 return hmac.compare_digest(digest, signature)
Check if the signature matches the current namespace.
96 def apply(self, proxy: E, shallow: bool = False) -> E: 97 """Rewrite an entity proxy so all IDs mentioned are limited to 98 the namespace.""" 99 signed = proxy.clone() 100 if proxy.id is not None: 101 signed.id = self.sign(proxy.id) 102 if not shallow: 103 for prop in proxy.iterprops(): 104 if prop.type != registry.entity: 105 continue 106 for value in signed.pop(prop): 107 entity_id = get_entity_id(value) 108 if entity_id is not None: 109 signed.add(prop, self.sign(entity_id)) 110 return signed
Rewrite an entity proxy so all IDs mentioned are limited to the namespace.