followthemoney.types.common
1from inspect import cleandoc 2from itertools import product 3from babel.core import Locale 4from banal import ensure_list 5from normality import stringify 6from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict 7 8from followthemoney.rdf import Literal, Identifier 9from followthemoney.util import get_locale 10from followthemoney.util import gettext, sanitize_text 11 12if TYPE_CHECKING: 13 from followthemoney.proxy import EntityProxy 14 15EnumValues = Dict[str, str] 16 17 18class PropertyTypeToDict(TypedDict, total=False): 19 label: str 20 plural: str 21 description: Optional[str] 22 group: Optional[str] 23 matchable: Optional[bool] 24 pivot: Optional[bool] 25 values: Optional[EnumValues] 26 27 28class PropertyType(object): 29 """Base class for all property types.""" 30 31 name: str = "any" 32 """A machine-facing, variable safe name for the given type.""" 33 34 group: Optional[str] = None 35 """Groups are used to invert all the properties of an entity that have a 36 given type into a single list before indexing them. This way, in Aleph, 37 you can query for ``countries:gb`` instead of having to make a set of filters 38 like ``properties.jurisdiction:gb OR properties.country:gb OR ...``.""" 39 40 label: str = "Any" 41 """A name for this type to be shown to users.""" 42 43 plural: str = "Any" 44 """A plural name for this type which can be used in appropriate places in 45 a user interface.""" 46 47 matchable: bool = True 48 """Matchable types allow properties to be compared with each other in order to 49 assess entity similarity. While it makes sense to compare names, countries or 50 phone numbers, the same isn't true for raw JSON blobs or descriptive text 51 snippets.""" 52 53 pivot: bool = False 54 """Pivot property types are like a stronger form of :attr:`~matchable` types: 55 they will be used when value-based lookups are used to find commonalities 56 between entities. For example, pivot typed-properties are used to show all the 57 other entities that mention the same phone number, email address or name as the 58 one currently seen by the user.""" 59 60 max_size: Optional[int] = None 61 """Some types have overall size limitations in place in order to avoid generating 62 entities that are very large (upstream ElasticSearch has a 100MB document limit). 63 Once the total size of all properties of this type has exceed the given limit, 64 an entity will refuse to add further values.""" 65 66 @property 67 def docs(self) -> Optional[str]: 68 if not self.__doc__: 69 return None 70 71 return cleandoc(self.__doc__) 72 73 def validate( 74 self, value: str, fuzzy: bool = False, format: Optional[str] = None 75 ) -> bool: 76 """Returns a boolean to indicate if the given value is a valid instance of 77 the type.""" 78 cleaned = self.clean(value, fuzzy=fuzzy, format=format) 79 return cleaned is not None 80 81 def clean( 82 self, 83 raw: Any, 84 fuzzy: bool = False, 85 format: Optional[str] = None, 86 proxy: Optional["EntityProxy"] = None, 87 ) -> Optional[str]: 88 """Create a clean version of a value of the type, suitable for storage 89 in an entity proxy.""" 90 text = sanitize_text(raw) 91 if text is None: 92 return None 93 return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy) 94 95 def clean_text( 96 self, 97 text: str, 98 fuzzy: bool = False, 99 format: Optional[str] = None, 100 proxy: Optional["EntityProxy"] = None, 101 ) -> Optional[str]: 102 """Specific types can apply their own cleaning routines here (this is called 103 by ``clean`` after the value has been converted to a string and null values 104 have been filtered).""" 105 return text 106 107 def join(self, values: Sequence[str]) -> str: 108 """Helper function for converting multi-valued FtM data into formats that 109 allow only a single value per field (e.g. CSV). This is not fully reversible 110 and should be used as a last option.""" 111 values = ensure_list(values) 112 return "; ".join(values) 113 114 def _specificity(self, value: str) -> float: 115 return 1.0 116 117 def specificity(self, value: Optional[str]) -> float: 118 """Return a score for how specific the given value is. This can be used as a 119 weighting factor in entity comparisons in order to rate matching property 120 values by how specific they are. For example: a longer address is considered 121 to be more specific than a short one, a full date more specific than just a 122 year number, etc.""" 123 if not self.matchable or value is None: 124 return 0.0 125 return self._specificity(value) 126 127 def compare_safe(self, left: Optional[str], right: Optional[str]) -> float: 128 """Compare, but support None values on either side of the comparison.""" 129 left = stringify(left) 130 right = stringify(right) 131 if left is None or right is None: 132 return 0.0 133 return self.compare(left, right) 134 135 def compare(self, left: str, right: str) -> float: 136 """Comparisons are a float between 0 and 1. They can assume 137 that the given data is cleaned, but not normalised.""" 138 if left.lower() == right.lower(): 139 return 1.0 * self.specificity(left) 140 return 0.0 141 142 def compare_sets( 143 self, 144 left: Sequence[str], 145 right: Sequence[str], 146 func: Callable[[Sequence[float]], float] = max, 147 ) -> float: 148 """Compare two sets of values and select the highest-scored result.""" 149 results = [] 150 for l, r in product(ensure_list(left), ensure_list(right)): 151 results.append(self.compare(l, r)) 152 if not len(results): 153 return 0.0 154 return func(results) 155 156 def country_hint(self, value: str) -> Optional[str]: 157 """Determine if the given value allows us to infer a country that it may 158 be related to (e.g. using a country prefix on a phone number or IBAN).""" 159 return None 160 161 def rdf(self, value: str) -> Identifier: 162 """Return an RDF term to represent the given value - either a string 163 literal, or a URI reference.""" 164 return Literal(value) 165 166 def pick(self, values: Sequence[str]) -> Optional[str]: 167 """Pick the best value to show to the user.""" 168 raise NotImplemented 169 170 def node_id(self, value: str) -> Optional[str]: 171 """Return an ID suitable to identify this entity as a typed node in a 172 graph representation of some FtM data. It's usually the same as the the 173 RDF form.""" 174 return str(self.rdf(value)) 175 176 def node_id_safe(self, value: Optional[str]) -> Optional[str]: 177 """Wrapper for node_id to handle None values.""" 178 if value is None: 179 return None 180 return self.node_id(value) 181 182 def caption(self, value: str) -> Optional[str]: 183 """Return a label for the given property value. This is often the same as the 184 value, but for types like countries or languages, it would return the label, 185 while other values like phone numbers can be formatted to be nicer to read.""" 186 return value 187 188 def to_dict(self) -> PropertyTypeToDict: 189 """Return a serialisable description of this data type.""" 190 data: PropertyTypeToDict = { 191 "label": gettext(self.label), 192 "plural": gettext(self.plural), 193 "description": gettext(self.docs), 194 } 195 if self.group: 196 data["group"] = self.group 197 if self.matchable: 198 data["matchable"] = True 199 if self.pivot: 200 data["pivot"] = True 201 return data 202 203 def __eq__(self, other: Any) -> bool: 204 if not isinstance(other, PropertyType): 205 return False 206 return self.name == other.name 207 208 def __hash__(self) -> int: 209 return hash(self.name) 210 211 def __str__(self) -> str: 212 return self.name 213 214 def __repr__(self) -> str: 215 return f"<{self.name}>" 216 217 218class EnumType(PropertyType): 219 """Enumerated type properties are used for types which have a defined set 220 of possible values, like languages and countries.""" 221 222 def __init__(self) -> None: 223 self._names: Dict[Locale, EnumValues] = {} 224 self.codes = set(self.names.keys()) 225 226 def _locale_names(self, locale: Locale) -> EnumValues: 227 return {} 228 229 @property 230 def names(self) -> EnumValues: 231 """Return a mapping from property values to their labels in the current 232 locale.""" 233 locale = get_locale() 234 if locale not in self._names: 235 self._names[locale] = self._locale_names(locale) 236 return self._names[locale] 237 238 def validate( 239 self, value: str, fuzzy: bool = False, format: Optional[str] = None 240 ) -> bool: 241 """Make sure that the given code value is one of the supported set.""" 242 if value is None: 243 return False 244 return str(value).lower().strip() in self.codes 245 246 def clean_text( 247 self, 248 code: str, 249 fuzzy: bool = False, 250 format: Optional[str] = None, 251 proxy: Optional["EntityProxy"] = None, 252 ) -> Optional[str]: 253 """All code values are cleaned to be lowercase and trailing whitespace is 254 removed.""" 255 code = code.lower().strip() 256 if code not in self.codes: 257 return None 258 return code 259 260 def caption(self, value: str) -> str: 261 """Given a code value, return the label that should be shown to a user.""" 262 return self.names.get(value, value) 263 264 def to_dict(self) -> PropertyTypeToDict: 265 """When serialising the model to JSON, include all values.""" 266 data = super(EnumType, self).to_dict() 267 data["values"] = self.names 268 return data
29class PropertyType(object): 30 """Base class for all property types.""" 31 32 name: str = "any" 33 """A machine-facing, variable safe name for the given type.""" 34 35 group: Optional[str] = None 36 """Groups are used to invert all the properties of an entity that have a 37 given type into a single list before indexing them. This way, in Aleph, 38 you can query for ``countries:gb`` instead of having to make a set of filters 39 like ``properties.jurisdiction:gb OR properties.country:gb OR ...``.""" 40 41 label: str = "Any" 42 """A name for this type to be shown to users.""" 43 44 plural: str = "Any" 45 """A plural name for this type which can be used in appropriate places in 46 a user interface.""" 47 48 matchable: bool = True 49 """Matchable types allow properties to be compared with each other in order to 50 assess entity similarity. While it makes sense to compare names, countries or 51 phone numbers, the same isn't true for raw JSON blobs or descriptive text 52 snippets.""" 53 54 pivot: bool = False 55 """Pivot property types are like a stronger form of :attr:`~matchable` types: 56 they will be used when value-based lookups are used to find commonalities 57 between entities. For example, pivot typed-properties are used to show all the 58 other entities that mention the same phone number, email address or name as the 59 one currently seen by the user.""" 60 61 max_size: Optional[int] = None 62 """Some types have overall size limitations in place in order to avoid generating 63 entities that are very large (upstream ElasticSearch has a 100MB document limit). 64 Once the total size of all properties of this type has exceed the given limit, 65 an entity will refuse to add further values.""" 66 67 @property 68 def docs(self) -> Optional[str]: 69 if not self.__doc__: 70 return None 71 72 return cleandoc(self.__doc__) 73 74 def validate( 75 self, value: str, fuzzy: bool = False, format: Optional[str] = None 76 ) -> bool: 77 """Returns a boolean to indicate if the given value is a valid instance of 78 the type.""" 79 cleaned = self.clean(value, fuzzy=fuzzy, format=format) 80 return cleaned is not None 81 82 def clean( 83 self, 84 raw: Any, 85 fuzzy: bool = False, 86 format: Optional[str] = None, 87 proxy: Optional["EntityProxy"] = None, 88 ) -> Optional[str]: 89 """Create a clean version of a value of the type, suitable for storage 90 in an entity proxy.""" 91 text = sanitize_text(raw) 92 if text is None: 93 return None 94 return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy) 95 96 def clean_text( 97 self, 98 text: str, 99 fuzzy: bool = False, 100 format: Optional[str] = None, 101 proxy: Optional["EntityProxy"] = None, 102 ) -> Optional[str]: 103 """Specific types can apply their own cleaning routines here (this is called 104 by ``clean`` after the value has been converted to a string and null values 105 have been filtered).""" 106 return text 107 108 def join(self, values: Sequence[str]) -> str: 109 """Helper function for converting multi-valued FtM data into formats that 110 allow only a single value per field (e.g. CSV). This is not fully reversible 111 and should be used as a last option.""" 112 values = ensure_list(values) 113 return "; ".join(values) 114 115 def _specificity(self, value: str) -> float: 116 return 1.0 117 118 def specificity(self, value: Optional[str]) -> float: 119 """Return a score for how specific the given value is. This can be used as a 120 weighting factor in entity comparisons in order to rate matching property 121 values by how specific they are. For example: a longer address is considered 122 to be more specific than a short one, a full date more specific than just a 123 year number, etc.""" 124 if not self.matchable or value is None: 125 return 0.0 126 return self._specificity(value) 127 128 def compare_safe(self, left: Optional[str], right: Optional[str]) -> float: 129 """Compare, but support None values on either side of the comparison.""" 130 left = stringify(left) 131 right = stringify(right) 132 if left is None or right is None: 133 return 0.0 134 return self.compare(left, right) 135 136 def compare(self, left: str, right: str) -> float: 137 """Comparisons are a float between 0 and 1. They can assume 138 that the given data is cleaned, but not normalised.""" 139 if left.lower() == right.lower(): 140 return 1.0 * self.specificity(left) 141 return 0.0 142 143 def compare_sets( 144 self, 145 left: Sequence[str], 146 right: Sequence[str], 147 func: Callable[[Sequence[float]], float] = max, 148 ) -> float: 149 """Compare two sets of values and select the highest-scored result.""" 150 results = [] 151 for l, r in product(ensure_list(left), ensure_list(right)): 152 results.append(self.compare(l, r)) 153 if not len(results): 154 return 0.0 155 return func(results) 156 157 def country_hint(self, value: str) -> Optional[str]: 158 """Determine if the given value allows us to infer a country that it may 159 be related to (e.g. using a country prefix on a phone number or IBAN).""" 160 return None 161 162 def rdf(self, value: str) -> Identifier: 163 """Return an RDF term to represent the given value - either a string 164 literal, or a URI reference.""" 165 return Literal(value) 166 167 def pick(self, values: Sequence[str]) -> Optional[str]: 168 """Pick the best value to show to the user.""" 169 raise NotImplemented 170 171 def node_id(self, value: str) -> Optional[str]: 172 """Return an ID suitable to identify this entity as a typed node in a 173 graph representation of some FtM data. It's usually the same as the the 174 RDF form.""" 175 return str(self.rdf(value)) 176 177 def node_id_safe(self, value: Optional[str]) -> Optional[str]: 178 """Wrapper for node_id to handle None values.""" 179 if value is None: 180 return None 181 return self.node_id(value) 182 183 def caption(self, value: str) -> Optional[str]: 184 """Return a label for the given property value. This is often the same as the 185 value, but for types like countries or languages, it would return the label, 186 while other values like phone numbers can be formatted to be nicer to read.""" 187 return value 188 189 def to_dict(self) -> PropertyTypeToDict: 190 """Return a serialisable description of this data type.""" 191 data: PropertyTypeToDict = { 192 "label": gettext(self.label), 193 "plural": gettext(self.plural), 194 "description": gettext(self.docs), 195 } 196 if self.group: 197 data["group"] = self.group 198 if self.matchable: 199 data["matchable"] = True 200 if self.pivot: 201 data["pivot"] = True 202 return data 203 204 def __eq__(self, other: Any) -> bool: 205 if not isinstance(other, PropertyType): 206 return False 207 return self.name == other.name 208 209 def __hash__(self) -> int: 210 return hash(self.name) 211 212 def __str__(self) -> str: 213 return self.name 214 215 def __repr__(self) -> str: 216 return f"<{self.name}>"
Base class for all property types.
Groups are used to invert all the properties of an entity that have a
given type into a single list before indexing them. This way, in Aleph,
you can query for countries:gb
instead of having to make a set of filters
like properties.jurisdiction:gb OR properties.country:gb OR ...
.
A plural name for this type which can be used in appropriate places in a user interface.
Matchable types allow properties to be compared with each other in order to assess entity similarity. While it makes sense to compare names, countries or phone numbers, the same isn't true for raw JSON blobs or descriptive text snippets.
Pivot property types are like a stronger form of ~matchable
types:
they will be used when value-based lookups are used to find commonalities
between entities. For example, pivot typed-properties are used to show all the
other entities that mention the same phone number, email address or name as the
one currently seen by the user.
Some types have overall size limitations in place in order to avoid generating entities that are very large (upstream ElasticSearch has a 100MB document limit). Once the total size of all properties of this type has exceed the given limit, an entity will refuse to add further values.
74 def validate( 75 self, value: str, fuzzy: bool = False, format: Optional[str] = None 76 ) -> bool: 77 """Returns a boolean to indicate if the given value is a valid instance of 78 the type.""" 79 cleaned = self.clean(value, fuzzy=fuzzy, format=format) 80 return cleaned is not None
Returns a boolean to indicate if the given value is a valid instance of the type.
82 def clean( 83 self, 84 raw: Any, 85 fuzzy: bool = False, 86 format: Optional[str] = None, 87 proxy: Optional["EntityProxy"] = None, 88 ) -> Optional[str]: 89 """Create a clean version of a value of the type, suitable for storage 90 in an entity proxy.""" 91 text = sanitize_text(raw) 92 if text is None: 93 return None 94 return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy)
Create a clean version of a value of the type, suitable for storage in an entity proxy.
96 def clean_text( 97 self, 98 text: str, 99 fuzzy: bool = False, 100 format: Optional[str] = None, 101 proxy: Optional["EntityProxy"] = None, 102 ) -> Optional[str]: 103 """Specific types can apply their own cleaning routines here (this is called 104 by ``clean`` after the value has been converted to a string and null values 105 have been filtered).""" 106 return text
Specific types can apply their own cleaning routines here (this is called
by clean
after the value has been converted to a string and null values
have been filtered).
108 def join(self, values: Sequence[str]) -> str: 109 """Helper function for converting multi-valued FtM data into formats that 110 allow only a single value per field (e.g. CSV). This is not fully reversible 111 and should be used as a last option.""" 112 values = ensure_list(values) 113 return "; ".join(values)
Helper function for converting multi-valued FtM data into formats that allow only a single value per field (e.g. CSV). This is not fully reversible and should be used as a last option.
118 def specificity(self, value: Optional[str]) -> float: 119 """Return a score for how specific the given value is. This can be used as a 120 weighting factor in entity comparisons in order to rate matching property 121 values by how specific they are. For example: a longer address is considered 122 to be more specific than a short one, a full date more specific than just a 123 year number, etc.""" 124 if not self.matchable or value is None: 125 return 0.0 126 return self._specificity(value)
Return a score for how specific the given value is. This can be used as a weighting factor in entity comparisons in order to rate matching property values by how specific they are. For example: a longer address is considered to be more specific than a short one, a full date more specific than just a year number, etc.
128 def compare_safe(self, left: Optional[str], right: Optional[str]) -> float: 129 """Compare, but support None values on either side of the comparison.""" 130 left = stringify(left) 131 right = stringify(right) 132 if left is None or right is None: 133 return 0.0 134 return self.compare(left, right)
Compare, but support None values on either side of the comparison.
136 def compare(self, left: str, right: str) -> float: 137 """Comparisons are a float between 0 and 1. They can assume 138 that the given data is cleaned, but not normalised.""" 139 if left.lower() == right.lower(): 140 return 1.0 * self.specificity(left) 141 return 0.0
Comparisons are a float between 0 and 1. They can assume that the given data is cleaned, but not normalised.
143 def compare_sets( 144 self, 145 left: Sequence[str], 146 right: Sequence[str], 147 func: Callable[[Sequence[float]], float] = max, 148 ) -> float: 149 """Compare two sets of values and select the highest-scored result.""" 150 results = [] 151 for l, r in product(ensure_list(left), ensure_list(right)): 152 results.append(self.compare(l, r)) 153 if not len(results): 154 return 0.0 155 return func(results)
Compare two sets of values and select the highest-scored result.
157 def country_hint(self, value: str) -> Optional[str]: 158 """Determine if the given value allows us to infer a country that it may 159 be related to (e.g. using a country prefix on a phone number or IBAN).""" 160 return None
Determine if the given value allows us to infer a country that it may be related to (e.g. using a country prefix on a phone number or IBAN).
162 def rdf(self, value: str) -> Identifier: 163 """Return an RDF term to represent the given value - either a string 164 literal, or a URI reference.""" 165 return Literal(value)
Return an RDF term to represent the given value - either a string literal, or a URI reference.
167 def pick(self, values: Sequence[str]) -> Optional[str]: 168 """Pick the best value to show to the user.""" 169 raise NotImplemented
Pick the best value to show to the user.
171 def node_id(self, value: str) -> Optional[str]: 172 """Return an ID suitable to identify this entity as a typed node in a 173 graph representation of some FtM data. It's usually the same as the the 174 RDF form.""" 175 return str(self.rdf(value))
Return an ID suitable to identify this entity as a typed node in a graph representation of some FtM data. It's usually the same as the the RDF form.
177 def node_id_safe(self, value: Optional[str]) -> Optional[str]: 178 """Wrapper for node_id to handle None values.""" 179 if value is None: 180 return None 181 return self.node_id(value)
Wrapper for node_id to handle None values.
189 def to_dict(self) -> PropertyTypeToDict: 190 """Return a serialisable description of this data type.""" 191 data: PropertyTypeToDict = { 192 "label": gettext(self.label), 193 "plural": gettext(self.plural), 194 "description": gettext(self.docs), 195 } 196 if self.group: 197 data["group"] = self.group 198 if self.matchable: 199 data["matchable"] = True 200 if self.pivot: 201 data["pivot"] = True 202 return data
Return a serialisable description of this data type.
219class EnumType(PropertyType): 220 """Enumerated type properties are used for types which have a defined set 221 of possible values, like languages and countries.""" 222 223 def __init__(self) -> None: 224 self._names: Dict[Locale, EnumValues] = {} 225 self.codes = set(self.names.keys()) 226 227 def _locale_names(self, locale: Locale) -> EnumValues: 228 return {} 229 230 @property 231 def names(self) -> EnumValues: 232 """Return a mapping from property values to their labels in the current 233 locale.""" 234 locale = get_locale() 235 if locale not in self._names: 236 self._names[locale] = self._locale_names(locale) 237 return self._names[locale] 238 239 def validate( 240 self, value: str, fuzzy: bool = False, format: Optional[str] = None 241 ) -> bool: 242 """Make sure that the given code value is one of the supported set.""" 243 if value is None: 244 return False 245 return str(value).lower().strip() in self.codes 246 247 def clean_text( 248 self, 249 code: str, 250 fuzzy: bool = False, 251 format: Optional[str] = None, 252 proxy: Optional["EntityProxy"] = None, 253 ) -> Optional[str]: 254 """All code values are cleaned to be lowercase and trailing whitespace is 255 removed.""" 256 code = code.lower().strip() 257 if code not in self.codes: 258 return None 259 return code 260 261 def caption(self, value: str) -> str: 262 """Given a code value, return the label that should be shown to a user.""" 263 return self.names.get(value, value) 264 265 def to_dict(self) -> PropertyTypeToDict: 266 """When serialising the model to JSON, include all values.""" 267 data = super(EnumType, self).to_dict() 268 data["values"] = self.names 269 return data
Enumerated type properties are used for types which have a defined set of possible values, like languages and countries.
230 @property 231 def names(self) -> EnumValues: 232 """Return a mapping from property values to their labels in the current 233 locale.""" 234 locale = get_locale() 235 if locale not in self._names: 236 self._names[locale] = self._locale_names(locale) 237 return self._names[locale]
Return a mapping from property values to their labels in the current locale.
239 def validate( 240 self, value: str, fuzzy: bool = False, format: Optional[str] = None 241 ) -> bool: 242 """Make sure that the given code value is one of the supported set.""" 243 if value is None: 244 return False 245 return str(value).lower().strip() in self.codes
Make sure that the given code value is one of the supported set.
247 def clean_text( 248 self, 249 code: str, 250 fuzzy: bool = False, 251 format: Optional[str] = None, 252 proxy: Optional["EntityProxy"] = None, 253 ) -> Optional[str]: 254 """All code values are cleaned to be lowercase and trailing whitespace is 255 removed.""" 256 code = code.lower().strip() 257 if code not in self.codes: 258 return None 259 return code
All code values are cleaned to be lowercase and trailing whitespace is removed.
265 def to_dict(self) -> PropertyTypeToDict: 266 """When serialising the model to JSON, include all values.""" 267 data = super(EnumType, self).to_dict() 268 data["values"] = self.names 269 return data
When serialising the model to JSON, include all values.