followthemoney.types.common
1from inspect import cleandoc 2from itertools import product 3from babel.core import Locale 4from banal import ensure_list 5from normality import stringify 6from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict 7 8from followthemoney.rdf import Literal, Identifier 9from followthemoney.util import get_locale 10from followthemoney.util import gettext, sanitize_text 11 12if TYPE_CHECKING: 13 from followthemoney.proxy import EntityProxy 14 15EnumValues = Dict[str, str] 16 17 18class PropertyTypeToDict(TypedDict, total=False): 19 label: str 20 plural: str 21 description: Optional[str] 22 maxLength: int 23 group: Optional[str] 24 matchable: Optional[bool] 25 pivot: Optional[bool] 26 values: Optional[EnumValues] 27 28 29class PropertyType(object): 30 """Base class for all property types.""" 31 32 name: str = "any" 33 """A machine-facing, variable safe name for the given type.""" 34 35 group: Optional[str] = None 36 """Groups are used to invert all the properties of an entity that have a 37 given type into a single list before indexing them. This way, in Aleph, 38 you can query for ``countries:gb`` instead of having to make a set of filters 39 like ``properties.jurisdiction:gb OR properties.country:gb OR ...``.""" 40 41 label: str = "Any" 42 """A name for this type to be shown to users.""" 43 44 plural: str = "Any" 45 """A plural name for this type which can be used in appropriate places in 46 a user interface.""" 47 48 matchable: bool = True 49 """Matchable types allow properties to be compared with each other in order to 50 assess entity similarity. While it makes sense to compare names, countries or 51 phone numbers, the same isn't true for raw JSON blobs or descriptive text 52 snippets.""" 53 54 pivot: bool = False 55 """Pivot property types are like a stronger form of :attr:`~matchable` types: 56 they will be used when value-based lookups are used to find commonalities 57 between entities. For example, pivot typed-properties are used to show all the 58 other entities that mention the same phone number, email address or name as the 59 one currently seen by the user.""" 60 61 max_length: int = 250 62 """The maximum length of a single value of this type. This is used to warn when 63 adding individual values that may be malformed or too long to be stored in 64 downstream databases with fixed column lengths. The unit is unicode codepoints 65 (not bytes), the output of Python len().""" 66 67 total_size: Optional[int] = None 68 """Some types have overall size limitations in place in order to avoid generating 69 entities that are very large (upstream ElasticSearch has a 100MB document limit). 70 Once the total size of all properties of this type has exceed the given limit, 71 an entity will refuse to add further values.""" 72 73 @property 74 def docs(self) -> Optional[str]: 75 if not self.__doc__: 76 return None 77 78 return cleandoc(self.__doc__) 79 80 def validate( 81 self, value: str, fuzzy: bool = False, format: Optional[str] = None 82 ) -> bool: 83 """Returns a boolean to indicate if the given value is a valid instance of 84 the type.""" 85 cleaned = self.clean(value, fuzzy=fuzzy, format=format) 86 return cleaned is not None 87 88 def clean( 89 self, 90 raw: Any, 91 fuzzy: bool = False, 92 format: Optional[str] = None, 93 proxy: Optional["EntityProxy"] = None, 94 ) -> Optional[str]: 95 """Create a clean version of a value of the type, suitable for storage 96 in an entity proxy.""" 97 text = sanitize_text(raw) 98 if text is None: 99 return None 100 return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy) 101 102 def clean_text( 103 self, 104 text: str, 105 fuzzy: bool = False, 106 format: Optional[str] = None, 107 proxy: Optional["EntityProxy"] = None, 108 ) -> Optional[str]: 109 """Specific types can apply their own cleaning routines here (this is called 110 by ``clean`` after the value has been converted to a string and null values 111 have been filtered).""" 112 return text 113 114 def join(self, values: Sequence[str]) -> str: 115 """Helper function for converting multi-valued FtM data into formats that 116 allow only a single value per field (e.g. CSV). This is not fully reversible 117 and should be used as a last option.""" 118 values = ensure_list(values) 119 return "; ".join(values) 120 121 def _specificity(self, value: str) -> float: 122 return 1.0 123 124 def specificity(self, value: Optional[str]) -> float: 125 """Return a score for how specific the given value is. This can be used as a 126 weighting factor in entity comparisons in order to rate matching property 127 values by how specific they are. For example: a longer address is considered 128 to be more specific than a short one, a full date more specific than just a 129 year number, etc.""" 130 if not self.matchable or value is None: 131 return 0.0 132 return self._specificity(value) 133 134 def compare_safe(self, left: Optional[str], right: Optional[str]) -> float: 135 """Compare, but support None values on either side of the comparison.""" 136 left = stringify(left) 137 right = stringify(right) 138 if left is None or right is None: 139 return 0.0 140 return self.compare(left, right) 141 142 def compare(self, left: str, right: str) -> float: 143 """Comparisons are a float between 0 and 1. They can assume 144 that the given data is cleaned, but not normalised.""" 145 if left.lower() == right.lower(): 146 return 1.0 * self.specificity(left) 147 return 0.0 148 149 def compare_sets( 150 self, 151 left: Sequence[str], 152 right: Sequence[str], 153 func: Callable[[Sequence[float]], float] = max, 154 ) -> float: 155 """Compare two sets of values and select the highest-scored result.""" 156 results = [] 157 for le, ri in product(ensure_list(left), ensure_list(right)): 158 results.append(self.compare(le, ri)) 159 if not len(results): 160 return 0.0 161 return func(results) 162 163 def country_hint(self, value: str) -> Optional[str]: 164 """Determine if the given value allows us to infer a country that it may 165 be related to (e.g. using a country prefix on a phone number or IBAN).""" 166 return None 167 168 def rdf(self, value: str) -> Identifier: 169 """Return an RDF term to represent the given value - either a string 170 literal, or a URI reference.""" 171 return Literal(value) 172 173 def pick(self, values: Sequence[str]) -> Optional[str]: 174 """Pick the best value to show to the user.""" 175 raise NotImplementedError 176 177 def node_id(self, value: str) -> Optional[str]: 178 """Return an ID suitable to identify this entity as a typed node in a 179 graph representation of some FtM data. It's usually the same as the the 180 RDF form.""" 181 return str(self.rdf(value)) 182 183 def node_id_safe(self, value: Optional[str]) -> Optional[str]: 184 """Wrapper for node_id to handle None values.""" 185 if value is None: 186 return None 187 return self.node_id(value) 188 189 def caption(self, value: str) -> Optional[str]: 190 """Return a label for the given property value. This is often the same as the 191 value, but for types like countries or languages, it would return the label, 192 while other values like phone numbers can be formatted to be nicer to read.""" 193 return value 194 195 def to_dict(self) -> PropertyTypeToDict: 196 """Return a serialisable description of this data type.""" 197 data: PropertyTypeToDict = { 198 "label": gettext(self.label), 199 "plural": gettext(self.plural), 200 "description": gettext(self.docs), 201 "maxLength": self.max_length, 202 } 203 if self.group: 204 data["group"] = self.group 205 if self.matchable: 206 data["matchable"] = True 207 if self.pivot: 208 data["pivot"] = True 209 return data 210 211 def __eq__(self, other: Any) -> bool: 212 if not isinstance(other, PropertyType): 213 return False 214 return self.name == other.name 215 216 def __hash__(self) -> int: 217 return hash(self.name) 218 219 def __str__(self) -> str: 220 return self.name 221 222 def __repr__(self) -> str: 223 return f"<{self.name}>" 224 225 226class EnumType(PropertyType): 227 """Enumerated type properties are used for types which have a defined set 228 of possible values, like languages and countries.""" 229 230 def __init__(self) -> None: 231 self._names: Dict[Locale, EnumValues] = {} 232 self.codes = set(self.names.keys()) 233 234 def _locale_names(self, locale: Locale) -> EnumValues: 235 return {} 236 237 @property 238 def names(self) -> EnumValues: 239 """Return a mapping from property values to their labels in the current 240 locale.""" 241 locale = get_locale() 242 if locale not in self._names: 243 self._names[locale] = self._locale_names(locale) 244 return self._names[locale] 245 246 def validate( 247 self, value: str, fuzzy: bool = False, format: Optional[str] = None 248 ) -> bool: 249 """Make sure that the given code value is one of the supported set.""" 250 if value is None: 251 return False 252 return str(value).lower().strip() in self.codes 253 254 def clean_text( 255 self, 256 code: str, 257 fuzzy: bool = False, 258 format: Optional[str] = None, 259 proxy: Optional["EntityProxy"] = None, 260 ) -> Optional[str]: 261 """All code values are cleaned to be lowercase and trailing whitespace is 262 removed.""" 263 code = code.lower().strip() 264 if code not in self.codes: 265 return None 266 return code 267 268 def caption(self, value: str) -> str: 269 """Given a code value, return the label that should be shown to a user.""" 270 return self.names.get(value, value) 271 272 def to_dict(self) -> PropertyTypeToDict: 273 """When serialising the model to JSON, include all values.""" 274 data = super(EnumType, self).to_dict() 275 data["values"] = self.names 276 return data
30class PropertyType(object): 31 """Base class for all property types.""" 32 33 name: str = "any" 34 """A machine-facing, variable safe name for the given type.""" 35 36 group: Optional[str] = None 37 """Groups are used to invert all the properties of an entity that have a 38 given type into a single list before indexing them. This way, in Aleph, 39 you can query for ``countries:gb`` instead of having to make a set of filters 40 like ``properties.jurisdiction:gb OR properties.country:gb OR ...``.""" 41 42 label: str = "Any" 43 """A name for this type to be shown to users.""" 44 45 plural: str = "Any" 46 """A plural name for this type which can be used in appropriate places in 47 a user interface.""" 48 49 matchable: bool = True 50 """Matchable types allow properties to be compared with each other in order to 51 assess entity similarity. While it makes sense to compare names, countries or 52 phone numbers, the same isn't true for raw JSON blobs or descriptive text 53 snippets.""" 54 55 pivot: bool = False 56 """Pivot property types are like a stronger form of :attr:`~matchable` types: 57 they will be used when value-based lookups are used to find commonalities 58 between entities. For example, pivot typed-properties are used to show all the 59 other entities that mention the same phone number, email address or name as the 60 one currently seen by the user.""" 61 62 max_length: int = 250 63 """The maximum length of a single value of this type. This is used to warn when 64 adding individual values that may be malformed or too long to be stored in 65 downstream databases with fixed column lengths. The unit is unicode codepoints 66 (not bytes), the output of Python len().""" 67 68 total_size: Optional[int] = None 69 """Some types have overall size limitations in place in order to avoid generating 70 entities that are very large (upstream ElasticSearch has a 100MB document limit). 71 Once the total size of all properties of this type has exceed the given limit, 72 an entity will refuse to add further values.""" 73 74 @property 75 def docs(self) -> Optional[str]: 76 if not self.__doc__: 77 return None 78 79 return cleandoc(self.__doc__) 80 81 def validate( 82 self, value: str, fuzzy: bool = False, format: Optional[str] = None 83 ) -> bool: 84 """Returns a boolean to indicate if the given value is a valid instance of 85 the type.""" 86 cleaned = self.clean(value, fuzzy=fuzzy, format=format) 87 return cleaned is not None 88 89 def clean( 90 self, 91 raw: Any, 92 fuzzy: bool = False, 93 format: Optional[str] = None, 94 proxy: Optional["EntityProxy"] = None, 95 ) -> Optional[str]: 96 """Create a clean version of a value of the type, suitable for storage 97 in an entity proxy.""" 98 text = sanitize_text(raw) 99 if text is None: 100 return None 101 return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy) 102 103 def clean_text( 104 self, 105 text: str, 106 fuzzy: bool = False, 107 format: Optional[str] = None, 108 proxy: Optional["EntityProxy"] = None, 109 ) -> Optional[str]: 110 """Specific types can apply their own cleaning routines here (this is called 111 by ``clean`` after the value has been converted to a string and null values 112 have been filtered).""" 113 return text 114 115 def join(self, values: Sequence[str]) -> str: 116 """Helper function for converting multi-valued FtM data into formats that 117 allow only a single value per field (e.g. CSV). This is not fully reversible 118 and should be used as a last option.""" 119 values = ensure_list(values) 120 return "; ".join(values) 121 122 def _specificity(self, value: str) -> float: 123 return 1.0 124 125 def specificity(self, value: Optional[str]) -> float: 126 """Return a score for how specific the given value is. This can be used as a 127 weighting factor in entity comparisons in order to rate matching property 128 values by how specific they are. For example: a longer address is considered 129 to be more specific than a short one, a full date more specific than just a 130 year number, etc.""" 131 if not self.matchable or value is None: 132 return 0.0 133 return self._specificity(value) 134 135 def compare_safe(self, left: Optional[str], right: Optional[str]) -> float: 136 """Compare, but support None values on either side of the comparison.""" 137 left = stringify(left) 138 right = stringify(right) 139 if left is None or right is None: 140 return 0.0 141 return self.compare(left, right) 142 143 def compare(self, left: str, right: str) -> float: 144 """Comparisons are a float between 0 and 1. They can assume 145 that the given data is cleaned, but not normalised.""" 146 if left.lower() == right.lower(): 147 return 1.0 * self.specificity(left) 148 return 0.0 149 150 def compare_sets( 151 self, 152 left: Sequence[str], 153 right: Sequence[str], 154 func: Callable[[Sequence[float]], float] = max, 155 ) -> float: 156 """Compare two sets of values and select the highest-scored result.""" 157 results = [] 158 for le, ri in product(ensure_list(left), ensure_list(right)): 159 results.append(self.compare(le, ri)) 160 if not len(results): 161 return 0.0 162 return func(results) 163 164 def country_hint(self, value: str) -> Optional[str]: 165 """Determine if the given value allows us to infer a country that it may 166 be related to (e.g. using a country prefix on a phone number or IBAN).""" 167 return None 168 169 def rdf(self, value: str) -> Identifier: 170 """Return an RDF term to represent the given value - either a string 171 literal, or a URI reference.""" 172 return Literal(value) 173 174 def pick(self, values: Sequence[str]) -> Optional[str]: 175 """Pick the best value to show to the user.""" 176 raise NotImplementedError 177 178 def node_id(self, value: str) -> Optional[str]: 179 """Return an ID suitable to identify this entity as a typed node in a 180 graph representation of some FtM data. It's usually the same as the the 181 RDF form.""" 182 return str(self.rdf(value)) 183 184 def node_id_safe(self, value: Optional[str]) -> Optional[str]: 185 """Wrapper for node_id to handle None values.""" 186 if value is None: 187 return None 188 return self.node_id(value) 189 190 def caption(self, value: str) -> Optional[str]: 191 """Return a label for the given property value. This is often the same as the 192 value, but for types like countries or languages, it would return the label, 193 while other values like phone numbers can be formatted to be nicer to read.""" 194 return value 195 196 def to_dict(self) -> PropertyTypeToDict: 197 """Return a serialisable description of this data type.""" 198 data: PropertyTypeToDict = { 199 "label": gettext(self.label), 200 "plural": gettext(self.plural), 201 "description": gettext(self.docs), 202 "maxLength": self.max_length, 203 } 204 if self.group: 205 data["group"] = self.group 206 if self.matchable: 207 data["matchable"] = True 208 if self.pivot: 209 data["pivot"] = True 210 return data 211 212 def __eq__(self, other: Any) -> bool: 213 if not isinstance(other, PropertyType): 214 return False 215 return self.name == other.name 216 217 def __hash__(self) -> int: 218 return hash(self.name) 219 220 def __str__(self) -> str: 221 return self.name 222 223 def __repr__(self) -> str: 224 return f"<{self.name}>"
Base class for all property types.
Groups are used to invert all the properties of an entity that have a
given type into a single list before indexing them. This way, in Aleph,
you can query for countries:gb
instead of having to make a set of filters
like properties.jurisdiction:gb OR properties.country:gb OR ...
.
A plural name for this type which can be used in appropriate places in a user interface.
Matchable types allow properties to be compared with each other in order to assess entity similarity. While it makes sense to compare names, countries or phone numbers, the same isn't true for raw JSON blobs or descriptive text snippets.
Pivot property types are like a stronger form of ~matchable
types:
they will be used when value-based lookups are used to find commonalities
between entities. For example, pivot typed-properties are used to show all the
other entities that mention the same phone number, email address or name as the
one currently seen by the user.
The maximum length of a single value of this type. This is used to warn when adding individual values that may be malformed or too long to be stored in downstream databases with fixed column lengths. The unit is unicode codepoints (not bytes), the output of Python len().
Some types have overall size limitations in place in order to avoid generating entities that are very large (upstream ElasticSearch has a 100MB document limit). Once the total size of all properties of this type has exceed the given limit, an entity will refuse to add further values.
81 def validate( 82 self, value: str, fuzzy: bool = False, format: Optional[str] = None 83 ) -> bool: 84 """Returns a boolean to indicate if the given value is a valid instance of 85 the type.""" 86 cleaned = self.clean(value, fuzzy=fuzzy, format=format) 87 return cleaned is not None
Returns a boolean to indicate if the given value is a valid instance of the type.
89 def clean( 90 self, 91 raw: Any, 92 fuzzy: bool = False, 93 format: Optional[str] = None, 94 proxy: Optional["EntityProxy"] = None, 95 ) -> Optional[str]: 96 """Create a clean version of a value of the type, suitable for storage 97 in an entity proxy.""" 98 text = sanitize_text(raw) 99 if text is None: 100 return None 101 return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy)
Create a clean version of a value of the type, suitable for storage in an entity proxy.
103 def clean_text( 104 self, 105 text: str, 106 fuzzy: bool = False, 107 format: Optional[str] = None, 108 proxy: Optional["EntityProxy"] = None, 109 ) -> Optional[str]: 110 """Specific types can apply their own cleaning routines here (this is called 111 by ``clean`` after the value has been converted to a string and null values 112 have been filtered).""" 113 return text
Specific types can apply their own cleaning routines here (this is called
by clean
after the value has been converted to a string and null values
have been filtered).
115 def join(self, values: Sequence[str]) -> str: 116 """Helper function for converting multi-valued FtM data into formats that 117 allow only a single value per field (e.g. CSV). This is not fully reversible 118 and should be used as a last option.""" 119 values = ensure_list(values) 120 return "; ".join(values)
Helper function for converting multi-valued FtM data into formats that allow only a single value per field (e.g. CSV). This is not fully reversible and should be used as a last option.
125 def specificity(self, value: Optional[str]) -> float: 126 """Return a score for how specific the given value is. This can be used as a 127 weighting factor in entity comparisons in order to rate matching property 128 values by how specific they are. For example: a longer address is considered 129 to be more specific than a short one, a full date more specific than just a 130 year number, etc.""" 131 if not self.matchable or value is None: 132 return 0.0 133 return self._specificity(value)
Return a score for how specific the given value is. This can be used as a weighting factor in entity comparisons in order to rate matching property values by how specific they are. For example: a longer address is considered to be more specific than a short one, a full date more specific than just a year number, etc.
135 def compare_safe(self, left: Optional[str], right: Optional[str]) -> float: 136 """Compare, but support None values on either side of the comparison.""" 137 left = stringify(left) 138 right = stringify(right) 139 if left is None or right is None: 140 return 0.0 141 return self.compare(left, right)
Compare, but support None values on either side of the comparison.
143 def compare(self, left: str, right: str) -> float: 144 """Comparisons are a float between 0 and 1. They can assume 145 that the given data is cleaned, but not normalised.""" 146 if left.lower() == right.lower(): 147 return 1.0 * self.specificity(left) 148 return 0.0
Comparisons are a float between 0 and 1. They can assume that the given data is cleaned, but not normalised.
150 def compare_sets( 151 self, 152 left: Sequence[str], 153 right: Sequence[str], 154 func: Callable[[Sequence[float]], float] = max, 155 ) -> float: 156 """Compare two sets of values and select the highest-scored result.""" 157 results = [] 158 for le, ri in product(ensure_list(left), ensure_list(right)): 159 results.append(self.compare(le, ri)) 160 if not len(results): 161 return 0.0 162 return func(results)
Compare two sets of values and select the highest-scored result.
164 def country_hint(self, value: str) -> Optional[str]: 165 """Determine if the given value allows us to infer a country that it may 166 be related to (e.g. using a country prefix on a phone number or IBAN).""" 167 return None
Determine if the given value allows us to infer a country that it may be related to (e.g. using a country prefix on a phone number or IBAN).
169 def rdf(self, value: str) -> Identifier: 170 """Return an RDF term to represent the given value - either a string 171 literal, or a URI reference.""" 172 return Literal(value)
Return an RDF term to represent the given value - either a string literal, or a URI reference.
174 def pick(self, values: Sequence[str]) -> Optional[str]: 175 """Pick the best value to show to the user.""" 176 raise NotImplementedError
Pick the best value to show to the user.
178 def node_id(self, value: str) -> Optional[str]: 179 """Return an ID suitable to identify this entity as a typed node in a 180 graph representation of some FtM data. It's usually the same as the the 181 RDF form.""" 182 return str(self.rdf(value))
Return an ID suitable to identify this entity as a typed node in a graph representation of some FtM data. It's usually the same as the the RDF form.
184 def node_id_safe(self, value: Optional[str]) -> Optional[str]: 185 """Wrapper for node_id to handle None values.""" 186 if value is None: 187 return None 188 return self.node_id(value)
Wrapper for node_id to handle None values.
196 def to_dict(self) -> PropertyTypeToDict: 197 """Return a serialisable description of this data type.""" 198 data: PropertyTypeToDict = { 199 "label": gettext(self.label), 200 "plural": gettext(self.plural), 201 "description": gettext(self.docs), 202 "maxLength": self.max_length, 203 } 204 if self.group: 205 data["group"] = self.group 206 if self.matchable: 207 data["matchable"] = True 208 if self.pivot: 209 data["pivot"] = True 210 return data
Return a serialisable description of this data type.
227class EnumType(PropertyType): 228 """Enumerated type properties are used for types which have a defined set 229 of possible values, like languages and countries.""" 230 231 def __init__(self) -> None: 232 self._names: Dict[Locale, EnumValues] = {} 233 self.codes = set(self.names.keys()) 234 235 def _locale_names(self, locale: Locale) -> EnumValues: 236 return {} 237 238 @property 239 def names(self) -> EnumValues: 240 """Return a mapping from property values to their labels in the current 241 locale.""" 242 locale = get_locale() 243 if locale not in self._names: 244 self._names[locale] = self._locale_names(locale) 245 return self._names[locale] 246 247 def validate( 248 self, value: str, fuzzy: bool = False, format: Optional[str] = None 249 ) -> bool: 250 """Make sure that the given code value is one of the supported set.""" 251 if value is None: 252 return False 253 return str(value).lower().strip() in self.codes 254 255 def clean_text( 256 self, 257 code: str, 258 fuzzy: bool = False, 259 format: Optional[str] = None, 260 proxy: Optional["EntityProxy"] = None, 261 ) -> Optional[str]: 262 """All code values are cleaned to be lowercase and trailing whitespace is 263 removed.""" 264 code = code.lower().strip() 265 if code not in self.codes: 266 return None 267 return code 268 269 def caption(self, value: str) -> str: 270 """Given a code value, return the label that should be shown to a user.""" 271 return self.names.get(value, value) 272 273 def to_dict(self) -> PropertyTypeToDict: 274 """When serialising the model to JSON, include all values.""" 275 data = super(EnumType, self).to_dict() 276 data["values"] = self.names 277 return data
Enumerated type properties are used for types which have a defined set of possible values, like languages and countries.
238 @property 239 def names(self) -> EnumValues: 240 """Return a mapping from property values to their labels in the current 241 locale.""" 242 locale = get_locale() 243 if locale not in self._names: 244 self._names[locale] = self._locale_names(locale) 245 return self._names[locale]
Return a mapping from property values to their labels in the current locale.
247 def validate( 248 self, value: str, fuzzy: bool = False, format: Optional[str] = None 249 ) -> bool: 250 """Make sure that the given code value is one of the supported set.""" 251 if value is None: 252 return False 253 return str(value).lower().strip() in self.codes
Make sure that the given code value is one of the supported set.
255 def clean_text( 256 self, 257 code: str, 258 fuzzy: bool = False, 259 format: Optional[str] = None, 260 proxy: Optional["EntityProxy"] = None, 261 ) -> Optional[str]: 262 """All code values are cleaned to be lowercase and trailing whitespace is 263 removed.""" 264 code = code.lower().strip() 265 if code not in self.codes: 266 return None 267 return code
All code values are cleaned to be lowercase and trailing whitespace is removed.