followthemoney.types.common

View Source

  1from inspect import cleandoc
  2from itertools import product
  3from babel.core import Locale
  4from banal import ensure_list
  5from normality import stringify
  6from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict
  7
  8from followthemoney.rdf import Literal, Identifier
  9from followthemoney.util import get_locale
 10from followthemoney.util import gettext, sanitize_text
 11
 12if TYPE_CHECKING:
 13    from followthemoney.proxy import EntityProxy
 14
 15EnumValues = Dict[str, str]
 16
 17
 18class PropertyTypeToDict(TypedDict, total=False):
 19    label: str
 20    plural: str
 21    description: Optional[str]
 22    maxLength: int
 23    group: Optional[str]
 24    matchable: Optional[bool]
 25    pivot: Optional[bool]
 26    values: Optional[EnumValues]
 27
 28
 29class PropertyType(object):
 30    """Base class for all property types."""
 31
 32    name: str = "any"
 33    """A machine-facing, variable safe name for the given type."""
 34
 35    group: Optional[str] = None
 36    """Groups are used to invert all the properties of an entity that have a
 37    given  type into a single list before indexing them. This way, in Aleph,
 38    you can query for ``countries:gb`` instead of having to make a set of filters
 39    like ``properties.jurisdiction:gb OR properties.country:gb OR ...``."""
 40
 41    label: str = "Any"
 42    """A name for this type to be shown to users."""
 43
 44    plural: str = "Any"
 45    """A plural name for this type which can be used in appropriate places in
 46    a user interface."""
 47
 48    matchable: bool = True
 49    """Matchable types allow properties to be compared with each other in order to
 50    assess entity similarity. While it makes sense to compare names, countries or
 51    phone numbers, the same isn't true for raw JSON blobs or descriptive text
 52    snippets."""
 53
 54    pivot: bool = False
 55    """Pivot property types are like a stronger form of :attr:`~matchable` types:
 56    they will be used when value-based lookups are used to find commonalities
 57    between entities. For example, pivot typed-properties are used to show all the
 58    other entities that mention the same phone number, email address or name as the
 59    one currently seen by the user."""
 60
 61    max_length: int = 250
 62    """The maximum length of a single value of this type. This is used to warn when
 63    adding individual values that may be malformed or too long to be stored in
 64    downstream databases with fixed column lengths. The unit is unicode codepoints
 65    (not bytes), the output of Python len()."""
 66
 67    total_size: Optional[int] = None
 68    """Some types have overall size limitations in place in order to avoid generating
 69    entities that are very large (upstream ElasticSearch has a 100MB document limit).
 70    Once the total size of all properties of this type has exceed the given limit,
 71    an entity will refuse to add further values."""
 72
 73    @property
 74    def docs(self) -> Optional[str]:
 75        if not self.__doc__:
 76            return None
 77
 78        return cleandoc(self.__doc__)
 79
 80    def validate(
 81        self, value: str, fuzzy: bool = False, format: Optional[str] = None
 82    ) -> bool:
 83        """Returns a boolean to indicate if the given value is a valid instance of
 84        the type."""
 85        cleaned = self.clean(value, fuzzy=fuzzy, format=format)
 86        return cleaned is not None
 87
 88    def clean(
 89        self,
 90        raw: Any,
 91        fuzzy: bool = False,
 92        format: Optional[str] = None,
 93        proxy: Optional["EntityProxy"] = None,
 94    ) -> Optional[str]:
 95        """Create a clean version of a value of the type, suitable for storage
 96        in an entity proxy."""
 97        text = sanitize_text(raw)
 98        if text is None:
 99            return None
100        return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy)
101
102    def clean_text(
103        self,
104        text: str,
105        fuzzy: bool = False,
106        format: Optional[str] = None,
107        proxy: Optional["EntityProxy"] = None,
108    ) -> Optional[str]:
109        """Specific types can apply their own cleaning routines here (this is called
110        by ``clean`` after the value has been converted to a string and null values
111        have been filtered)."""
112        return text
113
114    def join(self, values: Sequence[str]) -> str:
115        """Helper function for converting multi-valued FtM data into formats that
116        allow only a single value per field (e.g. CSV). This is not fully reversible
117        and should be used as a last option."""
118        values = ensure_list(values)
119        return "; ".join(values)
120
121    def _specificity(self, value: str) -> float:
122        return 1.0
123
124    def specificity(self, value: Optional[str]) -> float:
125        """Return a score for how specific the given value is. This can be used as a
126        weighting factor in entity comparisons in order to rate matching property
127        values by how specific they are. For example: a longer address is considered
128        to be more specific than a short one, a full date more specific than just a
129        year number, etc."""
130        if not self.matchable or value is None:
131            return 0.0
132        return self._specificity(value)
133
134    def compare_safe(self, left: Optional[str], right: Optional[str]) -> float:
135        """Compare, but support None values on either side of the comparison."""
136        left = stringify(left)
137        right = stringify(right)
138        if left is None or right is None:
139            return 0.0
140        return self.compare(left, right)
141
142    def compare(self, left: str, right: str) -> float:
143        """Comparisons are a float between 0 and 1. They can assume
144        that the given data is cleaned, but not normalised."""
145        if left.lower() == right.lower():
146            return 1.0 * self.specificity(left)
147        return 0.0
148
149    def compare_sets(
150        self,
151        left: Sequence[str],
152        right: Sequence[str],
153        func: Callable[[Sequence[float]], float] = max,
154    ) -> float:
155        """Compare two sets of values and select the highest-scored result."""
156        results = []
157        for le, ri in product(ensure_list(left), ensure_list(right)):
158            results.append(self.compare(le, ri))
159        if not len(results):
160            return 0.0
161        return func(results)
162
163    def country_hint(self, value: str) -> Optional[str]:
164        """Determine if the given value allows us to infer a country that it may
165        be related to (e.g. using a country prefix on a phone number or IBAN)."""
166        return None
167
168    def rdf(self, value: str) -> Identifier:
169        """Return an RDF term to represent the given value - either a string
170        literal, or a URI reference."""
171        return Literal(value)
172
173    def pick(self, values: Sequence[str]) -> Optional[str]:
174        """Pick the best value to show to the user."""
175        raise NotImplementedError
176
177    def node_id(self, value: str) -> Optional[str]:
178        """Return an ID suitable to identify this entity as a typed node in a
179        graph representation of some FtM data. It's usually the same as the the
180        RDF form."""
181        return str(self.rdf(value))
182
183    def node_id_safe(self, value: Optional[str]) -> Optional[str]:
184        """Wrapper for node_id to handle None values."""
185        if value is None:
186            return None
187        return self.node_id(value)
188
189    def caption(self, value: str) -> Optional[str]:
190        """Return a label for the given property value. This is often the same as the
191        value, but for types like countries or languages, it would return the label,
192        while other values like phone numbers can be formatted to be nicer to read."""
193        return value
194
195    def to_dict(self) -> PropertyTypeToDict:
196        """Return a serialisable description of this data type."""
197        data: PropertyTypeToDict = {
198            "label": gettext(self.label),
199            "plural": gettext(self.plural),
200            "description": gettext(self.docs),
201            "maxLength": self.max_length,
202        }
203        if self.group:
204            data["group"] = self.group
205        if self.matchable:
206            data["matchable"] = True
207        if self.pivot:
208            data["pivot"] = True
209        return data
210
211    def __eq__(self, other: Any) -> bool:
212        if not isinstance(other, PropertyType):
213            return False
214        return self.name == other.name
215
216    def __hash__(self) -> int:
217        return hash(self.name)
218
219    def __str__(self) -> str:
220        return self.name
221
222    def __repr__(self) -> str:
223        return f"<{self.name}>"
224
225
226class EnumType(PropertyType):
227    """Enumerated type properties are used for types which have a defined set
228    of possible values, like languages and countries."""
229
230    def __init__(self) -> None:
231        self._names: Dict[Locale, EnumValues] = {}
232        self.codes = set(self.names.keys())
233
234    def _locale_names(self, locale: Locale) -> EnumValues:
235        return {}
236
237    @property
238    def names(self) -> EnumValues:
239        """Return a mapping from property values to their labels in the current
240        locale."""
241        locale = get_locale()
242        if locale not in self._names:
243            self._names[locale] = self._locale_names(locale)
244        return self._names[locale]
245
246    def validate(
247        self, value: str, fuzzy: bool = False, format: Optional[str] = None
248    ) -> bool:
249        """Make sure that the given code value is one of the supported set."""
250        if value is None:
251            return False
252        return str(value).lower().strip() in self.codes
253
254    def clean_text(
255        self,
256        code: str,
257        fuzzy: bool = False,
258        format: Optional[str] = None,
259        proxy: Optional["EntityProxy"] = None,
260    ) -> Optional[str]:
261        """All code values are cleaned to be lowercase and trailing whitespace is
262        removed."""
263        code = code.lower().strip()
264        if code not in self.codes:
265            return None
266        return code
267
268    def caption(self, value: str) -> str:
269        """Given a code value, return the label that should be shown to a user."""
270        return self.names.get(value, value)
271
272    def to_dict(self) -> PropertyTypeToDict:
273        """When serialising the model to JSON, include all values."""
274        data = super(EnumType, self).to_dict()
275        data["values"] = self.names
276        return data

EnumValues = typing.Dict[str, str]

class PropertyTypeToDict(typing.TypedDict): View Source

19class PropertyTypeToDict(TypedDict, total=False):
20    label: str
21    plural: str
22    description: Optional[str]
23    maxLength: int
24    group: Optional[str]
25    matchable: Optional[bool]
26    pivot: Optional[bool]
27    values: Optional[EnumValues]

label: str

plural: str

description: Optional[str]

maxLength: int

group: Optional[str]

matchable: Optional[bool]

pivot: Optional[bool]

values: Optional[Dict[str, str]]

class EnumType(PropertyType): View Source

227class EnumType(PropertyType):
228    """Enumerated type properties are used for types which have a defined set
229    of possible values, like languages and countries."""
230
231    def __init__(self) -> None:
232        self._names: Dict[Locale, EnumValues] = {}
233        self.codes = set(self.names.keys())
234
235    def _locale_names(self, locale: Locale) -> EnumValues:
236        return {}
237
238    @property
239    def names(self) -> EnumValues:
240        """Return a mapping from property values to their labels in the current
241        locale."""
242        locale = get_locale()
243        if locale not in self._names:
244            self._names[locale] = self._locale_names(locale)
245        return self._names[locale]
246
247    def validate(
248        self, value: str, fuzzy: bool = False, format: Optional[str] = None
249    ) -> bool:
250        """Make sure that the given code value is one of the supported set."""
251        if value is None:
252            return False
253        return str(value).lower().strip() in self.codes
254
255    def clean_text(
256        self,
257        code: str,
258        fuzzy: bool = False,
259        format: Optional[str] = None,
260        proxy: Optional["EntityProxy"] = None,
261    ) -> Optional[str]:
262        """All code values are cleaned to be lowercase and trailing whitespace is
263        removed."""
264        code = code.lower().strip()
265        if code not in self.codes:
266            return None
267        return code
268
269    def caption(self, value: str) -> str:
270        """Given a code value, return the label that should be shown to a user."""
271        return self.names.get(value, value)
272
273    def to_dict(self) -> PropertyTypeToDict:
274        """When serialising the model to JSON, include all values."""
275        data = super(EnumType, self).to_dict()
276        data["values"] = self.names
277        return data

Enumerated type properties are used for types which have a defined set of possible values, like languages and countries.

codes

names: Dict[str, str] View Source

238    @property
239    def names(self) -> EnumValues:
240        """Return a mapping from property values to their labels in the current
241        locale."""
242        locale = get_locale()
243        if locale not in self._names:
244            self._names[locale] = self._locale_names(locale)
245        return self._names[locale]

Return a mapping from property values to their labels in the current locale.

def validate( self, value: str, fuzzy: bool = False, format: Optional[str] = None) -> bool: View Source

247    def validate(
248        self, value: str, fuzzy: bool = False, format: Optional[str] = None
249    ) -> bool:
250        """Make sure that the given code value is one of the supported set."""
251        if value is None:
252            return False
253        return str(value).lower().strip() in self.codes

Make sure that the given code value is one of the supported set.

def clean_text( self, code: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional[followthemoney.proxy.EntityProxy] = None) -> Optional[str]: View Source

255    def clean_text(
256        self,
257        code: str,
258        fuzzy: bool = False,
259        format: Optional[str] = None,
260        proxy: Optional["EntityProxy"] = None,
261    ) -> Optional[str]:
262        """All code values are cleaned to be lowercase and trailing whitespace is
263        removed."""
264        code = code.lower().strip()
265        if code not in self.codes:
266            return None
267        return code

All code values are cleaned to be lowercase and trailing whitespace is removed.

def caption(self, value: str) -> str: View Source

269    def caption(self, value: str) -> str:
270        """Given a code value, return the label that should be shown to a user."""
271        return self.names.get(value, value)

Given a code value, return the label that should be shown to a user.

def to_dict(self) -> PropertyTypeToDict: View Source

273    def to_dict(self) -> PropertyTypeToDict:
274        """When serialising the model to JSON, include all values."""
275        data = super(EnumType, self).to_dict()
276        data["values"] = self.names
277        return data

When serialising the model to JSON, include all values.

Inherited Members

PropertyType: name; group; label; plural; matchable; pivot; max_length; total_size; docs; clean; join; specificity; compare_safe; compare; compare_sets; country_hint; rdf; pick; node_id; node_id_safe