Source code for deepparse.parser.formatted_parsed_address
from typing import Dict, List, Tuple, Union
FIELDS = [
"StreetNumber",
"Unit",
"StreetName",
"Orientation",
"Municipality",
"Province",
"PostalCode",
"GeneralDelivery",
"EOS",
]
[docs]class FormattedParsedAddress:
"""
A parsed address as commonly known returned by an address parser.
Args:
address (dict): A dictionary where the key is an address, and the value is a list of tuples where
the first elements are address components, and the second elements are the parsed address
value. Also, the second tuple's address value can either be the tag of the components
(e.g. StreetName) or a tuple (``x``, ``y``) where ``x`` is the tag and ``y`` is the
probability (e.g. 0.9981) of the model prediction.
Attributes:
raw_address: The raw address (not parsed).
address_parsed_components: The parsed address in a list of tuples where the first elements
are the address components and the second elements are the tags.
<Address tag>: All the possible address tag element of the model. For example, ``StreetName`` or
``StreetNumber``.
Example:
.. code-block:: python
address_parser = AddressParser()
parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
print(parse_address.StreetNumber) # 350
print(parse_address.PostalCode) # G1L 1B6
# Print the parsed address
print(parsed_address)
Note:
Since an address component can be composed of multiple elements (e.g. Wolfe street), when the probability
values are asked of the address parser, the address components don't keep it. It's only available through the
``address_parsed_components`` attribute.
"""
def __init__(self, address: Dict) -> None:
for key in FIELDS:
setattr(self, key, None)
self.raw_address = list(address.keys())[0]
self.address_parsed_components = address[self.raw_address]
self._infer_tags_order()
self._resolve_tagged_affectation(self.address_parsed_components)
def __str__(self) -> str:
"""
Return the unparsed address plus the parsed address components.
"""
return (
f"The unparsed address is '{self.raw_address}' and the parsed address is "
f"'{self._formatted_parsed_address_components()}'"
)
def _formatted_parsed_address_components(self) -> str:
formatted_string = ""
for component_tuple in self.address_parsed_components:
formatted_string += str(component_tuple) + " "
return formatted_string.strip() # Strip to remove trailing ending whitespace
def __repr__(self) -> str:
values = [
self._get_attr_repr(name)
for name in self.__dict__
if name not in ("raw_address", "address_parsed_components", "inferred_order")
]
joined_values = ", ".join(v for v in values if v != "")
return self.__class__.__name__ + "<" + joined_values + ">"
def __eq__(self, other) -> bool:
"""
Equal if all address components elements are equals. If attributes are not the same, it will return False.
"""
for field in self.__dict__:
address_component = getattr(self, field)
try:
other_address_component = other.__getattribute__(field)
except AttributeError:
# Attribute not the same.
return False
if address_component != other_address_component:
# An element is different.
return False
return True
[docs] def format_address(
self,
fields: Union[List, None] = None,
capitalize_fields: Union[List[str], None] = None,
upper_case_fields: Union[List[str], None] = None,
field_separator: Union[str, None] = None,
) -> str:
"""
Method to format the address components in a specific order. We also filter the empty components (None).
By default, the order is ``'StreetNumber, Unit, StreetName, Orientation, Municipality, Province, PostalCode,
GeneralDelivery'`` and we filter the empty components.
Args:
fields (Union[list, None]): Optional argument to define the fields to order the address components of
the address. If None, we will use the inferred order based on the address tags' appearance. For example,
if the parsed address is ``(305, StreetNumber), (rue, StreetName), (des, StreetName),
(Lilas, StreetName)``, the inferred order will be ``StreetNumber, StreetName``.
capitalize_fields (Union[list, None]): Optional argument to define the capitalized fields for the formatted
address. If None, no fields are capitalized.
upper_case_fields (Union[list, None]): Optional argument to define the upper-cased fields for the
formatted address. If None, no fields are capitalized.
field_separator (Union[list, None]): Optional argument to define the field separator between address
components. If None, the default field separator is ``" "``.
Return:
A string of the formatted address in the fields order.
Examples:
.. code-block:: python
address_parser = AddressParser()
parse_address = address_parser("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6")
parse_address.formatted_address(fields_separator=", ")
# > 350, rue des lilas, ouest, quebec city, quebec, g1l 1b6
parse_address.formatted_address(fields_separator=", ", capitalize_fields=["StreetName", "Orientation"])
# > 350, rue des lilas, ouest, quebec city, quebec, g1l 1b6
parse_address.formatted_address(fields_separator=", ", upper_case_fields=["PostalCode""])
# > 350 rue des lilas ouest quebec city quebec G1L 1B6
"""
if fields is None:
fields = self.inferred_order
self._validate_argument(fields)
if capitalize_fields is None:
capitalize_fields = []
self._validate_argument(capitalize_fields)
if upper_case_fields is None:
upper_case_fields = []
self._validate_argument(upper_case_fields)
if field_separator is None:
field_separator = " "
formatted_parsed_address = ""
for field in fields:
address_component = getattr(self, field)
if address_component is not None:
# Format address
address_component = address_component.capitalize() if field in capitalize_fields else address_component
address_component = address_component.upper() if field in upper_case_fields else address_component
formatted_parsed_address += address_component + field_separator
return formatted_parsed_address.strip(field_separator) # To remove last field separator
[docs] def to_dict(self, fields: Union[List, None] = None) -> dict:
"""
Method to convert a parsed address into a dictionary where the keys are the address components, and the values
are the value of those components. For example, the parsed address ``<StreetNumber> 305 <StreetName>
rue des Lilas`` will be converted into the following dictionary:
``{'StreetNumber':'305', 'StreetName': 'rue des Lilas'}``.
Args:
fields (Union[list, None]): Optional argument to define the fields to extract from the address and the
order of it. If None, will use the default order and value ``'StreetNumber, Unit, StreetName,
Orientation, Municipality, Province, PostalCode, GeneralDelivery'``.
Return:
A dictionary where the keys are the selected (or default) fields and the values are the corresponding value
of the address components.
"""
if fields is None:
fields = FIELDS
return {field: getattr(self, field) for field in fields}
[docs] def to_list_of_tuples(self, fields: Union[List, None] = None) -> List[tuple]:
"""
Method to convert a parsed address into a list of tuples where the first element of the tuples
is the value of the components, and the second value is the name of the components.
For example, the parsed address ``<StreetNumber> 305 <StreetName> rue des Lilas`` will be converted into the
following list of tuples: ``('305', 'StreetNumber'), ('rue des Lilas', 'StreetName')]``.
Args:
fields (Union[list, None]): Optional argument to define the fields to extract from the address and its
order. If None, it will use the default order and value ``'StreetNumber, Unit, StreetName,
Orientation, Municipality, Province, PostalCode, GeneralDelivery'``.
Return:
A list of tuples where the first element of the tuples are the value of the address components
and the second values are the name of the address components.
"""
dict_of_attr = self.to_dict(fields)
return [(value, key) for key, value in dict_of_attr.items()]
[docs] def to_pandas(self) -> Dict:
"""
Method to convert a parsed address into a dictionary for pandas where the first key is the raw address and
the following keys are the address components, and the values are the values of those components.
For example, the parsed address ``<StreetNumber> 305 <StreetName> rue des Lilas`` will be converted into
the following dictionary: ``{'Address': '305 rue des Lilas', 'StreetNumber':'305', 'StreetName':
'rue des Lilas'}``.
Return:
A dictionary of the raw address and all is parsed components.
"""
return {"Address": self.raw_address, **self.to_dict()}
[docs] def to_pickle(self) -> Tuple[str, List]:
"""
Method to convert a parsed address into a list of tuple for pickle where the first tuple element is the
raw address and the following tuples are the address components, and the values are the values of
those components. For example, the parsed address ``<StreetNumber> 305 <StreetName> rue des Lilas``
will be converted into the following list of tuples: ``'305 rue des Lilas', ('305', 'StreetNumber'),
('rue des Lilas', 'StreetName')]``.
Return:
A tuple where the first element is the raw address (a string) and the second element is a list of
tuple of the parsed addresses. The first element of each tuple is the address components and the second
is the tag.
"""
return self.raw_address, self.to_list_of_tuples()
def _resolve_tagged_affectation(self, tagged_address: List[Tuple]) -> None:
"""
Private method to resolve the parsing of the tagged address.
Args:
tagged_address: The tagged address where the keys are the address component and the values are the
associated tag.
"""
for address_component, tag in tagged_address:
if isinstance(tag, tuple): # when tag is also the tag and the probability of the tag
tag = tag[0]
if getattr(self, tag) is None:
# empty address components
setattr(self, tag, address_component)
else:
# we merge the previous components with the new element
setattr(self, tag, " ".join([getattr(self, tag), address_component]))
def _get_attr_repr(self, name: str) -> str:
value = getattr(self, name)
if value is not None:
return name + "=" + repr(getattr(self, name))
return ""
def _validate_argument(self, arg: List) -> None:
for arg_element in arg:
if not hasattr(self, arg_element):
raise KeyError(arg_element + " not an attribute of the formatted parsed address.")
def _infer_tags_order(self) -> None:
"""
Private method to infer the order of the tags base on the address order tag.
"""
tags = [tag for _, tag in self.address_parsed_components]
inferred_order = []
for tag in tags:
if tag not in inferred_order:
inferred_order.append(tag)
self.inferred_order = inferred_order