Source code for deepparse.comparer.addresses_comparer

from dataclasses import dataclass
from typing import List, Tuple, Union, Dict

from .formatted_compared_addresses_raw import FormattedComparedAddressesRaw
from .formatted_compared_addresses_tags import FormattedComparedAddressesTags
from ..parser import AddressParser
from ..parser.formatted_parsed_address import FormattedParsedAddress


[docs]@dataclass(frozen=True)
class AddressesComparer:
    """
    Address comparer is used to compare addresses with each other and retrieve the differences between them. The
    addresses are parsed using an address parser based on one of the seq2seq pretrained networks, either with
    FastText or BPEmb.

    The address comparer can compare already parsed addresses. The address parser first recomposes the raw
    addresses then suggest its own tags; then it makes a comparison with the tags of the source parsing and the
    newly parsed address

    The address comparer is also able to compare raw addresses by first parsing the addresses using the
    address parser and then bring out the differences among the parsed addresses.


    Args:
        parser (~deepparse.parser.address_parser.AddressParser): the AddressParser used to parse the addresses.
    """

    parser: AddressParser

    def __str__(self) -> str:
        model_type_formatted = self.parser.get_formatted_model_name()
        return f"Compare addresses with {model_type_formatted}AddressParser"

    __repr__ = __str__  # To call __str__ when list of address

[docs]    def compare_tags(
        self,
        addresses_tags_to_compare: Union[List[tuple], List[List[tuple]]],
        with_prob: Union[None, bool] = None,
    ) -> Union[List[FormattedComparedAddressesTags], FormattedComparedAddressesTags]:
        """
        Compare tags of a source parsing with the parsing from AddressParser. First, it reconstructs the
        raw address from the parsing, AddressParser generates tags and compares the two parsings.

        Args:
            addresses_tags_to_compare (Union[List[tuple], List[List[tuple]]]): list of tuples that contain
            the tags for the address components from the source. Can compare multiple parsings if passed as a
            list of tuples.
            with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison
                report. The probabilities are not compared but only included in the report. The default value is
                ``None``, which means not taking into account.

        Return:
            Either a :class:`~FormattedComparedAddressesTags` or a list of :class:`~FormattedComparedAddressTags`
            when there is more than one comparison to make.

        Examples:

            .. code-block:: python

                first_parsed_address = [
                    ("350", "StreetNumber"),
                    ("rue des Lilas", "StreetName"),
                    ("Ouest Québec", "Municipality"),
                    ("Québec", "Province"),
                    ("G1L 1B6", "PostalCode")]
                second_parsed_address_with_prob = [
                    ('350', ('StreetNumber', 1.0)),
                    ('rue', ('StreetName', 0.9987)),
                    ('des', ('StreetName', 0.9993)),
                    ('Lilas', ('StreetName', 0.8176)),
                    ('Ouest', ('Orientation', 0.781)),
                    ('Quebec', ('Municipality', 0.9768)),
                    ('Quebec', ('Province', 1.0)),
                    ('G1L', ('PostalCode', 0.9993)),
                    ('1B6', ('PostalCode', 1.0))]

                address_parser = AddressParser(model_type="bpemb")
                addresses_comparer = AddressesComparer(address_parser)

                list_of_compared_addresses = addresses_comparer.compare_tags([first_parsed_address,
                                                                              second_parsed_address_with_prob])
                list_of_compared_addresses[0].comparison_report()
                list_of_compared_addresses[1].comparison_report()

        """

        if isinstance(addresses_tags_to_compare[0], tuple):
            addresses_tags_to_compare = [addresses_tags_to_compare]

        with_prob = (
            any((self._check_if_with_prob(address) for address in addresses_tags_to_compare))
            if with_prob is None
            else with_prob
        )

        raw_addresses = [" ".join([element[0] for element in address]) for address in addresses_tags_to_compare]

        formatted_addresses = [
            FormattedParsedAddress({raw_address: address_tags})
            for raw_address, address_tags in zip(raw_addresses, addresses_tags_to_compare)
        ]

        deepparsed_formatted_addresses = [
            self.parser(raw_address, with_prob=with_prob) for raw_address in raw_addresses
        ]

        comparison_tuples = list(zip(formatted_addresses, deepparsed_formatted_addresses))

        parsing_model = self.parser.model_type.capitalize()
        origin_tuple = ("source", "deepparse using " + parsing_model)
        list_of_comparison_dict = self._format_comparisons_dict(comparison_tuples, origin_tuple, with_prob)

        formatted_comparisons = [
            FormattedComparedAddressesTags(**comparison_info) for comparison_info in list_of_comparison_dict
        ]
        return formatted_comparisons if len(formatted_comparisons) > 1 else formatted_comparisons[0]

[docs]    def compare_raw(
        self,
        raw_addresses_to_compare: Union[Tuple[str], List[Tuple[str]]],
        with_prob: Union[None, bool] = None,
    ) -> List[FormattedComparedAddressesRaw]:
        """
        Compare a list of raw addresses together. It starts by parsing the addresses
        with the parser and then return the differences between the parsed address components of the two addresses.

        Args:
            raw_addresses_to_compare (Union[Tuple[str], List[Tuple[str]]]):
                List of strings that represent raw addresses to compare.
            with_prob (Union[None, bool]): An option flag to either or not include probabilities in the comparison
                report. The probabilities are not compared but only included in the report. The default value is
                ``None``, which means not taking into account.

        Return:
            Either a :class:`~FormattedComparedAddressesRaw` or a list of
            :class:`~FormattedComparedAddressesRaw` when given more than one comparison to make.

        Examples:

            .. code-block:: python

                raw_address_original = "350 rue des Lilas Ouest Quebec Quebec G1L 1B6"
                raw_address_identical = "350 rue des Lilas Ouest Quebec Quebec G1L 1B6"
                raw_address_equivalent = "350  rue des Lilas Ouest Quebec Quebec G1L 1B6"
                raw_address_diff_streetNumber = "450 rue des Lilas Ouest Quebec Quebec G1L 1B6"

                raw_addresses_multiples_comparisons = addresses_comparer.compare_raw([(raw_address_original,
                                                                                       raw_address_identical),
                                                                                      (raw_address_original,
                                                                                       raw_address_equivalent),
                                                                                      (raw_address_original,
                                                                                       raw_address_diff_streetNumber)])
                raw_addresses_multiples_comparisons[0].comparison_report()
                raw_addresses_multiples_comparisons[1].comparison_report()
                raw_addresses_multiples_comparisons[2].comparison_report()

        """
        if isinstance(raw_addresses_to_compare[0], str):
            raw_addresses_to_compare = [raw_addresses_to_compare]

        with_prob = True if with_prob is None else with_prob

        list_of_deepparsed_addresses = []
        for addresses_to_compare in raw_addresses_to_compare:
            if len(addresses_to_compare) != 2:
                raise ValueError("You need to compare two addresses")
            list_of_deepparsed_addresses.append(self.parser(addresses_to_compare, with_prob=with_prob))

        parsing_model = self.parser.model_type.capitalize()
        origin_tuple = (
            "deepparse using " + parsing_model,
            "deepparse using " + parsing_model,
        )
        list_of_comparison_dict = self._format_comparisons_dict(list_of_deepparsed_addresses, origin_tuple, with_prob)

        formatted_comparisons = [
            FormattedComparedAddressesRaw(**comparison_info) for comparison_info in list_of_comparison_dict
        ]

        return formatted_comparisons if len(formatted_comparisons) > 1 else formatted_comparisons[0]

    @staticmethod
    def _format_comparisons_dict(comparison_tuples: List, origin_tuple: Tuple[str, str], with_prob: bool) -> List[Dict]:
        """
        Return formatted dictionary that contains two FormattedParsedAddress and the origin name tuple and output it
        in a dictionary format.
        """

        list_of_formatted_comparisons_dict = []

        for comparison_tuple in comparison_tuples:
            comparison_info = {
                "first_address": comparison_tuple[0],
                "second_address": comparison_tuple[1],
                "origin": origin_tuple,
                "with_prob": with_prob,
            }

            list_of_formatted_comparisons_dict.append(comparison_info)

        return list_of_formatted_comparisons_dict

    @staticmethod
    def _check_if_with_prob(list_of_tuple: List[Tuple]) -> bool:
        return len(list_of_tuple[0][1]) == 2 and isinstance(list_of_tuple[0][1][1], float)