Source code for deepparse.comparer.formatted_compared_addresses

# pylint: disable=superfluous-parens

import os
import sys
from abc import ABC, abstractmethod
from dataclasses import dataclass
from difflib import SequenceMatcher
from typing import List, Union, Tuple, Dict

from ..parser import FormattedParsedAddress


[docs]@dataclass(frozen=True)
class FormattedComparedAddresses(ABC):
    """
    Abstract method that defined a comparison for addresses returned by the address comparer.

    Args:
        first_address(FormattedParsedAddress): A formatted parsed address that contains the parsing information
                                                for the first one.
        second_address(FormattedParsedAddress): A formatted parsed address that contains the parsing information
                                                for the second one.
        origin: (Tuple[str, str]): The origin of the parsing (ex : from source or a Deepparse pretrained model).

    Example:

        .. code-block:: python

            address_comparer = AddressesComparer(AddressParser())
            raw_identical_comparison = address_comparer.compare_raw(
                                                        ("350 rue des Lilas Ouest Quebec city Quebec G1L 1B6",
                                                        "450 rue des Lilas Ouest Quebec city Quebec G1L 1B6"))
    """

    first_address: FormattedParsedAddress
    second_address: FormattedParsedAddress
    origin: Tuple[str, str]
    with_prob: bool

    @property
    def list_of_bool(self) -> List:
        """
        A list of boolean that contains all the address components' names and indicates if it is the same for the
        two addresses.

        Return:
            A list of the boolean.
        """
        return self._bool_address_tags_are_the_same(
            [
                self.first_address.to_list_of_tuples(),
                self.second_address.to_list_of_tuples(),
            ]
        )

    @property
    def equivalent(self) -> bool:
        """
        Check if the parsing is the same for the two addresses.

        Return:
            A bool.
        """
        return all((bool_address[1] for bool_address in self.list_of_bool))

    @property
    def identical(self) -> bool:
        """
        Check if the parsing is the same for the two addresses and if the raw addresses are identical.

        Return:
            A bool.
        """
        is_identical = False
        if self.equivalent:
            if self.first_address.raw_address == self.second_address.raw_address:
                is_identical = True

        return is_identical

[docs]    def comparison_report(self, nb_delimiters: Union[int, None] = None) -> None:
        """
        Print a formatted comparison report of the two addresses.
        """
        sys.stdout.writelines(self._comparison_report(nb_delimiters))

    def _comparison_report(self, nb_delimiters: Union[int, None]) -> str:
        """
        Builds a comparison_report with delimiters to make the comparison's beginning and end easier to spot.
        """

        # Get terminal size to adapt the output to the user
        nb_delimiters = os.get_terminal_size().columns if nb_delimiters is None else nb_delimiters

        formatted_str = ""
        comparison_report_signal = "=" * nb_delimiters
        formatted_str += comparison_report_signal + "\n"
        formatted_str += self._comparison_report_builder()
        formatted_str += comparison_report_signal + "\n\n"
        return formatted_str

    @abstractmethod
    def _comparison_report_builder(self) -> str:
        """
        Builds the core of a comparison report for the various comparisons. Since the procedure to make a tags
        comparison and the raw addresses comparison are different, the comparison report is not the same for the two.
        It is then implemented in each specific class.
        """

    @abstractmethod
    def _get_probs(self) -> Dict:
        """
        A method to get the tags from the parsing with their associated probabilities, it needs to be implemented in
        each class because they don't use the probabilities the same way.
        """

    @staticmethod
    def _get_color_diff(string_one: str, string_two: str, highlight: bool = False) -> str:
        """
        Compare two strings and determine the difference between the two. The differences are highlighted with a
        coloured scheme; if the first string has more elements than the second one, it will be noted in one colour;
        on the contrary, if the other string has something more, it will have a different colour notation.

        Args:
            string_one (str): The first string to compare.
            string_two (str): The second string to compare.
            highlight (bool, optional): If set to yes, the difference will be highlighted in colour instead of the
                character itself in colour. This might be used to have information where the discrepancies between
                two strings are spaces. The default is False.

        Notes:
            The method is colorblind-friendly, which means that the output will be
            in colours that minimize the risk that a user cannot see the difference as
            defined here https://davidmathlogic.com/colorblind/#%23D81B60-%231E88E5-%23FFC107-%23004D40.

            If both the strings share the same character, it will be written in white.
            If the first string has something more than the second one, it will be indicated in blue.
            If the second string has something more than the first one, it will be noted in yellow.

            It uses SequenceMatcher to convert the different codes into colour codes later.

        Return:
            str: The two strings joined, and the differences are noted in colour codes
        """
        code_type = 48 if highlight else 38

        color_1 = "\033[{code_type};2;26;123;220m{text}\033[0m"  # blue
        color_2 = "\033[{code_type};2;255;194;10m{text}\033[0m"  # yellow

        white = "\033[38;2;255;255;255m{text}\033[0m"

        result = ""
        codes = SequenceMatcher(a=string_one, b=string_two).get_opcodes()
        for code in codes:
            if code[0] == "equal":
                result += white.format(text=(string_one[code[1] : code[2]]))
            elif code[0] == "delete":
                result += color_1.format(code_type=code_type, text=string_one[code[1] : code[2]])
            elif code[0] == "insert":
                result += color_2.format(code_type=code_type, text=string_two[code[3] : code[4]])
            elif code[0] == "replace":
                if code[1] <= code[3]:
                    result += color_1.format(code_type=code_type, text=string_one[code[1] : code[2]]) + color_2.format(
                        code_type=code_type, text=string_two[code[3] : code[4]]
                    )
                else:
                    result += color_2.format(code_type=code_type, text=string_two[code[3] : code[4]]) + color_1.format(
                        code_type=code_type, text=string_one[code[1] : code[2]]
                    )
        return result

    def _get_tags_diff_color(
        self,
        name_one: str = "first address",
        name_two: str = "second address",
        verbose: bool = True,
    ) -> str:
        """
        Print the output of the string with colour codes representing the differences between the two strings.

        Args:
            name_one (str, optional) : Name associated with first color. The default value is ``"first address"``,
                namely the first address of the two. We recommend using a whitespace characters between the words.
            name_two (str, optional) : Name associated with the second colour. The default value is
                ``"second address"``, namely the second address of the two.  We recommend using a whitespace
                characters between the words.
            verbose (bool, optional): If True, it will print a presentation of the colours and their meaning.
                The default value is ``True``.

        """

        formatted_str = ""
        if verbose:
            formatted_str += "White: Shared\n"
            formatted_str += "Blue: Belongs only to the " + name_one + "\n"
            formatted_str += "Yellow: Belongs only to the " + name_two + "\n"
            formatted_str += "\n"

        address_component_names = [tag[0] for tag in self.list_of_bool if not tag[1]]

        for address_component_name in address_component_names:
            list_of_list_tag = []
            for parsed_address in (
                self.first_address.to_list_of_tuples(),
                self.second_address.to_list_of_tuples(),
            ):
                list_of_list_tag.append(
                    " ".join(
                        [
                            tag
                            for (tag, tag_name) in parsed_address
                            if tag_name == address_component_name and tag is not None
                        ]
                    )
                )

            result = self._get_color_diff(list_of_list_tag[0], list_of_list_tag[1])

            formatted_str += address_component_name + ": \n"
            formatted_str += result + "\n"

        return formatted_str

    def _bool_address_tags_are_the_same(self, parsed_addresses: Union[List[List[tuple]], List[tuple]]) -> List[tuple]:
        """
        Compare the components between two addresses and put the differences in a dictionary where the keys are the
        names of the addresses components, and the values are the values of the addresses component.

        Args:
            parsed_addresses (Union[List[List[tuple]], List[tuple]]): Contains the tags and the
            address components' names for the parsed addresses.

        Return:
            List[tuple]: List of tuples containing the components that differ from the two addresses.
        """
        unique_address_component_names = self._unique_addresses_component_names(parsed_addresses)

        list_of_bool_and_tag = []
        for address_component_name in unique_address_component_names:
            list_of_list_tag = []
            for parsed_address in parsed_addresses:
                list_of_list_tag.append(
                    " ".join(
                        [
                            tag
                            for (tag, tag_name) in parsed_address
                            if tag_name == address_component_name and tag is not None
                        ]
                    )
                )

            list_of_bool_and_tag.append(
                (
                    address_component_name,
                    all(x == list_of_list_tag[0] for x in list_of_list_tag),
                )
            )

        return list_of_bool_and_tag

    @staticmethod
    def _unique_addresses_component_names(parsed_addresses: List[List[tuple]]) -> List:
        """
        Retrieves all the unique address component names from the comparison, then returns it.

        Args:
            parsed_addresses (List[List[tuple]]): Contains the tags and the
            address components' names for the parsed addresses.

        Return:
            Returns a list of all the unique address component names.
        """
        # We don't use a set here since the order and report will change.
        unique_address_component_names = []
        for tuple_values in parsed_addresses:
            for address_component in tuple_values:
                address_component = address_component[1]
                if address_component not in unique_address_component_names:
                    unique_address_component_names.append(address_component)
        return unique_address_component_names