From 2a82beecff6d71cd93862383b33f7c7c821a93b0 Mon Sep 17 00:00:00 2001 From: Ari Date: Thu, 20 Jun 2024 16:46:04 -0400 Subject: [PATCH 1/2] updated np.float_ to np.float64 for current compatibility with Numpy 2.0.0 - was released this week --- distances/_discounted_levenshtein.py | 2 +- distances/_editex.py | 2 +- distances/_levenshtein.py | 2 +- distances/_typo.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/distances/_discounted_levenshtein.py b/distances/_discounted_levenshtein.py index d2d5ed3..62603fc 100644 --- a/distances/_discounted_levenshtein.py +++ b/distances/_discounted_levenshtein.py @@ -175,7 +175,7 @@ def _alignment_matrix( else: discount_from = [1, 1] - d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_) + d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float64) if backtrace: trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8) for i in range(1, src_len + 1): diff --git a/distances/_editex.py b/distances/_editex.py index fa1b660..d34dd28 100644 --- a/distances/_editex.py +++ b/distances/_editex.py @@ -23,7 +23,7 @@ from typing import Any, Tuple, cast from unicodedata import normalize as unicode_normalize -from numpy import float_ as np_float +from numpy import float64 as np_float from numpy import zeros as np_zeros from ._distance import _Distance diff --git a/distances/_levenshtein.py b/distances/_levenshtein.py index c35f47d..42ee2bc 100644 --- a/distances/_levenshtein.py +++ b/distances/_levenshtein.py @@ -138,7 +138,7 @@ def _alignment_matrix( tar_len = len(tar) max_len = max(src_len, tar_len) - d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_) + d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float64) if backtrace: trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8) for i in range(src_len + 1): diff --git a/distances/_typo.py b/distances/_typo.py index b7ea7d9..2b29db0 100644 --- a/distances/_typo.py +++ b/distances/_typo.py @@ -23,7 +23,7 @@ from math import log from typing import Any, Dict, Tuple, cast -from numpy import float_ as np_float +from numpy import float64 as np_float from numpy import zeros as np_zeros from ._distance import _Distance From 57b0b248da2fedbe9b53ec4fc20460f88ce711e0 Mon Sep 17 00:00:00 2001 From: Michiel Date: Wed, 26 Jun 2024 12:06:48 +0200 Subject: [PATCH 2/2] organised imports and moved to black formatting --- name_matching/distance_metrics.py | 201 ++-- name_matching/name_matcher.py | 441 ++++--- name_matching/run_nm.py | 290 +++-- name_matching/sparse_cosine.py | 99 +- name_matching/test/test_name_matcher.py | 1357 ++++++++++++++++------ name_matching/test/test_sparse_cosine.py | 417 ++++--- setup.py | 28 +- 7 files changed, 1822 insertions(+), 1011 deletions(-) diff --git a/name_matching/distance_metrics.py b/name_matching/distance_metrics.py index cc2146e..0d1a50b 100644 --- a/name_matching/distance_metrics.py +++ b/name_matching/distance_metrics.py @@ -1,34 +1,37 @@ -from distances import Indel, DiscountedLevenshtein, CormodeLZ, Tichy, IterativeSubString, BaulieuXIII, Clement, DiceAsymmetricI, KuhnsIII, Overlap, PearsonII, WeightedJaccard, WarrensIV, Bag, RougeL, RatcliffObershelp, NCDbz2, FuzzyWuzzyPartialString, FuzzyWuzzyTokenSort, FuzzyWuzzyTokenSet, Editex, Typo,LIG3, SSK, Levenshtein, DoubleMetaphone, RefinedSoundex, PhoneticDistance +import distances as nm_dist from collections import defaultdict -def make_distance_metrics(indel=False, - discounted_levenshtein=False, - tichy=False, - cormodel_z=False, - iterative_sub_string=False, - baulieu_xiii=False, - clement=False, - dice_asymmetrici=False, - kuhns_iii=False, - overlap=False, - pearson_ii=False, - weighted_jaccard=False, - warrens_iv=False, - bag=False, - rouge_l=False, - ratcliff_obershelp=False, - ncd_bz2=False, - fuzzy_wuzzy_partial_string=False, - fuzzy_wuzzy_token_sort=False, - fuzzy_wuzzy_token_set=False, - editex=False, - typo=False, - lig_3=False, - ssk=False, - refined_soundex=False, - double_metaphone=False) -> dict: + +def make_distance_metrics( + indel=False, + discounted_levenshtein=False, + tichy=False, + cormodel_z=False, + iterative_sub_string=False, + baulieu_xiii=False, + clement=False, + dice_asymmetrici=False, + kuhns_iii=False, + overlap=False, + pearson_ii=False, + weighted_jaccard=False, + warrens_iv=False, + bag=False, + rouge_l=False, + ratcliff_obershelp=False, + ncd_bz2=False, + fuzzy_wuzzy_partial_string=False, + fuzzy_wuzzy_token_sort=False, + fuzzy_wuzzy_token_set=False, + editex=False, + typo=False, + lig_3=False, + ssk=False, + refined_soundex=False, + double_metaphone=False, +) -> dict: r""" - A function which returns a dict containing the distance metrics that should be + A function which returns a dict containing the distance metrics that should be used during the fuzzy string matching Levenshtein edit distance @@ -70,69 +73,69 @@ def make_distance_metrics(indel=False, Parameters ---------- indel: bool - Boolean indicating whether the Indel method should be used during the - fuzzy name matching. The indel method is equal to a regular levenshtein - distance with a twice as high substitution weight + Boolean indicating whether the Indel method should be used during the + fuzzy name matching. The indel method is equal to a regular levenshtein + distance with a twice as high substitution weight default=False discounted_levenshtein: bool - Boolean indicating whether the DiscountedLevenshtein method should be used + Boolean indicating whether the DiscountedLevenshtein method should be used during the fuzzy name matching. Equal to the regular levenshtein distance, only errors later in the string are counted at a discounted rate. To for instance limit the importance of suffix differences default=False tichy: bool - Boolean indicating whether the Tichy method should be used during the + Boolean indicating whether the Tichy method should be used during the fuzzy name matching. This algorithm provides a shortest edit distance based on substring and add operations. default=False cormodel_z: bool - Boolean indicating whether the CormodeLZ method should be used during the - fuzzy name matching. The CormodeLZ distance between strings x and y, is the - minimum number of single characters or substrings of y or of the partially + Boolean indicating whether the CormodeLZ method should be used during the + fuzzy name matching. The CormodeLZ distance between strings x and y, is the + minimum number of single characters or substrings of y or of the partially built string which are required to produce x from left to right. default=False iterative_sub_string: bool - Boolean indicating whether the IterativeSubString method should be used - during the fuzzy name matching. A method that counts the similarities + Boolean indicating whether the IterativeSubString method should be used + during the fuzzy name matching. A method that counts the similarities between two strings substrings and subtracts the differences taking into account the winkler similarity between the string and the substring. default=False baulieu_xiii: bool - Boolean indicating whether the BaulieuXIII method should be used during + Boolean indicating whether the BaulieuXIII method should be used during the fuzzy name matching. The Baulieu XIII distance between two strings is given by the following formula: (|X \ Y| + |Y \ X|) / ( |X ∩ Y| + |X \ Y| + |Y \ X| + |X ∩ Y| ∙ (|X ∩ Y| - 4)^2) default=False clement: bool Boolean indicating whether the Clement method should be used during the - fuzzy name matching. The Clement distance between two strings is given + fuzzy name matching. The Clement distance between two strings is given by the following formula: (|X ∩ Y|/|X|)*(1-|X|/|N|) + (|(N \ X) \ Y|/|N \ X|) * (1-|N \ X|/|N|) default=False dice_asymmetrici: bool - Boolean indicating whether the DiceAsymmetricI method should be used during + Boolean indicating whether the DiceAsymmetricI method should be used during the fuzzy name matching. The Dice asymmetric similarity is given be |X ∩ Y|/|X| default=False kuhns_iii: bool - Boolean indicating whether the KuhnsIII method should be used during the + Boolean indicating whether the KuhnsIII method should be used during the fuzzy name matching default=False overlap: bool - Boolean indicating whether the Overlap method should be used during the + Boolean indicating whether the Overlap method should be used during the fuzzy name matching. The overlap distance is given by: |X ∩ Y|/min(|X|,|Y|) default=True pearson_ii: bool - Boolean indicating whether the PearsonII method should be used during the - fuzzy name matching. This algorithm is based on the Phi coefficient or the + Boolean indicating whether the PearsonII method should be used during the + fuzzy name matching. This algorithm is based on the Phi coefficient or the mean square contingency default=False weighted_jaccard: bool - Boolean indicating whether the WeightedJaccard method should be used during - the fuzzy name matching. This is the Jaccard distance only using a wheighing + Boolean indicating whether the WeightedJaccard method should be used during + the fuzzy name matching. This is the Jaccard distance only using a wheighing for the differences of 3. default=True warrens_iv: bool - Boolean indicating whether the WarrensIV method should be used during the + Boolean indicating whether the WarrensIV method should be used during the fuzzy name matching default=False bag: bool @@ -141,54 +144,54 @@ def make_distance_metrics(indel=False, a similarity tree structure. default=False rouge_l: bool - Boolean indicating whether the ROUGE-L method should be used during the + Boolean indicating whether the ROUGE-L method should be used during the fuzzy name matching. The ROGUE-L method is a measure that counts the longest substring between to strings default=False ratcliff_obershelp: bool - Boolean indicating whether the RatcliffObershelp method should be used + Boolean indicating whether the RatcliffObershelp method should be used during the fuzzy name matching. This method finds the longest common substring - and evaluates the longest common substrings to the right and the left of the + and evaluates the longest common substrings to the right and the left of the original longest common substring default=True ncd_bz2: bool - Boolean indicating whether the NCDbz2 method should be used during the - fuzzy name matching. Applies the Burrows-Wheeler transform to the strings and + Boolean indicating whether the NCDbz2 method should be used during the + fuzzy name matching. Applies the Burrows-Wheeler transform to the strings and subsequently returns the normalised compression distance. default=False fuzzy_wuzzy_partial_string: bool Boolean indicating whether the FuzzyWuzzyPartialString method should be used - during the fuzzy name matching. This methods takes the length of the longest - common substring and divides it over the minimum of the length of each of + during the fuzzy name matching. This methods takes the length of the longest + common substring and divides it over the minimum of the length of each of the two strings. default=False fuzzy_wuzzy_token_sort: bool - Boolean indicating whether the FuzzyWuzzyTokenSort method should be used + Boolean indicating whether the FuzzyWuzzyTokenSort method should be used during the fuzzy name matching. This tokenizes the words in the string and sorts them, subsequently a hamming distance is calculated default=True fuzzy_wuzzy_token_set: bool - Boolean indicating whether the FuzzyWuzzyTokenSet method should be used - during the fuzzy name matching. This method tokenizes the strings and - find the largest intersection of the two substrings and divides it over + Boolean indicating whether the FuzzyWuzzyTokenSet method should be used + during the fuzzy name matching. This method tokenizes the strings and + find the largest intersection of the two substrings and divides it over the length of the shortest string default=False editex: bool - Boolean indicating whether the Editex method should be used during the + Boolean indicating whether the Editex method should be used during the fuzzy name matching default=True typo: bool - Boolean indicating whether the Typo method should be used during the + Boolean indicating whether the Typo method should be used during the fuzzy name matching. The typo distance is calculated based on the distance on a keyboard between edits. default=False lig_3: bool - Boolean indicating whether the LIG3 method should be used during the fuzzy + Boolean indicating whether the LIG3 method should be used during the fuzzy name matching default=False ssk: bool - Boolean indicating whether the SSK method should be used during the fuzzy - name matching. The ssk algorithm looks at the string kernel generated by all + Boolean indicating whether the SSK method should be used during the fuzzy + name matching. The ssk algorithm looks at the string kernel generated by all the possible different subsequences present between the two strings. default=False refined_soundex: bool @@ -205,62 +208,68 @@ def make_distance_metrics(indel=False, """ distance_metrics = defaultdict(list) if indel: - distance_metrics['Levenshtein'].append(Indel()) + distance_metrics["Levenshtein"].append(nm_dist.Indel()) if discounted_levenshtein: - distance_metrics['Levenshtein'].append( - DiscountedLevenshtein()) + distance_metrics["Levenshtein"].append(nm_dist.DiscountedLevenshtein()) if cormodel_z: - distance_metrics['block'].append(CormodeLZ()) + distance_metrics["block"].append(nm_dist.CormodeLZ()) if tichy: - distance_metrics['block'].append(Tichy()) + distance_metrics["block"].append(nm_dist.Tichy()) if iterative_sub_string: - distance_metrics['Subsequence'].append( - IterativeSubString()) + distance_metrics["Subsequence"].append(nm_dist.IterativeSubString()) if baulieu_xiii: - distance_metrics['multiset'].append(BaulieuXIII()) + distance_metrics["multiset"].append(nm_dist.BaulieuXIII()) if clement: - distance_metrics['multiset'].append(Clement()) + distance_metrics["multiset"].append(nm_dist.Clement()) if dice_asymmetrici: - distance_metrics['multiset'].append(DiceAsymmetricI()) + distance_metrics["multiset"].append(nm_dist.DiceAsymmetricI()) if kuhns_iii: - distance_metrics['multiset'].append(KuhnsIII()) + distance_metrics["multiset"].append(nm_dist.KuhnsIII()) if overlap: - distance_metrics['multiset'].append(Overlap()) + distance_metrics["multiset"].append(nm_dist.Overlap()) if pearson_ii: - distance_metrics['multiset'].append(PearsonII()) + distance_metrics["multiset"].append(nm_dist.PearsonII()) if weighted_jaccard: - distance_metrics['multiset'].append(WeightedJaccard()) + distance_metrics["multiset"].append(nm_dist.WeightedJaccard()) if warrens_iv: - distance_metrics['multiset'].append(WarrensIV()) + distance_metrics["multiset"].append(nm_dist.WarrensIV()) if bag: - distance_metrics['multiset'].append(Bag()) + distance_metrics["multiset"].append(nm_dist.Bag()) if rouge_l: - distance_metrics['multiset'].append(RougeL()) + distance_metrics["multiset"].append(nm_dist.RougeL()) if ratcliff_obershelp: - distance_metrics['Subsequence'].append( - RatcliffObershelp()) + distance_metrics["Subsequence"].append(nm_dist.RatcliffObershelp()) if ncd_bz2: - distance_metrics['compression'].append(NCDbz2()) + distance_metrics["compression"].append(nm_dist.NCDbz2()) if fuzzy_wuzzy_partial_string: - distance_metrics['fuzzy'].append( - FuzzyWuzzyPartialString()) + distance_metrics["fuzzy"].append(nm_dist.FuzzyWuzzyPartialString()) if fuzzy_wuzzy_token_sort: - distance_metrics['fuzzy'].append(FuzzyWuzzyTokenSort()) + distance_metrics["fuzzy"].append(nm_dist.FuzzyWuzzyTokenSort()) if fuzzy_wuzzy_token_set: - distance_metrics['fuzzy'].append(FuzzyWuzzyTokenSet()) + distance_metrics["fuzzy"].append(nm_dist.FuzzyWuzzyTokenSet()) if editex: - distance_metrics['edit'].append(Editex()) + distance_metrics["edit"].append(nm_dist.Editex()) if typo: - distance_metrics['edit'].append(Typo()) + distance_metrics["edit"].append(nm_dist.Typo()) if lig_3: - distance_metrics['Levenshtein'].append(LIG3()) + distance_metrics["Levenshtein"].append(nm_dist.LIG3()) if ssk: - distance_metrics['Subsequence'].append(SSK()) + distance_metrics["Subsequence"].append(nm_dist.SSK()) if refined_soundex: - distance_metrics['phonetic'].append(PhoneticDistance( - transforms=RefinedSoundex(max_length=30), metric=Levenshtein(), encode_alpha=True)) + distance_metrics["phonetic"].append( + nm_dist.PhoneticDistance( + transforms=nm_dist.RefinedSoundex(max_length=30), + metric=nm_dist.Levenshtein(), + encode_alpha=True, + ) + ) if double_metaphone: - distance_metrics['phonetic'].append(PhoneticDistance( - transforms=DoubleMetaphone(max_length=30), metric=Levenshtein(), encode_alpha=True)) + distance_metrics["phonetic"].append( + nm_dist.PhoneticDistance( + transforms=nm_dist.DoubleMetaphone(max_length=30), + metric=nm_dist.Levenshtein(), + encode_alpha=True, + ) + ) return distance_metrics diff --git a/name_matching/name_matcher.py b/name_matching/name_matcher.py index ad2b97c..3030a44 100644 --- a/name_matching/name_matcher.py +++ b/name_matching/name_matcher.py @@ -1,10 +1,10 @@ -import unicodedata -import functools -import operator -import re import numpy as np import pandas as pd from tqdm import tqdm +from operator import iconcat +from functools import reduce +from unicodedata import normalize +from re import escape, sub from typing import Union, Tuple from itertools import compress from sklearn.feature_extraction.text import TfidfVectorizer @@ -14,10 +14,9 @@ class NameMatcher: - """ A class for the name matching of data based on the strings in a single column. The NameMatcher - first applies a cosine similarity on the ngrams of the strings to get an approximate match followed + first applies a cosine similarity on the ngrams of the strings to get an approximate match followed by a fuzzy matching based on a number of different algorithms. Parameters @@ -46,17 +45,17 @@ class NameMatcher: group is returned. default=1 legal_suffixes : bool - Boolean indicating whether the most common company legal terms should be excluded when calculating + Boolean indicating whether the most common company legal terms should be excluded when calculating the final score. The terms are still included in determining the best match. default=False common_words : bool or list - Boolean indicating whether the most common words from the matching data should be excluded + Boolean indicating whether the most common words from the matching data should be excluded when calculating the final score. The terms are still included in determining the best match. If common_words is given as a list, the words in the list are excluded from the calculation of the final score, downgrading matches that predominatly rely on these words. default=False cut_off_no_scoring_words: float - the cut off percentage of the occurrence of the most occurring word for which words are still included + the cut off percentage of the occurrence of the most occurring word for which words are still included in the no_scoring_words set default=0.01 lowercase : bool @@ -71,8 +70,8 @@ class NameMatcher: ascii characters default=True : bool preprocess_split - Indicating whether during the preprocessing an additional step should be taken in which only - the most common words out of a name are isolated and used in the matching process. The removing + Indicating whether during the preprocessing an additional step should be taken in which only + the most common words out of a name are isolated and used in the matching process. The removing of the common words is only done for the n-grams cosine matching part. default=False verbose : bool @@ -80,35 +79,42 @@ class NameMatcher: default=True distance_metrics: list A list of The distance metrics to be used during the fuzzy matching. For a list of possible distance - metrics see the distance_metrics.py file. By default the following metrics are used: overlap, weighted_jaccard, + metrics see the distance_metrics.py file. By default the following metrics are used: overlap, weighted_jaccard, ratcliff_obershelp, fuzzy_wuzzy_token_sort and editex. row_numbers : bool Bool indicating whether the row number should be used as match_index rather than the original index as - was the default case before version 0.8.8 + was the default case before version 0.8.8 default=False return_algorithms_score : bool Bool indicating whether the scores of all the algorithms should be returned instead of a combined score default=False """ - def __init__(self, - ngrams: tuple = (2, 3), - top_n: int = 50, - low_memory: bool = False, - number_of_rows: int = 5000, - number_of_matches: int = 1, - lowercase: bool = True, - punctuations: bool = True, - remove_ascii: bool = True, - legal_suffixes: bool = False, - common_words: Union[bool, list] = False, - cut_off_no_scoring_words: float = 0.01, - preprocess_split: bool = False, - verbose: bool = True, - distance_metrics: Union[list, tuple] = ['overlap', 'weighted_jaccard', 'ratcliff_obershelp', - 'fuzzy_wuzzy_token_sort', 'editex'], - row_numbers: bool = False, - return_algorithms_score: bool = False): + def __init__( + self, + ngrams: tuple = (2, 3), + top_n: int = 50, + low_memory: bool = False, + number_of_rows: int = 5000, + number_of_matches: int = 1, + lowercase: bool = True, + punctuations: bool = True, + remove_ascii: bool = True, + legal_suffixes: bool = False, + common_words: Union[bool, list] = False, + cut_off_no_scoring_words: float = 0.01, + preprocess_split: bool = False, + verbose: bool = True, + distance_metrics: Union[list, tuple] = [ + "overlap", + "weighted_jaccard", + "ratcliff_obershelp", + "fuzzy_wuzzy_token_sort", + "editex", + ], + row_numbers: bool = False, + return_algorithms_score: bool = False, + ): self._possible_matches = None self._preprocessed = False @@ -117,8 +123,8 @@ def __init__(self, self._number_of_rows = number_of_rows self._low_memory = low_memory - self._column = '' - self._column_matching = '' + self._column = "" + self._column_matching = "" self._verbose = verbose self._number_of_matches = number_of_matches @@ -129,7 +135,7 @@ def __init__(self, self._preprocess_punctuations = punctuations self._preprocess_ascii = remove_ascii self._postprocess_company_legal_id = legal_suffixes - + if isinstance(common_words, bool): self._postprocess_common_words = common_words self._word_set = set() @@ -137,33 +143,35 @@ def __init__(self, self._postprocess_common_words = False self._word_set = set(common_words) else: - raise TypeError('Please provide common_words as a list or a bool') + raise TypeError("Please provide common_words as a list or a bool") self._preprocess_split = preprocess_split self._cut_off = cut_off_no_scoring_words if self._postprocess_company_legal_id: self._word_set = self._make_no_scoring_words( - 'legal', self._word_set, self._cut_off) - + "legal", self._word_set, self._cut_off + ) + self._original_indexes = not row_numbers self._original_index = None self.set_distance_metrics(distance_metrics) self._vec = TfidfVectorizer( - lowercase=False, analyzer="char", ngram_range=(ngrams)) + lowercase=False, analyzer="char", ngram_range=(ngrams) + ) self._n_grams_matching = None def set_distance_metrics(self, metrics: list) -> None: """ A method to set which of the distance metrics should be employed during the - fuzzy matching. For very short explanations of most of the name matching + fuzzy matching. For very short explanations of most of the name matching algorithms please see the make_distance_metrics function in distance_matrics.py Parameters ---------- - metrics: list + metrics: list The list with the distance metrics to be used during the name matching. The distance metrics can be chosen from the list below: indel @@ -198,47 +206,56 @@ def set_distance_metrics(self, metrics: list) -> None: try: self._distance_metrics = make_distance_metrics(**input_metrics) except TypeError: - raise TypeError('Not all of the supplied distance metrics are available. Please check the' + - 'list of options in the make_distance_metrics function and adjust your list accordingly') + raise TypeError( + "Not all of the supplied distance metrics are available. Please check the" + + "list of options in the make_distance_metrics function and adjust your list accordingly" + ) self._num_distance_metrics = sum( - [len(x) for x in self._distance_metrics.values()]) + [len(x) for x in self._distance_metrics.values()] + ) - def _select_top_words(self, word: str, word_counts: pd.Series, occurrence_count: int) -> str: + def _select_top_words( + self, word: str, word_counts: pd.Series, occurrence_count: int + ) -> str: """Remove the top words from the string word based on an occurrence_count threshold Parameters ---------- - word: str + word: str the string from which the words should be removed word_counts: pd.Series the words which should be removed with their counts as result from a value_counts - occurrence_count: int - the multiplication factor of the minimum occurrences below which to select + occurrence_count: int + the multiplication factor of the minimum occurrences below which to select Returns ------- str The string word with the words with a too high word_counts removed """ - compressed_list = list(compress( - word, (word_counts[word] < occurrence_count*word_counts[word].min()).values)) - - return ' '.join(compressed_list) - - def _preprocess_reduce(self, - to_be_matched: pd.DataFrame, - occurrence_count: int = 3) -> pd.DataFrame: + compressed_list = list( + compress( + word, + (word_counts[word] < occurrence_count * word_counts[word].min()).values, + ) + ) + + return " ".join(compressed_list) + + def _preprocess_reduce( + self, to_be_matched: pd.DataFrame, occurrence_count: int = 3 + ) -> pd.DataFrame: """Preprocesses and copies the data to obtain the data with reduced strings. The strings have all words - removed which appear more than 3x as often as the least common word in the string and returns an adjusted - copy of the input + removed which appear more than 3x as often as the least common word in the string and returns an adjusted + copy of the input Parameters ---------- - to_be_matched: pd.DataFrame + to_be_matched: pd.DataFrame A dataframe from which the most common words should be removed occurrence_count: int The number of occurrence a word can occur more then the least common word in the string for which it will - still be included in the process + still be included in the process default=3 Returns @@ -246,21 +263,25 @@ def _preprocess_reduce(self, pd.DataFrame A dataframe that will contain the reduced strings """ - individual_words = to_be_matched[self._column_matching].str.split( - expand=True).stack() + individual_words = ( + to_be_matched[self._column_matching].str.split(expand=True).stack() + ) word_counts = individual_words.value_counts() to_be_matched_new = to_be_matched.copy() name = to_be_matched[self._column_matching].str.split() to_be_matched_new[self._column_matching] = name.apply( - lambda word: self._select_top_words(word, word_counts, occurrence_count)) + lambda word: self._select_top_words(word, word_counts, occurrence_count) + ) return to_be_matched_new - def load_and_process_master_data(self, - column: str, - df_matching_data: pd.DataFrame, - start_processing: bool = True, - transform: bool = True) -> None: + def load_and_process_master_data( + self, + column: str, + df_matching_data: pd.DataFrame, + start_processing: bool = True, + transform: bool = True, + ) -> None: """Load the matching data into the NameMatcher and start the preprocessing. Parameters @@ -269,43 +290,42 @@ def load_and_process_master_data(self, The column name of the dataframe which should be used for the matching df_matching_data: pd.DataFrame The dataframe which is used to match the data to. - start_processing : bool - A boolean indicating whether to start the preprocessing step after loading the matching data + start_processing : bool + A boolean indicating whether to start the preprocessing step after loading the matching data default: True - transform : bool - A boolean indicating whether or not the data should be transformed after the vectoriser is initialised + transform : bool + A boolean indicating whether or not the data should be transformed after the vectoriser is initialised default: True """ self._column = column - self._df_matching_data = df_matching_data + self._df_matching_data = df_matching_data self._original_index = df_matching_data.index if start_processing: self._process_matching_data(transform) - def _process_matching_data(self, - transform: bool = True) -> None: - """Function to process the matching data. First the matching data is preprocessed and assigned to - a variable within the NameMatcher. Next the data is used to initialise the TfidfVectorizer. + def _process_matching_data(self, transform: bool = True) -> None: + """Function to process the matching data. First the matching data is preprocessed and assigned to + a variable within the NameMatcher. Next the data is used to initialise the TfidfVectorizer. Parameters ---------- - transform : bool - A boolean indicating whether or not the data should be transformed after the vectoriser is initialised + transform : bool + A boolean indicating whether or not the data should be transformed after the vectoriser is initialised default: True """ - self._df_matching_data = self.preprocess( - self._df_matching_data, self._column) + self._df_matching_data = self.preprocess(self._df_matching_data, self._column) if self._postprocess_common_words: self._word_set = self._make_no_scoring_words( - 'common', self._word_set, self._cut_off) + "common", self._word_set, self._cut_off + ) self._vectorise_data(transform) self._preprocessed = True - def match_names(self, - to_be_matched: Union[pd.Series, pd.DataFrame], - column_matching: str) -> Union[pd.Series, pd.DataFrame]: - """Performs the name matching operation on the to_be_matched data. First it does the preprocessing of the - data to be matched as well as the matching data if this has not been performed. Subsequently based on + def match_names( + self, to_be_matched: Union[pd.Series, pd.DataFrame], column_matching: str + ) -> Union[pd.Series, pd.DataFrame]: + """Performs the name matching operation on the to_be_matched data. First it does the preprocessing of the + data to be matched as well as the matching data if this has not been performed. Subsequently based on ngrams a cosine similarity is computed between the matching data and the data to be matched, to the top n matches fuzzy matching algorithms are performed to determine the best match and the quality of the match @@ -319,63 +339,87 @@ def match_names(self, Returns ------- Union[pd.Series, pd.DataFrame] - A series or dataframe depending on the input containing the match index from the matching_data dataframe. - the name in the to_be_matched data, the name to which the datapoint was matched and a score between 0 + A series or dataframe depending on the input containing the match index from the matching_data dataframe. + the name in the to_be_matched data, the name to which the datapoint was matched and a score between 0 (no match) and 100(perfect match) to indicate the quality of the matches """ - if self._column == '': + if self._column == "": raise ValueError( - 'Please first load the master data via the method: load_and_process_master_data') + "Please first load the master data via the method: load_and_process_master_data" + ) if self._verbose: tqdm.pandas() - tqdm.write('preprocessing...\n') + tqdm.write("preprocessing...\n") self._column_matching = column_matching is_dataframe = True if isinstance(to_be_matched, pd.Series): is_dataframe = False to_be_matched = pd.DataFrame( - [to_be_matched.values], columns=to_be_matched.index.to_list()) + [to_be_matched.values], columns=to_be_matched.index.to_list() + ) if not self._preprocessed: self._process_matching_data() to_be_matched = self.preprocess(to_be_matched, self._column_matching) if self._verbose: - tqdm.write('preprocessing complete \n searching for matches...\n') + tqdm.write("preprocessing complete \n searching for matches...\n") - self._possible_matches = self._search_for_possible_matches( - to_be_matched) + self._possible_matches = self._search_for_possible_matches(to_be_matched) if self._preprocess_split: - self._possible_matches = np.hstack((self._search_for_possible_matches( - self._preprocess_reduce(to_be_matched)), self._possible_matches)) - + self._possible_matches = np.hstack( + ( + self._search_for_possible_matches( + self._preprocess_reduce(to_be_matched) + ), + self._possible_matches, + ) + ) + if self._verbose: - tqdm.write('possible matches found \n fuzzy matching...\n') - data_matches = to_be_matched.progress_apply(lambda x: self.fuzzy_matches( - self._possible_matches[to_be_matched.index.get_loc(x.name), :], x), axis=1) + tqdm.write("possible matches found \n fuzzy matching...\n") + data_matches = to_be_matched.progress_apply( + lambda x: self.fuzzy_matches( + self._possible_matches[to_be_matched.index.get_loc(x.name), :], x + ), + axis=1, + ) else: - data_matches = to_be_matched.apply(lambda x: self.fuzzy_matches( - self._possible_matches[to_be_matched.index.get_loc(x.name), :], x), axis=1) + data_matches = to_be_matched.apply( + lambda x: self.fuzzy_matches( + self._possible_matches[to_be_matched.index.get_loc(x.name), :], x + ), + axis=1, + ) if self._return_algorithms_score: return data_matches - + if self._number_of_matches == 1: - data_matches = data_matches.rename(columns={'match_name_0': 'match_name', - 'score_0': 'score', 'match_index_0': 'match_index'}) + data_matches = data_matches.rename( + columns={ + "match_name_0": "match_name", + "score_0": "score", + "match_index_0": "match_index", + } + ) if is_dataframe and self._original_indexes: - for col in data_matches.columns[data_matches.columns.str.contains('match_index')]: - data_matches[col] = self._original_index[data_matches[col].astype(int).fillna(0)] + for col in data_matches.columns[ + data_matches.columns.str.contains("match_index") + ]: + data_matches[col] = self._original_index[ + data_matches[col].astype(int).fillna(0) + ] if self._verbose: - tqdm.write('done') + tqdm.write("done") return data_matches - def fuzzy_matches(self, - possible_matches: np.array, - to_be_matched: pd.Series) -> pd.Series: - """ A method which performs the fuzzy matching between the data in the to_be_matched series as well + def fuzzy_matches( + self, possible_matches: np.array, to_be_matched: pd.Series + ) -> pd.Series: + """A method which performs the fuzzy matching between the data in the to_be_matched series as well as the indicated indexes of the matching_data points which are possible matching candidates. Parameters @@ -389,28 +433,34 @@ def fuzzy_matches(self, ------- pd.Series A series containing the match index from the matching_data dataframe. the name in the to_be_matched data, - the name to which the datapoint was matched and a score between 0 (no match) and 100(perfect match) to + the name to which the datapoint was matched and a score between 0 (no match) and 100(perfect match) to indicate the quality of the matches """ if len(possible_matches.shape) > 1: possible_matches = possible_matches[0] - indexes = np.array([[f'match_name_{num}', f'score_{num}', f'match_index_{num}'] - for num in range(self._number_of_matches)]).flatten() - match = pd.Series(index=np.append('original_name', indexes), dtype=object) - match['original_name'] = to_be_matched[self._column_matching] + indexes = np.array( + [ + [f"match_name_{num}", f"score_{num}", f"match_index_{num}"] + for num in range(self._number_of_matches) + ] + ).flatten() + match = pd.Series(index=np.append("original_name", indexes), dtype=object) + match["original_name"] = to_be_matched[self._column_matching] list_possible_matches = self._df_matching_data.iloc[ - possible_matches.flatten(), :][self._column].values + possible_matches.flatten(), : + ][self._column].values match_score = self._score_matches( - to_be_matched[self._column_matching], list_possible_matches) + to_be_matched[self._column_matching], list_possible_matches + ) if self._return_algorithms_score: return match_score ind = self._rate_matches(match_score) for num, col_num in enumerate(ind): - match[f'match_name_{num}'] = list_possible_matches[col_num] - match[f'match_index_{num}'] = possible_matches[col_num] + match[f"match_name_{num}"] = list_possible_matches[col_num] + match[f"match_index_{num}"] = possible_matches[col_num] match = self._adjust_scores(match_score[ind, :], match) @@ -419,9 +469,9 @@ def fuzzy_matches(self, return match - def _score_matches(self, - to_be_matched_instance: str, - possible_matches: list) -> np.array: + def _score_matches( + self, to_be_matched_instance: str, possible_matches: list + ) -> np.array: """A method to score a name to_be_matched_instance to a list of possible matches. The scoring is done based on all the metrics which are enabled. @@ -437,20 +487,19 @@ def _score_matches(self, np.array The score of each of the matches with respect to the different metrics which are assessed. """ - match_score = np.zeros( - (len(possible_matches), self._num_distance_metrics)) + match_score = np.zeros((len(possible_matches), self._num_distance_metrics)) idx = 0 for method_list in self._distance_metrics.values(): for method in method_list: match_score[:, idx] = np.array( - [method.sim(to_be_matched_instance, s) for s in possible_matches]) + [method.sim(to_be_matched_instance, s) for s in possible_matches] + ) idx = idx + 1 return match_score - def _rate_matches(self, - match_score: np.array) -> np.array: - """Converts the match scores from the score_matches method to a list of indexes of the best scoring + def _rate_matches(self, match_score: np.array) -> np.array: + """Converts the match scores from the score_matches method to a list of indexes of the best scoring matches limited to the _number_of_matches. Parameters @@ -471,14 +520,16 @@ def _rate_matches(self, idx = 0 for num, method_list in enumerate(self._distance_metrics.values()): method_grouped_results = np.reshape( - match_score[:, idx: idx + len(method_list)], (-1, len(method_list))) + match_score[:, idx : idx + len(method_list)], (-1, len(method_list)) + ) ind[num] = np.argmax(np.mean(method_grouped_results, axis=1)) idx = idx + len(method_list) elif self._number_of_matches == self._num_distance_metrics: ind = np.argmax(match_score, axis=1) else: - ind = np.argsort(np.mean(match_score, axis=1) - )[-self._number_of_matches:][::-1] + ind = np.argsort(np.mean(match_score, axis=1))[-self._number_of_matches :][ + ::-1 + ] return np.array(ind, dtype=int) @@ -498,7 +549,7 @@ def _get_alternative_names(self, match: pd.Series) -> list: alt_names = [] for num in range(self._number_of_matches): - alt_names.append(str(match[f'match_name_{num}'])) + alt_names.append(str(match[f"match_name_{num}"])) return alt_names @@ -519,11 +570,11 @@ def _process_words(self, org_name: str, alt_names: list) -> Tuple[str, list]: """ len_atl_names = len(alt_names) for word in self._word_set: - org_name = ' '.join( - re.sub(fr'\b{re.escape(word)}\b', '', org_name).split()) + org_name = " ".join(sub(rf"\b{escape(word)}\b", "", org_name).split()) for num in range(len_atl_names): - alt_names[num] = ' '.join( - re.sub(fr'\b{re.escape(word)}\b', '', alt_names[num]).split()) + alt_names[num] = " ".join( + sub(rf"\b{escape(word)}\b", "", alt_names[num]).split() + ) return org_name, alt_names @@ -543,13 +594,12 @@ def _adjust_scores(self, match_score: np.array, match: pd.Series) -> pd.Series: The series with the possible matches and adjusted scores """ for num in range(self._number_of_matches): - match[f'score_{num}'] = 100*np.mean(match_score[num, :]) + match[f"score_{num}"] = 100 * np.mean(match_score[num, :]) return match - def postprocess(self, - match: pd.Series) -> pd.Series: - """Postprocesses the scores to exclude certain specific company words or the most + def postprocess(self, match: pd.Series) -> pd.Series: + """Postprocesses the scores to exclude certain specific company words or the most common words. In this method only the scores are adjusted, the matches still stand. Parameters @@ -563,7 +613,7 @@ def postprocess(self, A new version of the input series with updated scores """ alt_names = self._get_alternative_names(match) - org_name = str(match['original_name']) + org_name = str(match["original_name"]) org_name, alt_names = self._process_words(org_name, alt_names) @@ -573,16 +623,15 @@ def postprocess(self, return match - def _vectorise_data(self, - transform: bool = True): + def _vectorise_data(self, transform: bool = True): """Initialises the TfidfVectorizer, which generates ngrams and weights them based on the occurrance. Subsequently the matching data will be used to fit the vectoriser and the matching data might also be send to the transform_data function depending on the transform boolean. Parameters ---------- - transform : bool - A boolean indicating whether or not the data should be transformed after the vectoriser is initialised + transform : bool + A boolean indicating whether or not the data should be transformed after the vectoriser is initialised default: True """ self._vec.fit(self._df_matching_data[self._column].values.flatten()) @@ -590,21 +639,19 @@ def _vectorise_data(self, self.transform_data() def transform_data(self): - """A method which transforms the matching data based on the ngrams transformer. After the + """A method which transforms the matching data based on the ngrams transformer. After the transformation (the generation of the ngrams), the data is normalised by dividing each row by the sum of the row. Subsequently the data is changed to a coo sparse matrix format with the column indices in ascending order. """ - ngrams = self._vec.transform( - self._df_matching_data[self._column].astype(str)) + ngrams = self._vec.transform(self._df_matching_data[self._column].astype(str)) for i, j in zip(ngrams.indptr[:-1], ngrams.indptr[1:]): - ngrams.data[i:j] = ngrams.data[i:j]/np.sum(ngrams.data[i:j]) + ngrams.data[i:j] = ngrams.data[i:j] / np.sum(ngrams.data[i:j]) self._n_grams_matching = ngrams.tocsc() if self._low_memory: self._n_grams_matching = self._n_grams_matching.tocoo() - def _search_for_possible_matches(self, - to_be_matched: pd.DataFrame) -> np.array: + def _search_for_possible_matches(self, to_be_matched: pd.DataFrame) -> np.array: """Generates ngrams from the data which should be matched, calculate the cosine simularity between these data and the matching data. Hereafter a top n of the matches is selected and returned. @@ -622,8 +669,9 @@ def _search_for_possible_matches(self, """ if self._n_grams_matching is None: raise RuntimeError( - """First the data needs to be transformed to be able to use the sparse cosine simularity. To""" + - """transform the data, run transform_data or run load_and_process_master_data with transform=True""") + """First the data needs to be transformed to be able to use the sparse cosine simularity. To""" + + """transform the data, run transform_data or run load_and_process_master_data with transform=True""" + ) if self._low_memory: results = np.zeros((len(to_be_matched), self._top_n)) @@ -631,26 +679,37 @@ def _search_for_possible_matches(self, for idx, row_name in enumerate(tqdm(input_data, disable=not self._verbose)): match_ngrams = self._vec.transform([row_name]) results[idx, :] = sparse_cosine_top_n( - matrix_a=self._n_grams_matching, matrix_b=match_ngrams, top_n=self._top_n, low_memory=self._low_memory, number_of_rows=self._number_of_rows, verbose=self._verbose) + matrix_a=self._n_grams_matching, + matrix_b=match_ngrams, + top_n=self._top_n, + low_memory=self._low_memory, + number_of_rows=self._number_of_rows, + verbose=self._verbose, + ) else: match_ngrams = self._vec.transform( - to_be_matched[self._column_matching].tolist()).tocsc() + to_be_matched[self._column_matching].tolist() + ).tocsc() results = sparse_cosine_top_n( - matrix_a=self._n_grams_matching, matrix_b=match_ngrams, top_n=self._top_n, low_memory=self._low_memory, number_of_rows=self._number_of_rows, verbose=self._verbose) + matrix_a=self._n_grams_matching, + matrix_b=match_ngrams, + top_n=self._top_n, + low_memory=self._low_memory, + number_of_rows=self._number_of_rows, + verbose=self._verbose, + ) return results - def preprocess(self, - df: pd.DataFrame, - column_name: str) -> pd.DataFrame: - """Preprocess a dataframe before applying a name matching algorithm. The preprocessing consists of + def preprocess(self, df: pd.DataFrame, column_name: str) -> pd.DataFrame: + """Preprocess a dataframe before applying a name matching algorithm. The preprocessing consists of removing special characters, spaces, converting all characters to lower case and removing the words given in the word lists Parameters ---------- df : DataFrame - The dataframe or series on which the preprocessing needs to be performed + The dataframe or series on which the preprocessing needs to be performed column_name : str The name of the column that is used for the preprocessing @@ -664,12 +723,15 @@ def preprocess(self, df.loc[:, column_name] = df[column_name].str.lower() if self._preprocess_punctuations: df.loc[:, column_name] = df[column_name].str.replace( - '[^\w\s]', '', regex=True) - df.loc[:, column_name] = df[column_name].str.replace( - ' ', ' ').str.strip() + "[^\w\s]", "", regex=True + ) + df.loc[:, column_name] = df[column_name].str.replace(" ", " ").str.strip() if self._preprocess_ascii: - df.loc[:, column_name] = df[column_name].apply(lambda string: unicodedata.normalize( - 'NFKD', str(string)).encode('ASCII', 'ignore').decode()) + df.loc[:, column_name] = df[column_name].apply( + lambda string: normalize("NFKD", str(string)) + .encode("ASCII", "ignore") + .decode() + ) return df @@ -684,14 +746,15 @@ def _preprocess_word_list(self, terms: dict) -> list: Returns ------- list - A list of preprocessed legal words + A list of preprocessed legal words """ if self._preprocess_punctuations: - return [re.sub(r'[^\w\s]', '', s).strip() for s in functools.reduce( - operator.iconcat, terms.values(), [])] + return [ + sub(r"[^\w\s]", "", s).strip() + for s in reduce(iconcat, terms.values(), []) + ] else: - return [s.strip() for s in functools.reduce( - operator.iconcat, terms.values(), [])] + return [s.strip() for s in reduce(iconcat, terms.values(), [])] def _process_legal_words(self, word_set: set) -> set: """Preprocess legal words and add them to the word_set @@ -718,9 +781,9 @@ def _process_common_words(self, word_set: set, cut_off: float) -> set: Parameters ------- word_set: str - the current word list which should be extended with additional words + the current word list which should be extended with additional words cut_off: float - the cut_off percentage of the occurrence of the most occurring word for which words are still included + the cut_off percentage of the occurrence of the most occurring word for which words are still included in the no_soring_words set Returns @@ -728,17 +791,21 @@ def _process_common_words(self, word_set: set, cut_off: float) -> set: Set The current word set with the most common words from the matching_data added """ - word_counts = self._df_matching_data[self._column].str.split( - expand=True).stack().value_counts() + word_counts = ( + self._df_matching_data[self._column] + .str.split(expand=True) + .stack() + .value_counts() + ) word_set = word_set.union( - set(word_counts[word_counts > np.max(word_counts)*cut_off].index)) + set(word_counts[word_counts > np.max(word_counts) * cut_off].index) + ) return word_set - def _make_no_scoring_words(self, - indicator: str, - word_set: set, - cut_off: float) -> set: + def _make_no_scoring_words( + self, indicator: str, word_set: set, cut_off: float + ) -> set: """A method to make a set of words which are not taken into account when scoring matches. Parameters @@ -747,9 +814,9 @@ def _make_no_scoring_words(self, indicator for which types of words should be excluded can be legal for legal suffixes or common for the most common words word_set: str - the current word list which should be extended with additional words + the current word list which should be extended with additional words cut_off: float - the cut_off percentage of the occurrence of the most occurring word for which words are still included + the cut_off percentage of the occurrence of the most occurring word for which words are still included in the no_soring_words set Returns @@ -757,9 +824,9 @@ def _make_no_scoring_words(self, Set The set of no scoring words """ - if indicator == 'legal': + if indicator == "legal": word_set = self._process_legal_words(word_set) - if indicator == 'common': + if indicator == "common": word_set = self._process_common_words(word_set, cut_off) return word_set diff --git a/name_matching/run_nm.py b/name_matching/run_nm.py index a7f1977..7e93779 100644 --- a/name_matching/run_nm.py +++ b/name_matching/run_nm.py @@ -1,18 +1,18 @@ +import pandas as pd from name_matching.name_matcher import NameMatcher from typing import Union, Tuple -import pandas as pd -import unicodedata +from unicodedata import normalize -def _match_names_check_data(data: Union[pd.Series, pd.DataFrame], - column: str, - group_column: str) -> pd.DataFrame: +def _match_names_check_data( + data: Union[pd.Series, pd.DataFrame], column: str, group_column: str +) -> pd.DataFrame: """ - Checks the input data of the name matching function to see whether the defined columns can + Checks the input data of the name matching function to see whether the defined columns can be found and makes a new column which will be used for the name matching ---------- data: Union[pd.DataFrame, pd.Series] - The first dataframe or series used for the name matching + The first dataframe or series used for the name matching column: str The column in which the name that should be matched can be found for data group_column_first: str @@ -26,36 +26,39 @@ def _match_names_check_data(data: Union[pd.Series, pd.DataFrame], """ if isinstance(data, pd.DataFrame): - if column == '': + if column == "": raise ValueError( - 'For one of the dataframes no column is given to perform the name matching on') + "For one of the dataframes no column is given to perform the name matching on" + ) if column not in data.columns: - raise ValueError( - 'Could not find one of the columns in the dataframe') - if (group_column != '') & (group_column not in data.columns): - raise ValueError( - 'Could not find one of the group_columns in the dataframe') - data['name_matching_data'] = data[column] + raise ValueError("Could not find one of the columns in the dataframe") + if (group_column != "") & (group_column not in data.columns): + raise ValueError("Could not find one of the group_columns in the dataframe") + data["name_matching_data"] = data[column] else: - if group_column != '': + if group_column != "": raise ValueError( - 'Grouping is only possible when a dataframe is used for both inputs') - data = pd.DataFrame(data, columns=['name_matching_data']) + "Grouping is only possible when a dataframe is used for both inputs" + ) + data = pd.DataFrame(data, columns=["name_matching_data"]) return data -def _match_names_preprocess_data(column: str, - data_first: pd.DataFrame, - data_second: pd.DataFrame, - case_sensitive: bool, - punctuation_sensitive: bool, - special_character_sensitive: bool) -> Tuple[pd.DataFrame, pd.DataFrame]: + +def _match_names_preprocess_data( + column: str, + data_first: pd.DataFrame, + data_second: pd.DataFrame, + case_sensitive: bool, + punctuation_sensitive: bool, + special_character_sensitive: bool, +) -> Tuple[pd.DataFrame, pd.DataFrame]: """ - Preprocess the data by making the names lower case, removing punctuations and special characters. + Preprocess the data by making the names lower case, removing punctuations and special characters. And convert the indexes of the second dataframe to a column. ---------- data: Union[pd.DataFrame, pd.Series] - The first dataframe or series used for the name matching + The first dataframe or series used for the name matching column: str The column in which the name that should be matched can be found for data group_column_first: str @@ -81,32 +84,36 @@ def _match_names_preprocess_data(column: str, data_first[column] = data_first[column].str.lower().str.strip() data_second[column] = data_second[column].str.lower().str.strip() if not punctuation_sensitive: - data_first[column] = data_first[column].str.replace('[^\w\s]', '', regex=True) - data_second[column] = data_second[column].str.replace( - '[^\w\s]', '', regex=True) + data_first[column] = data_first[column].str.replace("[^\w\s]", "", regex=True) + data_second[column] = data_second[column].str.replace("[^\w\s]", "", regex=True) if not special_character_sensitive: - data_first[column] = data_first[column].apply(lambda string: unicodedata.normalize( - 'NFKD', string).encode('ASCII', 'ignore').decode()) - data_second[column] = data_second[column].apply(lambda string: unicodedata.normalize( - 'NFKD', string).encode('ASCII', 'ignore').decode()) + data_first[column] = data_first[column].apply( + lambda string: normalize("NFKD", string).encode("ASCII", "ignore").decode() + ) + data_second[column] = data_second[column].apply( + lambda string: normalize("NFKD", string).encode("ASCII", "ignore").decode() + ) + + data_second = data_second.rename_axis("index").reset_index(drop=False) - data_second = data_second.rename_axis('index').reset_index(drop=False) - return data_first, data_second -def _match_names_combine_data(data_first: pd.DataFrame, - data_second: pd.DataFrame, - left_cols: list, - right_cols: list) -> pd.DataFrame: + +def _match_names_combine_data( + data_first: pd.DataFrame, + data_second: pd.DataFrame, + left_cols: list, + right_cols: list, +) -> pd.DataFrame: """ Perform a merge to match data based on whether the names are equal ---------- data_first: pd.DataFrame - The first dataframe or series used for the name matching + The first dataframe or series used for the name matching data_second: pd.DataFrame - The second dataframe or series used for the name matching + The second dataframe or series used for the name matching left_cols: list - A list of columns on which the first dataframe should be merged + A list of columns on which the first dataframe should be merged right_cols: list A list of columns on which the first dataframe should be merged @@ -117,29 +124,38 @@ def _match_names_combine_data(data_first: pd.DataFrame, dataframe is equal to the original index of data_first, the match index is the index in data_second for the matched name. """ - matches = pd.merge(data_first, data_second, how='left', - left_on=left_cols, right_on=right_cols, suffixes=['', '_matched']) - matches['score'] = 100 - matches = matches.dropna(subset=['index']) - matches = matches.rename(columns={'index':'match_index'}) - matches = matches[['match_index', 'score']] + matches = pd.merge( + data_first, + data_second, + how="left", + left_on=left_cols, + right_on=right_cols, + suffixes=["", "_matched"], + ) + matches["score"] = 100 + matches = matches.dropna(subset=["index"]) + matches = matches.rename(columns={"index": "match_index"}) + matches = matches[["match_index", "score"]] return matches -def _match_names_match_single(matcher: NameMatcher, - data_first: pd.DataFrame, - data_second: pd.DataFrame, - name_column: str) -> pd.DataFrame: + +def _match_names_match_single( + matcher: NameMatcher, + data_first: pd.DataFrame, + data_second: pd.DataFrame, + name_column: str, +) -> pd.DataFrame: """ Perform the name matching. First by doing a perfect string match with a merge statement, followed - by the fuzzy matching approach as done in NameMatcher. + by the fuzzy matching approach as done in NameMatcher. ---------- matcher: NameMatcher - The NameMatcher to be used for the name matching part + The NameMatcher to be used for the name matching part data_first: pd.DataFrame - The first dataframe or series used for the name matching + The first dataframe or series used for the name matching data_second: pd.DataFrame - The second dataframe or series used for the name matching + The second dataframe or series used for the name matching name_column: str The column in which the name that should be matched can be found for both dataframes @@ -151,34 +167,46 @@ def _match_names_match_single(matcher: NameMatcher, for the matched name. """ - matches = _match_names_combine_data(data_first, data_second, - [name_column], [name_column]) + matches = _match_names_combine_data( + data_first, data_second, [name_column], [name_column] + ) unmatched = data_first[~data_first.index.isin(matches.index)].copy() if len(unmatched) > 0: matcher.load_and_process_master_data(name_column, data_second, transform=True) - matches = pd.concat([matches,(matcher.match_names( - to_be_matched=unmatched, column_matching=name_column))]) + matches = pd.concat( + [ + matches, + ( + matcher.match_names( + to_be_matched=unmatched, column_matching=name_column + ) + ), + ] + ) return matches else: - print('All data matched with basic string matching') + print("All data matched with basic string matching") return matches -def _match_names_match_group(matcher: NameMatcher, - data_first: pd.DataFrame, - data_second: pd.DataFrame, - name_column: str, - group_column_first: str, - group_column_second: str) -> pd.DataFrame: + +def _match_names_match_group( + matcher: NameMatcher, + data_first: pd.DataFrame, + data_second: pd.DataFrame, + name_column: str, + group_column_first: str, + group_column_second: str, +) -> pd.DataFrame: """ Perform the name matching based on the subgroups as indicated by the group_column strings. First by doing - a perfect string match with a merge statement, followed by the fuzzy matching approach as done in NameMatcher. + a perfect string match with a merge statement, followed by the fuzzy matching approach as done in NameMatcher. ---------- matcher: NameMatcher - The NameMatcher to be used for the name matching part + The NameMatcher to be used for the name matching part data_first: pd.DataFrame - The first dataframe or series used for the name matching + The first dataframe or series used for the name matching data_second: pd.DataFrame - The second dataframe or series used for the name matching + The second dataframe or series used for the name matching name_column: str The column in which the name that should be matched can be found for both dataframes group_column_first: str @@ -194,36 +222,54 @@ def _match_names_match_group(matcher: NameMatcher, for the matched name. """ - matches = _match_names_combine_data(data_first, data_second, [ - name_column, group_column_first], [name_column, group_column_second]) + matches = _match_names_combine_data( + data_first, + data_second, + [name_column, group_column_first], + [name_column, group_column_second], + ) unmatched = data_first[~data_first.index.isin(matches.index)] if len(unmatched) > 0: matcher.load_and_process_master_data(name_column, data_second, transform=False) for group in data_first[group_column_first].unique(): - data_second_group = data_second[data_second[group_column_second] == group].copy() - matcher.load_and_process_master_data(name_column, - data_second_group, start_processing=False) + data_second_group = data_second[ + data_second[group_column_second] == group + ].copy() + matcher.load_and_process_master_data( + name_column, data_second_group, start_processing=False + ) matcher.transform_data() - matches = pd.concat([matches, matcher.match_names( - to_be_matched=unmatched[unmatched[group_column_first] == group].copy(), column_matching=name_column)]) + matches = pd.concat( + [ + matches, + matcher.match_names( + to_be_matched=unmatched[ + unmatched[group_column_first] == group + ].copy(), + column_matching=name_column, + ), + ] + ) else: - print('All data matched with basic string matching') + print("All data matched with basic string matching") return matches return matches -def match_names(data_first: Union[pd.DataFrame, pd.Series], - data_second: Union[pd.DataFrame, pd.Series], - column_first='', - column_second='', - group_column_first='', - group_column_second='', - case_sensitive=False, - punctuation_sensitive=False, - special_character_sensitive=False, - threshold=95, - **kwargs) -> pd.DataFrame: +def match_names( + data_first: Union[pd.DataFrame, pd.Series], + data_second: Union[pd.DataFrame, pd.Series], + column_first="", + column_second="", + group_column_first="", + group_column_second="", + case_sensitive=False, + punctuation_sensitive=False, + special_character_sensitive=False, + threshold=95, + **kwargs +) -> pd.DataFrame: """Function which performs name matching. First a simple merge on the data is performed to get the instances in which the name matches perfectly. Subsequently the matches are matched using the name matching algorithm as defined in name_matcher. @@ -233,23 +279,23 @@ def match_names(data_first: Union[pd.DataFrame, pd.Series], data_first: Union[pd.DataFrame, pd.Series] The first dataframe or series used for the name matching data_second: Union[pd.DataFrame, pd.Series] - The second dataframe or series used for the name matching, for matching the data to + The second dataframe or series used for the name matching, for matching the data to itself data_second should be equal to data first column_first: str - If data_first is a dataframe column_first should be the column in which the name + If data_first is a dataframe column_first should be the column in which the name that should be matched can be found for data_first default='' column_second: str - If data_second is a dataframe column_second should be the column in which the name + If data_second is a dataframe column_second should be the column in which the name that should be matched can be found for data_second default='' group_column_first: str - The name of the column that should be used to generate groups within the data_first + The name of the column that should be used to generate groups within the data_first dataframe. The matchig is then only performed for instances in which the groups are identical default='' group_column_second: str - The name of the column that should be used to generate groups within the data_second + The name of the column that should be used to generate groups within the data_second dataframe. The matchig is then only performed for instances in which the groups are identical default='' @@ -275,36 +321,56 @@ def match_names(data_first: Union[pd.DataFrame, pd.Series], ------- pd.DataFrame A dataframe containing the matched rows were the match score is above the threshold. The - dataframe consists of 4 columns; original_name: the original name from data_first after + dataframe consists of 4 columns; original_name: the original name from data_first after preprocessing, match_name_0: the name it is matched to from data_second after preprocessing, - score_0: the score of the match, match_index_0: the index of the match in data_second. The - match_index_0 can be used to join the data from both dataframes. + score_0: the score of the match, match_index_0: the index of the match in data_second. The + match_index_0 can be used to join the data from both dataframes. """ - if 'number_of_matches' in kwargs: + if "number_of_matches" in kwargs: raise ValueError( - 'The number of matches can only be changed by using a custom matching approach') + "The number of matches can only be changed by using a custom matching approach" + ) data_first = _match_names_check_data(data_first, column_first, group_column_first) - data_second = _match_names_check_data(data_second, column_second, group_column_second) + data_second = _match_names_check_data( + data_second, column_second, group_column_second + ) - name_column = 'name_matching_data' + name_column = "name_matching_data" - if ((group_column_first == '') & (group_column_second != '')) | ((group_column_second == '') & (group_column_first != '')): + if ((group_column_first == "") & (group_column_second != "")) | ( + (group_column_second == "") & (group_column_first != "") + ): raise ValueError( - 'For the grouping to work both the grouping column in the first as well as the second dataframe have to be indicated') + "For the grouping to work both the grouping column in the first as well as the second dataframe have to be indicated" + ) if (threshold > 100) | (threshold < 0): - raise ValueError('Please pick a threshold between 0 and 100') + raise ValueError("Please pick a threshold between 0 and 100") - data_first, data_second = _match_names_preprocess_data(name_column, data_first, - data_second, case_sensitive, punctuation_sensitive, special_character_sensitive) + data_first, data_second = _match_names_preprocess_data( + name_column, + data_first, + data_second, + case_sensitive, + punctuation_sensitive, + special_character_sensitive, + ) matcher = NameMatcher(**kwargs) - if group_column_first == '': - matches = _match_names_match_single(matcher, data_first, data_second, name_column) + if group_column_first == "": + matches = _match_names_match_single( + matcher, data_first, data_second, name_column + ) else: - matches = _match_names_match_group(matcher, data_first, data_second, - name_column, group_column_first, group_column_second) + matches = _match_names_match_group( + matcher, + data_first, + data_second, + name_column, + group_column_first, + group_column_second, + ) - return matches[matches['score'] > threshold] + return matches[matches["score"] > threshold] diff --git a/name_matching/sparse_cosine.py b/name_matching/sparse_cosine.py index 3891b60..ee02fd8 100644 --- a/name_matching/sparse_cosine.py +++ b/name_matching/sparse_cosine.py @@ -1,19 +1,20 @@ import numpy as np from tqdm import tqdm -# from numba import jit from scipy.sparse import csc_matrix, coo_matrix from typing import Union -# @jit(nopython=True, fastmath=True) -def _sparse_cosine_low_memory(matrix_row: np.array, - matrix_col: np.array, - matrix_data: np.array, - matrix_len: int, - vector_ind: np.array, - vector_data: np.array) -> np.array: + +def _sparse_cosine_low_memory( + matrix_row: np.array, + matrix_col: np.array, + matrix_data: np.array, + matrix_len: int, + vector_ind: np.array, + vector_data: np.array, +) -> np.array: """ A sparse cosine simularity calculation between a matrix and a vector. The sparse matrix should be sorted - in ascending order based on the matrix_col values. The vector should be sorted based on the indexes in + in ascending order based on the matrix_col values. The vector should be sorted based on the indexes in ascending order. Parameters @@ -46,17 +47,20 @@ def _sparse_cosine_low_memory(matrix_row: np.array, if ind == len(vector_ind): break if col == vector_ind[ind]: - res[matrix_row[mat_ind]] = res[matrix_row[mat_ind]] + \ - matrix_data[mat_ind] * vector_data[ind] + res[matrix_row[mat_ind]] = ( + res[matrix_row[mat_ind]] + matrix_data[mat_ind] * vector_data[ind] + ) return res -def _sparse_cosine_top_n_standard(matrix_a: csc_matrix, - matrix_b: csc_matrix, - number_of_rows_at_once: int, - top_n: int, - verbose: bool) -> np.array: +def _sparse_cosine_top_n_standard( + matrix_a: csc_matrix, + matrix_b: csc_matrix, + number_of_rows_at_once: int, + top_n: int, + verbose: bool, +) -> np.array: """ A function for sparse matrix multiplication followed by an argpartition to only take the top_n indexes. @@ -82,40 +86,51 @@ def _sparse_cosine_top_n_standard(matrix_a: csc_matrix, """ - results_arg = np.zeros( - (matrix_b.shape[0], top_n), dtype=np.float32) + results_arg = np.zeros((matrix_b.shape[0], top_n), dtype=np.float32) # Split up the matrice in a certain number of rows - for j in tqdm(range(0, matrix_b.shape[0], number_of_rows_at_once), disable=not verbose): + for j in tqdm( + range(0, matrix_b.shape[0], number_of_rows_at_once), disable=not verbose + ): number_of_rows_at_once_min = min( - [number_of_rows_at_once, matrix_b.shape[0]-j]) - matrix_b_temp = matrix_b[j:j+number_of_rows_at_once_min, :] + [number_of_rows_at_once, matrix_b.shape[0] - j] + ) + matrix_b_temp = matrix_b[j : j + number_of_rows_at_once_min, :] # Calculate the matrix dot product results_full = (matrix_a * (matrix_b_temp.T)).tocsc() # For each of the rows of the original matrix select the argpartition for i in range(number_of_rows_at_once_min): - results_full_temp = results_full.data[results_full.indptr[i]:results_full.indptr[i+1]] + results_full_temp = results_full.data[ + results_full.indptr[i] : results_full.indptr[i + 1] + ] # If there are more results then top_n only select the top_n results if len(results_full_temp) > top_n: - ind = results_full.indices[results_full.indptr[i]:results_full.indptr[i+1]] - results_arg[j + i, :] = ind[np.argpartition( - results_full_temp, -top_n)[-top_n:]] - + ind = results_full.indices[ + results_full.indptr[i] : results_full.indptr[i + 1] + ] + results_arg[j + i, :] = ind[ + np.argpartition(results_full_temp, -top_n)[-top_n:] + ] + # else just select all the results else: - results_arg[j + i, :len(results_full_temp) - ] = results_full.indices[results_full.indptr[i]:results_full.indptr[i+1]] + results_arg[j + i, : len(results_full_temp)] = results_full.indices[ + results_full.indptr[i] : results_full.indptr[i + 1] + ] return results_arg -def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix], - matrix_b: csc_matrix, - top_n: int, - low_memory: bool, - number_of_rows: int, - verbose: bool): + +def sparse_cosine_top_n( + matrix_a: Union[csc_matrix, coo_matrix], + matrix_b: csc_matrix, + top_n: int, + low_memory: bool, + number_of_rows: int, + verbose: bool, +): """ Calculates the top_n cosine matches between matrix_a and matrix_b. Takes into account the amount of memory that should be used based on the low_memory int @@ -131,7 +146,7 @@ def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix], low_memory: bool A bool indicating whether the low memory sparse cosine approach should be used number_of_rows: int - An int inidcating the number of rows which should be + An int inidcating the number of rows which should be processed at once when calculating the cosine simalarity verbose: bool A boolean indicating whether the progress should be printed @@ -144,11 +159,19 @@ def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix], """ if low_memory: matrix_b.sort_indices() - res = _sparse_cosine_low_memory(matrix_a.row, matrix_a.col, matrix_a.data, - matrix_a.shape[0], matrix_b.indices, matrix_b.data) + res = _sparse_cosine_low_memory( + matrix_a.row, + matrix_a.col, + matrix_a.data, + matrix_a.shape[0], + matrix_b.indices, + matrix_b.data, + ) top_n_adjusted = -np.min([top_n, len(res)]) return np.argpartition(res, top_n_adjusted, axis=0)[top_n_adjusted:] else: - return _sparse_cosine_top_n_standard(matrix_a, matrix_b, number_of_rows, top_n, verbose) \ No newline at end of file + return _sparse_cosine_top_n_standard( + matrix_a, matrix_b, number_of_rows, top_n, verbose + ) diff --git a/name_matching/test/test_name_matcher.py b/name_matching/test/test_name_matcher.py index a5ab6b3..25c7c44 100644 --- a/name_matching/test/test_name_matcher.py +++ b/name_matching/test/test_name_matcher.py @@ -9,36 +9,94 @@ import operator import re import name_matching.name_matcher as nm -from distances import Indel, DiscountedLevenshtein, CormodeLZ, Tichy, IterativeSubString, BaulieuXIII, Clement, DiceAsymmetricI, KuhnsIII, Overlap, PearsonII, WeightedJaccard, WarrensIV, Bag, RougeL, RatcliffObershelp, NCDbz2, FuzzyWuzzyPartialString, FuzzyWuzzyTokenSort, FuzzyWuzzyTokenSet, Editex, Typo,LIG3, SSK, Levenshtein, DoubleMetaphone, RefinedSoundex, PhoneticDistance +from distances import ( + Indel, + DiscountedLevenshtein, + CormodeLZ, + Tichy, + IterativeSubString, + BaulieuXIII, + Clement, + DiceAsymmetricI, + KuhnsIII, + Overlap, + PearsonII, + WeightedJaccard, + WarrensIV, + Bag, + RougeL, + RatcliffObershelp, + NCDbz2, + FuzzyWuzzyPartialString, + FuzzyWuzzyTokenSort, + FuzzyWuzzyTokenSet, + Editex, + Typo, + LIG3, + SSK, + Levenshtein, + DoubleMetaphone, + RefinedSoundex, + PhoneticDistance, +) @pytest.fixture def name_match(): package_dir = path.dirname(path.dirname(path.dirname(path.abspath(__file__)))) - data = pd.read_csv(path.join(package_dir, 'test','test_names.csv')) + data = pd.read_csv(path.join(package_dir, "test", "test_names.csv")) name_matcher = nm.NameMatcher() name_matcher.load_and_process_master_data( - 'company_name', data, start_processing=False, transform=False) + "company_name", data, start_processing=False, transform=False + ) return name_matcher + @pytest.fixture def original_name(): package_dir = path.dirname(path.dirname(path.dirname(path.abspath(__file__)))) - return pd.read_csv(path.join(package_dir, 'test','test_names.csv')) + return pd.read_csv(path.join(package_dir, "test", "test_names.csv")) @pytest.fixture def adjusted_name(): package_dir = path.dirname(path.dirname(path.dirname(path.abspath(__file__)))) - return pd.read_csv(path.join(package_dir, 'test','adjusted_test_names.csv')) + return pd.read_csv(path.join(package_dir, "test", "adjusted_test_names.csv")) @pytest.fixture def words(): - return ['fun', 'small', 'pool', 'fun', 'small', 'pool', 'sign', - 'small', 'pool', 'sign', 'sign', 'small', 'pool', 'sign', 'paper', - 'oppose', 'paper', 'oppose', 'brown', 'pig', 'fat', 'oppose', 'paper', - 'oppose', 'brown', 'pig', 'fat', 'snail'] + return [ + "fun", + "small", + "pool", + "fun", + "small", + "pool", + "sign", + "small", + "pool", + "sign", + "sign", + "small", + "pool", + "sign", + "paper", + "oppose", + "paper", + "oppose", + "brown", + "pig", + "fat", + "oppose", + "paper", + "oppose", + "brown", + "pig", + "fat", + "snail", + ] + def number_of_words_in_legal_list(preprocess: bool) -> int: """ @@ -57,128 +115,193 @@ def number_of_words_in_legal_list(preprocess: bool) -> int: """ if preprocess: - set_of_words = set([re.sub(r'[^\w\s]', '', s).strip() for s in functools.reduce( - operator.iconcat, terms_by_country.values(), [])]) - set_of_words.update([re.sub(r'[^\w\s]', '', s).strip() for s in functools.reduce( - operator.iconcat, terms_by_type.values(), [])]) + set_of_words = set( + [ + re.sub(r"[^\w\s]", "", s).strip() + for s in functools.reduce( + operator.iconcat, terms_by_country.values(), [] + ) + ] + ) + set_of_words.update( + [ + re.sub(r"[^\w\s]", "", s).strip() + for s in functools.reduce(operator.iconcat, terms_by_type.values(), []) + ] + ) else: - set_of_words = set([s.strip() for s in functools.reduce( - operator.iconcat, terms_by_country.values(), [])]) - set_of_words.update([s.strip() for s in functools.reduce(operator.iconcat, terms_by_type.values(), [])]) + set_of_words = set( + [ + s.strip() + for s in functools.reduce( + operator.iconcat, terms_by_country.values(), [] + ) + ] + ) + set_of_words.update( + [ + s.strip() + for s in functools.reduce(operator.iconcat, terms_by_type.values(), []) + ] + ) return len(set_of_words) -@pytest.mark.parametrize("method", - ["", - None, - 'no_method'] - ) + +@pytest.mark.parametrize("method", ["", None, "no_method"]) def test_make_distance_metrics_error(name_match, method): with pytest.raises(TypeError): name_match.set_distance_metrics([method]) -@pytest.mark.parametrize("method, result", - [['indel', Indel()], - ['discounted_levenshtein', DiscountedLevenshtein()], - ['tichy', Tichy()], - ['cormodeL_z', CormodeLZ()], - ['iterative_sub_string', IterativeSubString()], - ['baulieu_xiii', BaulieuXIII()], - ['clement', Clement()], - ['dice_asymmetricI', DiceAsymmetricI()], - ['kuhns_iii', KuhnsIII()], - ['overlap', Overlap()], - ['pearson_ii', PearsonII()], - ['weighted_jaccard', WeightedJaccard()], - ['warrens_iv', WarrensIV()], - ['bag', Bag()], - ['rouge_l', RougeL()], - ['ratcliff_obershelp', RatcliffObershelp()], - ['ncd_bz2', NCDbz2()], - ['fuzzy_wuzzy_partial_string', - FuzzyWuzzyPartialString()], - ['fuzzy_wuzzy_token_sort', FuzzyWuzzyTokenSort()], - ['fuzzy_wuzzy_token_set', FuzzyWuzzyTokenSet()], - ['editex', Editex()], - ['typo', Typo()], - ['lig_3', LIG3()], - ['ssk', SSK()], - ['refined_soundex', PhoneticDistance(transforms=RefinedSoundex( - max_length=30), metric=Levenshtein(), encode_alpha=True)], - ['double_metaphone', PhoneticDistance(transforms=DoubleMetaphone(max_length=30), metric=Levenshtein(), encode_alpha=True)]] - ) +@pytest.mark.parametrize( + "method, result", + [ + ["indel", Indel()], + ["discounted_levenshtein", DiscountedLevenshtein()], + ["tichy", Tichy()], + ["cormodeL_z", CormodeLZ()], + ["iterative_sub_string", IterativeSubString()], + ["baulieu_xiii", BaulieuXIII()], + ["clement", Clement()], + ["dice_asymmetricI", DiceAsymmetricI()], + ["kuhns_iii", KuhnsIII()], + ["overlap", Overlap()], + ["pearson_ii", PearsonII()], + ["weighted_jaccard", WeightedJaccard()], + ["warrens_iv", WarrensIV()], + ["bag", Bag()], + ["rouge_l", RougeL()], + ["ratcliff_obershelp", RatcliffObershelp()], + ["ncd_bz2", NCDbz2()], + ["fuzzy_wuzzy_partial_string", FuzzyWuzzyPartialString()], + ["fuzzy_wuzzy_token_sort", FuzzyWuzzyTokenSort()], + ["fuzzy_wuzzy_token_set", FuzzyWuzzyTokenSet()], + ["editex", Editex()], + ["typo", Typo()], + ["lig_3", LIG3()], + ["ssk", SSK()], + [ + "refined_soundex", + PhoneticDistance( + transforms=RefinedSoundex(max_length=30), + metric=Levenshtein(), + encode_alpha=True, + ), + ], + [ + "double_metaphone", + PhoneticDistance( + transforms=DoubleMetaphone(max_length=30), + metric=Levenshtein(), + encode_alpha=True, + ), + ], + ], +) def test_make_distance_metrics(name_match, method, result): name_match.set_distance_metrics([method]) assert type(name_match._distance_metrics.popitem()[1][0]) == type(result) -@pytest.mark.parametrize("kwargs_str, result_1, result_2, result_3, result_4", - [[{"ngrams": (4, 5)}, -1, False, (4, 5), 5000], - [{"low_memory": True}, -1, True, (2, 3), 5000], - [{"legal_suffixes": True}, 0, False, (2, 3), 5000], - [{"legal_suffixes": True, "number_of_rows": 8, - "ngrams": (1, 2, 3)}, 0, False, (1, 2, 3), 8], - ]) +@pytest.mark.parametrize( + "kwargs_str, result_1, result_2, result_3, result_4", + [ + [{"ngrams": (4, 5)}, -1, False, (4, 5), 5000], + [{"low_memory": True}, -1, True, (2, 3), 5000], + [{"legal_suffixes": True}, 0, False, (2, 3), 5000], + [ + {"legal_suffixes": True, "number_of_rows": 8, "ngrams": (1, 2, 3)}, + 0, + False, + (1, 2, 3), + 8, + ], + ], +) def test_initialisation(kwargs_str, result_1, result_2, result_3, result_4): name_match = nm.NameMatcher(**kwargs_str) number_of_words = 1 if result_1 > -1: - number_of_words = number_of_words_in_legal_list(name_match._preprocess_punctuations) + number_of_words = number_of_words_in_legal_list( + name_match._preprocess_punctuations + ) assert len(name_match._word_set) == number_of_words + result_1 assert name_match._low_memory == result_2 assert name_match._vec.ngram_range == result_3 assert name_match._number_of_rows == result_4 -@pytest.mark.parametrize("occ, result_1, result_2, result_3, result_4, result_5", - [[1, '', '', '', '', ''], - [2, 'Schiller', 'Sch-ster, an[', - 'Runolfsson, Tashirian Will', 'Hyats, S|nger', 'Ankunding-Harb-er'], - [3, 'Schiller', 'Sch-ster, Raynor an[ Hermann', - 'Runolfsson, Tashirian Will', 'Hyats, Durgan S|nger', 'Ankunding-Harb-er'], - ]) -def test_preprocess_reduce(name_match, adjusted_name, occ, result_1, result_2, result_3, result_4, result_5): - - name_match._column_matching = 'company_name' - new_names = name_match._preprocess_reduce( - adjusted_name, occurrence_count=occ) - assert new_names.loc[166, 'company_name'] == result_1 - assert new_names.loc[423, 'company_name'] == result_2 - assert new_names.loc[268, 'company_name'] == result_3 - assert new_names.loc[59, 'company_name'] == result_4 - assert new_names.loc[18, 'company_name'] == result_5 - - -@pytest.mark.parametrize("col, start_pro, transform", - [['company_name', False, False], - ['no_name', False, False], - ['company_name', True, False], - ['company_name', True, True], - ['company_name', True, True], - ]) +@pytest.mark.parametrize( + "occ, result_1, result_2, result_3, result_4, result_5", + [ + [1, "", "", "", "", ""], + [ + 2, + "Schiller", + "Sch-ster, an[", + "Runolfsson, Tashirian Will", + "Hyats, S|nger", + "Ankunding-Harb-er", + ], + [ + 3, + "Schiller", + "Sch-ster, Raynor an[ Hermann", + "Runolfsson, Tashirian Will", + "Hyats, Durgan S|nger", + "Ankunding-Harb-er", + ], + ], +) +def test_preprocess_reduce( + name_match, adjusted_name, occ, result_1, result_2, result_3, result_4, result_5 +): + + name_match._column_matching = "company_name" + new_names = name_match._preprocess_reduce(adjusted_name, occurrence_count=occ) + assert new_names.loc[166, "company_name"] == result_1 + assert new_names.loc[423, "company_name"] == result_2 + assert new_names.loc[268, "company_name"] == result_3 + assert new_names.loc[59, "company_name"] == result_4 + assert new_names.loc[18, "company_name"] == result_5 + + +@pytest.mark.parametrize( + "col, start_pro, transform", + [ + ["company_name", False, False], + ["no_name", False, False], + ["company_name", True, False], + ["company_name", True, True], + ["company_name", True, True], + ], +) def test_load_and_process_master_data(adjusted_name, col, start_pro, transform): name_matcher = nm.NameMatcher() name_matcher.load_and_process_master_data( column=col, df_matching_data=adjusted_name, start_processing=start_pro, - transform=transform) + transform=transform, + ) assert name_matcher._column == col - pd.testing.assert_frame_equal( - name_matcher._df_matching_data, adjusted_name) + pd.testing.assert_frame_equal(name_matcher._df_matching_data, adjusted_name) assert name_matcher._preprocessed == start_pro if transform & start_pro: assert type(name_matcher._n_grams_matching) == csc_matrix -@pytest.mark.parametrize("trans, common", - [[False, False], - [True, False], - [False, True], - [True, True], - ]) +@pytest.mark.parametrize( + "trans, common", + [ + [False, False], + [True, False], + [False, True], + [True, True], + ], +) def test_process_matching_data(name_match, trans, common): name_match._postprocess_common_words = common name_match._process_matching_data(transform=trans) @@ -194,126 +317,383 @@ def test_process_matching_data(name_match, trans, common): assert len(name_match._word_set) == 0 -@pytest.mark.parametrize("lower_case, punctuations, ascii, result_1, result_2, result_3", - [[False, False, False, 'Schumm PLC', 'Towne, Johnston and Murray', 'Ösinski-Schinner'], - [True, False, False, 'schumm plc', - 'towne, johnston and murray', 'ösinski-schinner'], - [False, True, False, 'Schumm PLC', - 'Towne Johnston and Murray', 'ÖsinskiSchinner'], - [False, False, True, 'Schumm PLC', - 'Towne, Johnston and Murray', 'Osinski-Schinner'], - [False, True, True, 'Schumm PLC', - 'Towne Johnston and Murray', 'OsinskiSchinner'], - [True, False, True, 'schumm plc', - 'towne, johnston and murray', 'osinski-schinner'], - [True, True, False, 'schumm plc', - 'towne johnston and murray', 'ösinskischinner'], - [True, True, True, 'schumm plc', - 'towne johnston and murray', 'osinskischinner'], - ]) -def test_preprocess(name_match, lower_case, punctuations, ascii, result_1, result_2, result_3): +@pytest.mark.parametrize( + "lower_case, punctuations, ascii, result_1, result_2, result_3", + [ + [ + False, + False, + False, + "Schumm PLC", + "Towne, Johnston and Murray", + "Ösinski-Schinner", + ], + [ + True, + False, + False, + "schumm plc", + "towne, johnston and murray", + "ösinski-schinner", + ], + [ + False, + True, + False, + "Schumm PLC", + "Towne Johnston and Murray", + "ÖsinskiSchinner", + ], + [ + False, + False, + True, + "Schumm PLC", + "Towne, Johnston and Murray", + "Osinski-Schinner", + ], + [ + False, + True, + True, + "Schumm PLC", + "Towne Johnston and Murray", + "OsinskiSchinner", + ], + [ + True, + False, + True, + "schumm plc", + "towne, johnston and murray", + "osinski-schinner", + ], + [ + True, + True, + False, + "schumm plc", + "towne johnston and murray", + "ösinskischinner", + ], + [ + True, + True, + True, + "schumm plc", + "towne johnston and murray", + "osinskischinner", + ], + ], +) +def test_preprocess( + name_match, lower_case, punctuations, ascii, result_1, result_2, result_3 +): name_match._preprocess_lowercase = lower_case name_match._preprocess_punctuations = punctuations name_match._preprocess_ascii = ascii - new_df = name_match.preprocess( - name_match._df_matching_data, 'company_name') - assert new_df.loc[0, 'company_name'] == result_1 - assert new_df.loc[2, 'company_name'] == result_2 - assert new_df.loc[432, 'company_name'] == result_3 - - -@pytest.mark.parametrize("low_memory, ngrams, result_1, result_2, result_3", - [[1, (5, 6), 0.00689, 0.00892, 0.02242], - [6, (2, 3), 0.01044, 0.01092, 0.035], - [8, (1, 2), 0.02729, 0.02783, 0.02324], - [0, (5, 6), 0.00689, 0.00892, 0.02242], - [0, (2, 3), 0.01044, 0.01092, 0.035], - [0, (1, 2), 0.02729, 0.02783, 0.02324], - ]) + new_df = name_match.preprocess(name_match._df_matching_data, "company_name") + assert new_df.loc[0, "company_name"] == result_1 + assert new_df.loc[2, "company_name"] == result_2 + assert new_df.loc[432, "company_name"] == result_3 + + +@pytest.mark.parametrize( + "low_memory, ngrams, result_1, result_2, result_3", + [ + [1, (5, 6), 0.00689, 0.00892, 0.02242], + [6, (2, 3), 0.01044, 0.01092, 0.035], + [8, (1, 2), 0.02729, 0.02783, 0.02324], + [0, (5, 6), 0.00689, 0.00892, 0.02242], + [0, (2, 3), 0.01044, 0.01092, 0.035], + [0, (1, 2), 0.02729, 0.02783, 0.02324], + ], +) def test_transform_data(name_match, low_memory, ngrams, result_1, result_2, result_3): name_match._low_memory = low_memory name_match._vec = TfidfVectorizer( - lowercase=False, analyzer="char", ngram_range=ngrams) + lowercase=False, analyzer="char", ngram_range=ngrams + ) name_match._process_matching_data(transform=False) name_match.transform_data() - assert name_match._n_grams_matching.data[10] == pytest.approx( - result_1, 0.001) - assert name_match._n_grams_matching.data[181] == pytest.approx( - result_2, 0.001) - assert name_match._n_grams_matching.data[1000] == pytest.approx( - result_3, 0.001) - - -@pytest.mark.parametrize("to_be_matched, possible_matches, metrics, result", - [('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandse Bank', 'Bank de Nederlandsche'], ['weighted_jaccard'], 2), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandse Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'discounted_levenshtein'], 5), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandse Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'discounted_levenshtein', 'iterative_sub_string'], 7), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandse Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'overlap', 'iterative_sub_string'], 6), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandse Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'overlap', 'bag'], 11), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', - 'De Nederlandsche Bank', 'Bank de Nederlandsche'], ['weighted_jaccard'], 2), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandsche Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'discounted_levenshtein'], 4), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandsche Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'discounted_levenshtein', 'iterative_sub_string'], 6), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandsche Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'overlap', 'iterative_sub_string'], 6), - ('De Nederlandsche Bank', ['Nederlandsche Bank', 'De Nederlancsh Bank', 'De Nederlandsche Bank', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'overlap', 'bag'], 6), - ('Schumm PLC', ['Torphy-Corkery', 'Hansen, Hoppe and Tillman', - 'Gerlach and Sons', 'Bank de Nederlandsche'], ['weighted_jaccard'], 2), - ('Schumm PLC', ['Torphy-Corkery', 'Hansen, Hoppe and Tillman', 'Gerlach and Sons', - 'Bank de Nederlandsche'], ['weighted_jaccard', 'discounted_levenshtein'], 4), - ('Schumm PLC', ['Torphy-Corkery', 'Hansen, Hoppe and Tillman', 'Gerlach and Sons', 'Bank de Nederlandsche'], [ - 'weighted_jaccard', 'discounted_levenshtein', 'iterative_sub_string'], 6), - ('Schumm PLC', ['Torphy-Corkery', 'Hansen, Hoppe and Tillman', 'Gerlach and Sons', - 'Bank de Nederlandsche'], ['weighted_jaccard', 'overlap', 'iterative_sub_string'], 8), - ('Schumm PLC', ['Torphy-Corkery', 'Hansen, Hoppe and Tillman', 'Gerlach and Sons', - 'Bank de Nederlandsche'], ['weighted_jaccard', 'overlap', 'bag'], 8) - ]) + assert name_match._n_grams_matching.data[10] == pytest.approx(result_1, 0.001) + assert name_match._n_grams_matching.data[181] == pytest.approx(result_2, 0.001) + assert name_match._n_grams_matching.data[1000] == pytest.approx(result_3, 0.001) + + +@pytest.mark.parametrize( + "to_be_matched, possible_matches, metrics, result", + [ + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandse Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard"], + 2, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandse Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "discounted_levenshtein"], + 5, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandse Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "discounted_levenshtein", "iterative_sub_string"], + 7, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandse Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "overlap", "iterative_sub_string"], + 6, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandse Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "overlap", "bag"], + 11, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandsche Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard"], + 2, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandsche Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "discounted_levenshtein"], + 4, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandsche Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "discounted_levenshtein", "iterative_sub_string"], + 6, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandsche Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "overlap", "iterative_sub_string"], + 6, + ), + ( + "De Nederlandsche Bank", + [ + "Nederlandsche Bank", + "De Nederlancsh Bank", + "De Nederlandsche Bank", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "overlap", "bag"], + 6, + ), + ( + "Schumm PLC", + [ + "Torphy-Corkery", + "Hansen, Hoppe and Tillman", + "Gerlach and Sons", + "Bank de Nederlandsche", + ], + ["weighted_jaccard"], + 2, + ), + ( + "Schumm PLC", + [ + "Torphy-Corkery", + "Hansen, Hoppe and Tillman", + "Gerlach and Sons", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "discounted_levenshtein"], + 4, + ), + ( + "Schumm PLC", + [ + "Torphy-Corkery", + "Hansen, Hoppe and Tillman", + "Gerlach and Sons", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "discounted_levenshtein", "iterative_sub_string"], + 6, + ), + ( + "Schumm PLC", + [ + "Torphy-Corkery", + "Hansen, Hoppe and Tillman", + "Gerlach and Sons", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "overlap", "iterative_sub_string"], + 8, + ), + ( + "Schumm PLC", + [ + "Torphy-Corkery", + "Hansen, Hoppe and Tillman", + "Gerlach and Sons", + "Bank de Nederlandsche", + ], + ["weighted_jaccard", "overlap", "bag"], + 8, + ), + ], +) def test_score_matches(to_be_matched, possible_matches, metrics, result): name_match = nm.NameMatcher() name_match.set_distance_metrics(metrics) - assert np.argmax(name_match._score_matches( - to_be_matched, possible_matches)) == result - - -@pytest.mark.parametrize("number_of_matches, match_score, metrics, result", - [(1, np.array([[0.9, 0.3, 0.5, 0.2, 0.1]]), ['weighted_jaccard'], [0]), - (2, np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5]]), [ - 'weighted_jaccard', 'discounted_levenshtein'], [0, 1]), - (3, np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5], [1, 0.2, 0.3, 0.2, 0.1]]), [ - 'weighted_jaccard', 'discounted_levenshtein', 'iterative_sub_string'], [2, 1, 1]), - (2, np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5], [ - 1, 0.2, 0.3, 0.2, 0.1]]), ['tichy', 'overlap', 'bag'], [2, 1]), - (2, np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5]]), [ - 'overlap', 'bag'], [0, 2]), - (1, np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5], [ - 1, 0.2, 0.3, 0.2, 0.1]]), ['weighted_jaccard', 'overlap', 'iterative_sub_string'], [1]), - (2, np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5], [ - 1, 0.2, 0.3, 0.2, 0.1]]), ['weighted_jaccard', 'overlap', 'bag'], [1, 0]), - (1, np.array([[0.3, 0.3, 0.8, 0.2, 0.2]]), [ - 'weighted_jaccard'], [0]), - (3, np.array([[0.3, 0.3, 0.8, 0.2, 0.2], [0.3, 0.3, 0.8, 0.1, 0.1]]), [ - 'weighted_jaccard', 'discounted_levenshtein'], [0, 1]), - (2, np.array([[0.3, 0.3, 0.2, 0.1, 0.02], [0.1, 0.1, 0.2, 0.3, 0.02]]), [ - 'weighted_jaccard', 'iterative_sub_string'], [0, 0]), - (1, np.array([[0.3, 0.3, 0.2, 0.1, 0.02], [0.3, 0.3, 0.2, 0.3, 0.02]]), [ - 'overlap', 'iterative_sub_string'], [1]), - (1, np.array( - [[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ['bag'], [0]), - (1, np.array( - [[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ['BAG'], [0]), - (3, np.array([[10, 8, 7, 6, 12, 15, 14, 88]]), [ - 'weighted_jaccard'], [0]), - (2, np.array([[1, 0.3], [0.1, 0.4]]), [ - 'weighted_jaccard', 'discounted_levenshtein'], [0, 1]) - ]) + assert ( + np.argmax(name_match._score_matches(to_be_matched, possible_matches)) == result + ) + + +@pytest.mark.parametrize( + "number_of_matches, match_score, metrics, result", + [ + (1, np.array([[0.9, 0.3, 0.5, 0.2, 0.1]]), ["weighted_jaccard"], [0]), + ( + 2, + np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5]]), + ["weighted_jaccard", "discounted_levenshtein"], + [0, 1], + ), + ( + 3, + np.array( + [ + [0.9, 0.3, 0.5, 0.2, 0.1], + [0.6, 0.7, 0.8, 0.4, 0.5], + [1, 0.2, 0.3, 0.2, 0.1], + ] + ), + ["weighted_jaccard", "discounted_levenshtein", "iterative_sub_string"], + [2, 1, 1], + ), + ( + 2, + np.array( + [ + [0.9, 0.3, 0.5, 0.2, 0.1], + [0.6, 0.7, 0.8, 0.4, 0.5], + [1, 0.2, 0.3, 0.2, 0.1], + ] + ), + ["tichy", "overlap", "bag"], + [2, 1], + ), + ( + 2, + np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5]]), + ["overlap", "bag"], + [0, 2], + ), + ( + 1, + np.array( + [ + [0.9, 0.3, 0.5, 0.2, 0.1], + [0.6, 0.7, 0.8, 0.4, 0.5], + [1, 0.2, 0.3, 0.2, 0.1], + ] + ), + ["weighted_jaccard", "overlap", "iterative_sub_string"], + [1], + ), + ( + 2, + np.array( + [ + [0.9, 0.3, 0.5, 0.2, 0.1], + [0.6, 0.7, 0.8, 0.4, 0.5], + [1, 0.2, 0.3, 0.2, 0.1], + ] + ), + ["weighted_jaccard", "overlap", "bag"], + [1, 0], + ), + (1, np.array([[0.3, 0.3, 0.8, 0.2, 0.2]]), ["weighted_jaccard"], [0]), + ( + 3, + np.array([[0.3, 0.3, 0.8, 0.2, 0.2], [0.3, 0.3, 0.8, 0.1, 0.1]]), + ["weighted_jaccard", "discounted_levenshtein"], + [0, 1], + ), + ( + 2, + np.array([[0.3, 0.3, 0.2, 0.1, 0.02], [0.1, 0.1, 0.2, 0.3, 0.02]]), + ["weighted_jaccard", "iterative_sub_string"], + [0, 0], + ), + ( + 1, + np.array([[0.3, 0.3, 0.2, 0.1, 0.02], [0.3, 0.3, 0.2, 0.3, 0.02]]), + ["overlap", "iterative_sub_string"], + [1], + ), + (1, np.array([[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ["bag"], [0]), + (1, np.array([[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ["BAG"], [0]), + (3, np.array([[10, 8, 7, 6, 12, 15, 14, 88]]), ["weighted_jaccard"], [0]), + ( + 2, + np.array([[1, 0.3], [0.1, 0.4]]), + ["weighted_jaccard", "discounted_levenshtein"], + [0, 1], + ), + ], +) def test_rate_matches(number_of_matches, match_score, metrics, result): name_match = nm.NameMatcher() name_match._number_of_matches = number_of_matches @@ -329,37 +709,107 @@ def test_vectorise_data(name_match): assert len(name_match._vec.vocabulary_) > 0 -@pytest.mark.parametrize("match, number_of_matches, word_set, score, result", - [(pd.Series(['Nederandsche', 0, 2, 'De Nederlandsche Bank'], index=['match_name_0', 'score_0', 'match_index_0', 'original_name']), 1, set(['De', 'Bank', 'nl']), 0, 94.553), - (pd.Series(['Nederandsche', 0, 2, 'De Nederlandsche Bank'], index=[ - 'match_name_0', 'score_0', 'match_index_0', 'original_name']), 1, set(['komt', 'niet', 'voor']), 0, 69.713), - (pd.Series(['nederandsche', 0, 2, 'de nederand bank', 0.4, 3, 'De Nederlandsche Bank'], index=[ - 'match_name_0', 'score_0', 'match_index_0', 'match_name_1', 'score_1', 'match_index_1', 'original_name']), 1, set(['De', 'Bank', 'nl']), 1, 0.4), - (pd.Series(['nederandsche', 0, 2, 'de nederand bank', 0.4, 3, 'De Nederlandsche Bank'], index=[ - 'match_name_0', 'score_0', 'match_index_0', 'match_name_1', 'score_1', 'match_index_1', 'original_name']), 1, set(['De', 'Bank', 'nl']), 0, 86.031), - ]) +@pytest.mark.parametrize( + "match, number_of_matches, word_set, score, result", + [ + ( + pd.Series( + ["Nederandsche", 0, 2, "De Nederlandsche Bank"], + index=["match_name_0", "score_0", "match_index_0", "original_name"], + ), + 1, + set(["De", "Bank", "nl"]), + 0, + 94.553, + ), + ( + pd.Series( + ["Nederandsche", 0, 2, "De Nederlandsche Bank"], + index=["match_name_0", "score_0", "match_index_0", "original_name"], + ), + 1, + set(["komt", "niet", "voor"]), + 0, + 69.713, + ), + ( + pd.Series( + [ + "nederandsche", + 0, + 2, + "de nederand bank", + 0.4, + 3, + "De Nederlandsche Bank", + ], + index=[ + "match_name_0", + "score_0", + "match_index_0", + "match_name_1", + "score_1", + "match_index_1", + "original_name", + ], + ), + 1, + set(["De", "Bank", "nl"]), + 1, + 0.4, + ), + ( + pd.Series( + [ + "nederandsche", + 0, + 2, + "de nederand bank", + 0.4, + 3, + "De Nederlandsche Bank", + ], + index=[ + "match_name_0", + "score_0", + "match_index_0", + "match_name_1", + "score_1", + "match_index_1", + "original_name", + ], + ), + 1, + set(["De", "Bank", "nl"]), + 0, + 86.031, + ), + ], +) def test_postprocess(name_match, match, number_of_matches, word_set, score, result): name_match._number_of_matches = number_of_matches name_match._word_set = word_set new_match = name_match.postprocess(match) - assert new_match.loc[f'score_{score}'] == pytest.approx(result, 0.0001) - - -@pytest.mark.parametrize("indicator, punctuations, word_set, cut_off, result_1, result_2", - [('legal', False, set(), 0.01, 'plc.', 'bedrijf'), - ('legal', True, set(), 0.01, 'plc', 'bedrijf'), - ('legal', True, set(['bedrijf']), - 0.01, 'bedrijf', 'Group'), - ('common', True, set(), 0.01, 'Group', 'bedrijf'), - ('common', True, set(), 0.3, 'and', 'Group'), - ('common', True, set(['West']), - 0.3, 'West', 'bedrijf'), - ('someting', True, set(['key']), 0.01, 'key', 'val') - ]) -def test_make_no_scoring_words(name_match, indicator, punctuations, word_set, cut_off, result_1, result_2): + assert new_match.loc[f"score_{score}"] == pytest.approx(result, 0.0001) + + +@pytest.mark.parametrize( + "indicator, punctuations, word_set, cut_off, result_1, result_2", + [ + ("legal", False, set(), 0.01, "plc.", "bedrijf"), + ("legal", True, set(), 0.01, "plc", "bedrijf"), + ("legal", True, set(["bedrijf"]), 0.01, "bedrijf", "Group"), + ("common", True, set(), 0.01, "Group", "bedrijf"), + ("common", True, set(), 0.3, "and", "Group"), + ("common", True, set(["West"]), 0.3, "West", "bedrijf"), + ("someting", True, set(["key"]), 0.01, "key", "val"), + ], +) +def test_make_no_scoring_words( + name_match, indicator, punctuations, word_set, cut_off, result_1, result_2 +): name_match._preprocess_punctuations = punctuations - new_word_set = name_match._make_no_scoring_words( - indicator, word_set, cut_off) + new_word_set = name_match._make_no_scoring_words(indicator, word_set, cut_off) print(new_word_set) assert new_word_set.issuperset(set([result_1])) assert not new_word_set.issuperset(set([result_2])) @@ -371,18 +821,23 @@ def test_search_for_possible_matches_error(adjusted_name): name_matcher._search_for_possible_matches(adjusted_name) -@pytest.mark.parametrize("top_n, low_memory, number_of_rows, result_1, result_2", - [(10, True, 55, 469, 144), - (50, True, 112, 499, 6), - (100, True, 112, 499, 1), - (1, True, 112, 44, 144), - (10, False, 500, 469, 144), - (50, False, 1500, 499, 6), - (100, False, 500, 499, 1), - (1, False, 500, 44, 144) - ]) -def test_search_for_possible_matches(name_match, adjusted_name, top_n, low_memory, number_of_rows, result_1, result_2): - name_match._column_matching = 'company_name' +@pytest.mark.parametrize( + "top_n, low_memory, number_of_rows, result_1, result_2", + [ + (10, True, 55, 469, 144), + (50, True, 112, 499, 6), + (100, True, 112, 499, 1), + (1, True, 112, 44, 144), + (10, False, 500, 469, 144), + (50, False, 1500, 499, 6), + (100, False, 500, 499, 1), + (1, False, 500, 44, 144), + ], +) +def test_search_for_possible_matches( + name_match, adjusted_name, top_n, low_memory, number_of_rows, result_1, result_2 +): + name_match._column_matching = "company_name" name_match._low_memory = low_memory name_match._number_of_rows = number_of_rows name_match._top_n = top_n @@ -395,131 +850,207 @@ def test_search_for_possible_matches(name_match, adjusted_name, top_n, low_memor assert np.min(possible_match[144, :]) == result_2 -@pytest.mark.parametrize("common_words, num_matches, possible_matches, matching_series, result_0, result_1", - [(True, 3, np.array([29, 343, 126, 238, 445]), pd.Series( - ['Company and Sons'], index=['company_name']), 31.33, 31.77), - (False, 2, np.array([29, 343, 126, ]), pd.Series( - ['Company and Sons'], index=['company_name']), 71.28, 68.6), - (False, 2, np.array([29, 343]), pd.Series( - ['Company and Sons'], index=['company_name']), 71.28, 68.6), - (['Sons', 'and'], 3, np.array([29, 343, 126, 238, 445]), pd.Series( - ['Company and Sons'], index=['company_name']), 31.33, 31.77), - (False, 2, np.array([[29, 343], [0, 0]]), pd.Series( - ['Company and Sons'], index=['company_name']), 71.28, 68.6), - (False, 2, np.array([29, 343, 126, 238, 445]), pd.Series( - ['Company and Sons'], index=['company_name']), 71.28, 68.6) - ]) -def test_fuzzy_matches(name_match, common_words, num_matches, possible_matches, matching_series, result_0, result_1): - name_match._column_matching = 'company_name' +@pytest.mark.parametrize( + "common_words, num_matches, possible_matches, matching_series, result_0, result_1", + [ + ( + True, + 3, + np.array([29, 343, 126, 238, 445]), + pd.Series(["Company and Sons"], index=["company_name"]), + 31.33, + 31.77, + ), + ( + False, + 2, + np.array( + [ + 29, + 343, + 126, + ] + ), + pd.Series(["Company and Sons"], index=["company_name"]), + 71.28, + 68.6, + ), + ( + False, + 2, + np.array([29, 343]), + pd.Series(["Company and Sons"], index=["company_name"]), + 71.28, + 68.6, + ), + ( + ["Sons", "and"], + 3, + np.array([29, 343, 126, 238, 445]), + pd.Series(["Company and Sons"], index=["company_name"]), + 31.33, + 31.77, + ), + ( + False, + 2, + np.array([[29, 343], [0, 0]]), + pd.Series(["Company and Sons"], index=["company_name"]), + 71.28, + 68.6, + ), + ( + False, + 2, + np.array([29, 343, 126, 238, 445]), + pd.Series(["Company and Sons"], index=["company_name"]), + 71.28, + 68.6, + ), + ], +) +def test_fuzzy_matches( + name_match, + common_words, + num_matches, + possible_matches, + matching_series, + result_0, + result_1, +): + name_match._column_matching = "company_name" name_match._number_of_matches = num_matches name_match._postprocess_common_words = common_words if isinstance(common_words, list): name_match._word_set = set(common_words) elif common_words: - name_match._word_set = set(['Sons', 'and']) + name_match._word_set = set(["Sons", "and"]) else: name_match._word_set = set() match = name_match.fuzzy_matches(possible_matches, matching_series) - assert match['score_0'] == pytest.approx(result_0, 0.0001) - assert match['score_1'] == pytest.approx(result_1, 0.0001) - assert match['match_index_0'] in possible_matches - assert match['match_index_1'] in possible_matches + assert match["score_0"] == pytest.approx(result_0, 0.0001) + assert match["score_1"] == pytest.approx(result_1, 0.0001) + assert match["match_index_0"] in possible_matches + assert match["match_index_1"] in possible_matches def test_do_name_matching_split(name_match, adjusted_name): name_match._preprocess_split = True - result = name_match.match_names(adjusted_name.iloc[44, :], 'company_name') - assert np.any(result['match_index'] == 44) + result = name_match.match_names(adjusted_name.iloc[44, :], "company_name") + assert np.any(result["match_index"] == 44) def test_do_name_matching_series(name_match, adjusted_name): - result = name_match.match_names(adjusted_name.iloc[44, :], 'company_name') - assert np.any(result['match_index'] == 44) + result = name_match.match_names(adjusted_name.iloc[44, :], "company_name") + assert np.any(result["match_index"] == 44) + def test_do_name_matching_full(name_match, adjusted_name): - result = name_match.match_names(adjusted_name, 'company_name') - assert np.sum(result['match_index'] == result.index) == 491 - -@pytest.mark.parametrize("old_index, new_index, adjust, size_a, size_b, match_result", - [[10, 'new', False, 20, 20, 'new'], - [10, 'new', True, 20, 20, 10], - [10, 526, False, 20, 20, 526], - [10, 526, True, 20, 20, 10], - [4, 201, True, 20, 50, 4], - [8, 201, False, 20, 50, 201], - [8, 44, True, 50, 20, 8], - [4, 44, False, 50, 20, 44], - ]) -def test_do_name_matching_switch_index(original_name, old_index, new_index, adjust, size_a, size_b, match_result): + result = name_match.match_names(adjusted_name, "company_name") + assert np.sum(result["match_index"] == result.index) == 493 + + +@pytest.mark.parametrize( + "old_index, new_index, adjust, size_a, size_b, match_result", + [ + [10, "new", False, 20, 20, "new"], + [10, "new", True, 20, 20, 10], + [10, 526, False, 20, 20, 526], + [10, 526, True, 20, 20, 10], + [4, 201, True, 20, 50, 4], + [8, 201, False, 20, 50, 201], + [8, 44, True, 50, 20, 8], + [4, 44, False, 50, 20, 44], + ], +) +def test_do_name_matching_switch_index( + original_name, old_index, new_index, adjust, size_a, size_b, match_result +): name_match = nm.NameMatcher(row_numbers=adjust, verbose=False) adjusted_name = original_name.copy() - original_name = original_name.rename(index={old_index:new_index}) + original_name = original_name.rename(index={old_index: new_index}) name_match.load_and_process_master_data( - 'company_name', original_name.iloc[:size_a,:], start_processing=False, transform=False) - result = name_match.match_names(adjusted_name.iloc[:size_b,:], 'company_name') - assert result.loc[old_index, 'match_index'] == match_result + "company_name", + original_name.iloc[:size_a, :], + start_processing=False, + transform=False, + ) + result = name_match.match_names(adjusted_name.iloc[:size_b, :], "company_name") + assert result.loc[old_index, "match_index"] == match_result + def test_do_name_matching_error(adjusted_name): name_match = nm.NameMatcher() with pytest.raises(ValueError): - name_match.match_names(adjusted_name, 'company_name') + name_match.match_names(adjusted_name, "company_name") @pytest.mark.parametrize("verbose", [True, False]) def test_do_name_matching_print(capfd, name_match, adjusted_name, verbose): name_match._verbose = verbose - name_match.match_names(adjusted_name.iloc[:5].copy(), 'company_name') + name_match.match_names(adjusted_name.iloc[:5].copy(), "company_name") out, err = capfd.readouterr() if verbose: - assert out.find('preprocessing') > -1 - assert out.find('searching') > -1 - assert out.find('possible') > -1 - assert out.find('fuzzy') > -1 - assert out.find('done') > -1 + assert out.find("preprocessing") > -1 + assert out.find("searching") > -1 + assert out.find("possible") > -1 + assert out.find("fuzzy") > -1 + assert out.find("done") > -1 else: - assert out == '' - - -@pytest.mark.parametrize("word, occurrence_count, result", - [['fun snail pool', 2, 'snail'], - ['fun snail pool', 3, 'fun snail'], - ['fun snail pool', 1, ''], - ['fun small pool', 3, 'fun small pool'], - ['fun snail', 3, 'fun snail'], - ['fun small pool', 5, 'fun small pool']]) + assert out == "" + + +@pytest.mark.parametrize( + "word, occurrence_count, result", + [ + ["fun snail pool", 2, "snail"], + ["fun snail pool", 3, "fun snail"], + ["fun snail pool", 1, ""], + ["fun small pool", 3, "fun small pool"], + ["fun snail", 3, "fun snail"], + ["fun small pool", 5, "fun small pool"], + ], +) def test_select_top_words(word, words, occurrence_count, result): word_counts = pd.Series(words).value_counts() name_match = nm.NameMatcher() - new_word = name_match._select_top_words( - word.split(), word_counts, occurrence_count) + new_word = name_match._select_top_words(word.split(), word_counts, occurrence_count) assert new_word == result -@pytest.mark.parametrize("match, num_of_matches, result", - [[{'match_name_1': 'fun', 'match_name_2': 'dog', - 'match_name_0': 'cat'}, 3, ['cat', 'fun', 'dog']], - [{'match_name_1': 'fun', 'match_name_2': 'dog', - 'match_name_0': 'cat'}, 2, ['cat', 'fun']], - [{'match_name_1': 'fun', 'match_name_0': 'cat'}, - 2, ['cat', 'fun']], - [{'match_name_1': 'fun', 'match_name_2': 'dog', 'match_name_0': 'cat'}, 0, []]]) +@pytest.mark.parametrize( + "match, num_of_matches, result", + [ + [ + {"match_name_1": "fun", "match_name_2": "dog", "match_name_0": "cat"}, + 3, + ["cat", "fun", "dog"], + ], + [ + {"match_name_1": "fun", "match_name_2": "dog", "match_name_0": "cat"}, + 2, + ["cat", "fun"], + ], + [{"match_name_1": "fun", "match_name_0": "cat"}, 2, ["cat", "fun"]], + [{"match_name_1": "fun", "match_name_2": "dog", "match_name_0": "cat"}, 0, []], + ], +) def test_get_alternative_names(match, num_of_matches, result): name_match = nm.NameMatcher(number_of_matches=num_of_matches) res = name_match._get_alternative_names(pd.Series(match)) assert res == result -@pytest.mark.parametrize("preprocess_punctuations, output, input, x", - [[True, '_blame_', {'test': ['fun...', 'done'], 'num':['_.blame._']}, 2], - [True, 'done', {'test': ['fun. . . ', - 'done'], 'num':['_.blame._']}, 1], - [True, 'fun', { - 'test': ['fun. . . ', 'done'], 'num':['_.blame._']}, 0], - [False, 'fun. . .', { - 'test': ['fun. . . ', 'done'], 'num':['_.blame._']}, 0], - [False, 'fun. . .', { - 'num': ['_.blame._'], 'test': ['fun. . . ', 'done']}, 1] - ]) +@pytest.mark.parametrize( + "preprocess_punctuations, output, input, x", + [ + [True, "_blame_", {"test": ["fun...", "done"], "num": ["_.blame._"]}, 2], + [True, "done", {"test": ["fun. . . ", "done"], "num": ["_.blame._"]}, 1], + [True, "fun", {"test": ["fun. . . ", "done"], "num": ["_.blame._"]}, 0], + [False, "fun. . .", {"test": ["fun. . . ", "done"], "num": ["_.blame._"]}, 0], + [False, "fun. . .", {"num": ["_.blame._"], "test": ["fun. . . ", "done"]}, 1], + ], +) def test_preprocess_word_list(preprocess_punctuations, output, input, x): name_match = nm.NameMatcher(punctuations=preprocess_punctuations) res = name_match._preprocess_word_list(input) @@ -527,36 +1058,75 @@ def test_preprocess_word_list(preprocess_punctuations, output, input, x): assert res[x] == output -@pytest.mark.parametrize("num_matches, match_score, match, result, y", - [[3, np.array([[1, 1, 1], [1, 1, 1], [0, 0, 0]]), pd.Series(dtype=float), 100, 0], - [2, np.array([[1, 1], [0.4, 0.4], [0, 0]]), - pd.Series(dtype=float), 40, 1], - [1, np.array([[1, 1], [1, 1], [0, 0]]), - pd.Series(dtype=float), 100, 0] - ]) +@pytest.mark.parametrize( + "num_matches, match_score, match, result, y", + [ + [ + 3, + np.array([[1, 1, 1], [1, 1, 1], [0, 0, 0]]), + pd.Series(dtype=float), + 100, + 0, + ], + [2, np.array([[1, 1], [0.4, 0.4], [0, 0]]), pd.Series(dtype=float), 40, 1], + [1, np.array([[1, 1], [1, 1], [0, 0]]), pd.Series(dtype=float), 100, 0], + ], +) def test_adjust_scores(num_matches, match_score, match, result, y): name_match = nm.NameMatcher(number_of_matches=num_matches) match = name_match._adjust_scores(match_score, match) assert match.iloc[y] == result -@pytest.mark.parametrize("string, stringlist, result_1, result_2, y", - [['know sign first', ['know', 'know sign', 'know sign first'], 'know first', 'know first', 2], - ['know sign first', ['know', 'know sign', - 'know sign first'], 'know first', 'know', 1], - ['know sign first', ['know', 'know sign', - 'know sign first'], 'know first', 'know', 0], - ['know first', ['know', 'know', 'know'], - 'know first', 'know', 1], - ['pool sign small', ['sign small', - 'small pool sign', 'small'], '', '', 0], - ['pool sign small know', ['sign small', - 'small pool sign', 'small'], 'know', '', 0], - ['know pool sign small', ['sign small', - 'small pool sign', 'small'], 'know', '', 0], - ['pool sign small', ['sign small', - 'small pool know sign', 'small'], '', 'know', 1], - ]) +@pytest.mark.parametrize( + "string, stringlist, result_1, result_2, y", + [ + [ + "know sign first", + ["know", "know sign", "know sign first"], + "know first", + "know first", + 2, + ], + [ + "know sign first", + ["know", "know sign", "know sign first"], + "know first", + "know", + 1, + ], + [ + "know sign first", + ["know", "know sign", "know sign first"], + "know first", + "know", + 0, + ], + ["know first", ["know", "know", "know"], "know first", "know", 1], + ["pool sign small", ["sign small", "small pool sign", "small"], "", "", 0], + [ + "pool sign small know", + ["sign small", "small pool sign", "small"], + "know", + "", + 0, + ], + [ + "know pool sign small", + ["sign small", "small pool sign", "small"], + "know", + "", + 0, + ], + [ + "pool sign small", + ["sign small", "small pool know sign", "small"], + "", + "know", + 1, + ], + ], +) def test_process_words(words, string, stringlist, result_1, result_2, y): name_match = nm.NameMatcher() name_match._word_set = set(words) @@ -565,28 +1135,30 @@ def test_process_words(words, string, stringlist, result_1, result_2, y): assert stringlist[y] == result_2 -@pytest.mark.parametrize("word_set, cut_off, result_1, result_2", - [[set(), 0, 635, 'Group'], - [set(), 0, 635, 'and'], - [set(), 0.1, 7, 'Group'], - [set(), 0.1, 7, 'LLC'], - [set(), 0.12, 7, 'LLC'], - [set(), 0.2, 1, 'and'], - [set(['apple']), 1, 1, 'apple'], - [set(['apple']), 0, 636, 'apple'], - [set(['apple']), 0, 636, 'Group'] - ]) +@pytest.mark.parametrize( + "word_set, cut_off, result_1, result_2", + [ + [set(), 0, 635, "Group"], + [set(), 0, 635, "and"], + [set(), 0.1, 7, "Group"], + [set(), 0.1, 7, "LLC"], + [set(), 0.12, 7, "LLC"], + [set(), 0.2, 1, "and"], + [set(["apple"]), 1, 1, "apple"], + [set(["apple"]), 0, 636, "apple"], + [set(["apple"]), 0, 636, "Group"], + ], +) def test_process_common_words(name_match, word_set, cut_off, result_1, result_2): words = name_match._process_common_words(word_set, cut_off) assert result_2 in words assert len(words) == result_1 -@pytest.mark.parametrize("common_words, error", [[True, False], - [[], False], - [set(), False], - [dict(), True], - ["", True]]) +@pytest.mark.parametrize( + "common_words, error", + [[True, False], [[], False], [set(), False], [dict(), True], ["", True]], +) def test_common_words_type_error(common_words, error): if error: with pytest.raises(TypeError): @@ -599,33 +1171,44 @@ def test_common_words_type_error(common_words, error): assert name_matcher._word_set == set(common_words) -@pytest.mark.parametrize("common_words, legal_suffixes", [[['Cherry', 'Stream', 'Puzzle', 'Balloon', 'Candle', 'Mirror'], False], - [['Cherry', 'Stream', 'Puzzle', 'Balloon', 'Candle', 'Mirror'], True], - [['Cherry'], False], - [['Cherry'], True], - [['limited', 'gmbh'], False], - [['limited', 'gmbh'], True],]) +@pytest.mark.parametrize( + "common_words, legal_suffixes", + [ + [["Cherry", "Stream", "Puzzle", "Balloon", "Candle", "Mirror"], False], + [["Cherry", "Stream", "Puzzle", "Balloon", "Candle", "Mirror"], True], + [["Cherry"], False], + [["Cherry"], True], + [["limited", "gmbh"], False], + [["limited", "gmbh"], True], + ], +) def test_common_words_addition(original_name, common_words, legal_suffixes): - name_matcher = nm.NameMatcher(common_words=common_words, legal_suffixes=legal_suffixes) - name_matcher.load_and_process_master_data( - 'company_name', original_name, start_processing=False, transform=False) - name_matcher._process_matching_data(transform=False) - for word in common_words: - assert word in name_matcher._word_set - - -@pytest.mark.parametrize("word_set, preprocess, result_1, result_2, result_3", - [[set(), True, 0, 'company', True], - [set(), True, 0, '3ao', True], - [set(), True, 0, 'g.m.b.h.', False], - [set(), False, 0, '& company', True], - [set(), False, 0, '3ao', True], - [set(), False, 0, 'g.m.b.h.', True], - [set(['apple']), True, 1, 'apple', True], - [set(['apple']), False, 1, 'apple', True], - [set(['apple..']), True, 1, 'apple..', True], - [set(['apple..']), False, 1, 'apple..', True] - ]) + name_matcher = nm.NameMatcher( + common_words=common_words, legal_suffixes=legal_suffixes + ) + name_matcher.load_and_process_master_data( + "company_name", original_name, start_processing=False, transform=False + ) + name_matcher._process_matching_data(transform=False) + for word in common_words: + assert word in name_matcher._word_set + + +@pytest.mark.parametrize( + "word_set, preprocess, result_1, result_2, result_3", + [ + [set(), True, 0, "company", True], + [set(), True, 0, "3ao", True], + [set(), True, 0, "g.m.b.h.", False], + [set(), False, 0, "& company", True], + [set(), False, 0, "3ao", True], + [set(), False, 0, "g.m.b.h.", True], + [set(["apple"]), True, 1, "apple", True], + [set(["apple"]), False, 1, "apple", True], + [set(["apple.."]), True, 1, "apple..", True], + [set(["apple.."]), False, 1, "apple..", True], + ], +) def test_process_legal_words(word_set, preprocess, result_1, result_2, result_3): name_match = nm.NameMatcher() name_match._preprocess_punctuations = preprocess diff --git a/name_matching/test/test_sparse_cosine.py b/name_matching/test/test_sparse_cosine.py index 0758595..0321e4f 100644 --- a/name_matching/test/test_sparse_cosine.py +++ b/name_matching/test/test_sparse_cosine.py @@ -2,219 +2,286 @@ import pytest from scipy.sparse import csc_matrix -from name_matching.sparse_cosine import _sparse_cosine_top_n_standard, _sparse_cosine_low_memory, sparse_cosine_top_n +from name_matching.sparse_cosine import ( + _sparse_cosine_top_n_standard, + _sparse_cosine_low_memory, + sparse_cosine_top_n, +) + def assert_values_in_array(A1, A2): - assert len(A1) == len(A2) + assert len(A1) == len(A2) A1.sort() A2.sort() np.testing.assert_array_almost_equal(A1, A2, decimal=2) + @pytest.fixture def mat_a(): - return csc_matrix(np.array([[0. , 0. , 0. , 0. , 0. , 0.3, 0.2, 0.1, 0.3, 0.4], - [0. , 0. , 0. , 0. , 0.6, 0.5, 0. , 0. , 0. , 0. ], - [0. , 0. , 0.6, 0.1, 0. , 0.9, 0. , 0. , 0.5, 0. ], - [0. , 0.3, 0. , 0.4, 0. , 0.6, 0. , 0.1, 0. , 0. ], - [0. , 0. , 0. , 0. , 0. , 0. , 0.3, 0. , 0. , 0. ], - [0. , 0. , 0. , 0. , 0. , 0.9, 0. , 0. , 0.2, 0. ], - [0.7, 0. , 0. , 0.2, 0.3, 0.9, 0. , 0.3, 0. , 0.5], - [0.9, 0.9, 0. , 0.3, 0.9, 0.9, 0. , 0. , 0. , 0. ], - [0.9, 0.5, 0. , 0. , 0. , 0.5, 0.4, 0. , 0. , 0.7], - [0.1, 0. , 0. , 0.2, 0. , 0.4, 0. , 0.9, 0. , 0.7]])) + return csc_matrix( + np.array( + [ + [0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.2, 0.1, 0.3, 0.4], + [0.0, 0.0, 0.0, 0.0, 0.6, 0.5, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.6, 0.1, 0.0, 0.9, 0.0, 0.0, 0.5, 0.0], + [0.0, 0.3, 0.0, 0.4, 0.0, 0.6, 0.0, 0.1, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0, 0.0, 0.2, 0.0], + [0.7, 0.0, 0.0, 0.2, 0.3, 0.9, 0.0, 0.3, 0.0, 0.5], + [0.9, 0.9, 0.0, 0.3, 0.9, 0.9, 0.0, 0.0, 0.0, 0.0], + [0.9, 0.5, 0.0, 0.0, 0.0, 0.5, 0.4, 0.0, 0.0, 0.7], + [0.1, 0.0, 0.0, 0.2, 0.0, 0.4, 0.0, 0.9, 0.0, 0.7], + ] + ) + ) + + @pytest.fixture def mat_b(): - return csc_matrix(np.array([[0. , 0. , 0.4, 0. , 0.2, 0. , 0. , 0.4, 0. , 0. ], - [0. , 0. , 0. , 0.3, 0. , 0. , 0.4, 0. , 0. , 0. ], - [0. , 0.9, 0.9, 0.9, 0. , 0.1, 0.2, 0.6, 0. , 0. ], - [0. , 0.4, 0. , 0. , 0. , 0. , 0.9, 0. , 0. , 0. ], - [0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 0.4, 0. ], - [0. , 0. , 0. , 0. , 1. , 0.6, 0.6, 0. , 0. , 0. ], - [0. , 0. , 0. , 0.3, 0.6, 0. , 0.9, 0. , 0. , 0. ], - [0. , 0. , 0. , 0.9, 0. , 0. , 0. , 0. , 0. , 0.9], - [0. , 0. , 0.8, 0. , 0. , 0. , 1. , 0. , 0. , 0. ], - [0. , 0. , 0.4, 0. , 0. , 0. , 0.8, 0.3, 0. , 0. ]])) + return csc_matrix( + np.array( + [ + [0.0, 0.0, 0.4, 0.0, 0.2, 0.0, 0.0, 0.4, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.3, 0.0, 0.0, 0.4, 0.0, 0.0, 0.0], + [0.0, 0.9, 0.9, 0.9, 0.0, 0.1, 0.2, 0.6, 0.0, 0.0], + [0.0, 0.4, 0.0, 0.0, 0.0, 0.0, 0.9, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.4, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0, 0.6, 0.6, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.3, 0.6, 0.0, 0.9, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9], + [0.0, 0.0, 0.8, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.4, 0.0, 0.0, 0.0, 0.8, 0.3, 0.0, 0.0], + ] + ) + ) + + @pytest.fixture def result_a_b(): - return np.array([[9., 3., 0., 7., 6., 1., 2., 0., 0., 0.], - [8., 4., 0., 9., 7., 6., 3., 2., 0., 0.], - [4., 5., 1., 0., 9., 6., 2., 8., 7., 3.], - [4., 0., 8., 7., 3., 0., 0., 0., 0., 0.], - [5., 2., 8., 4., 0., 0., 0., 0., 0., 0.], - [4., 9., 8., 5., 3., 2., 0., 7., 6., 1.], - [8., 4., 0., 1., 9., 7., 6., 3., 2., 0.], - [8., 0., 9., 7., 6., 3., 2., 0., 0., 0.], - [8., 4., 0., 2., 0., 0., 0., 0., 0., 0.], - [9., 6., 3., 8., 4., 0., 2., 0., 0., 0.]]) + return np.array( + [ + [9.0, 3.0, 0.0, 7.0, 6.0, 1.0, 2.0, 0.0, 0.0, 0.0], + [8.0, 4.0, 0.0, 9.0, 7.0, 6.0, 3.0, 2.0, 0.0, 0.0], + [4.0, 5.0, 1.0, 0.0, 9.0, 6.0, 2.0, 8.0, 7.0, 3.0], + [4.0, 0.0, 8.0, 7.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [5.0, 2.0, 8.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [4.0, 9.0, 8.0, 5.0, 3.0, 2.0, 0.0, 7.0, 6.0, 1.0], + [8.0, 4.0, 0.0, 1.0, 9.0, 7.0, 6.0, 3.0, 2.0, 0.0], + [8.0, 0.0, 9.0, 7.0, 6.0, 3.0, 2.0, 0.0, 0.0, 0.0], + [8.0, 4.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [9.0, 6.0, 3.0, 8.0, 4.0, 0.0, 2.0, 0.0, 0.0, 0.0], + ] + ) + + @pytest.fixture def result_a_b1(): - return np.array([[9.], - [8.], - [7.], - [8.], - [2.], - [7.], - [7.], - [9.], - [2.], - [8.]]) + return np.array( + [[9.0], [8.0], [7.0], [8.0], [2.0], [7.0], [7.0], [9.0], [2.0], [8.0]] + ) + + @pytest.fixture def result_a_b3(): - return np.array([[7., 2., 9.], - [4., 3., 8.], - [3., 7., 9.], - [4., 7., 8.], - [5., 2., 0.], - [6., 1., 7.], - [1., 8., 7.], - [6., 8., 9.], - [4., 8., 2.], - [4., 9., 8.]]) + return np.array( + [ + [7.0, 2.0, 9.0], + [4.0, 3.0, 8.0], + [3.0, 7.0, 9.0], + [4.0, 7.0, 8.0], + [5.0, 2.0, 0.0], + [6.0, 1.0, 7.0], + [1.0, 8.0, 7.0], + [6.0, 8.0, 9.0], + [4.0, 8.0, 2.0], + [4.0, 9.0, 8.0], + ] + ) + + @pytest.fixture def mat_c(): - return csc_matrix(np.array([[0.2, 0.5, 0.2, 0.1, 0.5, 0. ], - [0.2, 0.9, 0.3, 0.4, 0.4, 0.7], - [0. , 0. , 0.4, 0. , 0. , 0. ], - [0. , 0.5, 0. , 0.3, 0.8, 0. ], - [0.7, 0.9, 0. , 0.7, 0.9, 0.2], - [0.2, 0.1, 0.8, 0. , 0. , 0.1]])) + return csc_matrix( + np.array( + [ + [0.2, 0.5, 0.2, 0.1, 0.5, 0.0], + [0.2, 0.9, 0.3, 0.4, 0.4, 0.7], + [0.0, 0.0, 0.4, 0.0, 0.0, 0.0], + [0.0, 0.5, 0.0, 0.3, 0.8, 0.0], + [0.7, 0.9, 0.0, 0.7, 0.9, 0.2], + [0.2, 0.1, 0.8, 0.0, 0.0, 0.1], + ] + ) + ) + + @pytest.fixture def mat_d(): - return csc_matrix(np.array([[0.8, 0. , 0. , 0. , 0.1, 0. ], - [0. , 0. , 0. , 0.4, 0. , 0. ], - [0.3, 0.4, 0. , 0. , 0. , 0.7], - [0. , 0. , 0. , 0. , 0. , 0. ], - [0.1, 0.1, 0.4, 0.4, 0. , 0. ], - [0.8, 0. , 0.5, 0.8, 0.2, 0. ]])) + return csc_matrix( + np.array( + [ + [0.8, 0.0, 0.0, 0.0, 0.1, 0.0], + [0.0, 0.0, 0.0, 0.4, 0.0, 0.0], + [0.3, 0.4, 0.0, 0.0, 0.0, 0.7], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.1, 0.1, 0.4, 0.4, 0.0, 0.0], + [0.8, 0.0, 0.5, 0.8, 0.2, 0.0], + ] + ) + ) + + @pytest.fixture def result_c_d(): - return np.array([[3., 5., 4., 1., 0., 0.], - [4., 3., 1., 0., 0., 0.], - [3., 5., 4., 1., 0., 0.], - [0., 0., 0., 0., 0., 0.], - [2., 3., 5., 4., 1., 0.], - [3., 2., 5., 4., 1., 0.]]) - + return np.array( + [ + [3.0, 5.0, 4.0, 1.0, 0.0, 0.0], + [4.0, 3.0, 1.0, 0.0, 0.0, 0.0], + [3.0, 5.0, 4.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [2.0, 3.0, 5.0, 4.0, 1.0, 0.0], + [3.0, 2.0, 5.0, 4.0, 1.0, 0.0], + ] + ) + + @pytest.fixture def result_c_d1(): - return np.array([[4], - [4], - [1], - [0], - [4], - [4]]) - + return np.array([[4], [4], [1], [0], [4], [4]]) + + @pytest.fixture def result_c_d4(): - return np.array([[5., 4., 1., 0.], - [4., 3., 1., 0.], - [3., 4., 1., 0.], - [0., 0., 0., 0.], - [0., 4., 1., 5.], - [0., 4., 1., 5.]]) - - -@pytest.mark.parametrize("top_n, num_rows", - [(10, 10), - (10, 8), - (10, 7), - (10, 1)] - ) + return np.array( + [ + [5.0, 4.0, 1.0, 0.0], + [4.0, 3.0, 1.0, 0.0], + [3.0, 4.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0], + [0.0, 4.0, 1.0, 5.0], + [0.0, 4.0, 1.0, 5.0], + ] + ) + + +@pytest.mark.parametrize("top_n, num_rows", [(10, 10), (10, 8), (10, 7), (10, 1)]) def test_cosine_standard(top_n, num_rows, mat_a, mat_b, result_a_b): - np.testing.assert_array_equal(_sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False), result_a_b) - -@pytest.mark.parametrize("top_n, num_rows", - [(1, 10), - (1, 8), - (1, 7), - (1, 1)] - ) + np.testing.assert_array_equal( + _sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False), result_a_b + ) + + +@pytest.mark.parametrize("top_n, num_rows", [(1, 10), (1, 8), (1, 7), (1, 1)]) def test_cosine_standard1(top_n, num_rows, mat_a, mat_b, result_a_b1): - np.testing.assert_array_equal(_sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False), result_a_b1) - -@pytest.mark.parametrize("top_n, num_rows", - [(3, 10), - (3, 8), - (3, 7), - (3, 1)] - ) + np.testing.assert_array_equal( + _sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False), result_a_b1 + ) + + +@pytest.mark.parametrize("top_n, num_rows", [(3, 10), (3, 8), (3, 7), (3, 1)]) def test_cosine_standard3(top_n, num_rows, mat_a, mat_b, result_a_b3): - np.testing.assert_array_equal(_sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False), result_a_b3) - -@pytest.mark.parametrize("top_n, num_rows", - [(7, 10), - (6, 8), - (9, 7), - (6, 1)] - ) + results = _sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False) + for row_0, row_1 in zip(results, result_a_b3): + np.testing.assert_array_equal(np.sort(row_0), np.sort(row_1)) + + +@pytest.mark.parametrize("top_n, num_rows", [(7, 10), (6, 8), (9, 7), (6, 1)]) def test_cosine_standard_c(top_n, num_rows, mat_c, mat_d, result_c_d): - np.testing.assert_array_equal(_sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False)[:,:6], result_c_d) - -@pytest.mark.parametrize("top_n, num_rows", - [(4, 5), - (4, 4), - (4, 3), - (4, 1)] - ) + results = _sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False)[:, :6] + for row_0, row_1 in zip(results, result_c_d): + np.testing.assert_array_equal(np.sort(row_0), np.sort(row_1)) + + +@pytest.mark.parametrize("top_n, num_rows", [(4, 5), (4, 4), (4, 3), (4, 1)]) def test_cosine_standard_c4(top_n, num_rows, mat_c, mat_d, result_c_d4): - np.testing.assert_array_equal(_sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False), result_c_d4) - -@pytest.mark.parametrize("top_n, num_rows", - [(1, 10), - (1, 3), - (1, 2), - (1, 1)] - ) + results = _sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False) + for row_0, row_1 in zip(results, result_c_d4): + np.testing.assert_array_equal(np.sort(row_0), np.sort(row_1)) + + +@pytest.mark.parametrize("top_n, num_rows", [(1, 10), (1, 3), (1, 2), (1, 1)]) def test_cosine_standard_c1(top_n, num_rows, mat_c, mat_d, result_c_d1): - np.testing.assert_array_equal(_sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False), result_c_d1) + np.testing.assert_array_equal( + _sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False), result_c_d1 + ) + -@pytest.mark.parametrize("row", - [[1],[2],[3],[4],[5],[0]] - ) +@pytest.mark.parametrize("row", [[1], [2], [3], [4], [5], [0]]) def test_cosine_top_n_cd_low_memory(row, mat_a, mat_b): mat_a_co = csc_matrix(mat_a).tocoo() - low_memory_result = _sparse_cosine_low_memory(matrix_row = mat_a_co.row, matrix_col = mat_a_co.col, - matrix_data = mat_a_co.data, matrix_len = mat_a_co.shape[0], vector_ind = mat_b[row,:].tocsr().indices, - vector_data = mat_b[row,:].tocsr().data) - ordinary_result = (mat_a * (mat_b).T).todense()[:,row] - np.testing.assert_array_almost_equal(low_memory_result.reshape(-1,1), ordinary_result, decimal=3) - -@pytest.mark.parametrize("top_n, num_rows, row", - [(1, 10, 2), - (2, 3, 3), - (3, 2, 1), - (3, 0, 5), - (3, 3, 0), - (6, 2, 1), - (3, 0, 4), - (5, 0, 2), - (8, 1, 2)] - ) + low_memory_result = _sparse_cosine_low_memory( + matrix_row=mat_a_co.row, + matrix_col=mat_a_co.col, + matrix_data=mat_a_co.data, + matrix_len=mat_a_co.shape[0], + vector_ind=mat_b[row, :].tocsr().indices, + vector_data=mat_b[row, :].tocsr().data, + ) + ordinary_result = (mat_a * (mat_b).T).todense()[:, row] + np.testing.assert_array_almost_equal( + low_memory_result.reshape(-1, 1), ordinary_result, decimal=3 + ) + + +@pytest.mark.parametrize( + "top_n, num_rows, row", + [ + (1, 10, 2), + (2, 3, 3), + (3, 2, 1), + (3, 0, 5), + (3, 3, 0), + (6, 2, 1), + (3, 0, 4), + (5, 0, 2), + (8, 1, 2), + ], +) def test_cosine_top_n_cd(top_n, num_rows, row, mat_c, mat_d): if num_rows == 0: - assert_values_in_array(sparse_cosine_top_n(mat_c.tocoo(), mat_d[row,:].tocsr(), top_n, True, num_rows, False).reshape(1,-1), - _sparse_cosine_top_n_standard(mat_c, mat_d[row,:], num_rows + 1, top_n, False)) + assert_values_in_array( + sparse_cosine_top_n( + mat_c.tocoo(), mat_d[row, :].tocsr(), top_n, True, num_rows, False + ).reshape(1, -1), + _sparse_cosine_top_n_standard( + mat_c, mat_d[row, :], num_rows + 1, top_n, False + ), + ) else: - np.testing.assert_array_equal(sparse_cosine_top_n(mat_c, mat_d, top_n, False, num_rows, False), _sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False)) - - -@pytest.mark.parametrize("top_n, num_rows, row", - [(1, 10, 2), - (2, 3, 3), - (6, 2, 1), - (3, 0, 5), - (3, 3, 0), - (6, 2, 1), - (4, 0, 4), - (1, 0, 8), - (2, 0, 6), - (6, 0, 2), - (8, 1, 2)] - ) + np.testing.assert_array_equal( + sparse_cosine_top_n(mat_c, mat_d, top_n, False, num_rows, False), + _sparse_cosine_top_n_standard(mat_c, mat_d, num_rows, top_n, False), + ) + + +@pytest.mark.parametrize( + "top_n, num_rows, row", + [ + (1, 10, 2), + (2, 3, 3), + (6, 2, 1), + (3, 0, 5), + (3, 3, 0), + (6, 2, 1), + (4, 0, 4), + (1, 0, 8), + (2, 0, 6), + (6, 0, 2), + (8, 1, 2), + ], +) def test_cosine_top_n_ab(top_n, num_rows, row, mat_a, mat_b): if num_rows == 0: - assert_values_in_array(sparse_cosine_top_n(mat_a.tocoo(), mat_b[row,:].tocsr(), top_n, True, num_rows, False).reshape(1,-1), - _sparse_cosine_top_n_standard(mat_a, mat_b[row,:], num_rows + 1, top_n, False)) + assert_values_in_array( + sparse_cosine_top_n( + mat_a.tocoo(), mat_b[row, :].tocsr(), top_n, True, num_rows, False + ).reshape(1, -1), + _sparse_cosine_top_n_standard( + mat_a, mat_b[row, :], num_rows + 1, top_n, False + ), + ) else: - np.testing.assert_array_equal(sparse_cosine_top_n(mat_a, mat_b, top_n, False, num_rows, False), _sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False)) - \ No newline at end of file + np.testing.assert_array_equal( + sparse_cosine_top_n(mat_a, mat_b, top_n, False, num_rows, False), + _sparse_cosine_top_n_standard(mat_a, mat_b, num_rows, top_n, False), + ) diff --git a/setup.py b/setup.py index bd0937b..d73ef25 100644 --- a/setup.py +++ b/setup.py @@ -5,21 +5,17 @@ long_description = (this_directory / "README.md").read_text() setup( - name='name_matching', - version='0.8.10', - description='A package for the matching of company names', - author='Michiel Nijhuis', - author_email='m.nijhuis@dnb.nl', - project_urls = { - 'Documentation': 'https://name-matching.readthedocs.io/en/latest/index.html', - 'Source Code': 'https://github.com/DeNederlandscheBank/name_matching'}, - packages=['name_matching','distances'], - install_requires = [ - 'cleanco', - 'scikit-learn', - 'pandas', - 'numpy', - 'tqdm'], + name="name_matching", + version="0.8.11", + description="A package for the matching of company names", + author="Michiel Nijhuis", + author_email="m.nijhuis@dnb.nl", + project_urls={ + "Documentation": "https://name-matching.readthedocs.io/en/latest/index.html", + "Source Code": "https://github.com/DeNederlandscheBank/name_matching", + }, + packages=["name_matching", "distances"], + install_requires=["cleanco", "scikit-learn", "pandas", "numpy", "tqdm"], long_description=long_description, - long_description_content_type='text/markdown', + long_description_content_type="text/markdown", )