diff --git a/name_matching/name_matcher.py b/name_matching/name_matcher.py index 4dd19f3..568e0fd 100644 --- a/name_matching/name_matcher.py +++ b/name_matching/name_matcher.py @@ -125,7 +125,6 @@ def __init__(self, self._postprocess_company_legal_id = legal_suffixes self._postprocess_common_words = common_words - self._preprocess_common_words = pd.Series(dtype=float) self._preprocess_split = preprocess_split self._cut_off = cut_off_no_scoring_words @@ -134,6 +133,7 @@ def __init__(self, 'legal', self._word_set, self._cut_off) self._original_indexes = not row_numbers + self._original_index = None self.set_distance_metrics(distance_metrics) @@ -235,12 +235,10 @@ def _preprocess_reduce(self, individual_words = to_be_matched[self._column_matching].str.split( expand=True).stack() word_counts = individual_words.value_counts() - preprocess_common_words_inst = pd.concat([self._preprocess_common_words, - word_counts]).fillna(0) to_be_matched_new = to_be_matched.copy() name = to_be_matched[self._column_matching].str.split() to_be_matched_new[self._column_matching] = name.apply( - lambda word: self._select_top_words(word, preprocess_common_words_inst, occurrence_count)) + lambda word: self._select_top_words(word, word_counts, occurrence_count)) return to_be_matched_new @@ -265,7 +263,8 @@ def load_and_process_master_data(self, default: True """ self._column = column - self._df_matching_data = df_matching_data + self._df_matching_data = df_matching_data + self._original_index = df_matching_data.index if start_processing: self._process_matching_data(transform) @@ -327,8 +326,6 @@ def match_names(self, self._process_matching_data() to_be_matched = self.preprocess(to_be_matched, self._column_matching) - original_index = to_be_matched.index - if self._verbose: tqdm.write('preprocessing complete \n searching for matches...\n') @@ -338,7 +335,7 @@ def match_names(self, if self._preprocess_split: self._possible_matches = np.hstack((self._search_for_possible_matches( self._preprocess_reduce(to_be_matched)), self._possible_matches)) - + if self._verbose: tqdm.write('possible matches found \n fuzzy matching...\n') data_matches = to_be_matched.progress_apply(lambda x: self.fuzzy_matches( @@ -352,7 +349,7 @@ def match_names(self, 'score_0': 'score', 'match_index_0': 'match_index'}) if is_dataframe and self._original_indexes: for col in data_matches.columns[data_matches.columns.str.contains('match_index')]: - data_matches[col] = original_index[data_matches[col].astype(int).fillna(0)] + data_matches[col] = self._original_index[data_matches[col].astype(int).fillna(0)] if self._verbose: tqdm.write('done') @@ -384,7 +381,7 @@ def fuzzy_matches(self, indexes = np.array([[f'match_name_{num}', f'score_{num}', f'match_index_{num}'] for num in range(self._number_of_matches)]).flatten() - match = pd.Series(index=np.append('original_name', indexes), dtype=float) + match = pd.Series(index=np.append('original_name', indexes), dtype=object) match['original_name'] = to_be_matched[self._column_matching] list_possible_matches = self._df_matching_data.iloc[ possible_matches.flatten(), :][self._column].values diff --git a/name_matching/test/test_name_matcher.py b/name_matching/test/test_name_matcher.py index 1b86551..f19b8fe 100644 --- a/name_matching/test/test_name_matcher.py +++ b/name_matching/test/test_name_matcher.py @@ -433,20 +433,24 @@ def test_do_name_matching_full(name_match, adjusted_name): result = name_match.match_names(adjusted_name, 'company_name') assert np.sum(result['match_index'] == result.index) == 491 -def test_do_name_matching_full(name_match, adjusted_name): - new_index = np.random.choice(range(100000), size=len(adjusted_name), replace=False) - adjusted_name_random_index = adjusted_name.set_index(new_index) - result = name_match.match_names(adjusted_name_random_index, 'company_name') - assert np.sum(result['match_index'] == result.index) == 491 - -def test_do_name_matching_full(adjusted_name, original_name): - new_index = np.random.choice(range(100000), size=len(adjusted_name), replace=False) - adjusted_name_random_index = adjusted_name.set_index(new_index) - name_match = nm.NameMatcher(row_numbers=True) +@pytest.mark.parametrize("old_index, new_index, adjust, size_a, size_b, match_result", + [[10, 'new', False, 20, 20, 'new'], + [10, 'new', True, 20, 20, 10], + [10, 526, False, 20, 20, 526], + [10, 526, True, 20, 20, 10], + [4, 201, True, 20, 50, 4], + [8, 201, False, 20, 50, 201], + [8, 44, True, 50, 20, 8], + [4, 44, False, 50, 20, 44], + ]) +def test_do_name_matching_switch_index(original_name, old_index, new_index, adjust, size_a, size_b, match_result): + name_match = nm.NameMatcher(row_numbers=adjust, verbose=False) + adjusted_name = original_name.copy() + original_name = original_name.rename(index={old_index:new_index}) name_match.load_and_process_master_data( - 'company_name', original_name, start_processing=False, transform=False) - result = name_match.match_names(adjusted_name_random_index, 'company_name') - assert np.max(result['match_index']) <= len(adjusted_name_random_index) + 'company_name', original_name.iloc[:size_a,:], start_processing=False, transform=False) + result = name_match.match_names(adjusted_name.iloc[:size_b,:], 'company_name') + assert result.loc[old_index, 'match_index'] == match_result def test_do_name_matching_error(adjusted_name): name_match = nm.NameMatcher() @@ -526,7 +530,7 @@ def test_preprocess_word_list(preprocess_punctuations, output, input, x): def test_adjust_scores(num_matches, match_score, match, result, y): name_match = nm.NameMatcher(number_of_matches=num_matches) match = name_match._adjust_scores(match_score, match) - assert match[y] == result + assert match.iloc[y] == result @pytest.mark.parametrize("string, stringlist, result_1, result_2, y", diff --git a/setup.py b/setup.py index b1903e4..7d8533f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='name_matching', - version='0.8.8', + version='0.8.9', description='A package for the matching of company names', author='Michiel Nijhuis', author_email='m.nijhuis@dnb.nl',