Skip to content

Commit

Permalink
incorrect index bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
mnijhuis-dnb committed Nov 3, 2023
1 parent 28dbd49 commit eca8a36
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 25 deletions.
17 changes: 7 additions & 10 deletions name_matching/name_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ def __init__(self,
self._postprocess_company_legal_id = legal_suffixes
self._postprocess_common_words = common_words

self._preprocess_common_words = pd.Series(dtype=float)
self._preprocess_split = preprocess_split
self._cut_off = cut_off_no_scoring_words

Expand All @@ -134,6 +133,7 @@ def __init__(self,
'legal', self._word_set, self._cut_off)

self._original_indexes = not row_numbers
self._original_index = None

self.set_distance_metrics(distance_metrics)

Expand Down Expand Up @@ -235,12 +235,10 @@ def _preprocess_reduce(self,
individual_words = to_be_matched[self._column_matching].str.split(
expand=True).stack()
word_counts = individual_words.value_counts()
preprocess_common_words_inst = pd.concat([self._preprocess_common_words,
word_counts]).fillna(0)
to_be_matched_new = to_be_matched.copy()
name = to_be_matched[self._column_matching].str.split()
to_be_matched_new[self._column_matching] = name.apply(
lambda word: self._select_top_words(word, preprocess_common_words_inst, occurrence_count))
lambda word: self._select_top_words(word, word_counts, occurrence_count))

return to_be_matched_new

Expand All @@ -265,7 +263,8 @@ def load_and_process_master_data(self,
default: True
"""
self._column = column
self._df_matching_data = df_matching_data
self._df_matching_data = df_matching_data
self._original_index = df_matching_data.index
if start_processing:
self._process_matching_data(transform)

Expand Down Expand Up @@ -327,8 +326,6 @@ def match_names(self,
self._process_matching_data()
to_be_matched = self.preprocess(to_be_matched, self._column_matching)

original_index = to_be_matched.index

if self._verbose:
tqdm.write('preprocessing complete \n searching for matches...\n')

Expand All @@ -338,7 +335,7 @@ def match_names(self,
if self._preprocess_split:
self._possible_matches = np.hstack((self._search_for_possible_matches(
self._preprocess_reduce(to_be_matched)), self._possible_matches))

if self._verbose:
tqdm.write('possible matches found \n fuzzy matching...\n')
data_matches = to_be_matched.progress_apply(lambda x: self.fuzzy_matches(
Expand All @@ -352,7 +349,7 @@ def match_names(self,
'score_0': 'score', 'match_index_0': 'match_index'})
if is_dataframe and self._original_indexes:
for col in data_matches.columns[data_matches.columns.str.contains('match_index')]:
data_matches[col] = original_index[data_matches[col].astype(int).fillna(0)]
data_matches[col] = self._original_index[data_matches[col].astype(int).fillna(0)]

if self._verbose:
tqdm.write('done')
Expand Down Expand Up @@ -384,7 +381,7 @@ def fuzzy_matches(self,

indexes = np.array([[f'match_name_{num}', f'score_{num}', f'match_index_{num}']
for num in range(self._number_of_matches)]).flatten()
match = pd.Series(index=np.append('original_name', indexes), dtype=float)
match = pd.Series(index=np.append('original_name', indexes), dtype=object)
match['original_name'] = to_be_matched[self._column_matching]
list_possible_matches = self._df_matching_data.iloc[
possible_matches.flatten(), :][self._column].values
Expand Down
32 changes: 18 additions & 14 deletions name_matching/test/test_name_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,20 +433,24 @@ def test_do_name_matching_full(name_match, adjusted_name):
result = name_match.match_names(adjusted_name, 'company_name')
assert np.sum(result['match_index'] == result.index) == 491

def test_do_name_matching_full(name_match, adjusted_name):
new_index = np.random.choice(range(100000), size=len(adjusted_name), replace=False)
adjusted_name_random_index = adjusted_name.set_index(new_index)
result = name_match.match_names(adjusted_name_random_index, 'company_name')
assert np.sum(result['match_index'] == result.index) == 491

def test_do_name_matching_full(adjusted_name, original_name):
new_index = np.random.choice(range(100000), size=len(adjusted_name), replace=False)
adjusted_name_random_index = adjusted_name.set_index(new_index)
name_match = nm.NameMatcher(row_numbers=True)
@pytest.mark.parametrize("old_index, new_index, adjust, size_a, size_b, match_result",
[[10, 'new', False, 20, 20, 'new'],
[10, 'new', True, 20, 20, 10],
[10, 526, False, 20, 20, 526],
[10, 526, True, 20, 20, 10],
[4, 201, True, 20, 50, 4],
[8, 201, False, 20, 50, 201],
[8, 44, True, 50, 20, 8],
[4, 44, False, 50, 20, 44],
])
def test_do_name_matching_switch_index(original_name, old_index, new_index, adjust, size_a, size_b, match_result):
name_match = nm.NameMatcher(row_numbers=adjust, verbose=False)
adjusted_name = original_name.copy()
original_name = original_name.rename(index={old_index:new_index})
name_match.load_and_process_master_data(
'company_name', original_name, start_processing=False, transform=False)
result = name_match.match_names(adjusted_name_random_index, 'company_name')
assert np.max(result['match_index']) <= len(adjusted_name_random_index)
'company_name', original_name.iloc[:size_a,:], start_processing=False, transform=False)
result = name_match.match_names(adjusted_name.iloc[:size_b,:], 'company_name')
assert result.loc[old_index, 'match_index'] == match_result

def test_do_name_matching_error(adjusted_name):
name_match = nm.NameMatcher()
Expand Down Expand Up @@ -526,7 +530,7 @@ def test_preprocess_word_list(preprocess_punctuations, output, input, x):
def test_adjust_scores(num_matches, match_score, match, result, y):
name_match = nm.NameMatcher(number_of_matches=num_matches)
match = name_match._adjust_scores(match_score, match)
assert match[y] == result
assert match.iloc[y] == result


@pytest.mark.parametrize("string, stringlist, result_1, result_2, y",
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='name_matching',
version='0.8.8',
version='0.8.9',
description='A package for the matching of company names',
author='Michiel Nijhuis',
author_email='[email protected]',
Expand Down

0 comments on commit eca8a36

Please sign in to comment.