incorrect index bug fix

DeNederlandscheBank · Nov 3, 2023 · eca8a36 · eca8a36
1 parent 28dbd49
commit eca8a36
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 25 deletions.
diff --git a/name_matching/name_matcher.py b/name_matching/name_matcher.py
@@ -125,7 +125,6 @@ def __init__(self,
         self._postprocess_company_legal_id = legal_suffixes
         self._postprocess_common_words = common_words
 
-        self._preprocess_common_words = pd.Series(dtype=float)
         self._preprocess_split = preprocess_split
         self._cut_off = cut_off_no_scoring_words
 
@@ -134,6 +133,7 @@ def __init__(self,
                 'legal', self._word_set, self._cut_off)
 
         self._original_indexes = not row_numbers
+        self._original_index = None
 
         self.set_distance_metrics(distance_metrics)
 
@@ -235,12 +235,10 @@ def _preprocess_reduce(self,
         individual_words = to_be_matched[self._column_matching].str.split(
             expand=True).stack()
         word_counts = individual_words.value_counts()
-        preprocess_common_words_inst = pd.concat([self._preprocess_common_words,
-            word_counts]).fillna(0)
         to_be_matched_new = to_be_matched.copy()
         name = to_be_matched[self._column_matching].str.split()
         to_be_matched_new[self._column_matching] = name.apply(
-            lambda word: self._select_top_words(word, preprocess_common_words_inst, occurrence_count))
+            lambda word: self._select_top_words(word, word_counts, occurrence_count))
 
         return to_be_matched_new
 
@@ -265,7 +263,8 @@ def load_and_process_master_data(self,
             default: True
         """
         self._column = column
-        self._df_matching_data = df_matching_data
+        self._df_matching_data = df_matching_data        
+        self._original_index = df_matching_data.index
         if start_processing:
             self._process_matching_data(transform)
 
@@ -327,8 +326,6 @@ def match_names(self,
             self._process_matching_data()
         to_be_matched = self.preprocess(to_be_matched, self._column_matching)
 
-        original_index = to_be_matched.index
-
         if self._verbose:
             tqdm.write('preprocessing complete \n searching for matches...\n')
 
@@ -338,7 +335,7 @@ def match_names(self,
         if self._preprocess_split:
             self._possible_matches = np.hstack((self._search_for_possible_matches(
                 self._preprocess_reduce(to_be_matched)), self._possible_matches))
-
+        
         if self._verbose:
             tqdm.write('possible matches found   \n fuzzy matching...\n')
             data_matches = to_be_matched.progress_apply(lambda x: self.fuzzy_matches(
@@ -352,7 +349,7 @@ def match_names(self,
                                                         'score_0': 'score', 'match_index_0': 'match_index'})
         if is_dataframe and self._original_indexes:
             for col in data_matches.columns[data_matches.columns.str.contains('match_index')]:
-                data_matches[col] = original_index[data_matches[col].astype(int).fillna(0)]
+                data_matches[col] = self._original_index[data_matches[col].astype(int).fillna(0)]
 
         if self._verbose:
             tqdm.write('done')
@@ -384,7 +381,7 @@ def fuzzy_matches(self,
 
         indexes = np.array([[f'match_name_{num}', f'score_{num}', f'match_index_{num}']
                             for num in range(self._number_of_matches)]).flatten()
-        match = pd.Series(index=np.append('original_name', indexes), dtype=float)
+        match = pd.Series(index=np.append('original_name', indexes), dtype=object)
         match['original_name'] = to_be_matched[self._column_matching]
         list_possible_matches = self._df_matching_data.iloc[
             possible_matches.flatten(), :][self._column].values

diff --git a/name_matching/test/test_name_matcher.py b/name_matching/test/test_name_matcher.py
@@ -433,20 +433,24 @@ def test_do_name_matching_full(name_match, adjusted_name):
     result = name_match.match_names(adjusted_name, 'company_name')
     assert np.sum(result['match_index'] == result.index) == 491
 
-def test_do_name_matching_full(name_match, adjusted_name):
-    new_index = np.random.choice(range(100000), size=len(adjusted_name), replace=False)
-    adjusted_name_random_index = adjusted_name.set_index(new_index)
-    result = name_match.match_names(adjusted_name_random_index, 'company_name')
-    assert np.sum(result['match_index'] == result.index) == 491
-
-def test_do_name_matching_full(adjusted_name, original_name):
-    new_index = np.random.choice(range(100000), size=len(adjusted_name), replace=False)
-    adjusted_name_random_index = adjusted_name.set_index(new_index)
-    name_match = nm.NameMatcher(row_numbers=True)
+@pytest.mark.parametrize("old_index, new_index, adjust, size_a, size_b, match_result",
+                        [[10, 'new', False, 20, 20, 'new'],
+                         [10, 'new', True, 20, 20, 10],
+                         [10, 526, False, 20, 20, 526],
+                         [10, 526, True, 20, 20, 10],
+                         [4, 201, True, 20, 50, 4],
+                         [8, 201, False, 20, 50, 201],
+                         [8, 44, True, 50, 20, 8],
+                         [4, 44, False, 50, 20, 44],
+                        ])
+def test_do_name_matching_switch_index(original_name, old_index, new_index, adjust, size_a, size_b, match_result):
+    name_match = nm.NameMatcher(row_numbers=adjust, verbose=False)
+    adjusted_name = original_name.copy()
+    original_name = original_name.rename(index={old_index:new_index})
     name_match.load_and_process_master_data(
-        'company_name', original_name, start_processing=False, transform=False)
-    result = name_match.match_names(adjusted_name_random_index, 'company_name')
-    assert np.max(result['match_index']) <= len(adjusted_name_random_index)
+        'company_name', original_name.iloc[:size_a,:], start_processing=False, transform=False)
+    result = name_match.match_names(adjusted_name.iloc[:size_b,:], 'company_name')
+    assert result.loc[old_index, 'match_index'] == match_result
 
 def test_do_name_matching_error(adjusted_name):
     name_match = nm.NameMatcher()
@@ -526,7 +530,7 @@ def test_preprocess_word_list(preprocess_punctuations, output, input, x):
 def test_adjust_scores(num_matches, match_score, match, result, y):
     name_match = nm.NameMatcher(number_of_matches=num_matches)
     match = name_match._adjust_scores(match_score, match)
-    assert match[y] == result
+    assert match.iloc[y] == result
 
 
 @pytest.mark.parametrize("string, stringlist, result_1, result_2, y",

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
    name='name_matching',
-   version='0.8.8',
+   version='0.8.9',
    description='A package for the matching of company names',
    author='Michiel Nijhuis',
    author_email='[email protected]',