Skip to content

Commit

Permalink
Merge pull request #33 from DeNederlandscheBank/dev
Browse files Browse the repository at this point in the history
  • Loading branch information
mnijhuis-dnb authored Aug 15, 2024
2 parents 9a9f4db + 3078a7d commit cdbfb4f
Show file tree
Hide file tree
Showing 8 changed files with 838 additions and 512 deletions.
221 changes: 124 additions & 97 deletions name_matching/name_matcher.py

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions name_matching/run_nm.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,10 @@ def _match_names_preprocess_data(
data_first[column] = data_first[column].str.lower().str.strip()
data_second[column] = data_second[column].str.lower().str.strip()
if not punctuation_sensitive:
data_first[column] = data_first[column].str.replace("[^\w\s]", "", regex=True)
data_second[column] = data_second[column].str.replace("[^\w\s]", "", regex=True)
data_first[column] = data_first[column].str.replace(r"[^\w\s]", "", regex=True)
data_second[column] = data_second[column].str.replace(
r"[^\w\s]", "", regex=True
)
if not special_character_sensitive:
data_first[column] = data_first[column].apply(
lambda string: normalize("NFKD", string).encode("ASCII", "ignore").decode()
Expand Down
938 changes: 600 additions & 338 deletions name_matching/test/test_distance_metrics.py

Large diffs are not rendered by default.

51 changes: 25 additions & 26 deletions name_matching/test/test_name_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,12 +401,12 @@ def test_preprocess(
@pytest.mark.parametrize(
"low_memory, ngrams, result_1, result_2, result_3",
[
[1, (5, 6), 0.00689, 0.00892, 0.02242],
[1, (5, 6), 0.00689, 0.00892, 0.0293],
[6, (2, 3), 0.01044, 0.01092, 0.035],
[8, (1, 2), 0.02729, 0.02783, 0.02324],
[0, (5, 6), 0.00689, 0.00892, 0.02242],
[8, (1, 2), 0.02729, 0.02783, 0.0360],
[0, (5, 6), 0.00689, 0.00892, 0.0293],
[0, (2, 3), 0.01044, 0.01092, 0.035],
[0, (1, 2), 0.02729, 0.02783, 0.02324],
[0, (1, 2), 0.02729, 0.02783, 0.036],
],
)
def test_transform_data(name_match, low_memory, ngrams, result_1, result_2, result_3):
Expand Down Expand Up @@ -608,7 +608,7 @@ def test_score_matches(to_be_matched, possible_matches, metrics, result):
2,
np.array([[0.9, 0.3, 0.5, 0.2, 0.1], [0.6, 0.7, 0.8, 0.4, 0.5]]),
["weighted_jaccard", "discounted_levenshtein"],
[0, 1],
[0, 2],
),
(
3,
Expand All @@ -620,7 +620,7 @@ def test_score_matches(to_be_matched, possible_matches, metrics, result):
]
),
["weighted_jaccard", "discounted_levenshtein", "iterative_sub_string"],
[2, 1, 1],
[0, 2, 0],
),
(
2,
Expand All @@ -632,7 +632,7 @@ def test_score_matches(to_be_matched, possible_matches, metrics, result):
]
),
["tichy", "overlap", "bag"],
[2, 1],
[0, 0],
),
(
2,
Expand All @@ -650,7 +650,7 @@ def test_score_matches(to_be_matched, possible_matches, metrics, result):
]
),
["weighted_jaccard", "overlap", "iterative_sub_string"],
[1],
[0],
),
(
2,
Expand All @@ -662,30 +662,30 @@ def test_score_matches(to_be_matched, possible_matches, metrics, result):
]
),
["weighted_jaccard", "overlap", "bag"],
[1, 0],
[0, 2],
),
(1, np.array([[0.3, 0.3, 0.8, 0.2, 0.2]]), ["weighted_jaccard"], [0]),
(1, np.array([[0.3, 0.3, 0.8, 0.2, 0.2]]), ["weighted_jaccard"], [2]),
(
3,
np.array([[0.3, 0.3, 0.8, 0.2, 0.2], [0.3, 0.3, 0.8, 0.1, 0.1]]),
np.array([[0.3, 0.4, 0.8, 0.2, 0.2], [0.3, 0.3, 0.8, 0.1, 0.1]]),
["weighted_jaccard", "discounted_levenshtein"],
[0, 1],
[2, 1, 0],
),
(
2,
np.array([[0.3, 0.3, 0.2, 0.1, 0.02], [0.1, 0.1, 0.2, 0.3, 0.02]]),
["weighted_jaccard", "iterative_sub_string"],
[0, 0],
[0, 3],
),
(
1,
np.array([[0.3, 0.3, 0.2, 0.1, 0.02], [0.3, 0.3, 0.2, 0.3, 0.02]]),
["overlap", "iterative_sub_string"],
[1],
[0],
),
(1, np.array([[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ["bag"], [0]),
(1, np.array([[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ["BAG"], [0]),
(3, np.array([[10, 8, 7, 6, 12, 15, 14, 88]]), ["weighted_jaccard"], [0]),
(1, np.array([[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ["bag"], [5]),
(1, np.array([[-0.5, -0.8, -0.3, -0.7, 0, 2]]), ["BAG"], [5]),
(3, np.array([[10, 8, 7, 6, 12, 15, 14, 88]]), ["weighted_jaccard"], [7, 5, 6]),
(
2,
np.array([[1, 0.3], [0.1, 0.4]]),
Expand All @@ -698,9 +698,8 @@ def test_rate_matches(number_of_matches, match_score, metrics, result):
name_match = nm.NameMatcher()
name_match._number_of_matches = number_of_matches
name_match.set_distance_metrics(metrics)
ind = name_match._rate_matches(match_score)
print(ind)
assert len(ind) == np.min([number_of_matches, match_score.shape[0]])
ind = name_match._rate_matches(match_score.T)
assert len(ind) == np.min([number_of_matches, match_score.shape[1]])
assert list(ind) == result


Expand Down Expand Up @@ -858,8 +857,8 @@ def test_search_for_possible_matches(
3,
np.array([29, 343, 126, 238, 445]),
pd.Series(["Company and Sons"], index=["company_name"]),
31.33,
31.77,
31.33,
),
(
False,
Expand Down Expand Up @@ -888,8 +887,8 @@ def test_search_for_possible_matches(
3,
np.array([29, 343, 126, 238, 445]),
pd.Series(["Company and Sons"], index=["company_name"]),
31.33,
31.77,
31.33,
),
(
False,
Expand Down Expand Up @@ -1138,15 +1137,15 @@ def test_process_words(words, string, stringlist, result_1, result_2, y):
@pytest.mark.parametrize(
"word_set, cut_off, result_1, result_2",
[
[set(), 0, 635, "Group"],
[set(), 0, 635, "and"],
[set(), 0, 640, "Group"],
[set(), 0, 640, "and"],
[set(), 0.1, 7, "Group"],
[set(), 0.1, 7, "LLC"],
[set(), 0.12, 7, "LLC"],
[set(), 0.2, 1, "and"],
[set(["apple"]), 1, 1, "apple"],
[set(["apple"]), 0, 636, "apple"],
[set(["apple"]), 0, 636, "Group"],
[set(["apple"]), 0, 641, "apple"],
[set(["apple"]), 0, 641, "Group"],
],
)
def test_process_common_words(name_match, word_set, cut_off, result_1, result_2):
Expand Down
116 changes: 76 additions & 40 deletions name_matching/test/test_run_nm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,102 @@

import name_matching.run_nm as run_nm


@pytest.fixture
def original_name():
package_dir = path.dirname(path.dirname(path.dirname(path.abspath(__file__))))
return pd.read_csv(path.join(package_dir, 'test','test_names.csv'), index_col=0)
return pd.read_csv(path.join(package_dir, "test", "test_names.csv"), index_col=0)


@pytest.fixture
def adjusted_name():
package_dir = path.dirname(path.dirname(path.dirname(path.abspath(__file__))))
return pd.read_csv(path.join(package_dir, 'test','adjusted_test_names.csv'), index_col=0)
return pd.read_csv(
path.join(package_dir, "test", "adjusted_test_names.csv"), index_col=0
)

@pytest.mark.parametrize("series, column, group_column",
[[False, '', ''],
[False, 'column', ''],
[False, 'company_name', 'column'],
[True, 'company_name', 'column'],
[True, '', 'company_name']
])

@pytest.mark.parametrize(
"series, column, group_column",
[
[False, "", ""],
[False, "column", ""],
[False, "company_name", "column"],
[True, "company_name", "column"],
[True, "", "company_name"],
],
)
def test_match_names_check_data_errors(adjusted_name, series, column, group_column):
if series:
adjusted_name = adjusted_name['company_name']
adjusted_name = adjusted_name["company_name"]
with pytest.raises(ValueError):
run_nm._match_names_check_data(adjusted_name, column, group_column)

@pytest.mark.parametrize("series, column, group_column",
[[False, 'company_name', ''],
[True, 'company_name', '']
])

@pytest.mark.parametrize(
"series, column, group_column",
[[False, "company_name", ""], [True, "company_name", ""]],
)
def test_match_names_check_data(adjusted_name, series, column, group_column):
if series:
adjusted_name = adjusted_name['company_name']
adjusted_name = adjusted_name["company_name"]
data = run_nm._match_names_check_data(adjusted_name, column, group_column)
assert 'name_matching_data' in data
assert "name_matching_data" in data
assert type(data) == pd.DataFrame


@pytest.mark.parametrize("case_sensitive, punctuation_sensitive, special_character_sensitive, result_1, result_2",
[[True, True, True, 'Ösinski-Schinner', 'Osinski-Schinneg'],
[False, True, True, 'ösinski-schinner', 'osinski-schinneg'],
[True, False, True, 'ÖsinskiSchinner', 'OsinskiSchinneg'],
[True, True, False, 'Osinski-Schinner', 'Osinski-Schinneg'],
[False, False, True, 'ösinskischinner', 'osinskischinneg'],
[False, True, False, 'osinski-schinner', 'osinski-schinneg'],
[True, False, False, 'OsinskiSchinner', 'OsinskiSchinneg'],
[False, False, False, 'osinskischinner', 'osinskischinneg']
])
def test_match_names_preprocess_data(original_name, adjusted_name, case_sensitive, punctuation_sensitive, special_character_sensitive, result_1, result_2):
data_a, data_b = run_nm._match_names_preprocess_data('company_name', original_name, adjusted_name, case_sensitive, punctuation_sensitive, special_character_sensitive)
assert data_a['company_name'][432] == result_1
assert data_b['company_name'][432] == result_2
@pytest.mark.parametrize(
"case_sensitive, punctuation_sensitive, special_character_sensitive, result_1, result_2",
[
[True, True, True, "Ösinski-Schinner", "Osinski-Schinneg"],
[False, True, True, "ösinski-schinner", "osinski-schinneg"],
[True, False, True, "ÖsinskiSchinner", "OsinskiSchinneg"],
[True, True, False, "Osinski-Schinner", "Osinski-Schinneg"],
[False, False, True, "ösinskischinner", "osinskischinneg"],
[False, True, False, "osinski-schinner", "osinski-schinneg"],
[True, False, False, "OsinskiSchinner", "OsinskiSchinneg"],
[False, False, False, "osinskischinner", "osinskischinneg"],
],
)
def test_match_names_preprocess_data(
original_name,
adjusted_name,
case_sensitive,
punctuation_sensitive,
special_character_sensitive,
result_1,
result_2,
):
data_a, data_b = run_nm._match_names_preprocess_data(
"company_name",
original_name,
adjusted_name,
case_sensitive,
punctuation_sensitive,
special_character_sensitive,
)
assert data_a["company_name"][432] == result_1
assert data_b["company_name"][432] == result_2


def test_match_names_combine_data_1(original_name, adjusted_name):
adjusted_name.loc[89,'company_name'] = original_name.loc[89,'company_name']
original_name, adjusted_name = run_nm._match_names_preprocess_data('company_name', original_name, adjusted_name, False, False, False)
data_a = run_nm._match_names_combine_data(original_name, adjusted_name, 'company_name', 'company_name')
assert len(data_a) == 87
adjusted_name.loc[89, "company_name"] = original_name.loc[89, "company_name"]
original_name, adjusted_name = run_nm._match_names_preprocess_data(
"company_name", original_name, adjusted_name, False, False, False
)
data_a = run_nm._match_names_combine_data(
original_name, adjusted_name, "company_name", "company_name"
)
assert len(data_a) == 83
assert 342 not in data_a.index



def test_match_names_combine_data_2(original_name, adjusted_name):
original_name, adjusted_name = run_nm._match_names_preprocess_data('company_name', original_name, adjusted_name, False, False, False)
data_a = run_nm._match_names_combine_data(original_name, adjusted_name, 'company_name', 'company_name')
assert len(data_a) == 86
assert data_a.loc[341,'score'] == 100
original_name, adjusted_name = run_nm._match_names_preprocess_data(
"company_name", original_name, adjusted_name, False, False, False
)
data_a = run_nm._match_names_combine_data(
original_name, adjusted_name, "company_name", "company_name"
)
assert len(data_a) == 82
assert data_a.loc[341, "score"] == 100
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name="name_matching",
version="0.8.11",
version="0.8.12",
description="A package for the matching of company names",
author="Michiel Nijhuis",
author_email="[email protected]",
Expand Down
6 changes: 3 additions & 3 deletions test/adjusted_test_names.csv
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@
180,Lesch-Vandervort,IT
181,2dick Ltd,BE
182,Wae~chi and S{ns,FR
183,HillInc,BE
183,HillsInc,BE
184,"Wyman, Witling and Christiansen",FR
185,Herman-Zie-me,SP
186,"Schaefer, Haley7and Botsford",SP
Expand Down Expand Up @@ -392,7 +392,7 @@ Group",FR
389,Hxrtmana-Gaylord,BE
390,Steub}r9Bahringgr,LU
391,Paucek-O'Keefe,FR
392,"Sc,hinner Inc",DK
392,"Sc,hunner Inc",DK
393,"Altenwerth, Witting and Huelg",IT
394,Lueltgen-SchroedRr,SP
395,Moore PLC,BE
Expand Down Expand Up @@ -454,7 +454,7 @@ Group",FR
451,StantonPLC,SP
452,"Witking, Kxhic and Watdica",FR
453,Quigley PLC,SP
454,Lehner LLC,IT
454,Lehner and co LLC,IT
455,Hyppe-Koelpin,FR
456,Mitlhell GrWup,SP
457,Konopelski-Donnn[l},NL
Expand Down
10 changes: 5 additions & 5 deletions test/test_names.csv
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
71,Gaylord Inc,NL
72,"Koch, Casper and Batz",BE
73,O'Connell and Sons,UK
74,Wintheiser LLC,IT
74,Wintheiser New LLC,IT
75,Champlin PLC,DE
76,Dickinson-Cummerata,LU
77,Hettinger and Sons,NL
Expand Down Expand Up @@ -182,7 +182,7 @@
180,Lesch-Vandervort,IT
181,Blick Ltd,BE
182,Waelchi and Sons,FR
183,Hill Inc,BE
183,Hilltop Inc,BE
184,"Wyman, Witting and Christiansen",FR
185,Herman-Zieme,SP
186,"Schaefer, Haley and Botsford",SP
Expand Down Expand Up @@ -373,7 +373,7 @@
371,Medhurst-Cole,BE
372,Rutherford Inc,SP
373,Grady PLC,DE
374,Brown Group,DK
374,Brownish Group,DK
375,"Mayert, Tremblay and Mante",SP
376,Greenholt Ltd,LU
377,Lynch Ltd,DE
Expand All @@ -391,7 +391,7 @@
389,Hartmann-Gaylord,BE
390,Steuber-Bahringer,LU
391,Paucek-O'Keefe,FR
392,Schinner Inc,DK
392,Schinnerschine Inc,DK
393,"Altenwerth, Witting and Huels",IT
394,Luettgen-Schroeder,SP
395,Moore PLC,BE
Expand Down Expand Up @@ -453,7 +453,7 @@
451,Stanton PLC,SP
452,"Witting, Kuhic and Watsica",FR
453,Quigley PLC,SP
454,Lehner LLC,IT
454,Kehner LLC,IT
455,Hoppe-Koelpin,FR
456,Mitchell Group,SP
457,Konopelski-Donnelly,NL
Expand Down

0 comments on commit cdbfb4f

Please sign in to comment.