Skip to content

Commit

Permalink
Merge pull request #29 from DeNederlandscheBank/dev
Browse files Browse the repository at this point in the history
numpy 2.0.0 upgrade
  • Loading branch information
mnijhuis-dnb authored Jun 26, 2024
2 parents eabeff0 + 7fddeda commit 9a9f4db
Show file tree
Hide file tree
Showing 11 changed files with 1,826 additions and 1,015 deletions.
2 changes: 1 addition & 1 deletion distances/_discounted_levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def _alignment_matrix(
else:
discount_from = [1, 1]

d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float64)
if backtrace:
trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
for i in range(1, src_len + 1):
Expand Down
2 changes: 1 addition & 1 deletion distances/_editex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from typing import Any, Tuple, cast
from unicodedata import normalize as unicode_normalize

from numpy import float_ as np_float
from numpy import float64 as np_float
from numpy import zeros as np_zeros

from ._distance import _Distance
Expand Down
2 changes: 1 addition & 1 deletion distances/_levenshtein.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _alignment_matrix(
tar_len = len(tar)
max_len = max(src_len, tar_len)

d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float64)
if backtrace:
trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
for i in range(src_len + 1):
Expand Down
2 changes: 1 addition & 1 deletion distances/_typo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from math import log
from typing import Any, Dict, Tuple, cast

from numpy import float_ as np_float
from numpy import float64 as np_float
from numpy import zeros as np_zeros

from ._distance import _Distance
Expand Down
201 changes: 105 additions & 96 deletions name_matching/distance_metrics.py

Large diffs are not rendered by default.

441 changes: 254 additions & 187 deletions name_matching/name_matcher.py

Large diffs are not rendered by default.

290 changes: 178 additions & 112 deletions name_matching/run_nm.py

Large diffs are not rendered by default.

99 changes: 61 additions & 38 deletions name_matching/sparse_cosine.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
import numpy as np
from tqdm import tqdm
# from numba import jit
from scipy.sparse import csc_matrix, coo_matrix
from typing import Union

# @jit(nopython=True, fastmath=True)
def _sparse_cosine_low_memory(matrix_row: np.array,
matrix_col: np.array,
matrix_data: np.array,
matrix_len: int,
vector_ind: np.array,
vector_data: np.array) -> np.array:

def _sparse_cosine_low_memory(
matrix_row: np.array,
matrix_col: np.array,
matrix_data: np.array,
matrix_len: int,
vector_ind: np.array,
vector_data: np.array,
) -> np.array:
"""
A sparse cosine simularity calculation between a matrix and a vector. The sparse matrix should be sorted
in ascending order based on the matrix_col values. The vector should be sorted based on the indexes in
in ascending order based on the matrix_col values. The vector should be sorted based on the indexes in
ascending order.
Parameters
Expand Down Expand Up @@ -46,17 +47,20 @@ def _sparse_cosine_low_memory(matrix_row: np.array,
if ind == len(vector_ind):
break
if col == vector_ind[ind]:
res[matrix_row[mat_ind]] = res[matrix_row[mat_ind]] + \
matrix_data[mat_ind] * vector_data[ind]
res[matrix_row[mat_ind]] = (
res[matrix_row[mat_ind]] + matrix_data[mat_ind] * vector_data[ind]
)

return res


def _sparse_cosine_top_n_standard(matrix_a: csc_matrix,
matrix_b: csc_matrix,
number_of_rows_at_once: int,
top_n: int,
verbose: bool) -> np.array:
def _sparse_cosine_top_n_standard(
matrix_a: csc_matrix,
matrix_b: csc_matrix,
number_of_rows_at_once: int,
top_n: int,
verbose: bool,
) -> np.array:
"""
A function for sparse matrix multiplication followed by an argpartition to
only take the top_n indexes.
Expand All @@ -82,40 +86,51 @@ def _sparse_cosine_top_n_standard(matrix_a: csc_matrix,
"""

results_arg = np.zeros(
(matrix_b.shape[0], top_n), dtype=np.float32)
results_arg = np.zeros((matrix_b.shape[0], top_n), dtype=np.float32)

# Split up the matrice in a certain number of rows
for j in tqdm(range(0, matrix_b.shape[0], number_of_rows_at_once), disable=not verbose):
for j in tqdm(
range(0, matrix_b.shape[0], number_of_rows_at_once), disable=not verbose
):
number_of_rows_at_once_min = min(
[number_of_rows_at_once, matrix_b.shape[0]-j])
matrix_b_temp = matrix_b[j:j+number_of_rows_at_once_min, :]
[number_of_rows_at_once, matrix_b.shape[0] - j]
)
matrix_b_temp = matrix_b[j : j + number_of_rows_at_once_min, :]

# Calculate the matrix dot product
results_full = (matrix_a * (matrix_b_temp.T)).tocsc()

# For each of the rows of the original matrix select the argpartition
for i in range(number_of_rows_at_once_min):
results_full_temp = results_full.data[results_full.indptr[i]:results_full.indptr[i+1]]
results_full_temp = results_full.data[
results_full.indptr[i] : results_full.indptr[i + 1]
]

# If there are more results then top_n only select the top_n results
if len(results_full_temp) > top_n:
ind = results_full.indices[results_full.indptr[i]:results_full.indptr[i+1]]
results_arg[j + i, :] = ind[np.argpartition(
results_full_temp, -top_n)[-top_n:]]

ind = results_full.indices[
results_full.indptr[i] : results_full.indptr[i + 1]
]
results_arg[j + i, :] = ind[
np.argpartition(results_full_temp, -top_n)[-top_n:]
]

# else just select all the results
else:
results_arg[j + i, :len(results_full_temp)
] = results_full.indices[results_full.indptr[i]:results_full.indptr[i+1]]
results_arg[j + i, : len(results_full_temp)] = results_full.indices[
results_full.indptr[i] : results_full.indptr[i + 1]
]
return results_arg

def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix],
matrix_b: csc_matrix,
top_n: int,
low_memory: bool,
number_of_rows: int,
verbose: bool):

def sparse_cosine_top_n(
matrix_a: Union[csc_matrix, coo_matrix],
matrix_b: csc_matrix,
top_n: int,
low_memory: bool,
number_of_rows: int,
verbose: bool,
):
"""
Calculates the top_n cosine matches between matrix_a and matrix_b. Takes into account
the amount of memory that should be used based on the low_memory int
Expand All @@ -131,7 +146,7 @@ def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix],
low_memory: bool
A bool indicating whether the low memory sparse cosine approach should be used
number_of_rows: int
An int inidcating the number of rows which should be
An int inidcating the number of rows which should be
processed at once when calculating the cosine simalarity
verbose: bool
A boolean indicating whether the progress should be printed
Expand All @@ -144,11 +159,19 @@ def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix],
"""
if low_memory:
matrix_b.sort_indices()
res = _sparse_cosine_low_memory(matrix_a.row, matrix_a.col, matrix_a.data,
matrix_a.shape[0], matrix_b.indices, matrix_b.data)
res = _sparse_cosine_low_memory(
matrix_a.row,
matrix_a.col,
matrix_a.data,
matrix_a.shape[0],
matrix_b.indices,
matrix_b.data,
)

top_n_adjusted = -np.min([top_n, len(res)])

return np.argpartition(res, top_n_adjusted, axis=0)[top_n_adjusted:]
else:
return _sparse_cosine_top_n_standard(matrix_a, matrix_b, number_of_rows, top_n, verbose)
return _sparse_cosine_top_n_standard(
matrix_a, matrix_b, number_of_rows, top_n, verbose
)
Loading

0 comments on commit 9a9f4db

Please sign in to comment.