Merge pull request #29 from DeNederlandscheBank/dev

numpy 2.0.0 upgrade
DeNederlandscheBank · Jun 26, 2024 · 9a9f4db · 9a9f4db
2 parents eabeff0 + 7fddeda
commit 9a9f4db
Show file tree

Hide file tree

Showing 11 changed files with 1,826 additions and 1,015 deletions.
diff --git a/distances/_discounted_levenshtein.py b/distances/_discounted_levenshtein.py
@@ -175,7 +175,7 @@ def _alignment_matrix(
         else:
             discount_from = [1, 1]
 
-        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
+        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float64)
         if backtrace:
             trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
         for i in range(1, src_len + 1):

diff --git a/distances/_editex.py b/distances/_editex.py
@@ -23,7 +23,7 @@
 from typing import Any, Tuple, cast
 from unicodedata import normalize as unicode_normalize
 
-from numpy import float_ as np_float
+from numpy import float64 as np_float
 from numpy import zeros as np_zeros
 
 from ._distance import _Distance

diff --git a/distances/_levenshtein.py b/distances/_levenshtein.py
@@ -138,7 +138,7 @@ def _alignment_matrix(
         tar_len = len(tar)
         max_len = max(src_len, tar_len)
 
-        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
+        d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float64)
         if backtrace:
             trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
         for i in range(src_len + 1):

diff --git a/distances/_typo.py b/distances/_typo.py
@@ -23,7 +23,7 @@
 from math import log
 from typing import Any, Dict, Tuple, cast
 
-from numpy import float_ as np_float
+from numpy import float64 as np_float
 from numpy import zeros as np_zeros
 
 from ._distance import _Distance

diff --git a/name_matching/distance_metrics.py b/name_matching/distance_metrics.py
diff --git a/name_matching/name_matcher.py b/name_matching/name_matcher.py
diff --git a/name_matching/run_nm.py b/name_matching/run_nm.py
diff --git a/name_matching/sparse_cosine.py b/name_matching/sparse_cosine.py
@@ -1,19 +1,20 @@
 import numpy as np
 from tqdm import tqdm
-# from numba import jit
 from scipy.sparse import csc_matrix, coo_matrix
 from typing import Union
 
-# @jit(nopython=True, fastmath=True)
-def _sparse_cosine_low_memory(matrix_row: np.array,
-                              matrix_col: np.array,
-                              matrix_data: np.array,
-                              matrix_len: int,
-                              vector_ind: np.array,
-                              vector_data: np.array) -> np.array:
+
+def _sparse_cosine_low_memory(
+    matrix_row: np.array,
+    matrix_col: np.array,
+    matrix_data: np.array,
+    matrix_len: int,
+    vector_ind: np.array,
+    vector_data: np.array,
+) -> np.array:
     """
     A sparse cosine simularity calculation between a matrix and a vector. The sparse matrix should be sorted
-    in ascending order based on the matrix_col values. The vector should be sorted based on the indexes in 
+    in ascending order based on the matrix_col values. The vector should be sorted based on the indexes in
     ascending order.
 
     Parameters
@@ -46,17 +47,20 @@ def _sparse_cosine_low_memory(matrix_row: np.array,
             if ind == len(vector_ind):
                 break
         if col == vector_ind[ind]:
-            res[matrix_row[mat_ind]] = res[matrix_row[mat_ind]] + \
-                matrix_data[mat_ind] * vector_data[ind]
+            res[matrix_row[mat_ind]] = (
+                res[matrix_row[mat_ind]] + matrix_data[mat_ind] * vector_data[ind]
+            )
 
     return res
 
 
-def _sparse_cosine_top_n_standard(matrix_a: csc_matrix,
-                                  matrix_b: csc_matrix,
-                                  number_of_rows_at_once: int,
-                                  top_n: int,
-                                  verbose: bool) -> np.array:
+def _sparse_cosine_top_n_standard(
+    matrix_a: csc_matrix,
+    matrix_b: csc_matrix,
+    number_of_rows_at_once: int,
+    top_n: int,
+    verbose: bool,
+) -> np.array:
     """
     A function for sparse matrix multiplication followed by an argpartition to
     only take the top_n indexes.
@@ -82,40 +86,51 @@ def _sparse_cosine_top_n_standard(matrix_a: csc_matrix,
 
     """
 
-    results_arg = np.zeros(
-        (matrix_b.shape[0], top_n), dtype=np.float32)
+    results_arg = np.zeros((matrix_b.shape[0], top_n), dtype=np.float32)
 
     # Split up the matrice in a certain number of rows
-    for j in tqdm(range(0, matrix_b.shape[0], number_of_rows_at_once), disable=not verbose):
+    for j in tqdm(
+        range(0, matrix_b.shape[0], number_of_rows_at_once), disable=not verbose
+    ):
         number_of_rows_at_once_min = min(
-            [number_of_rows_at_once, matrix_b.shape[0]-j])
-        matrix_b_temp = matrix_b[j:j+number_of_rows_at_once_min, :]
+            [number_of_rows_at_once, matrix_b.shape[0] - j]
+        )
+        matrix_b_temp = matrix_b[j : j + number_of_rows_at_once_min, :]
 
         # Calculate the matrix dot product
         results_full = (matrix_a * (matrix_b_temp.T)).tocsc()
 
         # For each of the rows of the original matrix select the argpartition
         for i in range(number_of_rows_at_once_min):
-            results_full_temp = results_full.data[results_full.indptr[i]:results_full.indptr[i+1]]
+            results_full_temp = results_full.data[
+                results_full.indptr[i] : results_full.indptr[i + 1]
+            ]
 
             # If there are more results then top_n only select the top_n results
             if len(results_full_temp) > top_n:
-                ind = results_full.indices[results_full.indptr[i]:results_full.indptr[i+1]]
-                results_arg[j + i, :] = ind[np.argpartition(
-                    results_full_temp, -top_n)[-top_n:]]
-
+                ind = results_full.indices[
+                    results_full.indptr[i] : results_full.indptr[i + 1]
+                ]
+                results_arg[j + i, :] = ind[
+                    np.argpartition(results_full_temp, -top_n)[-top_n:]
+                ]
+
             # else just select all the results
             else:
-                results_arg[j + i, :len(results_full_temp)
-                            ] = results_full.indices[results_full.indptr[i]:results_full.indptr[i+1]]
+                results_arg[j + i, : len(results_full_temp)] = results_full.indices[
+                    results_full.indptr[i] : results_full.indptr[i + 1]
+                ]
     return results_arg
 
-def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix], 
-                        matrix_b: csc_matrix, 
-                        top_n: int, 
-                        low_memory: bool,
-                        number_of_rows: int,
-                        verbose: bool):
+
+def sparse_cosine_top_n(
+    matrix_a: Union[csc_matrix, coo_matrix],
+    matrix_b: csc_matrix,
+    top_n: int,
+    low_memory: bool,
+    number_of_rows: int,
+    verbose: bool,
+):
     """
     Calculates the top_n cosine matches between matrix_a and matrix_b. Takes into account
     the amount of  memory that should be used based on the low_memory int
@@ -131,7 +146,7 @@ def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix],
     low_memory: bool
         A bool indicating whether the low memory sparse cosine approach should be used
     number_of_rows: int
-        An int inidcating the number of rows which should be 
+        An int inidcating the number of rows which should be
         processed at once when calculating the cosine simalarity
     verbose: bool
         A boolean indicating whether the progress should be printed
@@ -144,11 +159,19 @@ def sparse_cosine_top_n(matrix_a: Union[csc_matrix, coo_matrix],
     """
     if low_memory:
         matrix_b.sort_indices()
-        res = _sparse_cosine_low_memory(matrix_a.row, matrix_a.col, matrix_a.data,
-                        matrix_a.shape[0], matrix_b.indices, matrix_b.data)
+        res = _sparse_cosine_low_memory(
+            matrix_a.row,
+            matrix_a.col,
+            matrix_a.data,
+            matrix_a.shape[0],
+            matrix_b.indices,
+            matrix_b.data,
+        )
 
         top_n_adjusted = -np.min([top_n, len(res)])
 
         return np.argpartition(res, top_n_adjusted, axis=0)[top_n_adjusted:]
     else:
-        return _sparse_cosine_top_n_standard(matrix_a, matrix_b, number_of_rows, top_n, verbose)
+        return _sparse_cosine_top_n_standard(
+            matrix_a, matrix_b, number_of_rows, top_n, verbose
+        )