require py3.8+, modernize python code (dedupeio#1195)

ArVar · Jun 27, 2024 · 7d2c79b · 7d2c79b
1 parent 03cf52f
commit 7d2c79b
Show file tree

Hide file tree

Showing 22 changed files with 116 additions and 160 deletions.
diff --git a/benchmarks/benchmarks/canonical.py b/benchmarks/benchmarks/canonical.py
@@ -9,7 +9,7 @@
 
 def make_report(data, clustering):
     true_dupes = common.get_true_dupes(data)
-    predicted_dupes = set([])
+    predicted_dupes = set()
     for cluser_id, _ in clustering:
         for pair in combinations(cluser_id, 2):
             predicted_dupes.add(frozenset(pair))

diff --git a/benchmarks/benchmarks/canonical_gazetteer.py b/benchmarks/benchmarks/canonical_gazetteer.py
@@ -7,9 +7,9 @@
 
 def make_report(data, clustering):
     true_dupes = canonical_matching.get_true_dupes(data)
-    predicted_dupes = set(
+    predicted_dupes = {
         frozenset([a, b]) for a, result in clustering for b, score in result
-    )
+    }
     return common.Report.from_scores(true_dupes, predicted_dupes)
 
 

diff --git a/benchmarks/benchmarks/canonical_matching.py b/benchmarks/benchmarks/canonical_matching.py
@@ -15,7 +15,7 @@ def get_true_dupes(data):
 
 def make_report(data, clustering):
     true_dupes = get_true_dupes(data)
-    predicted_dupes = set(frozenset(pair) for pair, _ in clustering)
+    predicted_dupes = {frozenset(pair) for pair, _ in clustering}
     return common.Report.from_scores(true_dupes, predicted_dupes)
 
 

diff --git a/dedupe/_typing.py b/dedupe/_typing.py
@@ -1,5 +1,4 @@
 import os
-import sys
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -9,24 +8,21 @@
     Iterable,
     Iterator,
     List,
+    Literal,
     Mapping,
     MutableSequence,
+    Protocol,
     Sequence,
     Tuple,
     Type,
+    TypedDict,
     Union,
     runtime_checkable,
 )
 
 import numpy
 import numpy.typing
 
-if sys.version_info >= (3, 8):
-    from typing import Literal, Protocol, TypedDict
-else:
-    from typing_extensions import Literal, Protocol, TypedDict
-
-
 if TYPE_CHECKING:
     from dedupe.predicates import Predicate
 

diff --git a/dedupe/api.py b/dedupe/api.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 """
 dedupe provides the main user interface for the library the
 Dedupe class
@@ -29,15 +28,7 @@
 import dedupe.serializer as serializer
 
 if TYPE_CHECKING:
-    from typing import (
-        BinaryIO,
-        Collection,
-        Generator,
-        Iterable,
-        MutableMapping,
-        TextIO,
-        Union,
-    )
+    from typing import BinaryIO, Collection, Generator, Iterable, MutableMapping, TextIO
 
     import numpy.typing
 
@@ -75,7 +66,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Matching(object):
+class Matching:
     """
     Base Class for Record Matching Classes
     """
@@ -682,9 +673,9 @@ def __init__(
             self.temp_dir = tempfile.TemporaryDirectory()
             self.db = self.temp_dir.name + "/blocks.db"
 
-        self.indexed_data: Union[
-            MutableMapping[int, RecordDict], MutableMapping[str, RecordDict]
-        ]
+        self.indexed_data: (
+            MutableMapping[int, RecordDict] | MutableMapping[str, RecordDict]
+        )
         self.indexed_data = {}  # type: ignore[assignment]
 
     def _close(self) -> None:
@@ -856,10 +847,10 @@ def blocks(self, data):
                                ORDER BY a.record_id"""
         )
 
-        pair_blocks: Union[
-            Iterable[tuple[int, Iterable[tuple[int, int]]]],
-            Iterable[tuple[str, Iterable[tuple[str, str]]]],
-        ]
+        pair_blocks: (
+            Iterable[tuple[int, Iterable[tuple[int, int]]]]
+            | Iterable[tuple[str, Iterable[tuple[str, str]]]]
+        )
 
         pair_blocks = itertools.groupby(pairs, lambda x: x[0])
 
@@ -1313,14 +1304,12 @@ def mark_pairs(self, labeled_pairs: TrainingData) -> None:
                 self.active_learner.mark(examples, y)
             except dedupe.predicates.NoIndexError as e:
                 raise UserWarning(
-                    (
-                        "The record\n"
-                        f"{e.failing_record}\n"
-                        "is not known to to the active learner. "
-                        "Make sure all `labeled_pairs` "
-                        "are in the data or training file "
-                        "of the `prepare_training()` method"
-                    )
+                    "The record\n"
+                    f"{e.failing_record}\n"
+                    "is not known to to the active learner. "
+                    "Make sure all `labeled_pairs` "
+                    "are in the data or training file "
+                    "of the `prepare_training()` method"
                 )
 
     def _checkTrainingPairs(self, labeled_pairs: TrainingData) -> None:

diff --git a/dedupe/blocking.py b/dedupe/blocking.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 from __future__ import annotations
 
 import logging
@@ -34,7 +33,7 @@ def index_list() -> IndexList:
     return defaultdict(list)
 
 
-class Fingerprinter(object):
+class Fingerprinter:
     """Takes in a record and returns all blocks that record belongs to"""
 
     def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None:

diff --git a/dedupe/clustering.py b/dedupe/clustering.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 from __future__ import annotations
 
 import array

diff --git a/dedupe/convenience.py b/dedupe/convenience.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 from __future__ import annotations
 
 import collections
@@ -162,7 +161,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None:  # pragma: no cov
 
         for record in record_pair:
             for field in fields:
-                line = "%s : %s" % (field, record[field])
+                line = "{} : {}".format(field, record[field])
                 _print(line)
             _print()
         _print(f"{n_match}/10 positive, {n_distinct}/10 negative")

diff --git a/dedupe/core.py b/dedupe/core.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# -*- coding: utf-8 -*-
 from __future__ import annotations
 
 import collections
@@ -17,17 +16,7 @@
 from dedupe.backport import RLock
 
 if TYPE_CHECKING:
-    from typing import (
-        Any,
-        Generator,
-        Iterable,
-        Iterator,
-        Literal,
-        Optional,
-        Sequence,
-        Type,
-        Union,
-    )
+    from typing import Any, Generator, Iterable, Iterator, Literal, Sequence, Union
 
     from dedupe._typing import (
         Block,
@@ -50,7 +39,7 @@ class BlockingError(Exception):
     pass
 
 
-class ScoreDupes(object):
+class ScoreDupes:
     def __init__(
         self,
         featurizer: FeaturizerFunction,
@@ -71,7 +60,7 @@ def __init__(
 
     def __call__(self) -> None:
         while True:
-            record_pairs: Optional[RecordPairs] = self.records_queue.get()
+            record_pairs: RecordPairs | None = self.records_queue.get()
             if record_pairs is None:
                 break
 
@@ -198,7 +187,7 @@ def fillQueue(
             break
 
 
-class ScoreGazette(object):
+class ScoreGazette:
     def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier):
         self.featurizer = featurizer
         self.classifier = classifier
@@ -238,8 +227,7 @@ def scoreGazette(
 
     score_records = ScoreGazette(featurizer, classifier)
 
-    for scored_pairs in imap(score_records, record_pairs):
-        yield scored_pairs
+    yield from imap(score_records, record_pairs)
 
     # The underlying processes in the pool should terminate when the
     # pool is garbage collected, but sometimes it takes a while
@@ -248,7 +236,7 @@ def scoreGazette(
     pool.join()
 
 
-class MockPool(object):
+class MockPool:
     def close(self) -> None:
         pass
 
@@ -273,7 +261,7 @@ def appropriate_imap(num_cores: int) -> tuple[MapLike, ClosableJoinable]:
     return imap, pool
 
 
-def peek(seq: Iterator[Any]) -> tuple[Optional[Any], Iterator[Any]]:
+def peek(seq: Iterator[Any]) -> tuple[Any | None, Iterator[Any]]:
     try:
         first = next(seq)
     except TypeError as e:
@@ -307,11 +295,11 @@ def Enumerator(start: int = 0) -> collections.defaultdict[Any, int]:
 
 
 @overload
-def sniff_id_type(ids: Sequence[tuple[int, int]]) -> Type[int]: ...
+def sniff_id_type(ids: Sequence[tuple[int, int]]) -> type[int]: ...
 
 
 @overload
-def sniff_id_type(ids: Sequence[tuple[str, str]]) -> tuple[Type[str], Literal[256]]: ...
+def sniff_id_type(ids: Sequence[tuple[str, str]]) -> tuple[type[str], Literal[256]]: ...
 
 
 def sniff_id_type(ids: Sequence[tuple[RecordID, RecordID]]) -> RecordIDDType:

diff --git a/dedupe/labeler.py b/dedupe/labeler.py
@@ -14,7 +14,7 @@
 import dedupe.training as training
 
 if TYPE_CHECKING:
-    from typing import Dict, Iterable, Literal, Mapping
+    from typing import Iterable, Literal, Mapping
 
     from dedupe._typing import (
         Data,
@@ -170,7 +170,7 @@ def remove(self, index: int) -> None:
     def _sample_indices(
         self, sample_size: int, max_cover: int
     ) -> Iterable[RecordIDPair]:
-        weights: Dict[RecordIDPair, float] = {}
+        weights: dict[RecordIDPair, float] = {}
         for predicate, covered in self.block_learner.comparison_cover.items():
             # each predicate gets to vote for every record pair it covers. the
             # strength of that vote is in inverse proportion to the number of
@@ -248,7 +248,7 @@ def __init__(
     def _index_predicates(self, candidates: TrainingExamples) -> None:
         blocker = self.block_learner.blocker
 
-        records = core.unique((record for pair in candidates for record in pair))
+        records = core.unique(record for pair in candidates for record in pair)
 
         for field in blocker.index_fields:
             unique_fields = {record[field] for record in records}