Skip to content

Commit

Permalink
require py3.8+, modernize python code (dedupeio#1195)
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg authored Jun 27, 2024
1 parent 03cf52f commit 7d2c79b
Show file tree
Hide file tree
Showing 22 changed files with 116 additions and 160 deletions.
2 changes: 1 addition & 1 deletion benchmarks/benchmarks/canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def make_report(data, clustering):
true_dupes = common.get_true_dupes(data)
predicted_dupes = set([])
predicted_dupes = set()
for cluser_id, _ in clustering:
for pair in combinations(cluser_id, 2):
predicted_dupes.add(frozenset(pair))
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/benchmarks/canonical_gazetteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

def make_report(data, clustering):
true_dupes = canonical_matching.get_true_dupes(data)
predicted_dupes = set(
predicted_dupes = {
frozenset([a, b]) for a, result in clustering for b, score in result
)
}
return common.Report.from_scores(true_dupes, predicted_dupes)


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarks/canonical_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_true_dupes(data):

def make_report(data, clustering):
true_dupes = get_true_dupes(data)
predicted_dupes = set(frozenset(pair) for pair, _ in clustering)
predicted_dupes = {frozenset(pair) for pair, _ in clustering}
return common.Report.from_scores(true_dupes, predicted_dupes)


Expand Down
10 changes: 3 additions & 7 deletions dedupe/_typing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import sys
from typing import (
TYPE_CHECKING,
Any,
Expand All @@ -9,24 +8,21 @@
Iterable,
Iterator,
List,
Literal,
Mapping,
MutableSequence,
Protocol,
Sequence,
Tuple,
Type,
TypedDict,
Union,
runtime_checkable,
)

import numpy
import numpy.typing

if sys.version_info >= (3, 8):
from typing import Literal, Protocol, TypedDict
else:
from typing_extensions import Literal, Protocol, TypedDict


if TYPE_CHECKING:
from dedupe.predicates import Predicate

Expand Down
41 changes: 15 additions & 26 deletions dedupe/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
dedupe provides the main user interface for the library the
Dedupe class
Expand Down Expand Up @@ -29,15 +28,7 @@
import dedupe.serializer as serializer

if TYPE_CHECKING:
from typing import (
BinaryIO,
Collection,
Generator,
Iterable,
MutableMapping,
TextIO,
Union,
)
from typing import BinaryIO, Collection, Generator, Iterable, MutableMapping, TextIO

import numpy.typing

Expand Down Expand Up @@ -75,7 +66,7 @@
logger = logging.getLogger(__name__)


class Matching(object):
class Matching:
"""
Base Class for Record Matching Classes
"""
Expand Down Expand Up @@ -682,9 +673,9 @@ def __init__(
self.temp_dir = tempfile.TemporaryDirectory()
self.db = self.temp_dir.name + "/blocks.db"

self.indexed_data: Union[
MutableMapping[int, RecordDict], MutableMapping[str, RecordDict]
]
self.indexed_data: (
MutableMapping[int, RecordDict] | MutableMapping[str, RecordDict]
)
self.indexed_data = {} # type: ignore[assignment]

def _close(self) -> None:
Expand Down Expand Up @@ -856,10 +847,10 @@ def blocks(self, data):
ORDER BY a.record_id"""
)

pair_blocks: Union[
Iterable[tuple[int, Iterable[tuple[int, int]]]],
Iterable[tuple[str, Iterable[tuple[str, str]]]],
]
pair_blocks: (
Iterable[tuple[int, Iterable[tuple[int, int]]]]
| Iterable[tuple[str, Iterable[tuple[str, str]]]]
)

pair_blocks = itertools.groupby(pairs, lambda x: x[0])

Expand Down Expand Up @@ -1313,14 +1304,12 @@ def mark_pairs(self, labeled_pairs: TrainingData) -> None:
self.active_learner.mark(examples, y)
except dedupe.predicates.NoIndexError as e:
raise UserWarning(
(
"The record\n"
f"{e.failing_record}\n"
"is not known to to the active learner. "
"Make sure all `labeled_pairs` "
"are in the data or training file "
"of the `prepare_training()` method"
)
"The record\n"
f"{e.failing_record}\n"
"is not known to to the active learner. "
"Make sure all `labeled_pairs` "
"are in the data or training file "
"of the `prepare_training()` method"
)

def _checkTrainingPairs(self, labeled_pairs: TrainingData) -> None:
Expand Down
3 changes: 1 addition & 2 deletions dedupe/blocking.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import annotations

import logging
Expand Down Expand Up @@ -34,7 +33,7 @@ def index_list() -> IndexList:
return defaultdict(list)


class Fingerprinter(object):
class Fingerprinter:
"""Takes in a record and returns all blocks that record belongs to"""

def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None:
Expand Down
1 change: 0 additions & 1 deletion dedupe/clustering.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import annotations

import array
Expand Down
3 changes: 1 addition & 2 deletions dedupe/convenience.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import annotations

import collections
Expand Down Expand Up @@ -162,7 +161,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov

for record in record_pair:
for field in fields:
line = "%s : %s" % (field, record[field])
line = "{} : {}".format(field, record[field])
_print(line)
_print()
_print(f"{n_match}/10 positive, {n_distinct}/10 negative")
Expand Down
30 changes: 9 additions & 21 deletions dedupe/core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import annotations

import collections
Expand All @@ -17,17 +16,7 @@
from dedupe.backport import RLock

if TYPE_CHECKING:
from typing import (
Any,
Generator,
Iterable,
Iterator,
Literal,
Optional,
Sequence,
Type,
Union,
)
from typing import Any, Generator, Iterable, Iterator, Literal, Sequence, Union

from dedupe._typing import (
Block,
Expand All @@ -50,7 +39,7 @@ class BlockingError(Exception):
pass


class ScoreDupes(object):
class ScoreDupes:
def __init__(
self,
featurizer: FeaturizerFunction,
Expand All @@ -71,7 +60,7 @@ def __init__(

def __call__(self) -> None:
while True:
record_pairs: Optional[RecordPairs] = self.records_queue.get()
record_pairs: RecordPairs | None = self.records_queue.get()
if record_pairs is None:
break

Expand Down Expand Up @@ -198,7 +187,7 @@ def fillQueue(
break


class ScoreGazette(object):
class ScoreGazette:
def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier):
self.featurizer = featurizer
self.classifier = classifier
Expand Down Expand Up @@ -238,8 +227,7 @@ def scoreGazette(

score_records = ScoreGazette(featurizer, classifier)

for scored_pairs in imap(score_records, record_pairs):
yield scored_pairs
yield from imap(score_records, record_pairs)

# The underlying processes in the pool should terminate when the
# pool is garbage collected, but sometimes it takes a while
Expand All @@ -248,7 +236,7 @@ def scoreGazette(
pool.join()


class MockPool(object):
class MockPool:
def close(self) -> None:
pass

Expand All @@ -273,7 +261,7 @@ def appropriate_imap(num_cores: int) -> tuple[MapLike, ClosableJoinable]:
return imap, pool


def peek(seq: Iterator[Any]) -> tuple[Optional[Any], Iterator[Any]]:
def peek(seq: Iterator[Any]) -> tuple[Any | None, Iterator[Any]]:
try:
first = next(seq)
except TypeError as e:
Expand Down Expand Up @@ -307,11 +295,11 @@ def Enumerator(start: int = 0) -> collections.defaultdict[Any, int]:


@overload
def sniff_id_type(ids: Sequence[tuple[int, int]]) -> Type[int]: ...
def sniff_id_type(ids: Sequence[tuple[int, int]]) -> type[int]: ...


@overload
def sniff_id_type(ids: Sequence[tuple[str, str]]) -> tuple[Type[str], Literal[256]]: ...
def sniff_id_type(ids: Sequence[tuple[str, str]]) -> tuple[type[str], Literal[256]]: ...


def sniff_id_type(ids: Sequence[tuple[RecordID, RecordID]]) -> RecordIDDType:
Expand Down
6 changes: 3 additions & 3 deletions dedupe/labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import dedupe.training as training

if TYPE_CHECKING:
from typing import Dict, Iterable, Literal, Mapping
from typing import Iterable, Literal, Mapping

from dedupe._typing import (
Data,
Expand Down Expand Up @@ -170,7 +170,7 @@ def remove(self, index: int) -> None:
def _sample_indices(
self, sample_size: int, max_cover: int
) -> Iterable[RecordIDPair]:
weights: Dict[RecordIDPair, float] = {}
weights: dict[RecordIDPair, float] = {}
for predicate, covered in self.block_learner.comparison_cover.items():
# each predicate gets to vote for every record pair it covers. the
# strength of that vote is in inverse proportion to the number of
Expand Down Expand Up @@ -248,7 +248,7 @@ def __init__(
def _index_predicates(self, candidates: TrainingExamples) -> None:
blocker = self.block_learner.blocker

records = core.unique((record for pair in candidates for record in pair))
records = core.unique(record for pair in candidates for record in pair)

for field in blocker.index_fields:
unique_fields = {record[field] for record in records}
Expand Down
Loading

0 comments on commit 7d2c79b

Please sign in to comment.