Skip to content

Commit

Permalink
more direct set up data model (#1193)
Browse files Browse the repository at this point in the history
set up data models like

```python
[
  dedupe.variables.String("name"),
  dedupe.variables.Exact("address")
 ]
 ```

instead of 

```
[
  {"field", "name", "type": "String"},
  {"field", "address", "type": "String"},
 ]
```

supersedes #1122, #1121. will close #1085
  • Loading branch information
fgregg authored Jun 27, 2024
1 parent fc09b63 commit 8855842
Show file tree
Hide file tree
Showing 30 changed files with 386 additions and 467 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
max-line-length=160
extend-ignore = E203
extend-ignore = E203
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ repos:
hooks:
- id: isort
name: isort (python)
- repo: https://github.com/pycqa/flake8
rev: "7.1.0"
hooks:
- id: flake8
args: [--config=.flake8]
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 3.0.0
- Development in python packaging made supporting the previous namespace approach for
variable plugins untenable. Since we had to redo the way we defined the data model,
we took the opportunity to explicity instantiate variable objects.

# 2.0.6
- fixed bug that was preventing learning of index predicates in Dedupe mode

Expand Down
12 changes: 7 additions & 5 deletions benchmarks/benchmarks/canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,19 @@ def make_report(self, clustering):
return make_report(self.data, clustering)

def run(self, use_settings=False):
deduper: dedupe.StaticDedupe | dedupe.Dedupe

if use_settings and os.path.exists(self.settings_file):
with open(self.settings_file, "rb") as f:
deduper = dedupe.StaticDedupe(f)

else:
variables = [
{"field": "name", "type": "String"},
{"field": "name", "type": "Exact"},
{"field": "address", "type": "String"},
{"field": "cuisine", "type": "ShortString", "has missing": True},
{"field": "city", "type": "ShortString"},
dedupe.variables.String("name"),
dedupe.variables.Exact("name"),
dedupe.variables.String("address"),
dedupe.variables.ShortString("cuisine", has_missing=True),
dedupe.variables.ShortString("city"),
]

deduper = dedupe.Dedupe(variables, num_cores=5)
Expand Down
9 changes: 5 additions & 4 deletions benchmarks/benchmarks/canonical_gazetteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,17 @@ def make_report(self, clustering):

def run(self, kwargs, use_settings=False):
data_1, data_2 = self.data
gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer

if use_settings and os.path.exists(self.settings_file):
with open(self.settings_file, "rb") as f:
gazetteer = dedupe.StaticGazetteer(f)
else:
variables = [
{"field": "name", "type": "String"},
{"field": "address", "type": "String"},
{"field": "cuisine", "type": "String"},
{"field": "city", "type": "String"},
dedupe.variables.String("name"),
dedupe.variables.String("address"),
dedupe.variables.String("cuisine"),
dedupe.variables.String("city"),
]

gazetteer = dedupe.Gazetteer(variables)
Expand Down
9 changes: 5 additions & 4 deletions benchmarks/benchmarks/canonical_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,17 @@ def setup(self, kwargs):

def run(self, kwargs, use_settings=False):
data_1, data_2 = self.data
deduper: dedupe.StaticRecordLink | dedupe.RecordLink

if use_settings and os.path.exists(self.settings_file):
with open(self.settings_file, "rb") as f:
deduper = dedupe.StaticRecordLink(f)
else:
variables = [
{"field": "name", "type": "String"},
{"field": "address", "type": "String"},
{"field": "cuisine", "type": "String"},
{"field": "city", "type": "String"},
dedupe.variables.String("name"),
dedupe.variables.String("address"),
dedupe.variables.String("cuisine"),
dedupe.variables.String("city"),
]
deduper = dedupe.RecordLink(variables)
deduper.prepare_training(
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/benchmarks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def get_true_dupes(data: dict) -> set:
sorted(data.items(), key=lambda x: x[1]["unique_id"]),
key=lambda x: x[1]["unique_id"],
):
pair = list(pair)
if len(pair) == 2:
a, b = pair
pair_l = list(pair)
if len(pair_l) == 2:
a, b = pair_l
duplicates.add(frozenset((a[0], b[0])))
return duplicates

Expand Down
15 changes: 15 additions & 0 deletions dedupe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,18 @@
training_data_link,
)
from dedupe.serializer import read_training, write_training # noqa: F401

__all__ = [
"Dedupe",
"Gazetteer",
"RecordLink",
"StaticDedupe",
"StaticGazetteer",
"StaticRecordLink",
"canonicalize",
"console_label",
"training_data_dedupe",
"training_data_link",
"read_training",
"write_training",
]
44 changes: 23 additions & 21 deletions dedupe/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,18 @@
TYPE_CHECKING,
Any,
Callable,
Collection,
Dict,
FrozenSet,
Iterable,
Iterator,
List,
Mapping,
MutableSequence,
Sequence,
Tuple,
Type,
Union,
runtime_checkable,
)

import numpy
Expand Down Expand Up @@ -72,6 +73,7 @@
LookupResults = Union[LookupResultsInt, LookupResultsStr]
JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
CustomComparator = Callable[[Any, Any], Union[int, float]]
Scores = Union[numpy.memmap, numpy.ndarray]
Labels = List[Literal[0, 1]]
LabelsLike = Iterable[Literal[0, 1]]
Expand All @@ -81,28 +83,10 @@
ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr]
PredicateFunction = Callable[[Any], FrozenSet[str]]

VariableDefinition = TypedDict(
"VariableDefinition",
{
"type": str,
"field": str,
"variable name": str,
"corpus": Iterable[Union[str, Collection[str]]],
"comparator": Callable[
[Any, Any], Union[int, float]
], # a custom comparator can only return a single float or int, not a sequence of numbers
"categories": List[str],
"interaction variables": List[str],
"has missing": bool,
"name": str,
},
total=False,
)


class TrainingData(TypedDict):
match: List[RecordDictPair]
distinct: List[RecordDictPair]
match: MutableSequence[RecordDictPair]
distinct: MutableSequence[RecordDictPair]


# Takes pairs of records and generates a (n_samples X n_features) array
Expand All @@ -127,6 +111,24 @@ def close(self) -> None: ...
def join(self) -> None: ...


class Variable(Protocol):
name: str
predicates: List["Predicate"]
has_missing: bool

def __len__(self) -> int: ...


@runtime_checkable
class FieldVariable(Variable, Protocol):
field: str
comparator: Comparator


class InteractionVariable(Variable, Protocol):
interaction_fields: List[str]


MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]

PathLike = Union[str, os.PathLike]
7 changes: 3 additions & 4 deletions dedupe/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sqlite3
import tempfile
import warnings
from typing import TYPE_CHECKING, cast, overload
from typing import TYPE_CHECKING, Literal, cast, overload

import numpy
import sklearn.linear_model
Expand All @@ -27,7 +27,6 @@
import dedupe.labeler as labeler
import dedupe.predicates
import dedupe.serializer as serializer
from dedupe._typing import Literal

if TYPE_CHECKING:
from typing import (
Expand Down Expand Up @@ -70,7 +69,7 @@
Scores,
TrainingData,
TupleLinks,
VariableDefinition,
Variable,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1117,7 +1116,7 @@ class ActiveMatching(Matching):

def __init__(
self,
variable_definition: Collection[VariableDefinition],
variable_definition: Collection[Variable],
num_cores: int | None = None,
in_memory: bool = False,
**kwargs,
Expand Down
5 changes: 2 additions & 3 deletions dedupe/convenience.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,14 @@
import random
import sys
import warnings
from typing import Iterator, Tuple, overload
from typing import Iterator, Literal, Tuple, overload

import numpy

import dedupe
from dedupe._typing import (
DataInt,
DataStr,
Literal,
RecordDict,
RecordDictPair,
RecordID,
Expand Down Expand Up @@ -135,7 +134,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov

finished = False
use_previous = False
fields = unique(var.field for var in deduper.data_model.primary_variables)
fields = unique(var.field for var in deduper.data_model.field_variables)

buffer_len = 1 # Max number of previous operations
unlabeled: list[RecordDictPair] = []
Expand Down
2 changes: 1 addition & 1 deletion dedupe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
Generator,
Iterable,
Iterator,
Literal,
Optional,
Sequence,
Type,
Expand All @@ -35,7 +36,6 @@
ClosableJoinable,
Data,
FeaturizerFunction,
Literal,
MapLike,
RecordID,
RecordIDDType,
Expand Down
Loading

0 comments on commit 8855842

Please sign in to comment.