From 03cf52f51a4d4a70974888c1eb49638ef01584f6 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 27 Jun 2024 07:38:20 -0400 Subject: [PATCH] Revert "Revert "more direct set up data model (#1193)"" This reverts commit 425eb203db51d105e9e831e119600e1a7634343d. --- .flake8 | 2 +- .pre-commit-config.yaml | 5 + CHANGELOG.md | 5 + benchmarks/benchmarks/canonical.py | 12 +- benchmarks/benchmarks/canonical_gazetteer.py | 9 +- benchmarks/benchmarks/canonical_matching.py | 9 +- benchmarks/benchmarks/common.py | 6 +- dedupe/__init__.py | 15 ++ dedupe/_typing.py | 44 +-- dedupe/api.py | 7 +- dedupe/convenience.py | 5 +- dedupe/core.py | 2 +- dedupe/datamodel.py | 170 +++++------- dedupe/predicates.py | 4 +- dedupe/training.py | 3 +- dedupe/variables/__init__.py | 26 +- dedupe/variables/base.py | 90 +++---- dedupe/variables/categorical_type.py | 22 +- dedupe/variables/exists.py | 10 +- dedupe/variables/interaction.py | 19 +- dedupe/variables/set.py | 15 +- dedupe/variables/string.py | 26 +- docs/Variable-definition.rst | 270 ++++++++----------- pyproject.toml | 5 +- tests/test_api.py | 17 +- tests/test_core.py | 19 +- tests/test_dedupe.py | 29 +- tests/test_labeler.py | 3 +- tests/test_serializer.py | 2 +- tests/test_training.py | 2 +- 30 files changed, 386 insertions(+), 467 deletions(-) diff --git a/.flake8 b/.flake8 index 0e85dce10..7350ce301 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] max-line-length=160 -extend-ignore = E203 +extend-ignore = E203 \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 320205765..4896e74f6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,3 +8,8 @@ repos: hooks: - id: isort name: isort (python) + - repo: https://github.com/pycqa/flake8 + rev: "7.1.0" + hooks: + - id: flake8 + args: [--config=.flake8] diff --git a/CHANGELOG.md b/CHANGELOG.md index ef3a19bfc..8a9ee9c76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# 3.0.0 +- Development in python packaging made supporting the previous namespace approach for + variable plugins untenable. Since we had to redo the way we defined the data model, + we took the opportunity to explicity instantiate variable objects. + # 2.0.6 - fixed bug that was preventing learning of index predicates in Dedupe mode diff --git a/benchmarks/benchmarks/canonical.py b/benchmarks/benchmarks/canonical.py index da075b9ba..6f23bb08d 100644 --- a/benchmarks/benchmarks/canonical.py +++ b/benchmarks/benchmarks/canonical.py @@ -32,17 +32,19 @@ def make_report(self, clustering): return make_report(self.data, clustering) def run(self, use_settings=False): + deduper: dedupe.StaticDedupe | dedupe.Dedupe + if use_settings and os.path.exists(self.settings_file): with open(self.settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: variables = [ - {"field": "name", "type": "String"}, - {"field": "name", "type": "Exact"}, - {"field": "address", "type": "String"}, - {"field": "cuisine", "type": "ShortString", "has missing": True}, - {"field": "city", "type": "ShortString"}, + dedupe.variables.String("name"), + dedupe.variables.Exact("name"), + dedupe.variables.String("address"), + dedupe.variables.ShortString("cuisine", has_missing=True), + dedupe.variables.ShortString("city"), ] deduper = dedupe.Dedupe(variables, num_cores=5) diff --git a/benchmarks/benchmarks/canonical_gazetteer.py b/benchmarks/benchmarks/canonical_gazetteer.py index bdbc51ba1..1f73b8e20 100644 --- a/benchmarks/benchmarks/canonical_gazetteer.py +++ b/benchmarks/benchmarks/canonical_gazetteer.py @@ -25,16 +25,17 @@ def make_report(self, clustering): def run(self, kwargs, use_settings=False): data_1, data_2 = self.data + gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer if use_settings and os.path.exists(self.settings_file): with open(self.settings_file, "rb") as f: gazetteer = dedupe.StaticGazetteer(f) else: variables = [ - {"field": "name", "type": "String"}, - {"field": "address", "type": "String"}, - {"field": "cuisine", "type": "String"}, - {"field": "city", "type": "String"}, + dedupe.variables.String("name"), + dedupe.variables.String("address"), + dedupe.variables.String("cuisine"), + dedupe.variables.String("city"), ] gazetteer = dedupe.Gazetteer(variables) diff --git a/benchmarks/benchmarks/canonical_matching.py b/benchmarks/benchmarks/canonical_matching.py index 471cd4988..b1c3c823b 100644 --- a/benchmarks/benchmarks/canonical_matching.py +++ b/benchmarks/benchmarks/canonical_matching.py @@ -42,16 +42,17 @@ def setup(self, kwargs): def run(self, kwargs, use_settings=False): data_1, data_2 = self.data + deduper: dedupe.StaticRecordLink | dedupe.RecordLink if use_settings and os.path.exists(self.settings_file): with open(self.settings_file, "rb") as f: deduper = dedupe.StaticRecordLink(f) else: variables = [ - {"field": "name", "type": "String"}, - {"field": "address", "type": "String"}, - {"field": "cuisine", "type": "String"}, - {"field": "city", "type": "String"}, + dedupe.variables.String("name"), + dedupe.variables.String("address"), + dedupe.variables.String("cuisine"), + dedupe.variables.String("city"), ] deduper = dedupe.RecordLink(variables) deduper.prepare_training( diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py index afe993274..17e129f99 100644 --- a/benchmarks/benchmarks/common.py +++ b/benchmarks/benchmarks/common.py @@ -54,9 +54,9 @@ def get_true_dupes(data: dict) -> set: sorted(data.items(), key=lambda x: x[1]["unique_id"]), key=lambda x: x[1]["unique_id"], ): - pair = list(pair) - if len(pair) == 2: - a, b = pair + pair_l = list(pair) + if len(pair_l) == 2: + a, b = pair_l duplicates.add(frozenset((a[0], b[0]))) return duplicates diff --git a/dedupe/__init__.py b/dedupe/__init__.py index 726836a72..7ef7d4c77 100644 --- a/dedupe/__init__.py +++ b/dedupe/__init__.py @@ -13,3 +13,18 @@ training_data_link, ) from dedupe.serializer import read_training, write_training # noqa: F401 + +__all__ = [ + "Dedupe", + "Gazetteer", + "RecordLink", + "StaticDedupe", + "StaticGazetteer", + "StaticRecordLink", + "canonicalize", + "console_label", + "training_data_dedupe", + "training_data_link", + "read_training", + "write_training", +] diff --git a/dedupe/_typing.py b/dedupe/_typing.py index 65922f7e5..9de9eb5f7 100644 --- a/dedupe/_typing.py +++ b/dedupe/_typing.py @@ -4,17 +4,18 @@ TYPE_CHECKING, Any, Callable, - Collection, Dict, FrozenSet, Iterable, Iterator, List, Mapping, + MutableSequence, Sequence, Tuple, Type, Union, + runtime_checkable, ) import numpy @@ -72,6 +73,7 @@ LookupResults = Union[LookupResultsInt, LookupResultsStr] JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"] Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]] +CustomComparator = Callable[[Any, Any], Union[int, float]] Scores = Union[numpy.memmap, numpy.ndarray] Labels = List[Literal[0, 1]] LabelsLike = Iterable[Literal[0, 1]] @@ -81,28 +83,10 @@ ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr] PredicateFunction = Callable[[Any], FrozenSet[str]] -VariableDefinition = TypedDict( - "VariableDefinition", - { - "type": str, - "field": str, - "variable name": str, - "corpus": Iterable[Union[str, Collection[str]]], - "comparator": Callable[ - [Any, Any], Union[int, float] - ], # a custom comparator can only return a single float or int, not a sequence of numbers - "categories": List[str], - "interaction variables": List[str], - "has missing": bool, - "name": str, - }, - total=False, -) - class TrainingData(TypedDict): - match: List[RecordDictPair] - distinct: List[RecordDictPair] + match: MutableSequence[RecordDictPair] + distinct: MutableSequence[RecordDictPair] # Takes pairs of records and generates a (n_samples X n_features) array @@ -127,6 +111,24 @@ def close(self) -> None: ... def join(self) -> None: ... +class Variable(Protocol): + name: str + predicates: List["Predicate"] + has_missing: bool + + def __len__(self) -> int: ... + + +@runtime_checkable +class FieldVariable(Variable, Protocol): + field: str + comparator: Comparator + + +class InteractionVariable(Variable, Protocol): + interaction_fields: List[str] + + MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable] PathLike = Union[str, os.PathLike] diff --git a/dedupe/api.py b/dedupe/api.py index 1e2e1f438..9d17bc0b0 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -14,7 +14,7 @@ import sqlite3 import tempfile import warnings -from typing import TYPE_CHECKING, cast, overload +from typing import TYPE_CHECKING, Literal, cast, overload import numpy import sklearn.linear_model @@ -27,7 +27,6 @@ import dedupe.labeler as labeler import dedupe.predicates import dedupe.serializer as serializer -from dedupe._typing import Literal if TYPE_CHECKING: from typing import ( @@ -70,7 +69,7 @@ Scores, TrainingData, TupleLinks, - VariableDefinition, + Variable, ) logger = logging.getLogger(__name__) @@ -1117,7 +1116,7 @@ class ActiveMatching(Matching): def __init__( self, - variable_definition: Collection[VariableDefinition], + variable_definition: Collection[Variable], num_cores: int | None = None, in_memory: bool = False, **kwargs, diff --git a/dedupe/convenience.py b/dedupe/convenience.py index fb24de259..26b886a4a 100644 --- a/dedupe/convenience.py +++ b/dedupe/convenience.py @@ -7,7 +7,7 @@ import random import sys import warnings -from typing import Iterator, Tuple, overload +from typing import Iterator, Literal, Tuple, overload import numpy @@ -15,7 +15,6 @@ from dedupe._typing import ( DataInt, DataStr, - Literal, RecordDict, RecordDictPair, RecordID, @@ -135,7 +134,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov finished = False use_previous = False - fields = unique(var.field for var in deduper.data_model.primary_variables) + fields = unique(var.field for var in deduper.data_model.field_variables) buffer_len = 1 # Max number of previous operations unlabeled: list[RecordDictPair] = [] diff --git a/dedupe/core.py b/dedupe/core.py index a9ffd55b5..975c08469 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -22,6 +22,7 @@ Generator, Iterable, Iterator, + Literal, Optional, Sequence, Type, @@ -35,7 +36,6 @@ ClosableJoinable, Data, FeaturizerFunction, - Literal, MapLike, RecordID, RecordIDDType, diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 49956bc6b..1b84b8aff 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -1,54 +1,80 @@ from __future__ import annotations import copyreg -import pkgutil import types +from collections.abc import Mapping from typing import TYPE_CHECKING, cast import numpy -import dedupe.variables -from dedupe.variables.base import FieldType as FieldVariable -from dedupe.variables.base import MissingDataType, Variable +from dedupe._typing import FieldVariable from dedupe.variables.interaction import InteractionType -for _, module, _ in pkgutil.iter_modules( # type: ignore - dedupe.variables.__path__, "dedupe.variables." -): - __import__(module) - if TYPE_CHECKING: - from typing import Generator, Iterable, Sequence + from typing import Collection, Generator, Iterable, Sequence from dedupe._typing import ( Comparator, + InteractionVariable, RecordDict, RecordDictPair, - VariableDefinition, + Variable, ) from dedupe.predicates import Predicate -VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} +class DataModel: + version = 2 -class DataModel(object): - version = 1 + def __init__(self, variable_definitions: Collection[Variable]): + for item in variable_definitions: + if isinstance(item, Mapping): + raise ValueError( + "It looks like you are trying to use a variable definition " + "composed of dictionaries. dedupe 3.0 uses variable objects " + 'directly. So instead of [{"field": "name", "type": "String"}] ' + 'we now do [dedupe.variables.String("name")].' + ) - def __init__(self, variable_definitions: Iterable[VariableDefinition]): variable_definitions = list(variable_definitions) if not variable_definitions: raise ValueError("The variable definitions cannot be empty") - all_variables: list[Variable] - self.primary_variables, all_variables = typify_variables(variable_definitions) - self._derived_start = len(all_variables) + if not any(variable.predicates for variable in variable_definitions): + raise ValueError( + "At least one of the variable types needs to be a type" + "other than 'Custom'. 'Custom' types have no associated" + "blocking rules" + ) + + # This is a protocol check, not a class inheritance check + self.field_variables: list[FieldVariable] = [ + variable + for variable in variable_definitions + if isinstance(variable, FieldVariable) + ] + + # we need to keep track of ordering of variables because in + # order to calculate derived fields like interaction and missing + # data fields. + columns: list[Variable] = [] + for variable in self.field_variables: + if len(variable) == 1: + columns.append(variable) + elif len(variable) > 1: + assert hasattr(variable, "higher_vars") + columns.extend(variable.higher_vars) - all_variables += interactions(variable_definitions, self.primary_variables) - all_variables += missing(all_variables) + self._derived_start = len(columns) - self._missing_field_indices = missing_field_indices(all_variables) - self._interaction_indices = interaction_indices(all_variables) + # i'm not really satisfied with how we are dealing with interactions + # here. seems like there should be a cleaner path, but i don't see it + # today + columns += interactions(variable_definitions, self.field_variables) - self._len = len(all_variables) + self._missing_field_indices = missing_field_indices(columns) + self._interaction_indices = interaction_indices(columns) + + self._len = len(columns) + len(self._missing_field_indices) def __len__(self) -> int: return self._len @@ -63,7 +89,7 @@ def _field_comparators( ) -> Generator[tuple[str, Comparator, int, int], None, None]: start = 0 stop = 0 - for var in self.primary_variables: + for var in self.field_variables: stop = start + len(var) comparator = cast("Comparator", var.comparator) yield (var.field, comparator, start, stop) @@ -72,7 +98,7 @@ def _field_comparators( @property def predicates(self) -> set[Predicate]: predicates = set() - for var in self.primary_variables: + for var in self.field_variables: for predicate in var.predicates: predicates.add(predicate) return predicates @@ -132,100 +158,26 @@ def __getstate__(self): return d def __setstate__(self, d): - version = d.pop("version", None) + version = d.pop("object_version", None) if version is None and "_variables" in d: d["_len"] = len(d.pop("_variables")) d["primary_variables"] = d.pop("primary_fields") + elif version == 1: + d["field_variables"] = d.pop("primary_variables") self.__dict__ = d -def typify_variables( - variable_definitions: Iterable[VariableDefinition], -) -> tuple[list[FieldVariable], list[Variable]]: - primary_variables: list[FieldVariable] = [] - all_variables: list[Variable] = [] - only_custom = True - - for definition in variable_definitions: - try: - variable_type = definition["type"] - except TypeError: - raise TypeError( - "Incorrect variable specification: variable " - "specifications are dictionaries that must " - "include a type definition, ex. " - "{'field' : 'Phone', type: 'String'}" - ) - except KeyError: - raise KeyError( - "Missing variable type: variable " - "specifications are dictionaries that must " - "include a type definition, ex. " - "{'field' : 'Phone', type: 'String'}" - ) - - if variable_type != "Custom": - only_custom = False - - if variable_type == "Interaction": - continue - - if variable_type == "FuzzyCategorical" and "other fields" not in definition: - definition["other fields"] = [ # type: ignore - d["field"] - for d in variable_definitions - if ("field" in d and d["field"] != definition["field"]) - ] - - try: - variable_class = VARIABLE_CLASSES[variable_type] - except KeyError: - raise KeyError( - "Field type %s not valid. Valid types include %s" - % (definition["type"], ", ".join(VARIABLE_CLASSES)) - ) - - variable_object = variable_class(definition) - assert isinstance(variable_object, FieldVariable) - - primary_variables.append(variable_object) - - if hasattr(variable_object, "higher_vars"): - all_variables.extend(variable_object.higher_vars) - else: - variable_object = cast(Variable, variable_object) - all_variables.append(variable_object) - - if only_custom: - raise ValueError( - "At least one of the variable types needs to be a type" - "other than 'Custom'. 'Custom' types have no associated" - "blocking rules" - ) - - return primary_variables, all_variables - - -def missing(variables: list[Variable]) -> list[MissingDataType]: - missing_variables = [] - for var in variables: - if var.has_missing: - missing_variables.append(MissingDataType(var.name)) - return missing_variables - - def interactions( - definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] -) -> list[InteractionType]: + variables: Iterable[Variable], primary_variables: Iterable[FieldVariable] +) -> list[InteractionVariable]: field_d = {field.name: field for field in primary_variables} - interactions = [] - for definition in definitions: - if definition["type"] == "Interaction": - var = InteractionType(definition) - var.expandInteractions(field_d) - interactions.extend(var.higher_vars) + interactions: list[InteractionVariable] = [] + for variable in variables: + if isinstance(variable, InteractionType): + variable.expandInteractions(field_d) + interactions.extend(variable.higher_vars) return interactions diff --git a/dedupe/predicates.py b/dedupe/predicates.py index 2d180ee8d..1e07c672e 100644 --- a/dedupe/predicates.py +++ b/dedupe/predicates.py @@ -17,9 +17,9 @@ from dedupe.predicate_functions import * # noqa: F401, F403 if TYPE_CHECKING: - from typing import AbstractSet, Any, FrozenSet, Iterable, Mapping, Sequence + from typing import AbstractSet, Any, FrozenSet, Iterable, Literal, Mapping, Sequence - from dedupe._typing import Literal, PredicateFunction, RecordDict + from dedupe._typing import PredicateFunction, RecordDict from dedupe.index import Index diff --git a/dedupe/training.py b/dedupe/training.py index eb6fe0a04..98c9f28df 100644 --- a/dedupe/training.py +++ b/dedupe/training.py @@ -13,7 +13,7 @@ from . import blocking, branch_and_bound if TYPE_CHECKING: - from typing import Iterable, Sequence + from typing import Iterable, Literal, Sequence from ._typing import ( ComparisonCover, @@ -23,7 +23,6 @@ Data, DataInt, DataStr, - Literal, ) from ._typing import RecordDictPairs as TrainingExamples from ._typing import RecordID, RecordIDPair diff --git a/dedupe/variables/__init__.py b/dedupe/variables/__init__.py index b36383a61..39c339c68 100644 --- a/dedupe/variables/__init__.py +++ b/dedupe/variables/__init__.py @@ -1,3 +1,25 @@ -from pkgutil import extend_path +from .base import CustomType as Custom +from .categorical_type import CategoricalType as Categorical +from .exact import ExactType as Exact +from .exists import ExistsType as Exists +from .interaction import InteractionType as Interaction +from .latlong import LatLongType as LatLong +from .price import PriceType as Price +from .set import SetType as Set +from .string import ShortStringType as ShortString +from .string import StringType as String +from .string import TextType as Text -__path__ = extend_path(__path__, __name__) +__all__ = [ + "Custom", + "Categorical", + "Exact", + "Exists", + "Interaction", + "LatLong", + "Price", + "Set", + "ShortString", + "String", + "Text", +] diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py index 109d3332c..71d6e722e 100644 --- a/dedupe/variables/base.py +++ b/dedupe/variables/base.py @@ -1,20 +1,21 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional from dedupe import predicates if TYPE_CHECKING: - from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type + from typing import Any, ClassVar, Iterable, Sequence, Type - from dedupe._typing import Comparator, PredicateFunction, VariableDefinition + from dedupe._typing import Comparator, CustomComparator, PredicateFunction + from dedupe._typing import Variable as VariableProtocol class Variable(object): name: str type: ClassVar[str] predicates: list[predicates.Predicate] - higher_vars: Sequence["Variable"] + higher_vars: Sequence["VariableProtocol"] def __len__(self) -> int: return 1 @@ -29,16 +30,8 @@ def __eq__(self, other: Any) -> bool: other_name: str = other.name return self.name == other_name - def __init__(self, definition: VariableDefinition): - if definition.get("has missing", False): - self.has_missing = True - try: - exists_pred = predicates.ExistsPredicate(definition["field"]) - self.predicates.append(exists_pred) - except KeyError: - pass - else: - self.has_missing = False + def __init__(self, has_missing: bool = False): + self.has_missing = has_missing def __getstate__(self) -> dict[str, Any]: odict = self.__dict__.copy() @@ -46,31 +39,13 @@ def __getstate__(self) -> dict[str, Any]: return odict - @classmethod - def all_subclasses( - cls, - ) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]: - for q in cls.__subclasses__(): - yield getattr(q, "type", None), q - for p in q.all_subclasses(): - yield p - class DerivedType(Variable): type = "Derived" - def __init__(self, definition: VariableDefinition): - self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"])) - super(DerivedType, self).__init__(definition) - - -class MissingDataType(Variable): - type = "MissingData" - - def __init__(self, name: str): - self.name = "(%s: Not Missing)" % name - - self.has_missing = False + def __init__(self, name: str, var_type: str, **kwargs): + self.name = "(%s: %s)" % (str(name), str(var_type)) + super().__init__(**kwargs) class FieldType(Variable): @@ -80,13 +55,15 @@ class FieldType(Variable): _Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate comparator: Comparator - def __init__(self, definition: VariableDefinition): - self.field = definition["field"] + def __init__( + self, field: str, name: Optional[str] = None, has_missing: bool = False + ): + self.field = field - if "variable name" in definition: - self.name = definition["variable name"] - else: + if name is None: self.name = "(%s: %s)" % (self.field, self.type) + else: + self.name = name self.predicates = [ self._Predicate(pred, self.field) for pred in self._predicate_functions @@ -96,30 +73,39 @@ def __init__(self, definition: VariableDefinition): self._index_predicates, self._index_thresholds, self.field ) - super(FieldType, self).__init__(definition) + self.has_missing = has_missing + if self.has_missing: + exists_pred = predicates.ExistsPredicate(self.field) + self.predicates.append(exists_pred) class CustomType(FieldType): type = "Custom" - def __init__(self, definition: VariableDefinition): - super(CustomType, self).__init__(definition) - - try: - self.comparator = definition["comparator"] # type: ignore[assignment] - except KeyError: - raise KeyError( - "For 'Custom' field types you must define " - "a 'comparator' function in the field " - "definition. " + def __init__( + self, + field: str, + comparator: CustomComparator, + name: Optional[str] = None, + **kwargs, + ): + super().__init__(field, **kwargs) + + if comparator is None: + raise ValueError( + "You must define a comparator function for the Custom class" ) + else: + self.comparator = comparator - if "variable name" not in definition: + if name is None: self.name = "(%s: %s, %s)" % ( self.field, self.type, self.comparator.__name__, ) + else: + self.name = name def indexPredicates( diff --git a/dedupe/variables/categorical_type.py b/dedupe/variables/categorical_type.py index b9d3ef66b..c2dc56768 100644 --- a/dedupe/variables/categorical_type.py +++ b/dedupe/variables/categorical_type.py @@ -1,9 +1,11 @@ from __future__ import annotations +from typing import Sequence + from categorical import CategoricalComparator from dedupe import predicates -from dedupe._typing import PredicateFunction, VariableDefinition +from dedupe._typing import PredicateFunction from dedupe.variables.base import DerivedType, FieldType @@ -11,26 +13,14 @@ class CategoricalType(FieldType): type = "Categorical" _predicate_functions: list[PredicateFunction] = [predicates.wholeFieldPredicate] - def _categories(self, definition: VariableDefinition) -> list[str]: - try: - categories = definition["categories"] - except KeyError: - raise ValueError('No "categories" defined') - - return categories - - def __init__(self, definition: VariableDefinition): - super(CategoricalType, self).__init__(definition) - - categories = self._categories(definition) + def __init__(self, field: str, categories: Sequence[str], **kwargs): + super().__init__(field, **kwargs) self.comparator = CategoricalComparator(categories) # type: ignore[assignment] self.higher_vars = [] for higher_var in self.comparator.dummy_names: # type: ignore[attr-defined] - dummy_var = DerivedType( - {"name": higher_var, "type": "Dummy", "has missing": self.has_missing} - ) + dummy_var = DerivedType(higher_var, "Dummy", has_missing=False) self.higher_vars.append(dummy_var) def __len__(self) -> int: diff --git a/dedupe/variables/exists.py b/dedupe/variables/exists.py index 46c36c292..00ca7eb46 100644 --- a/dedupe/variables/exists.py +++ b/dedupe/variables/exists.py @@ -4,7 +4,7 @@ from categorical import CategoricalComparator -from dedupe._typing import PredicateFunction, VariableDefinition +from dedupe._typing import PredicateFunction from dedupe.variables.base import DerivedType from dedupe.variables.categorical_type import CategoricalType @@ -13,16 +13,14 @@ class ExistsType(CategoricalType): type = "Exists" _predicate_functions: list[PredicateFunction] = [] - def __init__(self, definition: VariableDefinition): - super(CategoricalType, self).__init__(definition) + def __init__(self, field: str, **kwargs): + super().__init__(field, **kwargs) self.cat_comparator = CategoricalComparator([0, 1]) self.higher_vars = [] for higher_var in self.cat_comparator.dummy_names: - dummy_var = DerivedType( - {"name": higher_var, "type": "Dummy", "has missing": self.has_missing} - ) + dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing) self.higher_vars.append(dummy_var) def comparator(self, field_1: Any, field_2: Any) -> list[int]: diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py index b0370e667..e9fbbe591 100644 --- a/dedupe/variables/interaction.py +++ b/dedupe/variables/interaction.py @@ -1,24 +1,23 @@ from __future__ import annotations import itertools -from typing import Mapping +from typing import List, Mapping -from dedupe._typing import VariableDefinition -from dedupe.variables.base import FieldType as FieldVariable +from dedupe._typing import FieldVariable, InteractionVariable from dedupe.variables.base import Variable class InteractionType(Variable): type = "Interaction" - higher_vars: list["InteractionType"] + higher_vars: List[InteractionVariable] - def __init__(self, definition: VariableDefinition): - self.interactions = definition["interaction variables"] + def __init__(self, *args: str, **kwargs): + self.interactions = list(args) self.name = "(Interaction: %s)" % str(self.interactions) self.interaction_fields = self.interactions - super().__init__(definition) + super().__init__(**kwargs) def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None: self.interaction_fields = self.atomicInteractions( @@ -42,14 +41,12 @@ def categorical(self, field_model: Mapping[str, FieldVariable]) -> None: if not hasattr(field_model[field], "higher_vars") ] - dummies = [field_model[field].higher_vars for field in categoricals] + dummies = [field_model[field].higher_vars for field in categoricals] # type: ignore[attr-defined] self.higher_vars = [] for combo in itertools.product(*dummies): var_names = [field.name for field in combo] + noncategoricals - higher_var = InteractionType( - {"has missing": self.has_missing, "interaction variables": var_names} - ) + higher_var = InteractionType(*var_names, has_missing=self.has_missing) self.higher_vars.append(higher_var) def atomicInteractions( diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py index fddfa5c5e..8b8253e59 100644 --- a/dedupe/variables/set.py +++ b/dedupe/variables/set.py @@ -1,7 +1,8 @@ +from typing import Collection, Iterable, Optional + from simplecosine.cosine import CosineSetSimilarity from dedupe import predicates -from dedupe._typing import VariableDefinition from dedupe.variables.base import FieldType @@ -24,10 +25,12 @@ class SetType(FieldType): ) _index_thresholds = (0.2, 0.4, 0.6, 0.8) - def __init__(self, definition: VariableDefinition): - super(SetType, self).__init__(definition) + def __init__( + self, field: str, corpus: Optional[Iterable[Collection[str]]] = None, **kwargs + ): + super().__init__(field, **kwargs) - if "corpus" not in definition: - definition["corpus"] = [] + if corpus is None: + corpus = [] - self.comparator = CosineSetSimilarity(definition["corpus"]) # type: ignore[assignment] + self.comparator = CosineSetSimilarity(corpus) # type: ignore[assignment] diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py index 4272dba09..9a2bc8ab3 100644 --- a/dedupe/variables/string.py +++ b/dedupe/variables/string.py @@ -1,11 +1,11 @@ -from typing import Sequence, Type +from typing import Iterable, Optional, Sequence, Type from affinegap import normalizedAffineGapDistance as affineGap from highered import CRFEditDistance from simplecosine.cosine import CosineTextSimilarity from dedupe import predicates -from dedupe._typing import PredicateFunction, VariableDefinition +from dedupe._typing import PredicateFunction from dedupe.variables.base import FieldType, indexPredicates crfEd = CRFEditDistance() @@ -36,8 +36,8 @@ class BaseStringType(FieldType): _Predicate = predicates.StringPredicate _predicate_functions: Sequence[PredicateFunction] = () - def __init__(self, definition: VariableDefinition): - super(BaseStringType, self).__init__(definition) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) self.predicates += indexPredicates( ( @@ -67,10 +67,12 @@ class ShortStringType(BaseStringType): ] _index_thresholds = (0.2, 0.4, 0.6, 0.8) - def __init__(self, definition: VariableDefinition): - super(ShortStringType, self).__init__(definition) + def __init__( + self, field: str, name: Optional[str] = None, crf: bool = False, **kwargs + ): + super().__init__(field, name=name, **kwargs) - if definition.get("crf", False) is True: + if crf: self.comparator = crfEd # type: ignore[assignment] else: self.comparator = affineGap # type: ignore[assignment] @@ -98,10 +100,10 @@ class TextType(BaseStringType): ] _index_thresholds = (0.2, 0.4, 0.6, 0.8) - def __init__(self, definition: VariableDefinition): - super(TextType, self).__init__(definition) + def __init__(self, field: str, corpus: Optional[Iterable[str]] = None, **kwargs): + super().__init__(field, **kwargs) - if "corpus" not in definition: - definition["corpus"] = [] + if corpus is None: + corpus = [] - self.comparator = CosineTextSimilarity(definition["corpus"]) # type: ignore[assignment] + self.comparator = CosineTextSimilarity(corpus) # type: ignore[assignment] diff --git a/docs/Variable-definition.rst b/docs/Variable-definition.rst index 73abcccc2..5e6d19023 100644 --- a/docs/Variable-definition.rst +++ b/docs/Variable-definition.rst @@ -3,31 +3,30 @@ Variable Definitions ==================== -Variable Types --------------- +Variables +--------- -A variable definition describes the records that you want to match. It is -a dictionary where the keys are the fields and the values are the -field specification. For example:- +A variable definition describes the records that you want to match. It is sequence +of Variable objects. For example:- .. code:: python + import dedupe.variables + [ - {'field': 'Site name', 'type': 'String'}, - {'field': 'Address', 'type': 'String'}, - {'field': 'Zip', 'type': 'ShortString', 'has missing': True}, - {'field': 'Phone', 'type': 'String', 'has missing': True} + dedupe.variables.String("Site Name"), + dedupe.variables.String("Address"), + dedupe.variables.ShortString("Zip", has_missing=True), + dedupe.variables.String("Phone", has_missing=True) ] -String Types -^^^^^^^^^^^^ +String +^^^^^^ -A ``String`` type field must declare the name of the record field to compare -a ``String`` type declaration. The ``String`` type expects fields to be of -class string. +The ``String`` takes the key of the record field to compare. -``String`` types are compared using string edit distance, specifically +``String`` variables are compared using string edit distance, specifically `affine gap string distance `__. This is a good metric for measuring fields that might have typos in them, such as "John" vs "Jon". @@ -36,44 +35,45 @@ For example:- .. code:: python - {'field': 'Address', type: 'String'} + dedupe.variables.String("Address") -ShortString Types -^^^^^^^^^^^^^^^^^ +ShortString +^^^^^^^^^^^ -A ``ShortString`` type field is just like ``String`` types except that dedupe +The ``ShortString`` variable is just like the ``String`` variable except that dedupe will not try to learn any :ref:`index blocking rules ` for these fields, which can speed up the training phase considerably. -Zip codes and city names are good candidates for this type. If in doubt, +Zip codes and city names are good candidates for this variable. If in doubt, always use ``String``. For example:- .. code:: python - {'field': 'Zipcode', type: 'ShortString'} + dedupe.variables.ShortString("Zipcode") .. _text-types-label: -Text Types -^^^^^^^^^^ +Text +^^^^ If you want to compare fields containing blocks of text e.g. product -descriptions or article abstracts, you should use this type. ``Text`` type -fields are compared using the `cosine similarity metric +descriptions or article abstracts, you should use this variable. ``Text`` +variables are compared using the `cosine similarity metric `__. This is a measurement of the amount of words that two documents have in common. This measure can be made more useful as the overlap of rare words counts more than the overlap of common words. -Compare this to ``String`` and ``ShortString`` types: For strings containing -occupations, "yoga teacher" might be fairly similar to "yoga instructor" when -using the ``Text`` measurement, because they both contain the relatively -rare word of "yoga". However, if you compared these two strings using the -``String`` or ``ShortString`` measurements, they might be considered fairly -dis-similar, because the actual string edit distance between them is large. +Compare this to ``String`` and ``ShortString`` variables: For strings +containing occupations, "yoga teacher" might be fairly similar to +"yoga instructor" when using the ``Text`` measurement, because they +both contain the relatively rare word of "yoga". However, if you +compared these two strings using the ``String`` or ``ShortString`` +measurements, they might be considered fairly dissimilar, because the +actual string edit distance between them is large. If provided a sequence of example fields (i.e. a corpus) then dedupe will @@ -81,29 +81,27 @@ learn these weights for you. For example:- .. code:: python - { - 'field': 'Product description', - 'type': 'Text', - 'corpus' : [ - 'this product is great', - 'this product is great and blue' - ] - } + dedupe.variables.Text("Product description", + corpus=[ + 'this product is great', + 'this product is great and blue' + ] + ) If you don't want to adjust the measure to your data, just leave 'corpus' out of the variable definition entirely. .. code:: python - {'field': 'Product description', 'type': 'Text'} + dedupe.variables.Text("Product description") -Custom Types +Custom Variable ^^^^^^^^^^^^ -A ``Custom`` type field must have specify the field it wants to compare, a -type declaration of ``Custom``, and a comparator declaration. The comparator -must be a function that can take in two field values and return a number. +A ``Custom`` variables allows you to use a custom function for +comparing fields. The function must take two field values and return a +number. For example, a custom comparator: @@ -120,65 +118,53 @@ The corresponding variable definition: .. code:: python - { - 'field': 'Zip', - 'type': 'Custom', - 'comparator': same_or_not_comparator - } + dedupe.variables.Custom("Zip", comparator=same_or_not_comparator) -``Custom`` fields do not have any blocking rules associated with them. +``Custom`` variables do not have any blocking rules associated with them. Since dedupe needs blocking rules, a data model that only contains ``Custom`` fields will raise an error. LatLong ^^^^^^^ -A ``LatLong`` type field must have as the name of a field and a type -declaration of ``LatLong``. ``LatLong`` fields are compared using the `Haversine +``LatLong`` variables are compared using the `Haversine Formula `__. -A ``LatLong`` -type field must consist of tuples of floats corresponding to a latitude and a -longitude. +A ``LatLong`` variable field must consist of tuples of floats +corresponding to a latitude and a longitude. .. code:: python - {'field': 'Location', 'type': 'LatLong'} + dedupe.variables.LatLong("location") Set ^^^ -A ``Set`` type field is for comparing lists of elements, like keywords or -client names. ``Set`` types are very similar to :ref:`text-types-label`. They +``Set`` variables are for comparing lists of elements, like keywords or +client names. ``Set`` variables are very similar to :ref:`text-types-label`. They use the same comparison function and you can also let dedupe learn which terms are common or rare by providing a corpus. Within a record, a ``Set`` -type field has to be hashable sequences like tuples or frozensets. +variable field has to be hashable sequences like tuples or frozensets. .. code:: python - { - 'field': 'Co-authors', - 'type': 'Set', - 'corpus' : [ - ('steve edwards'), - ('steve edwards', 'steve jobs') - ] - } + dedupe.variables.Set("Co-authors", + corpus=[ + ('steve edwards'), + ('steve edwards', 'steve jobs') + ]) or .. code:: python - {'field': 'Co-authors', 'type': 'Set'} + dedupe.variables.Set("Co-authors") Interaction ^^^^^^^^^^^ -An ``Interaction`` field multiplies the values of the multiple variables. -An ``Interaction`` variable is created with type declaration of -``Interaction`` and an ``interaction variables`` declaration. - -The ``interaction variables`` field must be a sequence of variable names of +An ``Interaction`` variable multiplies the values of the multiple variables. +The arguments to an ``Interaction`` variable must be a sequence of variable names of other fields you have defined in your variable definition. `Interactions `__ @@ -187,10 +173,9 @@ are good when the effect of two predictors is not simply additive. .. code:: python [ - { 'field': 'Name', 'variable name': 'name', 'type': 'String' }, - { 'field': 'Zip', 'variable name': 'zip', 'type': 'Custom', - 'comparator' : same_or_not_comparator }, - {'type': 'Interaction', 'interaction variables': ['name', 'zip']} + dedupe.variables.String("Name", name="name"), + dedupe.variables.Custom("Zip", comparator=same_or_not_comparator, name="zip") + dedupe.variables.Interaction("name", "zip") ] Exact @@ -200,7 +185,7 @@ Exact .. code:: python - {'field': 'city', 'type': 'Exact'} + dedupe.variables.Exact("city") Exists @@ -216,7 +201,7 @@ different cases: .. code:: python - {'field': 'first_name', 'type': 'Exists'} + dedupe.variables.Exists("first_name") @@ -254,11 +239,7 @@ You would create a definition such as: .. code:: python - { - 'field': 'Business Type', - 'type': 'Categorical', - 'categories' : ['taxi', 'lawyer'] - } + dedupe.variables.Categorical("Business Type", categories=['taxi', 'lawyer']) Price ^^^^^ @@ -269,7 +250,7 @@ prices. The values of ``Price`` field must be a positive float. If the value is .. code:: python - {'field': 'cost', 'type': 'Price'} + dedupe.variables.Price("cost") Optional Variables ------------------ @@ -286,8 +267,8 @@ DateTime ``DateTime`` variables are useful for comparing dates and timestamps. This variable can accept strings or Python datetime objects as inputs. -The ``DateTime`` variable definition accepts a few optional arguments that -can help improve behavior if you know your field follows an unusual format: +The ``DateTime`` variable a few optional arguments that can help +improve behavior if you know your field follows an unusual format: * :code:`fuzzy` - Use fuzzy parsing to automatically extract dates from strings like "It happened on June 2nd, 2018" (default :code:`True`) * :code:`dayfirst` - Ambiguous dates should be parsed as dd/mm/yy (default :code:`False`) @@ -297,34 +278,24 @@ Note that the ``DateTime`` variable defaults to mm/dd/yy for ambiguous dates. If both :code:`dayfirst` and :code:`yearfirst` are set to :code:`True`, then :code:`dayfirst` will take precedence. -For example, a sample ``DateTime`` variable definition, using the defaults: .. code:: python - { - 'field': 'time_of_sale', - 'type': 'DateTime', - 'fuzzy': True, - 'dayfirst': False, - 'yearfirst': False - } + import datetimetype -If you're happy with the defaults, you can simply define the :code:`field` -and :code:`type`: + datetimetype.DateTime("field") -.. code:: python +To install: - {'field': 'time_of_sale', 'type': 'DateTime'} +.. code:: console -Install the `dedupe-variable-datetime -`__ package for -``DateTime`` Type. For more info, see the `GitHub Repository -`__. + pip install dedupe-variable-datetime -Address Type -^^^^^^^^^^^^ -An ``Address`` variable should be used for United States addresses. It uses +Address +^^^^^^^ + +An ``USAddress`` variable should be used for United States addresses. It uses the `usaddress `__ package to split apart an address string into components like address number, street name, and street type and compares component to component. @@ -333,18 +304,22 @@ For example:- .. code:: python - {'field': 'address', 'type': 'Address'} + import addressvariable + + addressvariable.USAddress("address") + + +To install: +.. code:: console -Install the `dedupe-variable-address -`__ package for -``Address`` Type. For more info, see the `GitHub Repository -`__. + pip install dedupe-variable-address -Name Type -^^^^^^^^^ -A ``Name`` variable should be used for a field that contains American names, +Name +^^^^ + +A ``WesternName`` variable should be used for a field that contains American names, corporations and households. It uses the `probablepeople `__ package to split apart an name string into components like give name, surname, generational suffix, @@ -355,42 +330,15 @@ For example:- .. code:: python - {'field': 'name', 'type': 'Name'} - - -Install the `dedupe-variable-name -`__ package for ``Name`` -Type. For more info, see the `GitHub Repository -`__. - -Fuzzy Category -^^^^^^^^^^^^^^ - -A ``FuzzyCategorical`` variable should be used for when you for -categorical data that has variations. + import namevariable -Occupations are an example, where the you may have 'Attorney', 'Counsel', and -'Lawyer'. For this variable type, you need to supply a corpus of records that -contain your focal record and other field types. This corpus should either be -all the data you are trying to link or a representative sample. - -For example:- - -.. code:: python + namevariable.WesternName("field") - { - 'field': 'occupation', - 'type': 'FuzzyCategorical', - 'corpus' : [ - {'name' : 'Jim Doe', 'occupation' : 'Attorney'}, - {'name' : 'Jim Doe', 'occupation' : 'Lawyer'} - ] - } +To install: + +.. code:: console -Install the `dedupe-variable-fuzzycategory -`__ package for -the ``FuzzyCategorical`` Type. For more info, see the `GitHub Repository -`__. + pip install dedupe-variable-name Missing Data @@ -407,13 +355,13 @@ a ``None`` object. You should also use ``None`` to represent empty strings {'Name': None, 'Phone': '773-555-1123'} ] -If you want to model this missing data for a field, you can set ``'has -missing' : True`` in the variable definition. This creates a new, +If you want to model this missing data for a field, you can set the ``has +missing=True`` in the variable definition. This creates a new, additional field representing whether the data was present or not and zeros out the missing data. -If there is missing data, but you did not declare ``'has -missing' : True`` then the missing data will simply be zeroed out and +If there is missing data, but you did not declare ``has +missing=True`` then the missing data will simply be zeroed out and no field will be created to account for missing data. This approach is called 'response augmented data' and is described in @@ -430,7 +378,7 @@ This approach makes a few assumptions that are usually not completely true: If you define an an interaction with a field that you declared to have -missing data, then ``has missing : True`` will also be set for the +missing data, then ``has missing=True`` will also be set for the Interaction field. Longer example of a variable definition: @@ -438,12 +386,12 @@ Longer example of a variable definition: .. code:: python [ - {'field': 'name', 'variable name' : 'name', 'type': 'String'}, - {'field': 'address', 'type': 'String'}, - {'field': 'city', 'variable name' : 'city', 'type': 'String'}, - {'field': 'zip', 'type': 'Custom', 'comparator' : same_or_not_comparator}, - {'field': 'cuisine', 'type': 'String', 'has missing': True} - {'type': 'Interaction', 'interaction variables' : ['name', 'city']} + dedupe.variables.String("name", name="name"), + dedupe.variables.String("address"), + dedupe.variables.String("city", name="city"), + dedupe.variables.Custom("zip", comparator=same_or_not_comparator), + dedupe.variables.String("cuisine", has_missing=True), + dedupe.vairables.Interaction("name", "city") ] Multiple Variables comparing same field @@ -456,8 +404,8 @@ For example:- .. code:: python [ - {'field': 'name', 'type': 'String'}, - {'field': 'name', 'type': 'Text'} + dedupe.variables.String("name"), + dedupe.variables.Text("name") ] @@ -475,4 +423,4 @@ default edit distance. .. code:: python - {'field': 'name', 'type': 'String', 'crf': True} + dedupe.variables.String("name", crf=True) diff --git a/pyproject.toml b/pyproject.toml index 9cfe9afc9..0256eee68 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dedupe" description = "A python library for accurate and scaleable data deduplication and entity-resolution" -version = "2.0.24" +version = "3.0.0" readme = "README.md" requires-python = ">=3.7" license = {file = "LICENSE"} @@ -63,10 +63,11 @@ dedupe = ["py.typed"] [tool.mypy] plugins = "numpy.typing.mypy_plugin" -files = "dedupe" +files = ["dedupe"] show_error_codes = true ignore_missing_imports = true check_untyped_defs = true +implicit_reexport = false [tool.pytest.ini_options] minversion = "7.1" diff --git a/tests/test_api.py b/tests/test_api.py index 84ac9169a..4e6b92906 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -46,8 +46,8 @@ def icfi(x): class ActiveMatch(unittest.TestCase): def setUp(self): self.field_definition = [ - {"field": "name", "type": "String"}, - {"field": "age", "type": "String"}, + dedupe.variables.String("name"), + dedupe.variables.String("age"), ] def test_initialize_fields(self): @@ -58,23 +58,26 @@ def test_initialize_fields(self): [], ) + with self.assertRaises(ValueError): + dedupe.api.ActiveMatching([{"field": "name", "type": "String"}]) + with self.assertRaises(ValueError): dedupe.api.ActiveMatching( - [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}], + [dedupe.variables.Custom("name", comparator=lambda x, y: 1)], ) with self.assertRaises(ValueError): dedupe.api.ActiveMatching( [ - {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, - {"field": "age", "type": "Custom", "comparator": lambda x, y: 1}, + dedupe.variables.Custom("name", comparator=lambda x, y: 1), + dedupe.variables.Custom("age", comparator=lambda x, y: 1), ], ) dedupe.api.ActiveMatching( [ - {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, - {"field": "age", "type": "String"}, + dedupe.variables.Custom("name", comparator=lambda x, y: 1), + dedupe.variables.String("age"), ], ) diff --git a/tests/test_core.py b/tests/test_core.py index 56d1ac010..d0bc8c94a 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -47,7 +47,7 @@ def setUp(self): ] ) - deduper = dedupe.Dedupe([{"field": "name", "type": "String"}]) + deduper = dedupe.Dedupe([dedupe.variables.String("name")]) self.data_model = deduper.data_model self.classifier = MockClassifier() @@ -104,7 +104,7 @@ def test_score_duplicates_with_zeros(self): class FieldDistances(unittest.TestCase): def test_exact_comparator(self): - deduper = dedupe.Dedupe([{"field": "name", "type": "Exact"}]) + deduper = dedupe.Dedupe([dedupe.variables.Exact("name")]) record_pairs = ( ({"name": "Shmoo"}, {"name": "Shmee"}), @@ -117,7 +117,7 @@ def test_exact_comparator(self): def test_comparator(self): deduper = dedupe.Dedupe( - [{"field": "type", "type": "Categorical", "categories": ["a", "b", "c"]}] + [dedupe.variables.Categorical("type", categories=["a", "b", "c"])] ) record_pairs = (({"type": "a"}, {"type": "b"}), ({"type": "a"}, {"type": "c"})) @@ -131,14 +131,11 @@ def test_comparator(self): def test_comparator_interaction(self): deduper = dedupe.Dedupe( [ - { - "field": "type", - "variable name": "type", - "type": "Categorical", - "categories": ["a", "b"], - }, - {"type": "Interaction", "interaction variables": ["type", "name"]}, - {"field": "name", "variable name": "name", "type": "Exact"}, + dedupe.variables.Categorical( + "type", categories=["a", "b"], name="type" + ), + dedupe.variables.Interaction("type", "name"), + dedupe.variables.Exact("name", name="name"), ] ) diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py index e50af63cc..4a925e8b4 100644 --- a/tests/test_dedupe.py +++ b/tests/test_dedupe.py @@ -6,6 +6,7 @@ import numpy import dedupe +import dedupe.variables DATA = { 100: {"name": "Bob", "age": "50"}, @@ -37,9 +38,9 @@ def test_data_model(self): data_model = DataModel( [ - {"field": "a", "variable name": "a", "type": "String"}, - {"field": "b", "variable name": "b", "type": "String"}, - {"type": "Interaction", "interaction variables": ["a", "b"]}, + dedupe.variables.String(field="a", name="a"), + dedupe.variables.String(field="b", name="b"), + dedupe.variables.Interaction("a", "b"), ] ) @@ -47,14 +48,9 @@ def test_data_model(self): data_model = DataModel( [ - { - "field": "a", - "variable name": "a", - "type": "String", - "has missing": True, - }, - {"field": "b", "variable name": "b", "type": "String"}, - {"type": "Interaction", "interaction variables": ["a", "b"]}, + dedupe.variables.String(field="a", name="a", has_missing=True), + dedupe.variables.String(field="b", name="b"), + dedupe.variables.Interaction("a", "b"), ] ) @@ -62,14 +58,9 @@ def test_data_model(self): data_model = DataModel( [ - { - "field": "a", - "variable name": "a", - "type": "String", - "has missing": False, - }, - {"field": "b", "variable name": "b", "type": "String"}, - {"type": "Interaction", "interaction variables": ["a", "b"]}, + dedupe.variables.String(field="a", name="a", has_missing=False), + dedupe.variables.String(field="b", name="b"), + dedupe.variables.Interaction("a", "b"), ] ) diff --git a/tests/test_labeler.py b/tests/test_labeler.py index 30609ffae..8bbc2eab5 100644 --- a/tests/test_labeler.py +++ b/tests/test_labeler.py @@ -3,6 +3,7 @@ import pytest +import dedupe from dedupe import datamodel, labeler from dedupe._typing import RecordDictPair @@ -24,7 +25,7 @@ def freeze_record_pair(record_pair: RecordDictPair): class ActiveLearningTest(unittest.TestCase): def setUp(self): self.data_model = datamodel.DataModel( - [{"field": "name", "type": "String"}, {"field": "age", "type": "String"}] + [dedupe.variables.String("name"), dedupe.variables.String("age")] ) def test_AL(self): diff --git a/tests/test_serializer.py b/tests/test_serializer.py index ab8c0e471..7eb2d931e 100644 --- a/tests/test_serializer.py +++ b/tests/test_serializer.py @@ -53,7 +53,7 @@ def test_writeTraining(self): assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset) assert isinstance(loaded_training_pairs["distinct"][0][0]["baz"], tuple) - deduper = dedupe.Dedupe([{"field": "foo", "type": "String"}]) + deduper = dedupe.Dedupe([dedupe.variables.String("foo")]) deduper.classifier.cv = False encoded_file.seek(0) diff --git a/tests/test_training.py b/tests/test_training.py index b908dde0c..6b71f3aee 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -7,7 +7,7 @@ class TrainingTest(unittest.TestCase): def setUp(self): - field_definition = [{"field": "name", "type": "String"}] + field_definition = [dedupe.variables.String("name")] self.data_model = dedupe.Dedupe(field_definition).data_model self.training_pairs = { "match": [