From 03cf52f51a4d4a70974888c1eb49638ef01584f6 Mon Sep 17 00:00:00 2001
From: Forest Gregg <fgregg@datamade.us>
Date: Thu, 27 Jun 2024 07:38:20 -0400
Subject: [PATCH] Revert "Revert "more direct set up data model (#1193)""

This reverts commit 425eb203db51d105e9e831e119600e1a7634343d.
---
 .flake8                                      |   2 +-
 .pre-commit-config.yaml                      |   5 +
 CHANGELOG.md                                 |   5 +
 benchmarks/benchmarks/canonical.py           |  12 +-
 benchmarks/benchmarks/canonical_gazetteer.py |   9 +-
 benchmarks/benchmarks/canonical_matching.py  |   9 +-
 benchmarks/benchmarks/common.py              |   6 +-
 dedupe/__init__.py                           |  15 ++
 dedupe/_typing.py                            |  44 +--
 dedupe/api.py                                |   7 +-
 dedupe/convenience.py                        |   5 +-
 dedupe/core.py                               |   2 +-
 dedupe/datamodel.py                          | 170 +++++-------
 dedupe/predicates.py                         |   4 +-
 dedupe/training.py                           |   3 +-
 dedupe/variables/__init__.py                 |  26 +-
 dedupe/variables/base.py                     |  90 +++----
 dedupe/variables/categorical_type.py         |  22 +-
 dedupe/variables/exists.py                   |  10 +-
 dedupe/variables/interaction.py              |  19 +-
 dedupe/variables/set.py                      |  15 +-
 dedupe/variables/string.py                   |  26 +-
 docs/Variable-definition.rst                 | 270 ++++++++-----------
 pyproject.toml                               |   5 +-
 tests/test_api.py                            |  17 +-
 tests/test_core.py                           |  19 +-
 tests/test_dedupe.py                         |  29 +-
 tests/test_labeler.py                        |   3 +-
 tests/test_serializer.py                     |   2 +-
 tests/test_training.py                       |   2 +-
 30 files changed, 386 insertions(+), 467 deletions(-)

diff --git a/.flake8 b/.flake8
index 0e85dce10..7350ce301 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
 max-line-length=160
-extend-ignore = E203
+extend-ignore = E203
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 320205765..4896e74f6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,3 +8,8 @@ repos:
     hooks:
       - id: isort
         name: isort (python)
+  - repo: https://github.com/pycqa/flake8
+    rev: "7.1.0"
+    hooks:
+      - id: flake8
+        args: [--config=.flake8]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef3a19bfc..8a9ee9c76 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+# 3.0.0
+- Development in python packaging made supporting the previous namespace approach for
+  variable plugins untenable. Since we had to redo the way we defined the data model, 
+  we took the opportunity to explicity instantiate variable objects. 
+
 # 2.0.6
 - fixed bug that was preventing learning of index predicates in Dedupe mode
 
diff --git a/benchmarks/benchmarks/canonical.py b/benchmarks/benchmarks/canonical.py
index da075b9ba..6f23bb08d 100644
--- a/benchmarks/benchmarks/canonical.py
+++ b/benchmarks/benchmarks/canonical.py
@@ -32,17 +32,19 @@ def make_report(self, clustering):
         return make_report(self.data, clustering)
 
     def run(self, use_settings=False):
+        deduper: dedupe.StaticDedupe | dedupe.Dedupe
+
         if use_settings and os.path.exists(self.settings_file):
             with open(self.settings_file, "rb") as f:
                 deduper = dedupe.StaticDedupe(f)
 
         else:
             variables = [
-                {"field": "name", "type": "String"},
-                {"field": "name", "type": "Exact"},
-                {"field": "address", "type": "String"},
-                {"field": "cuisine", "type": "ShortString", "has missing": True},
-                {"field": "city", "type": "ShortString"},
+                dedupe.variables.String("name"),
+                dedupe.variables.Exact("name"),
+                dedupe.variables.String("address"),
+                dedupe.variables.ShortString("cuisine", has_missing=True),
+                dedupe.variables.ShortString("city"),
             ]
 
             deduper = dedupe.Dedupe(variables, num_cores=5)
diff --git a/benchmarks/benchmarks/canonical_gazetteer.py b/benchmarks/benchmarks/canonical_gazetteer.py
index bdbc51ba1..1f73b8e20 100644
--- a/benchmarks/benchmarks/canonical_gazetteer.py
+++ b/benchmarks/benchmarks/canonical_gazetteer.py
@@ -25,16 +25,17 @@ def make_report(self, clustering):
 
     def run(self, kwargs, use_settings=False):
         data_1, data_2 = self.data
+        gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer
 
         if use_settings and os.path.exists(self.settings_file):
             with open(self.settings_file, "rb") as f:
                 gazetteer = dedupe.StaticGazetteer(f)
         else:
             variables = [
-                {"field": "name", "type": "String"},
-                {"field": "address", "type": "String"},
-                {"field": "cuisine", "type": "String"},
-                {"field": "city", "type": "String"},
+                dedupe.variables.String("name"),
+                dedupe.variables.String("address"),
+                dedupe.variables.String("cuisine"),
+                dedupe.variables.String("city"),
             ]
 
             gazetteer = dedupe.Gazetteer(variables)
diff --git a/benchmarks/benchmarks/canonical_matching.py b/benchmarks/benchmarks/canonical_matching.py
index 471cd4988..b1c3c823b 100644
--- a/benchmarks/benchmarks/canonical_matching.py
+++ b/benchmarks/benchmarks/canonical_matching.py
@@ -42,16 +42,17 @@ def setup(self, kwargs):
 
     def run(self, kwargs, use_settings=False):
         data_1, data_2 = self.data
+        deduper: dedupe.StaticRecordLink | dedupe.RecordLink
 
         if use_settings and os.path.exists(self.settings_file):
             with open(self.settings_file, "rb") as f:
                 deduper = dedupe.StaticRecordLink(f)
         else:
             variables = [
-                {"field": "name", "type": "String"},
-                {"field": "address", "type": "String"},
-                {"field": "cuisine", "type": "String"},
-                {"field": "city", "type": "String"},
+                dedupe.variables.String("name"),
+                dedupe.variables.String("address"),
+                dedupe.variables.String("cuisine"),
+                dedupe.variables.String("city"),
             ]
             deduper = dedupe.RecordLink(variables)
             deduper.prepare_training(
diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py
index afe993274..17e129f99 100644
--- a/benchmarks/benchmarks/common.py
+++ b/benchmarks/benchmarks/common.py
@@ -54,9 +54,9 @@ def get_true_dupes(data: dict) -> set:
         sorted(data.items(), key=lambda x: x[1]["unique_id"]),
         key=lambda x: x[1]["unique_id"],
     ):
-        pair = list(pair)
-        if len(pair) == 2:
-            a, b = pair
+        pair_l = list(pair)
+        if len(pair_l) == 2:
+            a, b = pair_l
             duplicates.add(frozenset((a[0], b[0])))
     return duplicates
 
diff --git a/dedupe/__init__.py b/dedupe/__init__.py
index 726836a72..7ef7d4c77 100644
--- a/dedupe/__init__.py
+++ b/dedupe/__init__.py
@@ -13,3 +13,18 @@
     training_data_link,
 )
 from dedupe.serializer import read_training, write_training  # noqa: F401
+
+__all__ = [
+    "Dedupe",
+    "Gazetteer",
+    "RecordLink",
+    "StaticDedupe",
+    "StaticGazetteer",
+    "StaticRecordLink",
+    "canonicalize",
+    "console_label",
+    "training_data_dedupe",
+    "training_data_link",
+    "read_training",
+    "write_training",
+]
diff --git a/dedupe/_typing.py b/dedupe/_typing.py
index 65922f7e5..9de9eb5f7 100644
--- a/dedupe/_typing.py
+++ b/dedupe/_typing.py
@@ -4,17 +4,18 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Collection,
     Dict,
     FrozenSet,
     Iterable,
     Iterator,
     List,
     Mapping,
+    MutableSequence,
     Sequence,
     Tuple,
     Type,
     Union,
+    runtime_checkable,
 )
 
 import numpy
@@ -72,6 +73,7 @@
 LookupResults = Union[LookupResultsInt, LookupResultsStr]
 JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
 Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
+CustomComparator = Callable[[Any, Any], Union[int, float]]
 Scores = Union[numpy.memmap, numpy.ndarray]
 Labels = List[Literal[0, 1]]
 LabelsLike = Iterable[Literal[0, 1]]
@@ -81,28 +83,10 @@
 ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr]
 PredicateFunction = Callable[[Any], FrozenSet[str]]
 
-VariableDefinition = TypedDict(
-    "VariableDefinition",
-    {
-        "type": str,
-        "field": str,
-        "variable name": str,
-        "corpus": Iterable[Union[str, Collection[str]]],
-        "comparator": Callable[
-            [Any, Any], Union[int, float]
-        ],  # a custom comparator can only return a single float or int, not a sequence of numbers
-        "categories": List[str],
-        "interaction variables": List[str],
-        "has missing": bool,
-        "name": str,
-    },
-    total=False,
-)
-
 
 class TrainingData(TypedDict):
-    match: List[RecordDictPair]
-    distinct: List[RecordDictPair]
+    match: MutableSequence[RecordDictPair]
+    distinct: MutableSequence[RecordDictPair]
 
 
 # Takes pairs of records and generates a (n_samples X n_features) array
@@ -127,6 +111,24 @@ def close(self) -> None: ...
     def join(self) -> None: ...
 
 
+class Variable(Protocol):
+    name: str
+    predicates: List["Predicate"]
+    has_missing: bool
+
+    def __len__(self) -> int: ...
+
+
+@runtime_checkable
+class FieldVariable(Variable, Protocol):
+    field: str
+    comparator: Comparator
+
+
+class InteractionVariable(Variable, Protocol):
+    interaction_fields: List[str]
+
+
 MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]
 
 PathLike = Union[str, os.PathLike]
diff --git a/dedupe/api.py b/dedupe/api.py
index 1e2e1f438..9d17bc0b0 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -14,7 +14,7 @@
 import sqlite3
 import tempfile
 import warnings
-from typing import TYPE_CHECKING, cast, overload
+from typing import TYPE_CHECKING, Literal, cast, overload
 
 import numpy
 import sklearn.linear_model
@@ -27,7 +27,6 @@
 import dedupe.labeler as labeler
 import dedupe.predicates
 import dedupe.serializer as serializer
-from dedupe._typing import Literal
 
 if TYPE_CHECKING:
     from typing import (
@@ -70,7 +69,7 @@
         Scores,
         TrainingData,
         TupleLinks,
-        VariableDefinition,
+        Variable,
     )
 
 logger = logging.getLogger(__name__)
@@ -1117,7 +1116,7 @@ class ActiveMatching(Matching):
 
     def __init__(
         self,
-        variable_definition: Collection[VariableDefinition],
+        variable_definition: Collection[Variable],
         num_cores: int | None = None,
         in_memory: bool = False,
         **kwargs,
diff --git a/dedupe/convenience.py b/dedupe/convenience.py
index fb24de259..26b886a4a 100644
--- a/dedupe/convenience.py
+++ b/dedupe/convenience.py
@@ -7,7 +7,7 @@
 import random
 import sys
 import warnings
-from typing import Iterator, Tuple, overload
+from typing import Iterator, Literal, Tuple, overload
 
 import numpy
 
@@ -15,7 +15,6 @@
 from dedupe._typing import (
     DataInt,
     DataStr,
-    Literal,
     RecordDict,
     RecordDictPair,
     RecordID,
@@ -135,7 +134,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None:  # pragma: no cov
 
     finished = False
     use_previous = False
-    fields = unique(var.field for var in deduper.data_model.primary_variables)
+    fields = unique(var.field for var in deduper.data_model.field_variables)
 
     buffer_len = 1  # Max number of previous operations
     unlabeled: list[RecordDictPair] = []
diff --git a/dedupe/core.py b/dedupe/core.py
index a9ffd55b5..975c08469 100644
--- a/dedupe/core.py
+++ b/dedupe/core.py
@@ -22,6 +22,7 @@
         Generator,
         Iterable,
         Iterator,
+        Literal,
         Optional,
         Sequence,
         Type,
@@ -35,7 +36,6 @@
         ClosableJoinable,
         Data,
         FeaturizerFunction,
-        Literal,
         MapLike,
         RecordID,
         RecordIDDType,
diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index 49956bc6b..1b84b8aff 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -1,54 +1,80 @@
 from __future__ import annotations
 
 import copyreg
-import pkgutil
 import types
+from collections.abc import Mapping
 from typing import TYPE_CHECKING, cast
 
 import numpy
 
-import dedupe.variables
-from dedupe.variables.base import FieldType as FieldVariable
-from dedupe.variables.base import MissingDataType, Variable
+from dedupe._typing import FieldVariable
 from dedupe.variables.interaction import InteractionType
 
-for _, module, _ in pkgutil.iter_modules(  # type: ignore
-    dedupe.variables.__path__, "dedupe.variables."
-):
-    __import__(module)
-
 if TYPE_CHECKING:
-    from typing import Generator, Iterable, Sequence
+    from typing import Collection, Generator, Iterable, Sequence
 
     from dedupe._typing import (
         Comparator,
+        InteractionVariable,
         RecordDict,
         RecordDictPair,
-        VariableDefinition,
+        Variable,
     )
     from dedupe.predicates import Predicate
 
-VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k}
 
+class DataModel:
+    version = 2
 
-class DataModel(object):
-    version = 1
+    def __init__(self, variable_definitions: Collection[Variable]):
+        for item in variable_definitions:
+            if isinstance(item, Mapping):
+                raise ValueError(
+                    "It looks like you are trying to use a variable definition "
+                    "composed of dictionaries. dedupe 3.0 uses variable objects "
+                    'directly. So instead of [{"field": "name", "type": "String"}] '
+                    'we now do [dedupe.variables.String("name")].'
+                )
 
-    def __init__(self, variable_definitions: Iterable[VariableDefinition]):
         variable_definitions = list(variable_definitions)
         if not variable_definitions:
             raise ValueError("The variable definitions cannot be empty")
-        all_variables: list[Variable]
-        self.primary_variables, all_variables = typify_variables(variable_definitions)
-        self._derived_start = len(all_variables)
+        if not any(variable.predicates for variable in variable_definitions):
+            raise ValueError(
+                "At least one of the variable types needs to be a type"
+                "other than 'Custom'. 'Custom' types have no associated"
+                "blocking rules"
+            )
+
+        # This is a protocol check, not a class inheritance check
+        self.field_variables: list[FieldVariable] = [
+            variable
+            for variable in variable_definitions
+            if isinstance(variable, FieldVariable)
+        ]
+
+        # we need to keep track of ordering of variables because in
+        # order to calculate derived fields like interaction and missing
+        # data fields.
+        columns: list[Variable] = []
+        for variable in self.field_variables:
+            if len(variable) == 1:
+                columns.append(variable)
+            elif len(variable) > 1:
+                assert hasattr(variable, "higher_vars")
+                columns.extend(variable.higher_vars)
 
-        all_variables += interactions(variable_definitions, self.primary_variables)
-        all_variables += missing(all_variables)
+        self._derived_start = len(columns)
 
-        self._missing_field_indices = missing_field_indices(all_variables)
-        self._interaction_indices = interaction_indices(all_variables)
+        # i'm not really satisfied with how we are dealing with interactions
+        # here. seems like there should be a cleaner path, but i don't see it
+        # today
+        columns += interactions(variable_definitions, self.field_variables)
 
-        self._len = len(all_variables)
+        self._missing_field_indices = missing_field_indices(columns)
+        self._interaction_indices = interaction_indices(columns)
+
+        self._len = len(columns) + len(self._missing_field_indices)
 
     def __len__(self) -> int:
         return self._len
@@ -63,7 +89,7 @@ def _field_comparators(
     ) -> Generator[tuple[str, Comparator, int, int], None, None]:
         start = 0
         stop = 0
-        for var in self.primary_variables:
+        for var in self.field_variables:
             stop = start + len(var)
             comparator = cast("Comparator", var.comparator)
             yield (var.field, comparator, start, stop)
@@ -72,7 +98,7 @@ def _field_comparators(
     @property
     def predicates(self) -> set[Predicate]:
         predicates = set()
-        for var in self.primary_variables:
+        for var in self.field_variables:
             for predicate in var.predicates:
                 predicates.add(predicate)
         return predicates
@@ -132,100 +158,26 @@ def __getstate__(self):
         return d
 
     def __setstate__(self, d):
-        version = d.pop("version", None)
+        version = d.pop("object_version", None)
         if version is None and "_variables" in d:
             d["_len"] = len(d.pop("_variables"))
             d["primary_variables"] = d.pop("primary_fields")
+        elif version == 1:
+            d["field_variables"] = d.pop("primary_variables")
 
         self.__dict__ = d
 
 
-def typify_variables(
-    variable_definitions: Iterable[VariableDefinition],
-) -> tuple[list[FieldVariable], list[Variable]]:
-    primary_variables: list[FieldVariable] = []
-    all_variables: list[Variable] = []
-    only_custom = True
-
-    for definition in variable_definitions:
-        try:
-            variable_type = definition["type"]
-        except TypeError:
-            raise TypeError(
-                "Incorrect variable specification: variable "
-                "specifications are dictionaries that must "
-                "include a type definition, ex. "
-                "{'field' : 'Phone', type: 'String'}"
-            )
-        except KeyError:
-            raise KeyError(
-                "Missing variable type: variable "
-                "specifications are dictionaries that must "
-                "include a type definition, ex. "
-                "{'field' : 'Phone', type: 'String'}"
-            )
-
-        if variable_type != "Custom":
-            only_custom = False
-
-        if variable_type == "Interaction":
-            continue
-
-        if variable_type == "FuzzyCategorical" and "other fields" not in definition:
-            definition["other fields"] = [  # type: ignore
-                d["field"]
-                for d in variable_definitions
-                if ("field" in d and d["field"] != definition["field"])
-            ]
-
-        try:
-            variable_class = VARIABLE_CLASSES[variable_type]
-        except KeyError:
-            raise KeyError(
-                "Field type %s not valid. Valid types include %s"
-                % (definition["type"], ", ".join(VARIABLE_CLASSES))
-            )
-
-        variable_object = variable_class(definition)
-        assert isinstance(variable_object, FieldVariable)
-
-        primary_variables.append(variable_object)
-
-        if hasattr(variable_object, "higher_vars"):
-            all_variables.extend(variable_object.higher_vars)
-        else:
-            variable_object = cast(Variable, variable_object)
-            all_variables.append(variable_object)
-
-    if only_custom:
-        raise ValueError(
-            "At least one of the variable types needs to be a type"
-            "other than 'Custom'. 'Custom' types have no associated"
-            "blocking rules"
-        )
-
-    return primary_variables, all_variables
-
-
-def missing(variables: list[Variable]) -> list[MissingDataType]:
-    missing_variables = []
-    for var in variables:
-        if var.has_missing:
-            missing_variables.append(MissingDataType(var.name))
-    return missing_variables
-
-
 def interactions(
-    definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable]
-) -> list[InteractionType]:
+    variables: Iterable[Variable], primary_variables: Iterable[FieldVariable]
+) -> list[InteractionVariable]:
     field_d = {field.name: field for field in primary_variables}
 
-    interactions = []
-    for definition in definitions:
-        if definition["type"] == "Interaction":
-            var = InteractionType(definition)
-            var.expandInteractions(field_d)
-            interactions.extend(var.higher_vars)
+    interactions: list[InteractionVariable] = []
+    for variable in variables:
+        if isinstance(variable, InteractionType):
+            variable.expandInteractions(field_d)
+            interactions.extend(variable.higher_vars)
     return interactions
 
 
diff --git a/dedupe/predicates.py b/dedupe/predicates.py
index 2d180ee8d..1e07c672e 100644
--- a/dedupe/predicates.py
+++ b/dedupe/predicates.py
@@ -17,9 +17,9 @@
 from dedupe.predicate_functions import *  # noqa: F401, F403
 
 if TYPE_CHECKING:
-    from typing import AbstractSet, Any, FrozenSet, Iterable, Mapping, Sequence
+    from typing import AbstractSet, Any, FrozenSet, Iterable, Literal, Mapping, Sequence
 
-    from dedupe._typing import Literal, PredicateFunction, RecordDict
+    from dedupe._typing import PredicateFunction, RecordDict
     from dedupe.index import Index
 
 
diff --git a/dedupe/training.py b/dedupe/training.py
index eb6fe0a04..98c9f28df 100644
--- a/dedupe/training.py
+++ b/dedupe/training.py
@@ -13,7 +13,7 @@
 from . import blocking, branch_and_bound
 
 if TYPE_CHECKING:
-    from typing import Iterable, Sequence
+    from typing import Iterable, Literal, Sequence
 
     from ._typing import (
         ComparisonCover,
@@ -23,7 +23,6 @@
         Data,
         DataInt,
         DataStr,
-        Literal,
     )
     from ._typing import RecordDictPairs as TrainingExamples
     from ._typing import RecordID, RecordIDPair
diff --git a/dedupe/variables/__init__.py b/dedupe/variables/__init__.py
index b36383a61..39c339c68 100644
--- a/dedupe/variables/__init__.py
+++ b/dedupe/variables/__init__.py
@@ -1,3 +1,25 @@
-from pkgutil import extend_path
+from .base import CustomType as Custom
+from .categorical_type import CategoricalType as Categorical
+from .exact import ExactType as Exact
+from .exists import ExistsType as Exists
+from .interaction import InteractionType as Interaction
+from .latlong import LatLongType as LatLong
+from .price import PriceType as Price
+from .set import SetType as Set
+from .string import ShortStringType as ShortString
+from .string import StringType as String
+from .string import TextType as Text
 
-__path__ = extend_path(__path__, __name__)
+__all__ = [
+    "Custom",
+    "Categorical",
+    "Exact",
+    "Exists",
+    "Interaction",
+    "LatLong",
+    "Price",
+    "Set",
+    "ShortString",
+    "String",
+    "Text",
+]
diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py
index 109d3332c..71d6e722e 100644
--- a/dedupe/variables/base.py
+++ b/dedupe/variables/base.py
@@ -1,20 +1,21 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 from dedupe import predicates
 
 if TYPE_CHECKING:
-    from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type
+    from typing import Any, ClassVar, Iterable, Sequence, Type
 
-    from dedupe._typing import Comparator, PredicateFunction, VariableDefinition
+    from dedupe._typing import Comparator, CustomComparator, PredicateFunction
+    from dedupe._typing import Variable as VariableProtocol
 
 
 class Variable(object):
     name: str
     type: ClassVar[str]
     predicates: list[predicates.Predicate]
-    higher_vars: Sequence["Variable"]
+    higher_vars: Sequence["VariableProtocol"]
 
     def __len__(self) -> int:
         return 1
@@ -29,16 +30,8 @@ def __eq__(self, other: Any) -> bool:
         other_name: str = other.name
         return self.name == other_name
 
-    def __init__(self, definition: VariableDefinition):
-        if definition.get("has missing", False):
-            self.has_missing = True
-            try:
-                exists_pred = predicates.ExistsPredicate(definition["field"])
-                self.predicates.append(exists_pred)
-            except KeyError:
-                pass
-        else:
-            self.has_missing = False
+    def __init__(self, has_missing: bool = False):
+        self.has_missing = has_missing
 
     def __getstate__(self) -> dict[str, Any]:
         odict = self.__dict__.copy()
@@ -46,31 +39,13 @@ def __getstate__(self) -> dict[str, Any]:
 
         return odict
 
-    @classmethod
-    def all_subclasses(
-        cls,
-    ) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]:
-        for q in cls.__subclasses__():
-            yield getattr(q, "type", None), q
-            for p in q.all_subclasses():
-                yield p
-
 
 class DerivedType(Variable):
     type = "Derived"
 
-    def __init__(self, definition: VariableDefinition):
-        self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"]))
-        super(DerivedType, self).__init__(definition)
-
-
-class MissingDataType(Variable):
-    type = "MissingData"
-
-    def __init__(self, name: str):
-        self.name = "(%s: Not Missing)" % name
-
-        self.has_missing = False
+    def __init__(self, name: str, var_type: str, **kwargs):
+        self.name = "(%s: %s)" % (str(name), str(var_type))
+        super().__init__(**kwargs)
 
 
 class FieldType(Variable):
@@ -80,13 +55,15 @@ class FieldType(Variable):
     _Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate
     comparator: Comparator
 
-    def __init__(self, definition: VariableDefinition):
-        self.field = definition["field"]
+    def __init__(
+        self, field: str, name: Optional[str] = None, has_missing: bool = False
+    ):
+        self.field = field
 
-        if "variable name" in definition:
-            self.name = definition["variable name"]
-        else:
+        if name is None:
             self.name = "(%s: %s)" % (self.field, self.type)
+        else:
+            self.name = name
 
         self.predicates = [
             self._Predicate(pred, self.field) for pred in self._predicate_functions
@@ -96,30 +73,39 @@ def __init__(self, definition: VariableDefinition):
             self._index_predicates, self._index_thresholds, self.field
         )
 
-        super(FieldType, self).__init__(definition)
+        self.has_missing = has_missing
+        if self.has_missing:
+            exists_pred = predicates.ExistsPredicate(self.field)
+            self.predicates.append(exists_pred)
 
 
 class CustomType(FieldType):
     type = "Custom"
 
-    def __init__(self, definition: VariableDefinition):
-        super(CustomType, self).__init__(definition)
-
-        try:
-            self.comparator = definition["comparator"]  # type: ignore[assignment]
-        except KeyError:
-            raise KeyError(
-                "For 'Custom' field types you must define "
-                "a 'comparator' function in the field "
-                "definition. "
+    def __init__(
+        self,
+        field: str,
+        comparator: CustomComparator,
+        name: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(field, **kwargs)
+
+        if comparator is None:
+            raise ValueError(
+                "You must define a comparator function for the Custom class"
             )
+        else:
+            self.comparator = comparator
 
-        if "variable name" not in definition:
+        if name is None:
             self.name = "(%s: %s, %s)" % (
                 self.field,
                 self.type,
                 self.comparator.__name__,
             )
+        else:
+            self.name = name
 
 
 def indexPredicates(
diff --git a/dedupe/variables/categorical_type.py b/dedupe/variables/categorical_type.py
index b9d3ef66b..c2dc56768 100644
--- a/dedupe/variables/categorical_type.py
+++ b/dedupe/variables/categorical_type.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
+from typing import Sequence
+
 from categorical import CategoricalComparator
 
 from dedupe import predicates
-from dedupe._typing import PredicateFunction, VariableDefinition
+from dedupe._typing import PredicateFunction
 from dedupe.variables.base import DerivedType, FieldType
 
 
@@ -11,26 +13,14 @@ class CategoricalType(FieldType):
     type = "Categorical"
     _predicate_functions: list[PredicateFunction] = [predicates.wholeFieldPredicate]
 
-    def _categories(self, definition: VariableDefinition) -> list[str]:
-        try:
-            categories = definition["categories"]
-        except KeyError:
-            raise ValueError('No "categories" defined')
-
-        return categories
-
-    def __init__(self, definition: VariableDefinition):
-        super(CategoricalType, self).__init__(definition)
-
-        categories = self._categories(definition)
+    def __init__(self, field: str, categories: Sequence[str], **kwargs):
+        super().__init__(field, **kwargs)
 
         self.comparator = CategoricalComparator(categories)  # type: ignore[assignment]
 
         self.higher_vars = []
         for higher_var in self.comparator.dummy_names:  # type: ignore[attr-defined]
-            dummy_var = DerivedType(
-                {"name": higher_var, "type": "Dummy", "has missing": self.has_missing}
-            )
+            dummy_var = DerivedType(higher_var, "Dummy", has_missing=False)
             self.higher_vars.append(dummy_var)
 
     def __len__(self) -> int:
diff --git a/dedupe/variables/exists.py b/dedupe/variables/exists.py
index 46c36c292..00ca7eb46 100644
--- a/dedupe/variables/exists.py
+++ b/dedupe/variables/exists.py
@@ -4,7 +4,7 @@
 
 from categorical import CategoricalComparator
 
-from dedupe._typing import PredicateFunction, VariableDefinition
+from dedupe._typing import PredicateFunction
 from dedupe.variables.base import DerivedType
 from dedupe.variables.categorical_type import CategoricalType
 
@@ -13,16 +13,14 @@ class ExistsType(CategoricalType):
     type = "Exists"
     _predicate_functions: list[PredicateFunction] = []
 
-    def __init__(self, definition: VariableDefinition):
-        super(CategoricalType, self).__init__(definition)
+    def __init__(self, field: str, **kwargs):
+        super().__init__(field, **kwargs)
 
         self.cat_comparator = CategoricalComparator([0, 1])
 
         self.higher_vars = []
         for higher_var in self.cat_comparator.dummy_names:
-            dummy_var = DerivedType(
-                {"name": higher_var, "type": "Dummy", "has missing": self.has_missing}
-            )
+            dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing)
             self.higher_vars.append(dummy_var)
 
     def comparator(self, field_1: Any, field_2: Any) -> list[int]:
diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py
index b0370e667..e9fbbe591 100644
--- a/dedupe/variables/interaction.py
+++ b/dedupe/variables/interaction.py
@@ -1,24 +1,23 @@
 from __future__ import annotations
 
 import itertools
-from typing import Mapping
+from typing import List, Mapping
 
-from dedupe._typing import VariableDefinition
-from dedupe.variables.base import FieldType as FieldVariable
+from dedupe._typing import FieldVariable, InteractionVariable
 from dedupe.variables.base import Variable
 
 
 class InteractionType(Variable):
     type = "Interaction"
-    higher_vars: list["InteractionType"]
+    higher_vars: List[InteractionVariable]
 
-    def __init__(self, definition: VariableDefinition):
-        self.interactions = definition["interaction variables"]
+    def __init__(self, *args: str, **kwargs):
+        self.interactions = list(args)
 
         self.name = "(Interaction: %s)" % str(self.interactions)
         self.interaction_fields = self.interactions
 
-        super().__init__(definition)
+        super().__init__(**kwargs)
 
     def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None:
         self.interaction_fields = self.atomicInteractions(
@@ -42,14 +41,12 @@ def categorical(self, field_model: Mapping[str, FieldVariable]) -> None:
             if not hasattr(field_model[field], "higher_vars")
         ]
 
-        dummies = [field_model[field].higher_vars for field in categoricals]
+        dummies = [field_model[field].higher_vars for field in categoricals]  # type: ignore[attr-defined]
 
         self.higher_vars = []
         for combo in itertools.product(*dummies):
             var_names = [field.name for field in combo] + noncategoricals
-            higher_var = InteractionType(
-                {"has missing": self.has_missing, "interaction variables": var_names}
-            )
+            higher_var = InteractionType(*var_names, has_missing=self.has_missing)
             self.higher_vars.append(higher_var)
 
     def atomicInteractions(
diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py
index fddfa5c5e..8b8253e59 100644
--- a/dedupe/variables/set.py
+++ b/dedupe/variables/set.py
@@ -1,7 +1,8 @@
+from typing import Collection, Iterable, Optional
+
 from simplecosine.cosine import CosineSetSimilarity
 
 from dedupe import predicates
-from dedupe._typing import VariableDefinition
 from dedupe.variables.base import FieldType
 
 
@@ -24,10 +25,12 @@ class SetType(FieldType):
     )
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(self, definition: VariableDefinition):
-        super(SetType, self).__init__(definition)
+    def __init__(
+        self, field: str, corpus: Optional[Iterable[Collection[str]]] = None, **kwargs
+    ):
+        super().__init__(field, **kwargs)
 
-        if "corpus" not in definition:
-            definition["corpus"] = []
+        if corpus is None:
+            corpus = []
 
-        self.comparator = CosineSetSimilarity(definition["corpus"])  # type: ignore[assignment]
+        self.comparator = CosineSetSimilarity(corpus)  # type: ignore[assignment]
diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py
index 4272dba09..9a2bc8ab3 100644
--- a/dedupe/variables/string.py
+++ b/dedupe/variables/string.py
@@ -1,11 +1,11 @@
-from typing import Sequence, Type
+from typing import Iterable, Optional, Sequence, Type
 
 from affinegap import normalizedAffineGapDistance as affineGap
 from highered import CRFEditDistance
 from simplecosine.cosine import CosineTextSimilarity
 
 from dedupe import predicates
-from dedupe._typing import PredicateFunction, VariableDefinition
+from dedupe._typing import PredicateFunction
 from dedupe.variables.base import FieldType, indexPredicates
 
 crfEd = CRFEditDistance()
@@ -36,8 +36,8 @@ class BaseStringType(FieldType):
     _Predicate = predicates.StringPredicate
     _predicate_functions: Sequence[PredicateFunction] = ()
 
-    def __init__(self, definition: VariableDefinition):
-        super(BaseStringType, self).__init__(definition)
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
         self.predicates += indexPredicates(
             (
@@ -67,10 +67,12 @@ class ShortStringType(BaseStringType):
     ]
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(self, definition: VariableDefinition):
-        super(ShortStringType, self).__init__(definition)
+    def __init__(
+        self, field: str, name: Optional[str] = None, crf: bool = False, **kwargs
+    ):
+        super().__init__(field, name=name, **kwargs)
 
-        if definition.get("crf", False) is True:
+        if crf:
             self.comparator = crfEd  # type: ignore[assignment]
         else:
             self.comparator = affineGap  # type: ignore[assignment]
@@ -98,10 +100,10 @@ class TextType(BaseStringType):
     ]
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(self, definition: VariableDefinition):
-        super(TextType, self).__init__(definition)
+    def __init__(self, field: str, corpus: Optional[Iterable[str]] = None, **kwargs):
+        super().__init__(field, **kwargs)
 
-        if "corpus" not in definition:
-            definition["corpus"] = []
+        if corpus is None:
+            corpus = []
 
-        self.comparator = CosineTextSimilarity(definition["corpus"])  # type: ignore[assignment]
+        self.comparator = CosineTextSimilarity(corpus)  # type: ignore[assignment]
diff --git a/docs/Variable-definition.rst b/docs/Variable-definition.rst
index 73abcccc2..5e6d19023 100644
--- a/docs/Variable-definition.rst
+++ b/docs/Variable-definition.rst
@@ -3,31 +3,30 @@
 Variable Definitions
 ====================
 
-Variable Types
---------------
+Variables
+---------
 
-A variable definition describes the records that you want to match. It is
-a dictionary where the keys are the fields and the values are the
-field specification. For example:-
+A variable definition describes the records that you want to match. It is sequence
+of Variable objects. For example:-
 
 .. code:: python
 
+    import dedupe.variables
+	  
     [
-        {'field': 'Site name', 'type': 'String'},
-        {'field': 'Address', 'type': 'String'},
-        {'field': 'Zip', 'type': 'ShortString', 'has missing': True},
-        {'field': 'Phone', 'type': 'String', 'has missing': True}
+        dedupe.variables.String("Site Name"),
+        dedupe.variables.String("Address"),
+	dedupe.variables.ShortString("Zip", has_missing=True),
+	dedupe.variables.String("Phone", has_missing=True)
     ]
 
 
-String Types
-^^^^^^^^^^^^
+String
+^^^^^^
 
-A ``String`` type field must declare the name of the record field to compare
-a ``String`` type declaration. The ``String`` type expects fields to be of
-class string.
+The ``String`` takes the key of the record field to compare.
 
-``String`` types are compared using string edit distance, specifically
+``String`` variables are compared using string edit distance, specifically
 `affine gap string distance <http://en.wikipedia.org/wiki/Gap_penalty#Affine>`__.
 This is a good metric for measuring fields that might have typos in them,
 such as "John" vs "Jon".
@@ -36,44 +35,45 @@ For example:-
 
 .. code:: python
 
-  {'field': 'Address', type: 'String'}
+  dedupe.variables.String("Address")
 
-ShortString Types
-^^^^^^^^^^^^^^^^^
+ShortString
+^^^^^^^^^^^
 
-A ``ShortString`` type field is just like ``String`` types except that dedupe
+The ``ShortString`` variable is just like the ``String`` variable except that dedupe
 will not try to learn any :ref:`index blocking rules <index-blocks-label>` for these fields, which can
 speed up the training phase considerably.
 
-Zip codes and city names are good candidates for this type. If in doubt,
+Zip codes and city names are good candidates for this variable. If in doubt,
 always use ``String``.
 
 For example:-
 
 .. code:: python
 
-  {'field': 'Zipcode', type: 'ShortString'}
+  dedupe.variables.ShortString("Zipcode")
 
 .. _text-types-label:
 
-Text Types
-^^^^^^^^^^
+Text
+^^^^
 
 If you want to compare fields containing blocks of text e.g. product
-descriptions or article abstracts, you should use this type. ``Text`` type
-fields are compared using the `cosine similarity metric
+descriptions or article abstracts, you should use this variable. ``Text``
+variables are compared using the `cosine similarity metric
 <http://en.wikipedia.org/wiki/Vector_space_model>`__.
 
 This is a measurement of the amount of words that two documents have in
 common. This measure can be made more useful as the overlap of rare words
 counts more than the overlap of common words.
 
-Compare this to ``String`` and ``ShortString`` types: For strings containing
-occupations, "yoga teacher" might be fairly similar to "yoga instructor" when
-using the ``Text`` measurement, because they both contain the relatively
-rare word of "yoga". However, if you compared these two strings using the
-``String`` or ``ShortString`` measurements, they might be considered fairly
-dis-similar, because the actual string edit distance between them is large.
+Compare this to ``String`` and ``ShortString`` variables: For strings
+containing occupations, "yoga teacher" might be fairly similar to
+"yoga instructor" when using the ``Text`` measurement, because they
+both contain the relatively rare word of "yoga". However, if you
+compared these two strings using the ``String`` or ``ShortString``
+measurements, they might be considered fairly dissimilar, because the
+actual string edit distance between them is large.
 
 
 If provided a sequence of example fields (i.e. a corpus) then dedupe will
@@ -81,29 +81,27 @@ learn these weights for you. For example:-
 
 .. code:: python
 
-   {
-    'field': 'Product description',
-    'type': 'Text', 
-    'corpus' : [
-            'this product is great',
-            'this product is great and blue'
-        ]
-   } 
+   dedupe.variables.Text("Product description",
+                         corpus=[
+                                 'this product is great',
+                                 'this product is great and blue'
+                                ]
+			)
 
 If you don't want to adjust the measure to your data, just leave 'corpus' out
 of the variable definition entirely.
 
 .. code:: python
 
-   {'field': 'Product description', 'type': 'Text'} 
+   dedupe.variables.Text("Product description") 
 
 
-Custom Types
+Custom Variable
 ^^^^^^^^^^^^
 
-A ``Custom`` type field must have specify the field it wants to compare, a
-type declaration of ``Custom``, and a comparator declaration. The comparator
-must be a function that can take in two field values and return a number.
+A ``Custom`` variables allows you to use a custom function for
+comparing fields. The function must take two field values and return a
+number.
 
 For example, a custom comparator:
 
@@ -120,65 +118,53 @@ The corresponding variable definition:
 
 .. code:: python
 
-    {
-        'field': 'Zip',
-        'type': 'Custom', 
-        'comparator': same_or_not_comparator
-     }
+    dedupe.variables.Custom("Zip", comparator=same_or_not_comparator)
 
-``Custom`` fields do not have any blocking rules associated with them.
+``Custom`` variables do not have any blocking rules associated with them.
 Since dedupe needs blocking rules, a data model that only contains ``Custom``
 fields will raise an error.
 
 LatLong
 ^^^^^^^
 
-A ``LatLong`` type field must have as the name of a field and a type
-declaration of ``LatLong``. ``LatLong`` fields are compared using the `Haversine
+``LatLong`` variables are compared using the `Haversine
 Formula <http://en.wikipedia.org/wiki/Haversine_formula>`__. 
 
-A ``LatLong``
-type field must consist of tuples of floats corresponding to a latitude and a
-longitude.
+A ``LatLong`` variable field must consist of tuples of floats
+corresponding to a latitude and a longitude.
 
 .. code:: python
 
-    {'field': 'Location', 'type': 'LatLong'}
+    dedupe.variables.LatLong("location")
 
 Set
 ^^^
 
-A ``Set`` type field is for comparing lists of elements, like keywords or
-client names. ``Set`` types are very similar to :ref:`text-types-label`. They
+``Set`` variables are for comparing lists of elements, like keywords or
+client names. ``Set`` variables are very similar to :ref:`text-types-label`. They
 use the same comparison function and you can also let dedupe learn which
 terms are common or rare by providing a corpus. Within a record, a ``Set``
-type field has to be hashable sequences like tuples or frozensets.
+variable field has to be hashable sequences like tuples or frozensets.
 
 .. code:: python
 
-    {
-        'field': 'Co-authors',
-        'type': 'Set',
-        'corpus' : [
-                ('steve edwards'),
-                ('steve edwards', 'steve jobs')
-            ]
-     } 
+    dedupe.variables.Set("Co-authors",
+                         corpus=[
+                                 ('steve edwards'),
+                                 ('steve edwards', 'steve jobs')
+                                ])
 
 or
 
 .. code:: python
 
-    {'field': 'Co-authors', 'type': 'Set'}
+    dedupe.variables.Set("Co-authors")
 
 Interaction
 ^^^^^^^^^^^
 
-An ``Interaction`` field multiplies the values of the multiple variables.
-An ``Interaction`` variable is created with type declaration of
-``Interaction`` and an ``interaction variables`` declaration.
-
-The ``interaction variables`` field must be a sequence of variable names of
+An ``Interaction`` variable multiplies the values of the multiple variables.
+The arguments to an ``Interaction`` variable must be a sequence of variable names of
 other fields you have defined in your variable definition.
 
 `Interactions <http://en.wikipedia.org/wiki/Interaction_%28statistics%29>`__
@@ -187,10 +173,9 @@ are good when the effect of two predictors is not simply additive.
 .. code:: python
 
     [
-        { 'field': 'Name', 'variable name': 'name', 'type': 'String' },
-        { 'field': 'Zip', 'variable name': 'zip', 'type': 'Custom', 
-      'comparator' : same_or_not_comparator },
-        {'type': 'Interaction', 'interaction variables': ['name', 'zip']}
+        dedupe.variables.String("Name", name="name"),
+	dedupe.variables.Custom("Zip", comparator=same_or_not_comparator, name="zip")
+	dedupe.variables.Interaction("name", "zip")
     ]
 
 Exact
@@ -200,7 +185,7 @@ Exact
 
 .. code:: python
 
-    {'field': 'city', 'type': 'Exact'}
+    dedupe.variables.Exact("city")
 
 
 Exists
@@ -216,7 +201,7 @@ different cases:
 
 .. code:: python
 
-    {'field': 'first_name', 'type': 'Exists'} 
+    dedupe.variables.Exists("first_name")
 
 
 
@@ -254,11 +239,7 @@ You would create a definition such as:
 
 .. code:: python
 
-    {
-        'field': 'Business Type',
-        'type': 'Categorical',
-        'categories' : ['taxi', 'lawyer']
-    }
+    dedupe.variables.Categorical("Business Type", categories=['taxi', 'lawyer'])
 
 Price
 ^^^^^
@@ -269,7 +250,7 @@ prices. The values of ``Price`` field must be a positive float. If the value is
 
 .. code:: python
 
-    {'field': 'cost', 'type': 'Price'}
+    dedupe.variables.Price("cost")
 
 Optional Variables
 ------------------
@@ -286,8 +267,8 @@ DateTime
 ``DateTime`` variables are useful for comparing dates and timestamps. This
 variable can accept strings or Python datetime objects as inputs.
 
-The ``DateTime`` variable definition accepts a few optional arguments that
-can help improve behavior if you know your field follows an unusual format:
+The ``DateTime`` variable a few optional arguments that can help
+improve behavior if you know your field follows an unusual format:
 
 * :code:`fuzzy` - Use fuzzy parsing to automatically extract dates from strings like "It happened on June 2nd, 2018" (default :code:`True`)
 * :code:`dayfirst` - Ambiguous dates should be parsed as dd/mm/yy (default :code:`False`)
@@ -297,34 +278,24 @@ Note that the ``DateTime`` variable defaults to mm/dd/yy for ambiguous dates.
 If both :code:`dayfirst` and :code:`yearfirst` are set to :code:`True`, then
 :code:`dayfirst` will take precedence.
 
-For example, a sample ``DateTime`` variable definition, using the defaults:
 
 .. code:: python
 
-    {
-        'field': 'time_of_sale',
-        'type': 'DateTime',
-        'fuzzy': True,
-        'dayfirst': False,
-        'yearfirst': False
-    }
+    import datetimetype
 
-If you're happy with the defaults, you can simply define the :code:`field`
-and :code:`type`:
+    datetimetype.DateTime("field")
 
-.. code:: python
+To install:
 
-    {'field': 'time_of_sale', 'type': 'DateTime'}
+.. code:: console
 
-Install the `dedupe-variable-datetime
-<https://pypi.python.org/pypi/dedupe-variable-datetime>`__ package for
-``DateTime`` Type. For more info, see the `GitHub Repository
-<https://github.com/dedupeio/dedupe-variable-datetime>`__.
+    pip install dedupe-variable-datetime
 
-Address Type
-^^^^^^^^^^^^
 
-An ``Address`` variable should be used for United States addresses. It uses
+Address
+^^^^^^^
+
+An ``USAddress`` variable should be used for United States addresses. It uses
 the `usaddress <https://usaddress.readthedocs.io/en/latest/>`__ package to
 split apart an address string into components like address number, street
 name, and street type and compares component to component.
@@ -333,18 +304,22 @@ For example:-
 
 .. code:: python
 
-    {'field': 'address', 'type': 'Address'}
+    import addressvariable
+	  
+    addressvariable.USAddress("address")
+
+
+To install:
 
+.. code:: console
 
-Install the `dedupe-variable-address
-<https://pypi.python.org/pypi/dedupe-variable-address>`__ package for
-``Address`` Type. For more info, see the `GitHub Repository
-<https://github.com/dedupeio/dedupe-variable-address>`__.
+    pip install dedupe-variable-address
 
-Name Type
-^^^^^^^^^
 
-A ``Name`` variable should be used for a field that contains American names,
+Name
+^^^^
+
+A ``WesternName`` variable should be used for a field that contains American names,
 corporations and households. It uses the `probablepeople
 <https://probablepeople.readthedocs.io/en/latest/>`__ package to split apart
 an name string into components like give name, surname, generational suffix,
@@ -355,42 +330,15 @@ For example:-
 
 .. code:: python
 
-    {'field': 'name', 'type': 'Name'}
-
-
-Install the `dedupe-variable-name
-<https://pypi.python.org/pypi/dedupe-variable-name>`__ package for ``Name``
-Type. For more info, see the `GitHub Repository
-<https://github.com/dedupeio/dedupe-variable-name>`__.
-
-Fuzzy Category
-^^^^^^^^^^^^^^
-
-A ``FuzzyCategorical`` variable should be used for when you for
-categorical data that has variations.
+    import namevariable
 
-Occupations are an example, where the you may have 'Attorney', 'Counsel', and
-'Lawyer'. For this variable type, you need to supply a corpus of records that
-contain your focal record and other field types. This corpus should either be
-all the data you are trying to link or a representative sample.
-
-For example:-
-
-.. code:: python
+    namevariable.WesternName("field")
 
-    {
-     'field': 'occupation',
-     'type': 'FuzzyCategorical',
-     'corpus' : [
-            {'name' : 'Jim Doe', 'occupation' : 'Attorney'},
-            {'name' : 'Jim Doe', 'occupation' : 'Lawyer'}
-        ]
-    }
+To install: 
+    
+.. code:: console
 
-Install the `dedupe-variable-fuzzycategory
-<https://pypi.python.org/pypi/dedupe-variable-fuzzycategory>`__ package for
-the ``FuzzyCategorical`` Type. For more info, see the `GitHub Repository
-<https://github.com/dedupeio/fuzzycategory>`__.
+    pip install dedupe-variable-name
 
 
 Missing Data 
@@ -407,13 +355,13 @@ a ``None`` object. You should also use ``None`` to represent empty strings
         {'Name': None, 'Phone': '773-555-1123'}
    ]
 
-If you want to model this missing data for a field, you can set ``'has
-missing' : True`` in the variable definition. This creates a new,
+If you want to model this missing data for a field, you can set the ``has
+missing=True`` in the variable definition. This creates a new,
 additional field representing whether the data was present or not and
 zeros out the missing data.
 
-If there is missing data, but you did not declare ``'has
-missing' : True`` then the missing data will simply be zeroed out and
+If there is missing data, but you did not declare ``has
+missing=True`` then the missing data will simply be zeroed out and
 no field will be created to account for missing data.
 
 This approach is called 'response augmented data' and is described in
@@ -430,7 +378,7 @@ This approach makes a few assumptions that are usually not completely true:
 
 
 If you define an an interaction with a field that you declared to have
-missing data, then ``has missing : True`` will also be set for the
+missing data, then ``has missing=True`` will also be set for the
 Interaction field.
 
 Longer example of a variable definition:
@@ -438,12 +386,12 @@ Longer example of a variable definition:
 .. code:: python
 
     [
-        {'field': 'name', 'variable name' : 'name', 'type': 'String'},
-        {'field': 'address', 'type': 'String'},
-        {'field': 'city', 'variable name' : 'city', 'type': 'String'},
-        {'field': 'zip', 'type': 'Custom', 'comparator' : same_or_not_comparator},
-        {'field': 'cuisine', 'type': 'String', 'has missing': True}
-        {'type': 'Interaction', 'interaction variables' : ['name', 'city']}
+        dedupe.variables.String("name", name="name"),
+	dedupe.variables.String("address"),
+	dedupe.variables.String("city", name="city"),
+	dedupe.variables.Custom("zip", comparator=same_or_not_comparator),
+	dedupe.variables.String("cuisine", has_missing=True),
+	dedupe.vairables.Interaction("name", "city")
     ]
 
 Multiple Variables comparing same field
@@ -456,8 +404,8 @@ For example:-
 .. code:: python
 
     [
-        {'field': 'name', 'type': 'String'},
-        {'field': 'name', 'type': 'Text'}
+        dedupe.variables.String("name"),
+	dedupe.variables.Text("name")
     ]
 
 
@@ -475,4 +423,4 @@ default edit distance.
 
 .. code:: python
 
-    {'field': 'name', 'type': 'String', 'crf': True}
+    dedupe.variables.String("name", crf=True)
diff --git a/pyproject.toml b/pyproject.toml
index 9cfe9afc9..0256eee68 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "dedupe"
 description = "A python library for accurate and scaleable data deduplication and entity-resolution"
-version = "2.0.24"
+version = "3.0.0"
 readme = "README.md"
 requires-python = ">=3.7"
 license = {file = "LICENSE"}
@@ -63,10 +63,11 @@ dedupe = ["py.typed"]
 
 [tool.mypy]
 plugins = "numpy.typing.mypy_plugin"
-files = "dedupe"
+files = ["dedupe"]
 show_error_codes = true
 ignore_missing_imports = true
 check_untyped_defs = true
+implicit_reexport = false
 
 [tool.pytest.ini_options]
 minversion = "7.1"
diff --git a/tests/test_api.py b/tests/test_api.py
index 84ac9169a..4e6b92906 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -46,8 +46,8 @@ def icfi(x):
 class ActiveMatch(unittest.TestCase):
     def setUp(self):
         self.field_definition = [
-            {"field": "name", "type": "String"},
-            {"field": "age", "type": "String"},
+            dedupe.variables.String("name"),
+            dedupe.variables.String("age"),
         ]
 
     def test_initialize_fields(self):
@@ -58,23 +58,26 @@ def test_initialize_fields(self):
                 [],
             )
 
+        with self.assertRaises(ValueError):
+            dedupe.api.ActiveMatching([{"field": "name", "type": "String"}])
+
         with self.assertRaises(ValueError):
             dedupe.api.ActiveMatching(
-                [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}],
+                [dedupe.variables.Custom("name", comparator=lambda x, y: 1)],
             )
 
         with self.assertRaises(ValueError):
             dedupe.api.ActiveMatching(
                 [
-                    {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
-                    {"field": "age", "type": "Custom", "comparator": lambda x, y: 1},
+                    dedupe.variables.Custom("name", comparator=lambda x, y: 1),
+                    dedupe.variables.Custom("age", comparator=lambda x, y: 1),
                 ],
             )
 
         dedupe.api.ActiveMatching(
             [
-                {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
-                {"field": "age", "type": "String"},
+                dedupe.variables.Custom("name", comparator=lambda x, y: 1),
+                dedupe.variables.String("age"),
             ],
         )
 
diff --git a/tests/test_core.py b/tests/test_core.py
index 56d1ac010..d0bc8c94a 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -47,7 +47,7 @@ def setUp(self):
             ]
         )
 
-        deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
+        deduper = dedupe.Dedupe([dedupe.variables.String("name")])
         self.data_model = deduper.data_model
         self.classifier = MockClassifier()
 
@@ -104,7 +104,7 @@ def test_score_duplicates_with_zeros(self):
 
 class FieldDistances(unittest.TestCase):
     def test_exact_comparator(self):
-        deduper = dedupe.Dedupe([{"field": "name", "type": "Exact"}])
+        deduper = dedupe.Dedupe([dedupe.variables.Exact("name")])
 
         record_pairs = (
             ({"name": "Shmoo"}, {"name": "Shmee"}),
@@ -117,7 +117,7 @@ def test_exact_comparator(self):
 
     def test_comparator(self):
         deduper = dedupe.Dedupe(
-            [{"field": "type", "type": "Categorical", "categories": ["a", "b", "c"]}]
+            [dedupe.variables.Categorical("type", categories=["a", "b", "c"])]
         )
 
         record_pairs = (({"type": "a"}, {"type": "b"}), ({"type": "a"}, {"type": "c"}))
@@ -131,14 +131,11 @@ def test_comparator(self):
     def test_comparator_interaction(self):
         deduper = dedupe.Dedupe(
             [
-                {
-                    "field": "type",
-                    "variable name": "type",
-                    "type": "Categorical",
-                    "categories": ["a", "b"],
-                },
-                {"type": "Interaction", "interaction variables": ["type", "name"]},
-                {"field": "name", "variable name": "name", "type": "Exact"},
+                dedupe.variables.Categorical(
+                    "type", categories=["a", "b"], name="type"
+                ),
+                dedupe.variables.Interaction("type", "name"),
+                dedupe.variables.Exact("name", name="name"),
             ]
         )
 
diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py
index e50af63cc..4a925e8b4 100644
--- a/tests/test_dedupe.py
+++ b/tests/test_dedupe.py
@@ -6,6 +6,7 @@
 import numpy
 
 import dedupe
+import dedupe.variables
 
 DATA = {
     100: {"name": "Bob", "age": "50"},
@@ -37,9 +38,9 @@ def test_data_model(self):
 
         data_model = DataModel(
             [
-                {"field": "a", "variable name": "a", "type": "String"},
-                {"field": "b", "variable name": "b", "type": "String"},
-                {"type": "Interaction", "interaction variables": ["a", "b"]},
+                dedupe.variables.String(field="a", name="a"),
+                dedupe.variables.String(field="b", name="b"),
+                dedupe.variables.Interaction("a", "b"),
             ]
         )
 
@@ -47,14 +48,9 @@ def test_data_model(self):
 
         data_model = DataModel(
             [
-                {
-                    "field": "a",
-                    "variable name": "a",
-                    "type": "String",
-                    "has missing": True,
-                },
-                {"field": "b", "variable name": "b", "type": "String"},
-                {"type": "Interaction", "interaction variables": ["a", "b"]},
+                dedupe.variables.String(field="a", name="a", has_missing=True),
+                dedupe.variables.String(field="b", name="b"),
+                dedupe.variables.Interaction("a", "b"),
             ]
         )
 
@@ -62,14 +58,9 @@ def test_data_model(self):
 
         data_model = DataModel(
             [
-                {
-                    "field": "a",
-                    "variable name": "a",
-                    "type": "String",
-                    "has missing": False,
-                },
-                {"field": "b", "variable name": "b", "type": "String"},
-                {"type": "Interaction", "interaction variables": ["a", "b"]},
+                dedupe.variables.String(field="a", name="a", has_missing=False),
+                dedupe.variables.String(field="b", name="b"),
+                dedupe.variables.Interaction("a", "b"),
             ]
         )
 
diff --git a/tests/test_labeler.py b/tests/test_labeler.py
index 30609ffae..8bbc2eab5 100644
--- a/tests/test_labeler.py
+++ b/tests/test_labeler.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+import dedupe
 from dedupe import datamodel, labeler
 from dedupe._typing import RecordDictPair
 
@@ -24,7 +25,7 @@ def freeze_record_pair(record_pair: RecordDictPair):
 class ActiveLearningTest(unittest.TestCase):
     def setUp(self):
         self.data_model = datamodel.DataModel(
-            [{"field": "name", "type": "String"}, {"field": "age", "type": "String"}]
+            [dedupe.variables.String("name"), dedupe.variables.String("age")]
         )
 
     def test_AL(self):
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
index ab8c0e471..7eb2d931e 100644
--- a/tests/test_serializer.py
+++ b/tests/test_serializer.py
@@ -53,7 +53,7 @@ def test_writeTraining(self):
         assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset)
         assert isinstance(loaded_training_pairs["distinct"][0][0]["baz"], tuple)
 
-        deduper = dedupe.Dedupe([{"field": "foo", "type": "String"}])
+        deduper = dedupe.Dedupe([dedupe.variables.String("foo")])
         deduper.classifier.cv = False
 
         encoded_file.seek(0)
diff --git a/tests/test_training.py b/tests/test_training.py
index b908dde0c..6b71f3aee 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -7,7 +7,7 @@
 
 class TrainingTest(unittest.TestCase):
     def setUp(self):
-        field_definition = [{"field": "name", "type": "String"}]
+        field_definition = [dedupe.variables.String("name")]
         self.data_model = dedupe.Dedupe(field_definition).data_model
         self.training_pairs = {
             "match": [