diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index c9a5271fc..bbd43adbc 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -5,6 +5,13 @@ on: [push, pull_request] jobs: lint: runs-on: ubuntu-latest + env: + FORCE_COLOR: 1 + PY_COLORS: 1 + TERM: xterm-color + MYPY_FORCE_COLOR: 1 + MYPY_FORCE_TERMINAL_WIDTH: 200 + PYTEST_ADDOPTS: --color=yes steps: - uses: actions/checkout@v2 - uses: actions/setup-python@v2 @@ -13,7 +20,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip - pip install -e . --config-settings editable_mode=compat --use-pep517 + pip install -e . pip install -r requirements.txt - name: flake8 run: flake8 dedupe tests benchmarks/benchmarks @@ -27,6 +34,13 @@ jobs: if: always() run: mypy test: + env: + FORCE_COLOR: 1 + PY_COLORS: 1 + TERM: xterm-color + MYPY_FORCE_COLOR: 1 + MYPY_FORCE_TERMINAL_WIDTH: 200 + PYTEST_ADDOPTS: --color=yes timeout-minutes: 40 runs-on: ${{ matrix.os }} strategy: @@ -44,7 +58,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip - pip install -e . --config-settings editable_mode=compat --use-pep517 + pip install -e . - name: Install test dependencies run: pip install -r requirements.txt - name: pytest @@ -89,7 +103,7 @@ jobs: - name: Install dependencies run: | pip install --upgrade pip - pip install -e . --config-settings editable_mode=compat + pip install -e . python -m pip install -e ./benchmarks - name: Run on canonical on main run: python benchmarks/benchmarks/canonical.py @@ -98,7 +112,7 @@ jobs: with: clean: false - name: Install any new dependencies - run: pip install -e . --config-settings editable_mode=compat --use-pep517 + run: pip install -e . - name: Run on canonical with setting file created on main run: python benchmarks/benchmarks/canonical.py wheels: diff --git a/dedupe/__init__.py b/dedupe/__init__.py index cf75318a5..726836a72 100644 --- a/dedupe/__init__.py +++ b/dedupe/__init__.py @@ -1,7 +1,15 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -from pkgutil import extend_path - -__path__ = extend_path(__path__, __name__) - -from dedupe._init import * # noqa +from dedupe.api import ( # noqa: F401 + Dedupe, + Gazetteer, + RecordLink, + StaticDedupe, + StaticGazetteer, + StaticRecordLink, +) +from dedupe.convenience import ( # noqa: F401 + canonicalize, + console_label, + training_data_dedupe, + training_data_link, +) +from dedupe.serializer import read_training, write_training # noqa: F401 diff --git a/dedupe/_init.py b/dedupe/_init.py deleted file mode 100644 index 726836a72..000000000 --- a/dedupe/_init.py +++ /dev/null @@ -1,15 +0,0 @@ -from dedupe.api import ( # noqa: F401 - Dedupe, - Gazetteer, - RecordLink, - StaticDedupe, - StaticGazetteer, - StaticRecordLink, -) -from dedupe.convenience import ( # noqa: F401 - canonicalize, - console_label, - training_data_dedupe, - training_data_link, -) -from dedupe.serializer import read_training, write_training # noqa: F401 diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index bb4335658..d005877ed 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -1,7 +1,7 @@ from __future__ import annotations import copyreg -import pkgutil +import sys import types from typing import TYPE_CHECKING, cast @@ -12,10 +12,10 @@ from dedupe.variables.base import MissingDataType, Variable from dedupe.variables.interaction import InteractionType -for _, module, _ in pkgutil.iter_modules( # type: ignore - dedupe.variables.__path__, "dedupe.variables." -): - __import__(module) +if sys.version_info >= (3, 8): + from importlib import metadata as importlib_metadata +else: + import importlib_metadata if TYPE_CHECKING: from typing import Generator, Iterable, Sequence @@ -28,7 +28,14 @@ ) from dedupe.predicates import Predicate -VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} + +def load_setuptools_entrypoints(group: str): + variables = [] + for dist in list(importlib_metadata.distributions()): + for ep in dist.entry_points: + if ep.group == group: + variables.append(ep.load()) + return variables class DataModel(object): @@ -145,6 +152,16 @@ def __setstate__(self, d): def typify_variables( variable_definitions: Iterable[VariableDefinition], ) -> tuple[list[FieldVariable], list[Variable]]: + + variable_types = {} + for variablename in dedupe.variables.__all__: + variable = getattr(dedupe.variables, variablename) + variable_types[variable.type] = variable + + plugin_vars = load_setuptools_entrypoints("dedupevariables") + for var in plugin_vars: + variable_types[var.type] = var + primary_variables: list[FieldVariable] = [] all_variables: list[Variable] = [] only_custom = True @@ -181,11 +198,11 @@ def typify_variables( ] try: - variable_class = VARIABLE_CLASSES[variable_type] + variable_class = variable_types[variable_type] except KeyError: raise KeyError( "Field type %s not valid. Valid types include %s" - % (definition["type"], ", ".join(VARIABLE_CLASSES)) + % (definition["type"], ", ".join(variable_types)) ) variable_object = variable_class(definition) diff --git a/dedupe/variables/__init__.py b/dedupe/variables/__init__.py index b36383a61..39950042e 100644 --- a/dedupe/variables/__init__.py +++ b/dedupe/variables/__init__.py @@ -1,3 +1,24 @@ -from pkgutil import extend_path +# flake8: noqa +from dedupe.variables.base import CustomType +from dedupe.variables.categorical_type import CategoricalType +from dedupe.variables.exact import ExactType +from dedupe.variables.exists import ExistsType +from dedupe.variables.interaction import InteractionType +from dedupe.variables.latlong import LatLongType +from dedupe.variables.price import PriceType +from dedupe.variables.set import SetType +from dedupe.variables.string import ShortStringType, StringType, TextType -__path__ = extend_path(__path__, __name__) +__all__ = [ + "CustomType", + "CategoricalType", + "ExactType", + "ExistsType", + "InteractionType", + "LatLongType", + "PriceType", + "SetType", + "ShortStringType", + "StringType", + "TextType", +] diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py index f80b28faa..f0c82de64 100644 --- a/dedupe/variables/base.py +++ b/dedupe/variables/base.py @@ -5,7 +5,7 @@ from dedupe import predicates if TYPE_CHECKING: - from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type + from typing import Any, ClassVar, Iterable, Sequence, Type from dedupe._typing import Comparator, PredicateFunction, VariableDefinition @@ -47,15 +47,6 @@ def __getstate__(self) -> dict[str, Any]: return odict - @classmethod - def all_subclasses( - cls, - ) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]: - for q in cls.__subclasses__(): - yield getattr(q, "type", None), q - for p in q.all_subclasses(): - yield p - class DerivedType(Variable): type = "Derived" diff --git a/pyproject.toml b/pyproject.toml index 957c88500..7841e5afb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [project] name = "dedupe" description = "A python library for accurate and scaleable data deduplication and entity-resolution" -version = "2.0.20" +version = "3.0.0" readme = "README.md" requires-python = ">=3.7" license = {file = "LICENSE"} -keywords = [] +keywords = ["entity-resolution", "deduplication", "record-linkage", "dedupe", "clustering"] authors = [ { name = "Forest Gregg", email = "fgregg@datamade.us" }, ] @@ -51,13 +51,13 @@ MailingList = "https://groups.google.com/forum/#!forum/open-source-deduplication [build-system] -requires = ["setuptools==63", +requires = ["setuptools", "wheel", "cython"] build-backend = "setuptools.build_meta" [tool.setuptools] -packages = ["dedupe", "dedupe.variables"] +packages = ["dedupe"] [tool.mypy] plugins = "numpy.typing.mypy_plugin"