Skip to content

Commit

Permalink
Merge branch 'master' into PYTHON-4636
Browse files Browse the repository at this point in the history
  • Loading branch information
NoahStapp committed Oct 1, 2024
2 parents b241f6d + c0f7810 commit 43a6a54
Show file tree
Hide file tree
Showing 51 changed files with 1,838 additions and 235 deletions.
3 changes: 3 additions & 0 deletions .evergreen/resync-specs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ do
atlas-data-lake-testing|data_lake)
cpjson atlas-data-lake-testing/tests/ data_lake
;;
bson-binary-vector|bson_binary_vector)
cpjson bson-binary-vector/tests/ bson_binary_vector
;;
bson-corpus|bson_corpus)
cpjson bson-corpus/tests/ bson_corpus
;;
Expand Down
3 changes: 3 additions & 0 deletions .evergreen/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ if [ -n "$PERF_TEST" ]; then
python -m pip install simplejson
start_time=$(date +%s)
TEST_SUITES="perf"
# PYTHON-4769 Run perf_test.py directly otherwise pytest's test collection negatively
# affects the benchmark results.
TEST_ARGS="test/performance/perf_test.py $TEST_ARGS"
fi

echo "Running $AUTH tests over $SSL with python $(which python)"
Expand Down
4 changes: 2 additions & 2 deletions bson/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,7 +1324,7 @@ def decode_iter(
elements = data[position : position + obj_size]
position += obj_size

yield _bson_to_dict(elements, opts) # type:ignore[misc, type-var]
yield _bson_to_dict(elements, opts) # type:ignore[misc]


@overload
Expand Down Expand Up @@ -1370,7 +1370,7 @@ def decode_file_iter(
raise InvalidBSON("cut off in middle of objsize")
obj_size = _UNPACK_INT_FROM(size_data, 0)[0] - 4
elements = size_data + file_obj.read(max(0, obj_size))
yield _bson_to_dict(elements, opts) # type:ignore[type-var, arg-type, misc]
yield _bson_to_dict(elements, opts) # type:ignore[arg-type, misc]


def is_valid(bson: bytes) -> bool:
Expand Down
152 changes: 146 additions & 6 deletions bson/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
# limitations under the License.
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Tuple, Type, Union
import struct
from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
from uuid import UUID

"""Tools for representing BSON binary data.
Expand Down Expand Up @@ -191,21 +194,75 @@ class UuidRepresentation:
"""


VECTOR_SUBTYPE = 9
"""**(BETA)** BSON binary subtype for densely packed vector data.
.. versionadded:: 4.10
"""


USER_DEFINED_SUBTYPE = 128
"""BSON binary subtype for any user defined structure.
"""


class BinaryVectorDtype(Enum):
"""**(BETA)** Datatypes of vector subtype.
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
The `PACKED_BIT` value represents a special case where vector values themselves
can only be of two values (0 or 1) but these are packed together into groups of 8,
a byte. In Python, these are displayed as ints in range [0, 255]
Each value is of type bytes with a length of one.
.. versionadded:: 4.10
"""

INT8 = b"\x03"
FLOAT32 = b"\x27"
PACKED_BIT = b"\x10"


@dataclass
class BinaryVector:
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
.. versionadded:: 4.10
"""

__slots__ = ("data", "dtype", "padding")

def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
"""
:param data: Sequence of numbers representing the mathematical vector.
:param dtype: The data type stored in binary
:param padding: The number of bits in the final byte that are to be ignored
when a vector element's size is less than a byte
and the length of the vector is not a multiple of 8.
"""
self.data = data
self.dtype = dtype
self.padding = padding


class Binary(bytes):
"""Representation of BSON binary data.
This is necessary because we want to represent Python strings as
the BSON string type. We need to wrap binary data so we can tell
We want to represent Python strings as the BSON string type.
We need to wrap binary data so that we can tell
the difference between what should be considered binary data and
what should be considered a string when we encode to BSON.
Raises TypeError if `data` is not an instance of :class:`bytes`
or `subtype` is not an instance of :class:`int`.
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
Its data is prepended with two bytes of metadata.
The first (dtype) describes its data type, such as float32 or int8.
The second (padding) prescribes the number of bits to ignore in the final byte.
This is relevant when the element size of the dtype is not a multiple of 8.
Raises TypeError if `subtype` is not an instance of :class:`int`.
Raises ValueError if `subtype` is not in [0, 256).
.. note::
Expand All @@ -218,7 +275,10 @@ class Binary(bytes):
to use
.. versionchanged:: 3.9
Support any bytes-like type that implements the buffer protocol.
Support any bytes-like type that implements the buffer protocol.
.. versionchanged:: 4.10
**(BETA)** Addition of vector subtype.
"""

_type_marker = 5
Expand Down Expand Up @@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
)

@classmethod
def from_vector(
cls: Type[Binary],
vector: list[int, float],
dtype: BinaryVectorDtype,
padding: int = 0,
) -> Binary:
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
To interpret the representation of the numbers, a data type must be included.
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
The dtype and padding are prepended to the binary data's value.
:param vector: List of values
:param dtype: Data type of the values
:param padding: For fractional bytes, number of bits to ignore at end of vector.
:return: Binary packed data identified by dtype and padding.
.. versionadded:: 4.10
"""
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
format_str = "b"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
format_str = "B"
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
format_str = "f"
if padding:
raise ValueError(f"padding does not apply to {dtype=}")
else:
raise NotImplementedError("%s not yet supported" % dtype)

metadata = struct.pack("<sB", dtype.value, padding)
data = struct.pack(f"{len(vector)}{format_str}", *vector)
return cls(metadata + data, subtype=VECTOR_SUBTYPE)

def as_vector(self) -> BinaryVector:
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
:return: BinaryVector
.. versionadded:: 4.10
"""

if self.subtype != VECTOR_SUBTYPE:
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")

position = 0
dtype, padding = struct.unpack_from("<sB", self, position)
position += 2
dtype = BinaryVectorDtype(dtype)
n_values = len(self) - position

if dtype == BinaryVectorDtype.INT8:
dtype_format = "b"
format_string = f"{n_values}{dtype_format}"
vector = list(struct.unpack_from(format_string, self, position))
return BinaryVector(vector, dtype, padding)

elif dtype == BinaryVectorDtype.FLOAT32:
n_bytes = len(self) - position
n_values = n_bytes // 4
if n_bytes % 4:
raise ValueError(
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
)
vector = list(struct.unpack_from(f"{n_values}f", self, position))
return BinaryVector(vector, dtype, padding)

elif dtype == BinaryVectorDtype.PACKED_BIT:
# data packed as uint8
dtype_format = "B"
unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
return BinaryVector(unpacked_uint8s, dtype, padding)

else:
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)

@property
def subtype(self) -> int:
"""Subtype of this binary data."""
Expand Down
2 changes: 1 addition & 1 deletion bson/decimal128.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def __init__(self, value: _VALUE_OPTIONS) -> None:
"from list or tuple. Must have exactly 2 "
"elements."
)
self.__high, self.__low = value # type: ignore
self.__high, self.__low = value
else:
raise TypeError(f"Cannot convert {value!r} to Decimal128")

Expand Down
2 changes: 1 addition & 1 deletion bson/json_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def __new__(
"JSONOptions.datetime_representation must be one of LEGACY, "
"NUMBERLONG, or ISO8601 from DatetimeRepresentation."
)
self = cast(JSONOptions, super().__new__(cls, *args, **kwargs)) # type:ignore[arg-type]
self = cast(JSONOptions, super().__new__(cls, *args, **kwargs))
if json_mode not in (JSONMode.LEGACY, JSONMode.RELAXED, JSONMode.CANONICAL):
raise ValueError(
"JSONOptions.json_mode must be one of LEGACY, RELAXED, "
Expand Down
2 changes: 1 addition & 1 deletion bson/son.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(
self.update(kwargs)

def __new__(cls: Type[SON[_Key, _Value]], *args: Any, **kwargs: Any) -> SON[_Key, _Value]:
instance = super().__new__(cls, *args, **kwargs) # type: ignore[type-var]
instance = super().__new__(cls, *args, **kwargs)
instance.__keys = []
return instance

Expand Down
8 changes: 8 additions & 0 deletions doc/api/bson/binary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
.. autoclass:: UuidRepresentation
:members:

.. autoclass:: BinaryVectorDtype
:members:
:show-inheritance:

.. autoclass:: BinaryVector
:members:


.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
:members:
:show-inheritance:
5 changes: 5 additions & 0 deletions doc/async-tutorial.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Async Tutorial
==============

.. warning:: This API is currently in beta, meaning the classes, methods,
and behaviors described within may change before the full release.
If you come across any bugs during your use of this API,
please file a Jira ticket in the "Python Driver" project at https://jira.mongodb.org/browse/PYTHON.

.. code-block:: pycon
from pymongo import AsyncMongoClient
Expand Down
18 changes: 18 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,24 @@
Changelog
=========

Changes in Version 4.10.0
-------------------------

- Added provisional **(BETA)** support for a new Binary BSON subtype (9) used for efficient storage and retrieval of vectors:
densely packed arrays of numbers, all of the same type.
This includes new methods :meth:`~bson.binary.Binary.from_vector` and :meth:`~bson.binary.Binary.as_vector`.
- Added C extension use to client metadata, for example: ``{"driver": {"name": "PyMongo|c", "version": "4.10.0"}, ...}``
- Fixed a bug where :class:`~pymongo.asynchronous.mongo_client.AsyncMongoClient` could deadlock.
- Fixed a bug where PyMongo could fail to import on Windows if ``asyncio`` is misconfigured.

Issues Resolved
...............

See the `PyMongo 4.10 release notes in JIRA`_ for the list of resolved issues
in this release.

.. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553

Changes in Version 4.9.0
-------------------------

Expand Down
5 changes: 3 additions & 2 deletions hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ features = ["docs","test"]
test = "sphinx-build -E -b doctest doc ./doc/_build/doctest"

[envs.typing]
features = ["encryption", "ocsp", "zstd", "aws"]
dependencies = ["mypy==1.2.0","pyright==1.1.290", "certifi", "typing_extensions"]
pre-install-commands = [
"pip install -q -r requirements/typing.txt",
]
[envs.typing.scripts]
check-mypy = [
"mypy --install-types --non-interactive bson gridfs tools pymongo",
Expand Down
21 changes: 9 additions & 12 deletions pymongo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,7 @@

from pymongo import _csot
from pymongo._version import __version__, get_version_string, version_tuple
from pymongo.asynchronous.mongo_client import AsyncMongoClient
from pymongo.common import MAX_SUPPORTED_WIRE_VERSION, MIN_SUPPORTED_WIRE_VERSION
from pymongo.common import MAX_SUPPORTED_WIRE_VERSION, MIN_SUPPORTED_WIRE_VERSION, has_c
from pymongo.cursor import CursorType
from pymongo.operations import (
DeleteMany,
Expand All @@ -105,18 +104,16 @@
from pymongo.synchronous.mongo_client import MongoClient
from pymongo.write_concern import WriteConcern

version = __version__
"""Current version of PyMongo."""

try:
from pymongo.asynchronous.mongo_client import AsyncMongoClient
except Exception as e:
# PYTHON-4781: Importing asyncio can fail on Windows.
import warnings as _warnings

def has_c() -> bool:
"""Is the C extension installed?"""
try:
from pymongo import _cmessage # type: ignore[attr-defined] # noqa: F401
_warnings.warn(f"Failed to import Async PyMongo: {e!r}", ImportWarning, stacklevel=2)

return True
except ImportError:
return False
version = __version__
"""Current version of PyMongo."""


def timeout(seconds: Optional[float]) -> ContextManager[None]:
Expand Down
3 changes: 1 addition & 2 deletions pymongo/_csot.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,13 @@ def __init__(self, timeout: Optional[float]):
self._timeout = timeout
self._tokens: Optional[tuple[Token[Optional[float]], Token[float], Token[float]]] = None

def __enter__(self) -> _TimeoutContext:
def __enter__(self) -> None:
timeout_token = TIMEOUT.set(self._timeout)
prev_deadline = DEADLINE.get()
next_deadline = time.monotonic() + self._timeout if self._timeout else float("inf")
deadline_token = DEADLINE.set(min(prev_deadline, next_deadline))
rtt_token = RTT.set(0.0)
self._tokens = (timeout_token, deadline_token, rtt_token)
return self

def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
if self._tokens:
Expand Down
2 changes: 1 addition & 1 deletion pymongo/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
from typing import List, Tuple, Union

__version__ = "4.10.0.dev0"
__version__ = "4.11.0.dev0"


def get_version_tuple(version: str) -> Tuple[Union[int, str], ...]:
Expand Down
Loading

0 comments on commit 43a6a54

Please sign in to comment.