From cba1b8fc245f213814843fa6e201b45e5801be95 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sat, 10 Jun 2023 16:11:29 +0200 Subject: [PATCH] Update `TarArchiveOperations` and equip with tests - More sensible `__repr__` for `ArchiveOperations` - Document methods - More flexible handling of archive item names. `open()` and all other such methods can now accept `item.name` from `__iter__` results directly. Based on the assumption that all TAR archives use member names in POSIX notation. - Add unit tests with comprehensive coverage. Modelled after the `ZipArchiveOperations` tests in https://github.com/datalad/datalad-next/pull/407 but with more context manager use. --- datalad_next/archive_operations/__init__.py | 4 + datalad_next/archive_operations/tarfile.py | 41 +++++++-- .../archive_operations/tests/__init__.py | 0 .../archive_operations/tests/test_tarfile.py | 87 +++++++++++++++++++ 4 files changed, 124 insertions(+), 8 deletions(-) create mode 100644 datalad_next/archive_operations/tests/__init__.py create mode 100644 datalad_next/archive_operations/tests/test_tarfile.py diff --git a/datalad_next/archive_operations/__init__.py b/datalad_next/archive_operations/__init__.py index 36a98498..e0955080 100644 --- a/datalad_next/archive_operations/__init__.py +++ b/datalad_next/archive_operations/__init__.py @@ -71,6 +71,10 @@ def __init__(self, location: Any, *, cfg: ConfigManager | None = None): def __str__(self) -> str: return f'{self.__class__.__name__}({self._location})' + def __repr__(self) -> str: + return \ + f'{self.__class__.__name__}({self._location}, cfg={self._cfg!r})' + @property def cfg(self) -> ConfigManager: """ConfigManager given to the constructor, or the session default""" diff --git a/datalad_next/archive_operations/tarfile.py b/datalad_next/archive_operations/tarfile.py index 67ef1ae3..485f49a2 100644 --- a/datalad_next/archive_operations/tarfile.py +++ b/datalad_next/archive_operations/tarfile.py @@ -6,13 +6,17 @@ import logging import tarfile from contextlib import contextmanager -from pathlib import Path +from pathlib import ( + Path, + PurePath, +) from typing import ( Any, Generator, IO, ) +from datalad_next.config import ConfigManager # TODO we might just want to do it in reverse: # move the code of `iter_tar` in here and have it call # `TarArchiveOperations(path).__iter__()` instead. @@ -27,13 +31,15 @@ ) from . import ArchiveOperations -from datalad_next.config import ConfigManager lgr = logging.getLogger('datalad.ext.next.archive_operations.tarfile') class TarArchiveOperations(ArchiveOperations): - """ + """Handler for a TAR archive on a local file system + + Any methods that take an archive item/member name as an argument + accept a POSIX path string, or any `PurePath` instance. """ def __init__(self, location: Path, *, cfg: ConfigManager | None = None): """ @@ -55,24 +61,36 @@ def __init__(self, location: Path, *, cfg: ConfigManager | None = None): @property def tarfile(self) -> tarfile.TarFile: + """Returns `TarFile` instance, after creating it on-demand + + The instance is cached, and needs to be released by calling + ``.close()`` if called outside a context manager. + """ if self._tarfile is None: self._tarfile = tarfile.open(self._tarfile_path, 'r') return self._tarfile def close(self) -> None: + """Closes any opened TAR file handler""" if self._tarfile: self._tarfile.close() self._tarfile = None @contextmanager - def open(self, item: Any) -> IO: - """ + def open(self, item: str | PurePath) -> IO: + """Get a file-like for a TAR archive item + + Parameters + ---------- + item: str | PurePath + The identifier must be a POSIX path string, or a `PurePath` instance. """ - yield self.tarfile.extractfile(str(item)) + with self.tarfile.extractfile(_anyid2membername(item)) as fp: + yield fp - def __contains__(self, item: Any) -> bool: + def __contains__(self, item: str | PurePath) -> bool: try: - self.tarfile.getmember(item) + self.tarfile.getmember(_anyid2membername(item)) return True except KeyError: return False @@ -81,3 +99,10 @@ def __iter__(self) -> Generator[TarfileItem, None, None]: # if fp=True is needed, either `iter_tar()` can be used # directly, or `TarArchiveOperations.open` yield from iter_tar(self._tarfile_path, fp=False) + + +def _anyid2membername(item_id: str | PurePath) -> str: + if isinstance(item_id, PurePath): + return item_id.as_posix() + else: + return item_id diff --git a/datalad_next/archive_operations/tests/__init__.py b/datalad_next/archive_operations/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datalad_next/archive_operations/tests/test_tarfile.py b/datalad_next/archive_operations/tests/test_tarfile.py new file mode 100644 index 00000000..72220e49 --- /dev/null +++ b/datalad_next/archive_operations/tests/test_tarfile.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import ( + Path, + PurePath, + PurePosixPath, +) +from typing import Generator + +import pytest + +from datalad_next.iter_collections.utils import FileSystemItemType + +from ..tarfile import TarArchiveOperations + + +@dataclass +class TestArchive: + path: Path + item_count: int + content: bytes + target_hash: dict[str, str] + + +@pytest.fixture(scope='session') +def structured_sample_tar_xz( + sample_tar_xz +) -> Generator[TestArchive, None, None]: + yield TestArchive( + path=sample_tar_xz, + item_count=6, + content=b'123\n', + target_hash={ + 'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4', + 'md5': 'd700214df5487801e8ee23d31e60382a', + } + ) + + +def test_tararchive_basics(structured_sample_tar_xz: TestArchive): + spec = structured_sample_tar_xz + # this is intentionally a hard-coded POSIX relpath + member_name = 'test-archive/onetwothree.txt' + with TarArchiveOperations(spec.path) as archive_ops: + with archive_ops.open(member_name) as member: + assert member.read() == spec.content + with archive_ops.open(PurePosixPath(member_name)) as member: + assert member.read() == spec.content + with archive_ops.open(PurePath(member_name)) as member: + assert member.read() == spec.content + + +def test_tararchive_contain(structured_sample_tar_xz: TestArchive): + # this is intentionally a hard-coded POSIX relpath + member_name = 'test-archive/onetwothree.txt' + archive_ops = TarArchiveOperations(structured_sample_tar_xz.path) + # POSIX path str + assert member_name in archive_ops + # POSIX path as obj + assert PurePosixPath(member_name) in archive_ops + # platform path + assert PurePath(PurePosixPath(member_name)) in archive_ops + assert 'bogus' not in archive_ops + + +def test_tararchive_iterator(structured_sample_tar_xz: TestArchive): + spec = structured_sample_tar_xz + with TarArchiveOperations(spec.path) as archive_ops: + items = list(archive_ops) + assert len(items) == spec.item_count + for item in items: + assert item.name in archive_ops + + +def test_open(structured_sample_tar_xz: TestArchive): + spec = structured_sample_tar_xz + file_pointer = set() + with TarArchiveOperations(spec.path) as tf: + for item in tf: + if item.type == FileSystemItemType.file: + with tf.open(str(PurePosixPath(item.name))) as fp: + file_pointer.add(fp) + assert fp.read(len(spec.content)) == spec.content + # check the fp before we close the archive handler + for fp in file_pointer: + assert fp.closed is True