Skip to content

Commit

Permalink
Update TarArchiveOperations and equip with tests
Browse files Browse the repository at this point in the history
- More sensible `__repr__` for `ArchiveOperations`
- Document methods
- More flexible handling of archive item names. `open()` and all other
  such methods can now accept `item.name` from `__iter__` results
  directly. Based on the assumption that all TAR archives use member
  names in POSIX notation.
- Add unit tests with comprehensive coverage. Modelled after the
  `ZipArchiveOperations` tests in
  #407 but with more
  context manager use.
  • Loading branch information
mih committed Jun 11, 2023
1 parent a6940c9 commit cba1b8f
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 8 deletions.
4 changes: 4 additions & 0 deletions datalad_next/archive_operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ def __init__(self, location: Any, *, cfg: ConfigManager | None = None):
def __str__(self) -> str:
return f'{self.__class__.__name__}({self._location})'

def __repr__(self) -> str:
return \
f'{self.__class__.__name__}({self._location}, cfg={self._cfg!r})'

@property
def cfg(self) -> ConfigManager:
"""ConfigManager given to the constructor, or the session default"""
Expand Down
41 changes: 33 additions & 8 deletions datalad_next/archive_operations/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@
import logging
import tarfile
from contextlib import contextmanager
from pathlib import Path
from pathlib import (
Path,
PurePath,
)
from typing import (
Any,
Generator,
IO,
)

from datalad_next.config import ConfigManager
# TODO we might just want to do it in reverse:
# move the code of `iter_tar` in here and have it call
# `TarArchiveOperations(path).__iter__()` instead.
Expand All @@ -27,13 +31,15 @@
)

from . import ArchiveOperations
from datalad_next.config import ConfigManager

lgr = logging.getLogger('datalad.ext.next.archive_operations.tarfile')


class TarArchiveOperations(ArchiveOperations):
"""
"""Handler for a TAR archive on a local file system
Any methods that take an archive item/member name as an argument
accept a POSIX path string, or any `PurePath` instance.
"""
def __init__(self, location: Path, *, cfg: ConfigManager | None = None):
"""
Expand All @@ -55,24 +61,36 @@ def __init__(self, location: Path, *, cfg: ConfigManager | None = None):

@property
def tarfile(self) -> tarfile.TarFile:
"""Returns `TarFile` instance, after creating it on-demand
The instance is cached, and needs to be released by calling
``.close()`` if called outside a context manager.
"""
if self._tarfile is None:
self._tarfile = tarfile.open(self._tarfile_path, 'r')
return self._tarfile

def close(self) -> None:
"""Closes any opened TAR file handler"""
if self._tarfile:
self._tarfile.close()
self._tarfile = None

@contextmanager
def open(self, item: Any) -> IO:
"""
def open(self, item: str | PurePath) -> IO:
"""Get a file-like for a TAR archive item
Parameters
----------
item: str | PurePath
The identifier must be a POSIX path string, or a `PurePath` instance.
"""
yield self.tarfile.extractfile(str(item))
with self.tarfile.extractfile(_anyid2membername(item)) as fp:
yield fp

def __contains__(self, item: Any) -> bool:
def __contains__(self, item: str | PurePath) -> bool:
try:
self.tarfile.getmember(item)
self.tarfile.getmember(_anyid2membername(item))
return True
except KeyError:
return False
Expand All @@ -81,3 +99,10 @@ def __iter__(self) -> Generator[TarfileItem, None, None]:
# if fp=True is needed, either `iter_tar()` can be used
# directly, or `TarArchiveOperations.open`
yield from iter_tar(self._tarfile_path, fp=False)


def _anyid2membername(item_id: str | PurePath) -> str:
if isinstance(item_id, PurePath):
return item_id.as_posix()
else:
return item_id
Empty file.
87 changes: 87 additions & 0 deletions datalad_next/archive_operations/tests/test_tarfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import (
Path,
PurePath,
PurePosixPath,
)
from typing import Generator

import pytest

from datalad_next.iter_collections.utils import FileSystemItemType

from ..tarfile import TarArchiveOperations


@dataclass
class TestArchive:
path: Path
item_count: int
content: bytes
target_hash: dict[str, str]


@pytest.fixture(scope='session')
def structured_sample_tar_xz(
sample_tar_xz
) -> Generator[TestArchive, None, None]:
yield TestArchive(
path=sample_tar_xz,
item_count=6,
content=b'123\n',
target_hash={
'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4',
'md5': 'd700214df5487801e8ee23d31e60382a',
}
)


def test_tararchive_basics(structured_sample_tar_xz: TestArchive):
spec = structured_sample_tar_xz
# this is intentionally a hard-coded POSIX relpath
member_name = 'test-archive/onetwothree.txt'
with TarArchiveOperations(spec.path) as archive_ops:
with archive_ops.open(member_name) as member:
assert member.read() == spec.content
with archive_ops.open(PurePosixPath(member_name)) as member:
assert member.read() == spec.content
with archive_ops.open(PurePath(member_name)) as member:
assert member.read() == spec.content


def test_tararchive_contain(structured_sample_tar_xz: TestArchive):
# this is intentionally a hard-coded POSIX relpath
member_name = 'test-archive/onetwothree.txt'
archive_ops = TarArchiveOperations(structured_sample_tar_xz.path)
# POSIX path str
assert member_name in archive_ops
# POSIX path as obj
assert PurePosixPath(member_name) in archive_ops
# platform path
assert PurePath(PurePosixPath(member_name)) in archive_ops
assert 'bogus' not in archive_ops


def test_tararchive_iterator(structured_sample_tar_xz: TestArchive):
spec = structured_sample_tar_xz
with TarArchiveOperations(spec.path) as archive_ops:
items = list(archive_ops)
assert len(items) == spec.item_count
for item in items:
assert item.name in archive_ops


def test_open(structured_sample_tar_xz: TestArchive):
spec = structured_sample_tar_xz
file_pointer = set()
with TarArchiveOperations(spec.path) as tf:
for item in tf:
if item.type == FileSystemItemType.file:
with tf.open(str(PurePosixPath(item.name))) as fp:
file_pointer.add(fp)
assert fp.read(len(spec.content)) == spec.content
# check the fp before we close the archive handler
for fp in file_pointer:
assert fp.closed is True

0 comments on commit cba1b8f

Please sign in to comment.