Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update TarArchiveOperations and equip with tests #415

Merged
merged 3 commits into from
Jun 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ build: off
# init cannot use any components from the repo, because it runs prior to
# cloning it
init:
# enable external SSH access to CI worker
# needs APPVEYOR_SSH_KEY defined in project settings (or environment)
- sh: curl -sflL 'https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-ssh.sh' | bash -e -
# remove windows 260-char limit on path names
- cmd: powershell Set-Itemproperty -path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name LongPathsEnabled -value 1
# enable developer mode on windows
Expand All @@ -227,9 +230,6 @@ init:
- sh: export TMPDIR=~/DLTMP

install:
# enable external SSH access to CI worker on all other systems
# needs APPVEYOR_SSH_KEY defined in project settings (or environment)
- sh: tools/appveyor/enable-ssh-login
# place a debug setup helper at a convenient location
- cmd: copy tools\appveyor\env_setup.bat C:\\datalad_debug.bat
# Missing system software
Expand Down
6 changes: 5 additions & 1 deletion datalad_next/archive_operations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Handler for operations on various archive types
All handlers implements the API defined by :class:`ArchiveOperations`.
All handlers implement the API defined by :class:`ArchiveOperations`.
Available handlers:
Expand Down Expand Up @@ -71,6 +71,10 @@ def __init__(self, location: Any, *, cfg: ConfigManager | None = None):
def __str__(self) -> str:
return f'{self.__class__.__name__}({self._location})'

def __repr__(self) -> str:
return \
f'{self.__class__.__name__}({self._location}, cfg={self._cfg!r})'

@property
def cfg(self) -> ConfigManager:
"""ConfigManager given to the constructor, or the session default"""
Expand Down
41 changes: 33 additions & 8 deletions datalad_next/archive_operations/tarfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@
import logging
import tarfile
from contextlib import contextmanager
from pathlib import Path
from pathlib import (
Path,
PurePath,
)
from typing import (
Any,
Generator,
IO,
)

from datalad_next.config import ConfigManager
# TODO we might just want to do it in reverse:
# move the code of `iter_tar` in here and have it call
# `TarArchiveOperations(path).__iter__()` instead.
Expand All @@ -27,13 +31,15 @@
)

from . import ArchiveOperations
from datalad_next.config import ConfigManager

lgr = logging.getLogger('datalad.ext.next.archive_operations.tarfile')


class TarArchiveOperations(ArchiveOperations):
"""
"""Handler for a TAR archive on a local file system
Any methods that take an archive item/member name as an argument
accept a POSIX path string, or any `PurePath` instance.
"""
def __init__(self, location: Path, *, cfg: ConfigManager | None = None):
"""
Expand All @@ -55,24 +61,36 @@ def __init__(self, location: Path, *, cfg: ConfigManager | None = None):

@property
def tarfile(self) -> tarfile.TarFile:
"""Returns `TarFile` instance, after creating it on-demand
The instance is cached, and needs to be released by calling
``.close()`` if called outside a context manager.
"""
if self._tarfile is None:
self._tarfile = tarfile.open(self._tarfile_path, 'r')
return self._tarfile

def close(self) -> None:
"""Closes any opened TAR file handler"""
if self._tarfile:
self._tarfile.close()
self._tarfile = None

@contextmanager
def open(self, item: Any) -> IO:
"""
def open(self, item: str | PurePath) -> IO:
"""Get a file-like for a TAR archive item
Parameters
----------
item: str | PurePath
The identifier must be a POSIX path string, or a `PurePath` instance.
"""
yield self.tarfile.extractfile(str(item))
with self.tarfile.extractfile(_anyid2membername(item)) as fp:
yield fp

def __contains__(self, item: Any) -> bool:
def __contains__(self, item: str | PurePath) -> bool:
try:
self.tarfile.getmember(item)
self.tarfile.getmember(_anyid2membername(item))
return True
except KeyError:
return False
Expand All @@ -81,3 +99,10 @@ def __iter__(self) -> Generator[TarfileItem, None, None]:
# if fp=True is needed, either `iter_tar()` can be used
# directly, or `TarArchiveOperations.open`
yield from iter_tar(self._tarfile_path, fp=False)


def _anyid2membername(item_id: str | PurePath) -> str:
if isinstance(item_id, PurePath):
return item_id.as_posix()
else:
return item_id
Empty file.
87 changes: 87 additions & 0 deletions datalad_next/archive_operations/tests/test_tarfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import (
Path,
PurePath,
PurePosixPath,
)
from typing import Generator

import pytest

from datalad_next.iter_collections.utils import FileSystemItemType

from ..tarfile import TarArchiveOperations


@dataclass
class TestArchive:
path: Path
item_count: int
content: bytes
target_hash: dict[str, str]


@pytest.fixture(scope='session')
def structured_sample_tar_xz(
sample_tar_xz
) -> Generator[TestArchive, None, None]:
yield TestArchive(
path=sample_tar_xz,
item_count=6,
content=b'123\n',
target_hash={
'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4',
'md5': 'd700214df5487801e8ee23d31e60382a',
}
)


def test_tararchive_basics(structured_sample_tar_xz: TestArchive):
spec = structured_sample_tar_xz
# this is intentionally a hard-coded POSIX relpath
member_name = 'test-archive/onetwothree.txt'
with TarArchiveOperations(spec.path) as archive_ops:
with archive_ops.open(member_name) as member:
assert member.read() == spec.content
with archive_ops.open(PurePosixPath(member_name)) as member:
assert member.read() == spec.content
with archive_ops.open(PurePath(member_name)) as member:
assert member.read() == spec.content


def test_tararchive_contain(structured_sample_tar_xz: TestArchive):
# this is intentionally a hard-coded POSIX relpath
member_name = 'test-archive/onetwothree.txt'
archive_ops = TarArchiveOperations(structured_sample_tar_xz.path)
# POSIX path str
assert member_name in archive_ops
# POSIX path as obj
assert PurePosixPath(member_name) in archive_ops
# platform path
assert PurePath(PurePosixPath(member_name)) in archive_ops
assert 'bogus' not in archive_ops


def test_tararchive_iterator(structured_sample_tar_xz: TestArchive):
spec = structured_sample_tar_xz
with TarArchiveOperations(spec.path) as archive_ops:
items = list(archive_ops)
assert len(items) == spec.item_count
for item in items:
assert item.name in archive_ops


def test_open(structured_sample_tar_xz: TestArchive):
spec = structured_sample_tar_xz
file_pointer = set()
with TarArchiveOperations(spec.path) as tf:
for item in tf:
if item.type == FileSystemItemType.file:
with tf.open(str(PurePosixPath(item.name))) as fp:
file_pointer.add(fp)
assert fp.read(len(spec.content)) == spec.content
# check the fp before we close the archive handler
for fp in file_pointer:
assert fp.closed is True