Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the class ZipArchiveOperations, which implements archive operations on zip-files #407

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions datalad_next/archive_operations/tests/test_zipfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import (
Path,
PurePosixPath,
)
from typing import Generator

import pytest

from datalad_next.iter_collections.utils import FileSystemItemType

from ..zipfile import ZipArchiveOperations


@dataclass
class TestArchive:
path: Path
item_count: int
content: bytes
target_hash: dict[str, str]


@pytest.fixture(scope='session')
def structured_sample_zip(sample_zip) -> Generator[TestArchive, None, None]:
yield TestArchive(
path=sample_zip,
item_count=4,
content=b'zip-123\n',
target_hash={
'SHA1': 'b5dfcec4d1b6166067226fae102f7fbcf6bd1bd4',
'md5': 'd700214df5487801e8ee23d31e60382a',
}
)


def test_ziparchive_basics(structured_sample_zip: TestArchive):
spec = structured_sample_zip
# this is intentionally a hard-coded POSIX relpath
member_name = 'test-archive/onetwothree.txt'
with ZipArchiveOperations(spec.path) as archive_ops:
with archive_ops.open(member_name) as member:
assert member.read() == spec.content
with archive_ops.open(PurePosixPath(member_name)) as member:
assert member.read() == spec.content


def test_ziparchive_contain(structured_sample_zip: TestArchive):
# this is intentionally a hard-coded POSIX relpath
member_name = 'test-archive/onetwothree.txt'
with ZipArchiveOperations(structured_sample_zip.path) as archive_ops:
assert member_name in archive_ops
assert PurePosixPath(member_name) in archive_ops
assert 'bogus' not in archive_ops


def test_ziparchive_iterator(structured_sample_zip: TestArchive):
spec = structured_sample_zip
with ZipArchiveOperations(structured_sample_zip.path) as archive_ops:
items = list(archive_ops)
assert len(items) == spec.item_count
for item in items:
# zip archives append a '/' to a directory name, because this
# representation is not supported in `PurePosixPath`-instances,
# we have to use specially crafted strings here.
item_name = (
str(item.name) + '/'
if item.type == FileSystemItemType.directory
else str(item.name)
)
assert item_name in archive_ops


def test_open(structured_sample_zip: TestArchive):
spec = structured_sample_zip
file_pointer = set()
with ZipArchiveOperations(structured_sample_zip.path) as zf:
for item in zf:
if item.type == FileSystemItemType.file:
with zf.open(str(PurePosixPath(item.name))) as fp:
file_pointer.add(fp)
assert fp.read(len(spec.content)) == spec.content
for fp in file_pointer:
assert fp.closed is True
119 changes: 119 additions & 0 deletions datalad_next/archive_operations/zipfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Archive operation handler for zipfiles"""

from __future__ import annotations

import logging
import zipfile
from contextlib import contextmanager
from pathlib import (
Path,
PurePosixPath,
)
from typing import (
Generator,
IO,
)
from zipfile import ZipInfo

from datalad_next.config import ConfigManager
# TODO we might just want to do it in reverse:
# move the code of `iter_zip` in here and have it call
# `ZipArchiveOperations(path).__iter__()` instead.
# However, the flexibility to have `iter_zip()` behave
# differently depending on parameters (fp=True/False)
# is nice, and `__iter__()` only has `self`, such that
# any customization would need to be infused in the whole
# class. Potentially cumbersome.
from datalad_next.iter_collections.zipfile import (
ZipfileItem,
iter_zip,
)
from . import ArchiveOperations


lgr = logging.getLogger('datalad.ext.next.archive_operations.zipfile')


class ZipArchiveOperations(ArchiveOperations):
"""
"""
def __init__(self,
location: Path,
*,
cfg: ConfigManager | None = None,
**kwargs):
"""
Parameters
----------
location: Path
ZIP archive location
cfg: ConfigManager, optional
A config manager instance that is consulted for any supported
configuration items
**kwargs: dict
Keyword arguments that are passed to zipfile.ZipFile-constructor
"""
super().__init__(location, cfg=cfg)

self.zipfile_kwargs = kwargs
# Consider supporting file-like for `location`,
# see zipfile.ZipFile(file_like_object)
self._zipfile_path = location
self._zipfile: zipfile.ZipFile | None = None

@property
def zipfile(self) -> zipfile.ZipFile:
if self._zipfile is None:
self._zipfile = zipfile.ZipFile(
self._zipfile_path,
**self.zipfile_kwargs
)
return self._zipfile

def close(self) -> None:
if self._zipfile:
self._zipfile.close()
self._zipfile = None

@contextmanager
def open(self, item: str | PurePosixPath | ZipInfo, **kwargs) -> IO:
"""Context manager, returning an open file for a member of the archive.

The file-like object will be closed when the context-handler
exits.

Parameters:
---------
item: str | PurePosixPath | zipfile.ZipInfo
Name, path, or ZipInfo-instance that identifies an item in the zipfile
kwargs: dict
Keyword arguments that will be used for ZipFile.open()

Returns
-------
IO
A file-like object to read bytes from the item or to write bytes
to the item.
"""
with self.zipfile.open(_anyzipid2membername(item), **kwargs) as fp:
yield fp

def __contains__(self, item: str | PurePosixPath | ZipInfo) -> bool:
try:
self.zipfile.getinfo(_anyzipid2membername(item))
return True
except KeyError:
return False

def __iter__(self) -> Generator[ZipfileItem, None, None]:
# if fp=True is needed, either `iter_zip()` can be used
# directly, or `ZipArchiveOperations.open`
yield from iter_zip(self._zipfile_path, fp=False)


def _anyzipid2membername(item: str | PurePosixPath | ZipInfo) -> str | ZipInfo:
if isinstance(item, ZipInfo):
return item
elif isinstance(item, PurePosixPath):
return item.as_posix()
return item
4 changes: 4 additions & 0 deletions datalad_next/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,7 @@
# file/dir/link types
sample_tar_xz,
)
from datalad_next.iter_collections.tests.test_iterzip import (
# session-scope, create a sample zip file
sample_zip,
)
Loading