Merge pull request #135 from yarikoptic/enh-dhub-tags

ENH: containers-add-dhub - add multiple images/tags/repositories from docker hub
datalad · Apr 13, 2021 · 3f7b7b4 · 3f7b7b4
2 parents bc33e4a + bfe6743
commit 3f7b7b4
Showing 1 changed file with 328 additions and 0 deletions.
diff --git a/tools/containers_add_dhub_tags.py b/tools/containers_add_dhub_tags.py
@@ -0,0 +1,328 @@
+"""Feed tagged Docker Hub images to datalad-containers-add.
+
+This command takes a set of Docker Hub repositories, looks up the
+tags, and calls `datalad containers-add ... dhub://REPO:TAG@digest`.  The
+output of datalad-container's Docker adapter is dumped to
+
+    images/REPO/TAG/ARCH-DATE-SHORTDIGEST/
+
+where SHORTDIGEST is the first 12 characters of .config.digest key of
+the manifest returned by Docker Hub for the image for the arch which was
+uploaded on the DATE. In addition, that image record and manifest are
+written to a sattelite to that directory .image.json and .manifest.json files.
+The step of adding the image is skipped if the path is already present locally.
+"""
+
+import fileinput
+import json
+import logging
+from pathlib import Path
+from pprint import pprint
+import re
+import requests
+
+from datalad.api import (
+    containers_add,
+    save,
+)
+
+lgr = logging.getLogger("containers_add_dhub_tags")
+
+REGISTRY_AUTH_URL = ("https://auth.docker.io/token?service=registry.docker.io"
+                     "&scope=repository:{repo}:pull")
+REGISTRY_ENDPOINT = "https://registry-1.docker.io/v2"
+DHUB_ENDPOINT = "https://hub.docker.com/v2"
+
+# TODO: wrap it up with feeding the repositories to consider
+# or if we just make it one repository at a time, then could become CLI options
+target_architectures = '.*'
+target_tags = '.*'
+# TODO: forget_tags = 'master' -- those for which we might not want to retain prior versions
+# or may be exclude them completely since too frequently changing etc?
+
+# TEST on busybox on just a few architectures and tags - it is tiny but has too many
+#target_architectures = '^(amd64|.*86)$'
+#target_tags = '(latest|1.32.0)'
+
+# TODO this could be a CLI option
+default_architecture = 'amd64'
+
+
+def clean_container_name(name):
+    """Transform `name` for use in datalad-containers-add.
+
+    Note that, although it probably doesn't matter in practice, this
+    transformation is susceptible to conflicts and ambiguity.
+    """
+    if name.startswith("_/"):
+        name = name[2:]
+    name = name.replace("_", "-")
+    # TODO: research feasibility to create "hierarchical" organization
+    # by using . as a separator.  Then we could have a "default"
+    # one and then various past instances in  sublevels of
+    # .version.architecture.date--shortdigest
+    return re.sub(r"[^0-9a-zA-Z-]", "--", name)
+
+
+def add_container(url, name, target):
+    lgr.info("Adding %s as %s", url, name)
+    # TODO: This would result in a commit for each image, which would
+    # be good to avoid.
+    #
+    # This containers_add() call also prevents doing things in
+    # parallel.
+    containers_add(
+        name=name, url=url, image=str(target),
+        # Pass update=True to let the image for an existing entry
+        # (particularly the one for the "latest" tag) be updated.
+        update=True)
+    return name
+
+
+def write_json(target, content):
+    lgr.info("Writing %s", target)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(json.dumps(content))
+    return target
+
+#
+# Registry -- requires authentication to query
+#
+from contextlib import contextmanager
+
+
+class RepoRegistry(object):
+    def __init__(self, repo):
+        resp_auth = requests.get(REGISTRY_AUTH_URL.format(repo=repo))
+        resp_auth.raise_for_status()
+        self.repo = repo
+        self._headers = {
+            "Authorization": "Bearer " + resp_auth.json()["token"],
+        }
+
+    def get(self, query, headers=None):
+        headers = headers or {}
+        headers.update(self._headers)
+        resp_man = requests.get(f"{REGISTRY_ENDPOINT}/{self.repo}/{query}",
+                                headers=headers)
+        resp_man.raise_for_status()
+        return resp_man.json()
+
+    def get_manifest(self, reference):
+        lgr.debug("Getting manifest for %s:%s", self.repo, reference)
+        # TODO: Can we check with HEAD first to see if the digest
+        # matches what we have locally?
+        return self.get(
+            f'manifests/{reference}',
+            # return the single (first, if multiple e.g. for a reference being a tag)
+            # manifest
+            headers={"Accept": "application/vnd.docker.distribution.manifest.v2+json"}
+        )
+
+#
+# HUB -- no authentication required
+#
+
+
+def walk_pages(url):
+    next_page = url
+    while next_page:
+        lgr.debug("GET %s", next_page)
+        response = requests.get(next_page)
+        response.raise_for_status()
+        data = response.json()
+        next_page = data.get("next")
+        yield from data.get("results", [])
+
+
+def get_repo_tag_images(repo):
+    url = f"{DHUB_ENDPOINT}/repositories/{repo}/tags"
+    for result in walk_pages(url):
+        images = result["images"]
+        # there could be records with images not having been uploaded,
+        # then it seems digest is not there and 'last_pushed' is None
+        for i, image in list(enumerate(images))[::-1]:
+            if 'digest' not in image:
+                assert not image.get('last_pushed')
+                images.pop(i)
+        yield result["name"], sorted(images, key=lambda i: i['digest'])
+
+
+def get_namespace_repos(name):
+    lgr.info("Getting repositories for %s...", name)
+    url = f"{DHUB_ENDPOINT}/repositories/{name}/"
+    for result in walk_pages(url):
+        assert name == result["namespace"]
+        yield f"{name}/{result['name']}"
+
+
+def parse_input(line):
+    line = line.strip()
+    lgr.debug("Processing input: %s", line)
+    if line.endswith("/"):
+        kind = "namespace"
+        name = line[:-1]
+    else:
+        kind = "repository"
+        if "/" in line:
+            name = line
+        else:
+            lgr.debug(
+                "Assuming official image and assigning library/ namespace")
+            name = "library/" + line
+    return name, kind
+
+
+def process_files(files):
+    failed = []
+    for line in fileinput.input(files):
+        name, kind = parse_input(line)
+        if kind == "namespace":
+            try:
+                repos = list(get_namespace_repos(name))
+            except requests.HTTPError as exc:
+                lgr.warning(
+                    "Failed to list repositories for %s (status %s). Skipping",
+                    name, exc.response.status_code)
+                failed.append(name)
+                continue
+        else:
+            repos = [name]
+
+        target_architectures_re = re.compile(target_architectures)
+        target_tags_re = re.compile(target_tags)
+        for repo in repos:
+            lgr.info("Working on %s", repo)
+            try:
+                registry = RepoRegistry(repo)
+                #pprint(list(zip(sorted(_all_tags['latest'], key=lambda r: r['digest']), sorted(_all_tags['1.32.0'],
+                # key=lambda r: r['digest']))))
+                tag_images = dict(get_repo_tag_images(repo))
+
+                # 'latest' tag is special in docker, it is the default one
+                # which might typically point to some other release/version.
+                # If we find that it is the case, we do not create a dedicated "latest"
+                # image/datalad container -- we just add container entry pointing to that
+                # one.  If there is no matching one -- we do get "latest"
+                latest_matching_tag = None
+                # NOTE: "master" is also often used to signal a moving target
+                # it might, or not, correspond to tagged release.  I guess we are just
+                # doomed to breed those
+                if target_tags_re.match('latest'):
+                    matching_tags = []
+                    for tag, images in tag_images.items():
+                        if tag == 'latest' or not target_tags_re.match(tag):
+                            lgr.debug("Skipping tag %(tag)s")
+                            continue
+
+                        if images == tag_images['latest']:
+                            matching_tags.append(tag)
+                    if len(matching_tags) >= 1:
+                        if len(matching_tags) > 1:
+                            lgr.info(
+                                "Multiple tags images match latest, taking the first: %s",
+                                ', '.join(matching_tags))
+                        latest_matching_tag = matching_tags[0]
+                        lgr.info("Taking %s as the one for 'latest'", latest_matching_tag)
+                else:
+                    # TODO: if there is no latest, we should at least establish the
+                    # convenient one for each tag
+                    pass
+                for tag, images in tag_images.items():
+                    if tag == 'latest' and latest_matching_tag:
+                        continue  # skip since we will handle it
+                    if not target_tags_re.match(tag):
+                        lgr.debug("Skipping tag %(tag)s")
+                        continue
+                    multiarch = len({i['architecture'] for i in images}) > 1
+                    for image in images:
+                        architecture = image['architecture']
+                        if not target_architectures_re.match(architecture):
+                            lgr.debug("Skipping architecture %(architecture)s", image)
+                            continue
+                        manifest = registry.get_manifest(image['digest'])
+                        digest = manifest["config"]["digest"]
+                        # yoh: if I got it right, it is actual image ID we see in docker images
+                        assert digest.startswith("sha256:")
+                        digest = digest[7:]
+                        digest_short = digest[:12]  # use short version in name
+                        last_pushed = image.get('last_pushed')
+                        if last_pushed:
+                            assert last_pushed.endswith('Z')
+                            # take only date
+                            last_pushed = last_pushed[:10].replace('-', '')
+                            assert len(last_pushed) == 8
+                        cleaner_repo = repo
+                        # this is how it looks on hub.docker.com URL
+                        if repo.startswith('library/'):
+                            cleaner_repo = "_/" + cleaner_repo[len('library/'):]
+                        image_name = f"{cleaner_repo}/{tag}/"
+                        if multiarch:
+                            image_name += f"{architecture}-"
+                        if last_pushed:
+                            # apparently not in all, e.g. no for repronim/neurodocker
+                            # may be None for those built on the hub?
+                            image_name += f"{last_pushed}-"
+                        image_name += f"{digest_short}"
+                        dl_container_name = clean_container_name(str(image_name))
+                        image_path = Path("images") / image_name
+                        url = f"dhub://{repo}:{tag}@{image['digest']}"
+                        save_paths = []
+                        if image_path.exists():
+                            lgr.info("%s already exists, skipping adding", str(image_path))
+                        else:
+                            save_paths.append(write_json(Path(str(image_path) + '.manifest.json'), manifest))
+                            save_paths.append(write_json(Path(str(image_path) + '.image.json'), image))
+                            add_container(url, dl_container_name, image_path)
+                            # TODO: either fix datalad-container for https://github.com/datalad/datalad-container/issues/98
+                            # or here, since we have manifest, we can datalad download-url, and add-archive-content
+                            # of the gzipped layers (but without untarring) - that should add datalad-archive
+                            # urls to individual layers in the "saved" version
+                            # TODO: make it in a single commit with add_container at least,
+                            # or one commit for the whole repo sweep
+                            save(path=save_paths, message=f"Added manifest and image records for {dl_container_name}")
+                        # TODO: ensure .datalad/config to have additional useful fields:
+                        #  architecture, os, and manually "updateurl" since not added for
+                        #  dhub:// ATM
+                        if tag == latest_matching_tag and architecture == default_architecture:
+                            # TODO remove section if exists, copy this one
+                            lgr.warning("Tracking of 'latest' is not yet implemented")
+            except requests.HTTPError as exc:
+                lgr.warning(
+                    "Failed processing %s. Skipping\n  status %s for %s",
+                    repo, exc.response.status_code, exc.response.url)
+                failed.append(name)
+                continue
+    return failed
+
+
+def main(args):
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument(
+        "-v", "--verbose", action="store_true")
+    parser.add_argument(
+        "files", metavar="FILE", nargs="*",
+        help=("File with list of names. "
+              "If a name doesn't contain a slash, "
+              "it's treated as an official image by prepending 'library/'. "
+              "A name ending with a slash is taken as a namespace, "
+              "and Docker Hub is queried to obtain a list of repositories "
+              "under that namespace (e.g., all the repositories of a user). "
+              "If not specified, the names are read from stdin."))
+    namespace = parser.parse_args(args[1:])
+
+    logging.basicConfig(
+        level=logging.DEBUG if namespace.verbose else logging.INFO,
+        format="%(message)s")
+
+    return process_files(namespace.files)
+
+
+if __name__ == "__main__":
+    import sys
+    failed = main(sys.argv)
+    sys.exit(len(failed) > 0)