Replace the www-authenticate dependency with internal function

The implementation is built on request's parsing utilities rather than elusive regexes. I find this better to grasp. The associated test includes all www-authenticate test cases, plus a set of additional ones that focus on multi-challenge header specifications. Closes #493
datalad · Oct 23, 2023 · ce6a9c5 · ce6a9c5
1 parent 6a2d65e
commit ce6a9c5
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 7 deletions.
diff --git a/changelog.d/20231023_064405_michael.hanke_www_auth.md b/changelog.d/20231023_064405_michael.hanke_www_auth.md
@@ -0,0 +1,8 @@
+### 🏠 Internal
+
+- The `www-authenticate` dependencies is dropped. The functionality is
+  replaced by a `requests`-based implementation of an alternative parser.
+  This trims the dependency footprint and facilitates Debian-packaging.
+  The previous test cases are kept and further extended.
+  Fixes https://github.com/datalad/datalad-next/issues/493 via
+  https://github.com/datalad/datalad-next/pull/495 (by @mih)
diff --git a/datalad_next/url_operations/http.py b/datalad_next/url_operations/http.py
@@ -9,11 +9,13 @@
 from typing import Dict
 import requests
 from requests_toolbelt import user_agent
-import www_authenticate
 
 import datalad
 
-from datalad_next.utils.requests_auth import DataladAuth
+from datalad_next.utils.requests_auth import (
+    DataladAuth,
+    parse_www_authenticate,
+)
 from . import (
     UrlOperations,
     UrlOperationsRemoteError,
@@ -233,7 +235,7 @@ def probe_url(self, url, timeout=10.0, headers=None):
             headers=headers,
         )
         if 'www-authenticate' in req.headers:
-            props['auth'] = www_authenticate.parse(
+            props['auth'] = parse_www_authenticate(
                 req.headers['www-authenticate'])
         props['is_redirect'] = True if req.history else False
         props['status_code'] = req.status_code

diff --git a/datalad_next/utils/requests_auth.py b/datalad_next/utils/requests_auth.py
@@ -7,7 +7,6 @@
 from typing import Dict
 from urllib.parse import urlparse
 import requests
-import www_authenticate
 
 from datalad_next.config import ConfigManager
 from datalad_next.utils import CredentialManager
@@ -16,7 +15,77 @@
 lgr = logging.getLogger('datalad.ext.next.utils.requests_auth')
 
 
-__all__ = ['DataladAuth', 'HTTPBearerTokenAuth']
+__all__ = ['DataladAuth', 'HTTPBearerTokenAuth', 'parse_www_authenticate']
+
+
+def parse_www_authenticate(hdr: str) -> dict:
+    """Parse HTTP www-authenticate header
+
+    This helper uses ``requests`` utilities to parse the ``www-authenticate``
+    header as represented in a ``requests.Response`` instance. The header may
+    contain any number of challenge specifications.
+
+    The implementation follows RFC7235, where a challenge parameters set is
+    specified as: either a comma-separated list of parameters, or a single
+    sequence of characters capable of holding base64-encoded information,
+    and parameters are name=value pairs, where the name token is matched
+    case-insensitively, and each parameter name MUST only occur once
+    per challenge.
+
+    Returns
+    -------
+    dict
+      Keys are casefolded challenge labels (e.g., 'basic', 'digest').
+      Values are: ``None`` (no parameter), ``str`` (a token68), or
+      ``dict`` (name/value mapping of challenge parameters)
+    """
+    plh = requests.utils.parse_list_header
+    pdh = requests.utils.parse_dict_header
+    challenges = {}
+    challenge = None
+    # challenges as well as their properties are in a single
+    # comma-separated list
+    for item in plh(hdr):
+        # parse the item into a key/value set
+        # the value will be `None` if this item was no mapping
+        k, v = pdh(item).popitem()
+        # split the key to check for a challenge spec start
+        key_split = k.split(' ', maxsplit=1)
+        if len(key_split) > 1 or v is None:
+            item_suffix = item[len(key_split[0]) + 1:]
+            challenge = [item[len(key_split[0]) + 1:]] if item_suffix else None
+            challenges[key_split[0].casefold()] = challenge
+        else:
+            # implementation logic assumes that the above conditional
+            # was triggered before we ever get here
+            assert challenge
+            challenge.append(item)
+
+    return {
+        challenge: _convert_www_authenticate_items(items)
+        for challenge, items in challenges.items()
+    }
+
+
+def _convert_www_authenticate_items(items: list) -> None | str | dict:
+    pdh = requests.utils.parse_dict_header
+    # according to RFC7235, items can be:
+    # either a comma-separated list of parameters
+    # or a single sequence of characters capable of holding base64-encoded
+    # information.
+    # parameters are name=value pairs, where the name token is matched
+    # case-insensitively, and each parameter name MUST only occur once
+    # per challenge.
+    if items is None:
+        return None
+    elif len(items) == 1 and pdh(items[0].rstrip('=')).popitem()[1] is None:
+        # this items matches the token68 appearance (no name value
+        # pair after potential base64 padding its removed
+        return items[0]
+    else:
+        return {
+            k.casefold(): v for i in items for k, v in pdh(i).items()
+        }
 
 
 class DataladAuth(requests.auth.AuthBase):
@@ -201,7 +270,7 @@ def handle_401(self, r, **kwargs):
             # www-authenticate with e.g. 403s
             return r
         # which auth schemes does the server support?
-        auth_schemes = www_authenticate.parse(r.headers['www-authenticate'])
+        auth_schemes = parse_www_authenticate(r.headers['www-authenticate'])
         ascheme, credname, cred = self._get_credential(r.url, auth_schemes)
 
         if cred is None or 'secret' not in cred:

diff --git a/datalad_next/utils/tests/test_parse_www_authenticate.py b/datalad_next/utils/tests/test_parse_www_authenticate.py
@@ -0,0 +1,45 @@
+
+from ..requests_auth import parse_www_authenticate
+
+
+challenges = (
+    # just challenge type
+    ('Negotiate',
+     [('negotiate', None)]),
+    # challenge and just a token, tolerate any base64 padding
+    ('Negotiate abcdef',
+     [('negotiate', 'abcdef')]),
+    ('Negotiate abcdef=',
+     [('negotiate', 'abcdef=')]),
+    ('Negotiate abcdef==',
+     [('negotiate', 'abcdef==')]),
+    # standard bearer
+    ('Bearer realm=example.com',
+     [('bearer', {'realm': 'example.com'})]),
+    # standard digest
+    ('Digest realm="example.com", qop="auth,auth-int", nonce="abcdef", '
+     'opaque="ghijkl"',
+     [('digest', {'realm': 'example.com', 'qop': 'auth,auth-int',
+                  'nonce': 'abcdef', 'opaque': 'ghijkl'})]),
+    # multi challenge
+    ('Basic speCial="paf ram", realm="basIC", '
+     'Bearer, '
+     'Digest realm="[email protected]", qop="auth, auth-int", '
+     'algorithm=MD5',
+     [('basic', {'special': 'paf ram', 'realm': 'basIC'}),
+      ('bearer', None),
+      ('digest', {'realm': "[email protected]", 'qop': "auth, auth-int",
+                  'algorithm': 'MD5'})]),
+    # same challenge, multiple times, last one wins
+    ('Basic realm="basIC", '
+     'Basic realm="complex"',
+     [('basic', {'realm': 'complex'})]),
+)
+
+
+def test_parse_www_authenticate():
+    for hdr, targets in challenges:
+        res = parse_www_authenticate(hdr)
+        for ctype, props in targets:
+            assert ctype in res
+            assert res[ctype] == props
diff --git a/setup.cfg b/setup.cfg
@@ -16,7 +16,6 @@ python_requires = >= 3.8
 install_requires =
     annexremote
     datalad >= 0.18.4
-    www-authenticate
     humanize
 packages = find_namespace:
 include_package_data = True