From b20168fcbf9746c91467c236bcd3dccc535718a3 Mon Sep 17 00:00:00 2001
From: Yaroslav Halchenko <debian@onerussian.com>
Date: Mon, 23 Jul 2018 09:40:11 -0400
Subject: [PATCH 1/3] ENH: simple_with_archives - allow to follow by matching
 link text

---
 datalad_crawler/nodes/crawl_url.py            |  3 +-
 .../pipelines/simple_with_archives.py         | 67 +++++++++++++++++--
 2 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/datalad_crawler/nodes/crawl_url.py b/datalad_crawler/nodes/crawl_url.py
index 2f6763c..db4d954 100644
--- a/datalad_crawler/nodes/crawl_url.py
+++ b/datalad_crawler/nodes/crawl_url.py
@@ -27,7 +27,8 @@ class crawl_url(object):
 
     """
     def __init__(self,
-                 url=None, matchers=None,
+                 url=None,
+                 matchers=None,
                  input='url',
                  failed=None,
                  cache_redirects=True,
diff --git a/datalad_crawler/pipelines/simple_with_archives.py b/datalad_crawler/pipelines/simple_with_archives.py
index 132be1d..524ca2e 100644
--- a/datalad_crawler/pipelines/simple_with_archives.py
+++ b/datalad_crawler/pipelines/simple_with_archives.py
@@ -6,12 +6,27 @@
 #   copyright and license terms.
 #
 # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
-"""A pipeline for crawling a crcns dataset"""
+"""A pipeline for crawling basic websites and annexing their content
+
+It features:
+- extraction of content from archives (see `a_href_match_` option)
+- establishing 3-branch workflow with following branches:
+  - incoming - where all content is downloaded as is
+  - incoming-processed - where archives (if present) are extracted
+    (automatically) based on the content of incoming branch
+  - master - where incoming-processed is merged, so you could introduce
+    your additions/changes/fixups to files so they would later be automatically
+    merged with changes from the web which would propagate via
+    incoming-processed branch
+"""
 
 # Import necessary nodes
 from ..nodes.crawl_url import crawl_url
 from ..nodes.misc import fix_url
-from ..nodes.matches import a_href_match
+from ..nodes.matches import (
+    a_href_match,
+    a_text_match,
+)
 from ..nodes.misc import find_files
 from ..nodes.misc import sub
 from ..nodes.annex import Annexificator
@@ -21,12 +36,15 @@
 # Possibly instantiate a logger if you would like to log
 # during pipeline creation
 from logging import getLogger
-lgr = getLogger("datalad.crawler.pipelines.kaggle")
+lgr = getLogger("datalad.crawler.pipelines.simple_with_archives")
 
 
 def pipeline(url=None,
              a_href_match_='.*/download/.*\.(tgz|tar.*|zip)',
-             tarballs=True,
+             a_href_match_follow=None,
+             a_text_match_follow=None,
+             tarballs=None,
+             fail_if_no_archives=True,
              datalad_downloader=False,
              use_current_dir=False,
              leading_dirs_depth=1,
@@ -35,14 +53,40 @@ def pipeline(url=None,
              add_archive_leading_dir=False,
              annex=None,
              add_annex_to_incoming_pipeline=False,
-             incoming_pipeline=None):
+             incoming_pipeline=None,
+             archives_regex="\.(zip|tgz|tar(\..+)?)$"):
     """Pipeline to crawl/annex a simple web page with some tarballs on it
     
     If .gitattributes file in the repository already provides largefiles
     setting, none would be provided here to calls to git-annex.  But if not -- 
     README* and LICENSE* files will be added to git, while the rest to annex
+
+    Parameters
+    ----------
+    url : str
+      Top URL to crawl
+    a_href_match_: str, optional
+      Regular expression for HTML A href option to match to signal which files
+      to download
+    a_href_match_follow: str, optional
+      Regular expression for HTML A href option to follow/recurse into to look
+      for more URLs
+    a_text_match_follow: str, optional
+      Regular expression for HTML A text content to follow/recurse into to look
+      for more URLs
+    tarballs: bool, optional
+      old, use `fail_if_no_archives`
+    fail_if_no_archives: bool, optional
+      Fail if no archives were found
+    archives_regex: str, optional
+      Regular expression to define what files are archives and should be
+      extracted
     """
 
+    if tarballs is not None:
+        # compatibility
+        fail_if_no_archives = tarballs
+
     if not isinstance(leading_dirs_depth, int):
         leading_dirs_depth = int(leading_dirs_depth)
 
@@ -64,7 +108,15 @@ def pipeline(url=None,
 
     if url:
         assert not incoming_pipeline
-        crawler = crawl_url(url)
+
+        follow_matchers = []
+        if a_href_match_follow:
+            follow_matchers.append(a_href_match(a_href_match_follow))
+        if a_text_match_follow:
+            follow_matchers.append(a_text_match(a_text_match_follow))
+
+        crawler = crawl_url(url, matchers=follow_matchers)
+
         incoming_pipeline = [ # Download all the archives found on the project page
             crawler,
             a_href_match(a_href_match_, min_count=1),
@@ -81,6 +133,7 @@ def pipeline(url=None,
 
 
     # TODO: we could just extract archives processing setup into a separate pipeline template
+
     return [
         annex.switch_branch('incoming', parent='master'),
         [
@@ -90,7 +143,7 @@ def pipeline(url=None,
         [   # nested pipeline so we could skip it entirely if nothing new to be merged
             annex.merge_branch('incoming', strategy='theirs', commit=False),  #, skip_no_changes=False),
             [   # Pipeline to augment content of the incoming and commit it to master
-                find_files("\.(zip|tgz|tar(\..+)?)$", fail_if_none=tarballs),  # So we fail if none found -- there must be some! ;)),
+                find_files(archives_regex, fail_if_none=fail_if_no_archives),  # So we fail if none found -- there must be some! ;)),
                 annex.add_archive_content(
                     existing='archive-suffix',
                     # Since inconsistent and seems in many cases no leading dirs to strip, keep them as provided

From a1c18778e96e1b153daf8e6bfef718f095bac7ff Mon Sep 17 00:00:00 2001
From: Yaroslav Halchenko <debian@onerussian.com>
Date: Fri, 27 Jul 2018 22:21:42 -0400
Subject: [PATCH 2/3] ENH(BK): working on following the indexes etc

---
 datalad_crawler/nodes/crawl_url.py            |  7 +--
 datalad_crawler/nodes/matches.py              | 51 +++++++++++++++----
 datalad_crawler/pipeline.py                   | 22 ++++----
 .../pipelines/simple_with_archives.py         | 51 +++++++++++++------
 .../tests/test_simple_with_archives.py        | 40 +++++++++++++++
 5 files changed, 133 insertions(+), 38 deletions(-)

diff --git a/datalad_crawler/nodes/crawl_url.py b/datalad_crawler/nodes/crawl_url.py
index db4d954..bfef56d 100644
--- a/datalad_crawler/nodes/crawl_url.py
+++ b/datalad_crawler/nodes/crawl_url.py
@@ -65,7 +65,8 @@ def _visit_url(self, url, data):
             return
         # this is just a cruel first attempt
         lgr.debug("Visiting %s" % url)
-
+        if 'initial_url' not in data:
+            data['initial_url'] = url
         try:
             retry = 0
             orig_url = url
@@ -97,12 +98,12 @@ def _visit_url(self, url, data):
 
         data_ = updated(data, zip(self._output, (page, url)))
         yield data_
-
         # now recurse if matchers were provided
         matchers = self._matchers
         if matchers:
             lgr.debug("Looking for more URLs at %s using %s", url, matchers)
-            for matcher in (matchers if isinstance(matchers, (list, tuple)) else [matchers]):
+            matchers = matchers if isinstance(matchers, (list, tuple)) else [matchers]
+            for matcher in matchers:
                 for data_matched in matcher(data_):
                     if 'url' not in data_matched:
                         lgr.warning("Got data without a url from %s" % matcher)
diff --git a/datalad_crawler/nodes/matches.py b/datalad_crawler/nodes/matches.py
index 6fe2555..70c5ca3 100644
--- a/datalad_crawler/nodes/matches.py
+++ b/datalad_crawler/nodes/matches.py
@@ -41,9 +41,26 @@ class ExtractorMatch(object):
     """Generic matching extractor
     """
     def __init__(self, query, input='response', output='match', pop_input=False,
-                 allow_multiple=False, xpaths=None, csss=None, min_count=None,
-                 max_count=None):
-        """"""
+                 allow_multiple=False, xpaths=None, csss=None,
+                 min_count=None, max_count=None, finalize_each=True):
+        """
+
+        Parameters
+        ----------
+        query:
+        input:
+        output:
+        pop_input:
+        allow_multiple:
+        xpaths:
+        csss:
+        min_count:
+        max_count:
+        finalize_each: bool, optional
+          If set to True, it would call finalize upon each call of the node,
+          so min/max_count checks would be done per each call, instead of for
+          the entire duration of node use
+        """
         # TODO: define arguments
         self.query = query
         # allow_multiple concerns only extraction of additional xpaths and csss
@@ -55,11 +72,15 @@ def __init__(self, query, input='response', output='match', pop_input=False,
         self._pop_input = False
         self._min_count = min_count
         self._max_count = max_count
+        self._finalize_each = finalize_each
+        self.count = 0
+        self._finalized = True
 
     def _select_and_extract(self, selector, query, data):  # pragma: no cover
         raise NotImplementedError
 
     def __call__(self, data):
+        self._finalized = False
         input = data.pop(self._input) if self._pop_input else data[self._input]
 
         if not Response:
@@ -76,7 +97,9 @@ def __call__(self, data):
         else:
             selector = Selector(text=input)
 
-        count = 0
+        if 'url' in data:
+            # store URL for the page from where the match has happened
+            data['listing_url'] = data['url']
         for entry, data_ in self._select_and_extract(selector, self.query, data):
             data_ = updated(data_, {self._output: entry.extract()})
             # now get associated xpaths, css, etc
@@ -95,22 +118,30 @@ def __call__(self, data):
                             data_[key] = key_extracted
                             # raise NotImplementedError("Don't know what to do yet with this one")
                         else:
-                            lgr.warn(
+                            lgr.warning(
                                 "Got multiple selections for xpath query %s. "
                                 "Keeping only the first one: %s" % (repr(selector_), key_extracted[0]))
                             data_[key] = key_extracted[0]
                     else:
                         data_[key] = key_extracted[0]
-            count += 1
+            self.count += 1
             yield data_
+        if self._finalize_each:
+            self.finalize()
 
-        if self._min_count and count < self._min_count:
+    def finalize(self):
+        if self._finalized:  # already done
+            return
+        if self._min_count and self.count < self._min_count:
             raise ValueError("Did not match required %d matches (got %d) using %s"
-                             % (self._min_count, count, self))
+                             % (self._min_count, self.count, self))
 
-        if self._max_count and count > self._max_count:
+        if self._max_count and self.count > self._max_count:
             raise ValueError("Matched more than %d matches (got %d) using %s"
-                             % (self._max_count, count, self))
+                             % (self._max_count, self.count, self))
+        self.count = 0
+        self._finalized = True
+
 
 
 class ScrapyExtractorMatch(ExtractorMatch):
diff --git a/datalad_crawler/pipeline.py b/datalad_crawler/pipeline.py
index a23c70e..5b7a225 100644
--- a/datalad_crawler/pipeline.py
+++ b/datalad_crawler/pipeline.py
@@ -90,18 +90,19 @@ class FinishPipeline(Exception):
 PIPELINE_TYPES = (list, tuple)
 
 
-def reset_pipeline(pipeline):
-    """Given a pipeline, traverse its nodes and call .reset on them
+def _call_recursively(pipeline, method_name):
+    """Given a pipeline, traverse its nodes and call their method
 
-    Note: it doesn't try to call reset if a node doesn't have it
+    Note: it doesn't try to call method if a node doesn't have it
     """
     if pipeline:
         for node in pipeline:
+            method = getattr(node, method_name, None)
             if isinstance(node, PIPELINE_TYPES):
-                reset_pipeline(node)
-            elif hasattr(node, '__call__') and hasattr(node, 'reset'):
-                lgr.log(2, "Resetting node %s" % node)
-                node.reset()
+                _call_recursively(node, method_name)
+            elif method and hasattr(node, '__call__'):
+                lgr.log(2, "Calling %s of node %s", method_name, node)
+                method()
 
 
 def run_pipeline(*args, **kwargs):
@@ -136,7 +137,7 @@ def _get_pipeline_opts(pipeline):
     return opts, pipeline
 
 
-def xrun_pipeline(pipeline, data=None, stats=None, reset=True):
+def xrun_pipeline(pipeline, data=None, stats=None, reset=True, finalize=True):
     """Yield results from the pipeline.
 
     """
@@ -150,7 +151,7 @@ def _log(msg, *args):
 
     if reset:
         _log("Resetting pipeline")
-        reset_pipeline(pipeline)
+        _call_recursively(pipeline, 'reset')
 
     # just for paranoids and PEP8-disturbed, since theoretically every node
     # should not change the data, so having default {} should be sufficient
@@ -209,6 +210,9 @@ def _log(msg, *args):
             # or all subsequent or may be go back and only skip that generated result
             _log("got a signal that pipeline is 'finished'")
 
+    if finalize:
+        _log("Finalizing pipeline")
+        _call_recursively(pipeline, 'finalize')
     # TODO: this implementation is somewhat bad since all the output logic is
     # duplicated within xrun_pipeline_steps, but it is probably unavoidable because of
     # loop option
diff --git a/datalad_crawler/pipelines/simple_with_archives.py b/datalad_crawler/pipelines/simple_with_archives.py
index 524ca2e..c61d955 100644
--- a/datalad_crawler/pipelines/simple_with_archives.py
+++ b/datalad_crawler/pipelines/simple_with_archives.py
@@ -30,6 +30,7 @@
 from ..nodes.misc import find_files
 from ..nodes.misc import sub
 from ..nodes.annex import Annexificator
+from datalad.utils import assure_bool
 from datalad_crawler.consts import DATALAD_SPECIAL_REMOTE, ARCHIVES_SPECIAL_REMOTE
 from datalad.support.strings import get_replacement_dict
 
@@ -40,21 +41,28 @@
 
 
 def pipeline(url=None,
+             # parameters for initial crawling
              a_href_match_='.*/download/.*\.(tgz|tar.*|zip)',
-             a_href_match_follow=None,
-             a_text_match_follow=None,
-             tarballs=None,
+             a_href_follow=None,
+             a_text_follow=None,
+             rename=None,
+             # how do we establish the path while crawling following the links
+             path_buildup=None,  # 'relurl' - relative to initial url,  ?'a_text' - could construct from the "navigation bread crumbs"
+             # working with tarballs:
+             tarballs=None,  # deprecated
              fail_if_no_archives=True,
-             datalad_downloader=False,
+             add_archive_leading_dir=False,
              use_current_dir=False,
              leading_dirs_depth=1,
-             rename=None,
-             backend='MD5E',
-             add_archive_leading_dir=False,
+             archives_regex="\.(zip|tgz|tar(\..+)?)$",
+             # heavy customizations so this could be used from other pipelines
+             datalad_downloader=False,
              annex=None,
              add_annex_to_incoming_pipeline=False,
              incoming_pipeline=None,
-             archives_regex="\.(zip|tgz|tar(\..+)?)$"):
+             # Additional repo specs
+             backend='MD5E'
+):
     """Pipeline to crawl/annex a simple web page with some tarballs on it
     
     If .gitattributes file in the repository already provides largefiles
@@ -68,10 +76,10 @@ def pipeline(url=None,
     a_href_match_: str, optional
       Regular expression for HTML A href option to match to signal which files
       to download
-    a_href_match_follow: str, optional
+    a_href_follow: str, optional
       Regular expression for HTML A href option to follow/recurse into to look
       for more URLs
-    a_text_match_follow: str, optional
+    a_text_follow: str, optional
       Regular expression for HTML A text content to follow/recurse into to look
       for more URLs
     tarballs: bool, optional
@@ -81,11 +89,16 @@ def pipeline(url=None,
     archives_regex: str, optional
       Regular expression to define what files are archives and should be
       extracted
+    path_buildup: (None, 'relpath')
+      If not None, directs how to establish a path for the file out of url.
+      `relpath` - relative to the initial url
     """
 
     if tarballs is not None:
         # compatibility
         fail_if_no_archives = tarballs
+    # This is messy and non-uniform ATM :-/ TODO: use constraints etc for typing of options?
+    fail_if_no_archives = assure_bool(fail_if_no_archives)
 
     if not isinstance(leading_dirs_depth, int):
         leading_dirs_depth = int(leading_dirs_depth)
@@ -110,18 +123,24 @@ def pipeline(url=None,
         assert not incoming_pipeline
 
         follow_matchers = []
-        if a_href_match_follow:
-            follow_matchers.append(a_href_match(a_href_match_follow))
-        if a_text_match_follow:
-            follow_matchers.append(a_text_match(a_text_match_follow))
+        if a_href_follow:
+            follow_matchers.append(a_href_match(a_href_follow))
+        if a_text_follow:
+            follow_matchers.append(a_text_match(a_text_follow))
 
         crawler = crawl_url(url, matchers=follow_matchers)
 
+        from datalad_crawler.nodes.misc import debug
         incoming_pipeline = [ # Download all the archives found on the project page
             crawler,
-            a_href_match(a_href_match_, min_count=1),
-            fix_url,
+            a_href_match(a_href_match_, min_count=1, finalize_each=False),
+            debug(fix_url),
         ]
+        if path_buildup == 'relpath':
+            incoming_pipeline += [
+
+            ]
+
         if rename:
             incoming_pipeline += [sub({'filename': get_replacement_dict(rename)})]
         incoming_pipeline += [annex]
diff --git a/datalad_crawler/pipelines/tests/test_simple_with_archives.py b/datalad_crawler/pipelines/tests/test_simple_with_archives.py
index f434284..6678e6a 100644
--- a/datalad_crawler/pipelines/tests/test_simple_with_archives.py
+++ b/datalad_crawler/pipelines/tests/test_simple_with_archives.py
@@ -91,3 +91,43 @@ def test_crawl_autoaddtext(ind, topurl, outd):
     ok_file_under_git(outd, "anothertext", annexed=False)
     ok_file_under_git(outd, "d/textfile", annexed=False)
     ok_file_under_git(outd, "d/tooshort", annexed=True)
+
+
+@with_tree(tree={
+    'index.html': """<html><body>
+    <a href="d1/">sub1</a>
+    <a href="d2/">sub2</a>
+</body></html>
+""",
+    'd1': {
+        '1.dat': ''
+    },
+    'd2': {
+        '2.dat': 'text'
+}})
+@serve_path_via_http
+@with_tempfile
+@known_failure_direct_mode  #FIXME
+def test_recurse_follow(ind, topurl, outd):
+    ds = create(outd, text_no_annex=True)
+    def crawl_init_(**kw):
+        return crawl_init(
+            dict(url=topurl, a_href_match_='.*\.dat', fail_if_no_archives=False, **kw)
+            , save=True
+            , template='simple_with_archives'
+            )
+
+    with chpwd(outd):  # TODO -- dataset argument
+        crawl_init_()
+        # nothing is matched so it would  blow somehow
+        with assert_raises(Exception):
+            crawl()
+
+        crawl_init_(a_href_follow='.*/d[1]/?')
+        crawl()
+
+    ok_clean_git(outd)
+    ok_file_under_git(outd, "d1/1.dat", annexed=True)
+    #ok_file_under_git(outd, "d/textfile", annexed=False)
+    #ok_file_under_git(outd, "d/tooshort", annexed=True)
+

From 199d05747451ef7d9d4ffacb146a762c0b0b9717 Mon Sep 17 00:00:00 2001
From: Yaroslav Halchenko <debian@onerussian.com>
Date: Wed, 1 Aug 2018 09:41:49 -0400
Subject: [PATCH 3/3] BF: remove debug() helper around fix_url node

---
 datalad_crawler/pipelines/simple_with_archives.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datalad_crawler/pipelines/simple_with_archives.py b/datalad_crawler/pipelines/simple_with_archives.py
index c61d955..8c977ea 100644
--- a/datalad_crawler/pipelines/simple_with_archives.py
+++ b/datalad_crawler/pipelines/simple_with_archives.py
@@ -134,7 +134,7 @@ def pipeline(url=None,
         incoming_pipeline = [ # Download all the archives found on the project page
             crawler,
             a_href_match(a_href_match_, min_count=1, finalize_each=False),
-            debug(fix_url),
+            fix_url,
         ]
         if path_buildup == 'relpath':
             incoming_pipeline += [