Add exclude glob option in wget job (#10)

* Refine defaults needed for thresholding * change defaults to make sure globs get the file only, not the full paths! * Refine defaults * add exclude flag in wget blob * remove TODO * optional flag * verify json * fix this bad commit change
hysds · Sep 26, 2019 · 2fd1d89 · 2fd1d89
1 parent 7cc73a5
commit 2fd1d89
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 22 deletions.
diff --git a/docker/hysds-io.json.lw-tosca-wget-glob b/docker/hysds-io.json.lw-tosca-wget-glob
@@ -14,12 +14,20 @@
       "from": "submitter"
     },
     {
-      "name": "glob",
+      "name": "include_glob",
       "type": "text",
       "from": "submitter",
-      "default": "*fn*_logra2a1*.tif,*logamp*.tif,*.dem*,*msk.tif",
+      "default": "*final*",
       "placeholder": "A comma seperated list of globs to download"
     },
+    {
+      "name": "exclude_glob",
+      "type": "text",
+      "from": "submitter",
+      "default": "*tiles*",
+      "optional": true,
+      "placeholder": "(Optional) A comma seperated list of globs to exclude from included list"
+    },
     {
       "name": "name",
       "type": "text",

diff --git a/docker/job-spec.json.lw-tosca-wget-glob b/docker/job-spec.json.lw-tosca-wget-glob
@@ -19,7 +19,11 @@
       "destination": "positional"
     },
     {
-      "name": "glob",
+      "name": "include_glob",
+      "destination": "context"
+    },
+    {
+      "name": "exclude_glob",
       "destination": "context"
     }
    ]

diff --git a/wget.py b/wget.py
@@ -22,7 +22,7 @@
 logger = logging.getLogger("hysds")
 
 
-def wget_script(dataset=None, glob_list=None):
+def wget_script(dataset=None, glob_dict=None):
     """Return wget script."""
 
     # query
@@ -68,7 +68,7 @@ def wget_script(dataset=None, glob_list=None):
     scroll_id = scan_result['_scroll_id']
 
     # stream output a page at a time for better performance and lower memory footprint
-    def stream_wget(scroll_id, glob_list=None):
+    def stream_wget(scroll_id, glob_dict=None):
         #formatted_source = format_source(source)
         yield '#!/bin/bash\n#\n' + \
               '# query:\n#\n' + \
@@ -107,8 +107,8 @@ def stream_wget(scroll_id, glob_list=None):
                         cut_dirs = 6
                 if '.s3-website' in url or 'amazonaws.com' in url:
                     files = get_s3_files(url)
-                    if glob_list:
-                        files = glob_filter(files, glob_list)
+                    if glob_dict:
+                        files = glob_filter(files, glob_dict)
                     for file in files:
                         yield 'echo "downloading  %s"\n' % file
                         if 's1a_ifg' in url:
@@ -128,7 +128,7 @@ def stream_wget(scroll_id, glob_list=None):
 
     # malarout: interate over each line of stream_wget response, and write to a file which is later attached to the email.
     with open('wget_script.sh','w') as f:
-        for i in stream_wget(scroll_id, glob_list):
+        for i in stream_wget(scroll_id, glob_dict):
                 f.write(i)
 
     # for gzip compressed use file extension .tar.gz and modifier "w:gz"
@@ -199,16 +199,38 @@ def make_product(rule_name, query):
     with open("{0}/{0}.dataset.json".format(name), "w") as fp:
         json.dump({"id": name, "version": "v0.1"}, fp)
 
-def glob_filter(names,pattern):
+def glob_filter(names, glob_dict):
     import fnmatch
     files = []
-    for pat in pattern:
-        matching = fnmatch.filter(names, "*" + pat)
-        files.extend(matching)
+    files_exclude = []
+    include_csv = glob_dict.get("include", None)
+    exclude_csv = glob_dict.get("exclude", None)
+
+    if include_csv:
+        pattern_list = [item.strip() for item in include_csv.split(',')]
+
+        for pat in pattern_list:
+            matching = fnmatch.filter(names, "*" + pat)
+            files.extend(matching)
+        print("Got the following files to include: %s" % str(files))
+
+    if exclude_csv:
+        pattern_list_exc = [item.strip() for item in exclude_csv.split(',')]
+
+
+        for pat in pattern_list_exc:
+            matching = fnmatch.filter(names, "*" + pat)
+            files_exclude.extend(matching)
+
+        files_exclude = list(set(files_exclude))
+        print("Got the following files to exclude: %s" % str(files_exclude))
+
     #unique list
-    retfiles_set = set(files)
+    files_final = [x for x in files if x not in files_exclude]
+    retfiles_set = set(files_final)
     print("Got the following files: %s" % str(retfiles_set))
-    return list(retfiles_set)
+    return list(files_final)
+
 
 if __name__ == "__main__":
     '''
@@ -228,15 +250,15 @@ def glob_filter(names,pattern):
     except:
         raise Exception('unable to parse _context.json from work directory')
 
-    globs = None
-    if "glob" in context:
-        glob_strs = context["glob"]
-        globs = [item.strip() for item in glob_strs.split(',')]
-        print("Got the following globs: %s" % str(globs))
+    glob_dict = None
+    if "include_glob" in context and "exclude_glob" in context:
+        glob_dict = {"include":context["include_glob"], "exclude": context["exclude_glob"]}
+
     # getting the script
-    wget_script(query, globs)
-    if emails == "unused":
-        make_product(rule_name, query)
+
+    wget_script(query, glob_dict)
+    if emails=="unused":
+	make_product(rule_name, query)
     else:
         # now email the query
         email(query, emails, rule_name)