Skip to content

Commit

Permalink
Add exclude glob option in wget job (#10)
Browse files Browse the repository at this point in the history
* Refine defaults needed for thresholding

* change defaults to make sure globs get the file only, not the full paths!

* Refine defaults

* add exclude flag in wget blob

* remove TODO

* optional flag

* verify json

* fix this bad commit change
  • Loading branch information
shitong01 authored and pymonger committed Sep 26, 2019
1 parent 7cc73a5 commit 2fd1d89
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 22 deletions.
12 changes: 10 additions & 2 deletions docker/hysds-io.json.lw-tosca-wget-glob
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,20 @@
"from": "submitter"
},
{
"name": "glob",
"name": "include_glob",
"type": "text",
"from": "submitter",
"default": "*fn*_logra2a1*.tif,*logamp*.tif,*.dem*,*msk.tif",
"default": "*final*",
"placeholder": "A comma seperated list of globs to download"
},
{
"name": "exclude_glob",
"type": "text",
"from": "submitter",
"default": "*tiles*",
"optional": true,
"placeholder": "(Optional) A comma seperated list of globs to exclude from included list"
},
{
"name": "name",
"type": "text",
Expand Down
6 changes: 5 additions & 1 deletion docker/job-spec.json.lw-tosca-wget-glob
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
"destination": "positional"
},
{
"name": "glob",
"name": "include_glob",
"destination": "context"
},
{
"name": "exclude_glob",
"destination": "context"
}
]
Expand Down
60 changes: 41 additions & 19 deletions wget.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
logger = logging.getLogger("hysds")


def wget_script(dataset=None, glob_list=None):
def wget_script(dataset=None, glob_dict=None):
"""Return wget script."""

# query
Expand Down Expand Up @@ -68,7 +68,7 @@ def wget_script(dataset=None, glob_list=None):
scroll_id = scan_result['_scroll_id']

# stream output a page at a time for better performance and lower memory footprint
def stream_wget(scroll_id, glob_list=None):
def stream_wget(scroll_id, glob_dict=None):
#formatted_source = format_source(source)
yield '#!/bin/bash\n#\n' + \
'# query:\n#\n' + \
Expand Down Expand Up @@ -107,8 +107,8 @@ def stream_wget(scroll_id, glob_list=None):
cut_dirs = 6
if '.s3-website' in url or 'amazonaws.com' in url:
files = get_s3_files(url)
if glob_list:
files = glob_filter(files, glob_list)
if glob_dict:
files = glob_filter(files, glob_dict)
for file in files:
yield 'echo "downloading %s"\n' % file
if 's1a_ifg' in url:
Expand All @@ -128,7 +128,7 @@ def stream_wget(scroll_id, glob_list=None):

# malarout: interate over each line of stream_wget response, and write to a file which is later attached to the email.
with open('wget_script.sh','w') as f:
for i in stream_wget(scroll_id, glob_list):
for i in stream_wget(scroll_id, glob_dict):
f.write(i)

# for gzip compressed use file extension .tar.gz and modifier "w:gz"
Expand Down Expand Up @@ -199,16 +199,38 @@ def make_product(rule_name, query):
with open("{0}/{0}.dataset.json".format(name), "w") as fp:
json.dump({"id": name, "version": "v0.1"}, fp)

def glob_filter(names,pattern):
def glob_filter(names, glob_dict):
import fnmatch
files = []
for pat in pattern:
matching = fnmatch.filter(names, "*" + pat)
files.extend(matching)
files_exclude = []
include_csv = glob_dict.get("include", None)
exclude_csv = glob_dict.get("exclude", None)

if include_csv:
pattern_list = [item.strip() for item in include_csv.split(',')]

for pat in pattern_list:
matching = fnmatch.filter(names, "*" + pat)
files.extend(matching)
print("Got the following files to include: %s" % str(files))

if exclude_csv:
pattern_list_exc = [item.strip() for item in exclude_csv.split(',')]


for pat in pattern_list_exc:
matching = fnmatch.filter(names, "*" + pat)
files_exclude.extend(matching)

files_exclude = list(set(files_exclude))
print("Got the following files to exclude: %s" % str(files_exclude))

#unique list
retfiles_set = set(files)
files_final = [x for x in files if x not in files_exclude]
retfiles_set = set(files_final)
print("Got the following files: %s" % str(retfiles_set))
return list(retfiles_set)
return list(files_final)


if __name__ == "__main__":
'''
Expand All @@ -228,15 +250,15 @@ def glob_filter(names,pattern):
except:
raise Exception('unable to parse _context.json from work directory')

globs = None
if "glob" in context:
glob_strs = context["glob"]
globs = [item.strip() for item in glob_strs.split(',')]
print("Got the following globs: %s" % str(globs))
glob_dict = None
if "include_glob" in context and "exclude_glob" in context:
glob_dict = {"include":context["include_glob"], "exclude": context["exclude_glob"]}

# getting the script
wget_script(query, globs)
if emails == "unused":
make_product(rule_name, query)

wget_script(query, glob_dict)
if emails=="unused":
make_product(rule_name, query)
else:
# now email the query
email(query, emails, rule_name)

0 comments on commit 2fd1d89

Please sign in to comment.