forked from webdataset/webdataset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtasks.py
307 lines (256 loc) · 9.52 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import glob
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
import textwrap
import time
import yaml
from invoke import task
VENV = "venv"
BIN = f"{VENV}/bin"
PYTHON3 = f"{BIN}/python3"
ACTIVATE = f". {BIN}/activate;"
PIP = f"{BIN}/pip"
PACKAGE = "webdataset"
DOCKER = "wdstest"
COMMANDS = []
MODULES = [os.path.splitext(fname)[0] for fname in glob.glob(f"{PACKAGE}/*.py")]
MODULES = [re.sub("/", ".", name) for name in MODULES if name[0] != "_"]
@task
def clean(c):
"Remove temporary files."
c.run(f"rm -rf build site dist __pycache__ */__pycache__ *.pyc */*.pyc")
@task(clean)
def cleanall(c):
"Remove temporary files and virtualenv."
c.run(f"rm -rf venv")
@task
def venv(c):
"Build the virtualenv."
c.run(f"test -d {VENV} || python3 -m venv {VENV}")
c.run(f"{BIN}/pip install --upgrade pip")
c.run(f"{BIN}/pip install -e '.[dev]'")
c.run(f"pre-commit install || true")
print("done")
@task
def ruff(c):
"Run the ruff linter."
c.run(f"{BIN}/ruff check .")
@task
def docsserve(c):
"Serve the documentation locally in a browser."
c.run(f"{BIN}/mkdocs serve -o")
@task
def docspush(c):
"""Generate the documentation and push it to Github pages."""
c.run("rm -rf site")
c.run("mkdocs build")
c.run("ghp-import -n -p site")
c.run("rm -rf site")
def summarize_notebook(nb):
"""Summarize a notebook."""
import textwrap
prompt = textwrap.dedent(
"""
Here is a notebook in markdown format. Please summarize the purpose and contents
of the notebook in a few sentences. The only markup you may use is `...` for
quoting identifiers. Except for quoted identifiers, do not include any code
or output in the summary. Do not use any other markup or markdown, just plain text.
In your summary, focus on the use of webdataset, wids, or wsds libraries (note:
these are different libraries and be sure to talk only about the library that
is being used in the notebook) in the notebooks and what
the notebook illustrates about the use of those libraries,
rather than the deep learning problem or processing problem used to illustrate
the library usage. Mention the primary classes in those libraries used/exemplified
by each notebook.
Keep your summary brief, 1-3 sentences at most. Do not describe the contents
of the notebook step-by-step.
"""
)
summary = os.popen(f"sgpt --no-md '{prompt}' < {nb}").read().strip()
summary = textwrap.fill(summary, 80)
return summary
def find_with_key(d, key):
if isinstance(d, dict):
if key in d:
return d[key]
for k, v in d.items():
result = find_with_key(v, key)
if result is not None:
return result
if isinstance(d, list):
for v in d:
result = find_with_key(v, key)
if result is not None:
return result
return None
@task
def docsnbgen(c):
assert os.path.exists("./mkdocs.yml")
assert os.path.exists("./docs")
assert os.path.exists("./examples")
structure = yaml.safe_load(open("mkdocs.yml"))
structure = find_with_key(structure, "Examples")
for item in structure:
if not isinstance(item, dict):
continue
k = list(item.keys())[0]
v = item[k]
odir = f"./docs/examples/{k}"
os.makedirs(odir, exist_ok=True)
for onav in v:
if "index.md" in onav:
continue
output = f"./docs/{onav}"
nb = "./examples/" + os.path.basename(output).replace(".md", ".ipynb")
print(nb, "-->", output)
# continue
if os.path.exists(output) and os.path.getmtime(nb) < os.path.getmtime(
output
):
continue
c.run(
f"{ACTIVATE}jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace {nb}"
)
c.run(
f"{ACTIVATE}jupyter nbconvert {nb} --to markdown --output-dir=docs/examples/{k}"
)
summary = summarize_notebook(output)
summary_fname = output.replace(".md", ".summary.md")
with open(summary_fname, "w") as stream:
stream.write(summary)
print()
def mksection(summary_fname):
with open(summary_fname) as stream:
summary = stream.read().strip()
section_name = os.path.basename(summary_fname).replace(".summary.md", "")
capitalized_name = section_name.replace("-", " ").title()
link = f"[{capitalized_name}](./{section_name})"
return f"### {capitalized_name}\n\n{link}\n\n{summary}\n\n"
summaries = [
mksection(fname) for fname in glob.glob(f"docs/examples/{k}/*.summary.md")
]
with open(f"docs/examples/{k}/index.md", "w") as stream:
print("Writing", f"docs/examples/{k}/index.md")
stream.write("\n\n".join(summaries))
@task
def nbrun(c):
"""Run selected notebooks with papermill+parameters; put into ./out."""
def nbprocess(c, nb, *args, **kwargs):
"""Process one notebook."""
out_file = f"docs/output/{nb}"
if not os.path.exists(out_file) or os.path.getmtime(nb) > os.path.getmtime(
out_file
):
c.run(
f"../venv/bin/python -m papermill -l python {' '.join(args)} {nb} docs/output/_{nb}"
)
c.run(f"mv docs/output/_{nb} {out_file}")
with c.cd("examples"): # Change directory to 'examples'
c.run("rm -f *.log *.out.ipynb *.stripped.ipynb _temp.ipynb", pty=True)
c.run("mkdir -p docs/output", pty=True)
nbprocess(c, "generate-text-dataset.ipynb")
nbprocess(c, "train-ocr-errors-hf.ipynb", "-p", "max_steps", "100")
nbprocess(c, "train-resnet50-wds.ipynb", "-p", "max_steps", "10000")
nbprocess(c, "train-resnet50-wids.ipynb", "-p", "max_steps", "10000")
nbprocess(c, "train-resnet50-multiray-wds.ipynb", "-p", "max_steps", "1000")
nbprocess(c, "train-resnet50-multiray-wids.ipynb", "-p", "max_steps", "1000")
nbprocess(c, "tesseract-wds.ipynb")
@task
def quick(c):
"Run the tests."
# venv(c)
c.run(f"{ACTIVATE}{PYTHON3} -m pytest -x tests -m quick")
@task
def test(c):
"Run the tests."
# venv(c)
c.run(f"{ACTIVATE}{PYTHON3} -m pytest -x tests")
@task
def testwids(c):
"Run the wids tests."
c.run(f"{ACTIVATE}{PYTHON3} -m pytest -x tests/test_wids*.py")
@task
def testdebug(c):
"Run the tests with --pdb."
c.run(f"{ACTIVATE}{PYTHON3} -m pytest -x --pdb tests")
@task
def testcov(c):
"Run the tests and generate coverage.json and coverage.lcov."
# venv(c)
c.run(
f"{ACTIVATE}{PYTHON3} -m pytest ./tests --cov=wids "
+ "--cov=webdataset --cov-report=term-missing --cov-branch "
+ "--cov-report=json:coverage.json --cov-report=lcov:coverage.lcov"
)
@task
def faqmake(c):
"Create the FAQ.md file from github issues."
from helpers.faq import faq_intro, generate_faq_entries_from_issues, wrap_long_lines
generate_faq_entries_from_issues()
output = open("FAQ.md", "w")
output.write(faq_intro)
entries = sorted(glob.glob("faqs/[a-zA-Z]*.md"))
entries = sorted(glob.glob("faqs/[0-9]*.md"), reverse=True)
for fname in entries:
with open(fname) as stream:
text = stream.read()
if "N/A" in text[:20]:
continue
text = text.strip()
text = re.sub(r"[ \t]+$", "", text, flags=re.MULTILINE)
text = wrap_long_lines(text)
if len(text) < 10:
continue
text += "\n\n"
if match := re.match(r"faqs/([0-9]+)\.md", fname):
issue_number = int(match.group(1))
text = f"Issue #{issue_number}\n\n{text}"
output.write("-" * 78 + "\n\n")
output.write(text.strip() + "\n\n")
output.close()
c.run("cp FAQ.md docs/FAQ.md")
# def twine_pypi_release(c):
# "Manually push to PyPI via Twine."
# c.run("rm -f dist/*")
# c.run("$(PYTHON3) -m build --sdist")
# c.run("$(PYTHON3) -m build --wheel")
# c.run("twine check dist/*")
# c.run("twine upload dist/*")
def update_version_numbers_locally(c):
c.run("bump2version patch")
@task
def releasenotes(c):
# get the last release tag using gh
last_tag = c.run("gh release list --limit 1 | cut -f1").stdout.strip()
print("Last tag:", last_tag)
# compute a diff between the last tag and the current state
diff = c.run(f"gh release view {last_tag}").stdout
cmd = "git log --since={last_tag} | "
cmd += "sgpt --no-md 'summarize these commit messages into Python/github release notes'"
notes = c.run(cmd).stdout
with open("RELEASE_NOTES.md", "w") as stream:
stream.write(notes)
def read_version():
# open the pyproject.toml file and read the version number
with open("pyproject.toml") as stream:
for line in stream:
if "version" in line:
version = line.split("=")[1].strip()
version = version.replace('"', "")
return version
@task
def release(c):
"Tag the current version as a release on Github."
assert os.path.exists("RELEASE_NOTES.md")
assert c.run("bump2version --tag patch").ok
assert c.run("git push --follow-tags").ok
version = read_version()
tag = "v" + version
assert c.run(f"gh release create {tag} -t {tag} --notes-file RELEASE_NOTES.md").ok
assert c.run(f"rm RELEASE_NOTES.md").ok
print(f"Release {version} created successfully.")