diff --git a/src/fetch-docs.py b/src/fetch-docs.py index 985212b..57b1f29 100755 --- a/src/fetch-docs.py +++ b/src/fetch-docs.py @@ -27,6 +27,9 @@ if __name__ == '__main__': # Use a Session for connection pooling session = requests.Session() + session.headers.update({ + "User-Agent": "https://github.com/nextstrain/docs.nextstrain.org (hello@nextstrain.org)", + }) class RemoteDoc: def __init__(self, source_url, dest_path): diff --git a/src/reference/data-formats.rst b/src/reference/data-formats.rst index 0f85b70..2db1329 100644 --- a/src/reference/data-formats.rst +++ b/src/reference/data-formats.rst @@ -2,6 +2,39 @@ Data formats ============ +.. contents:: Table of Contents + :local: + +TSV +=== + +Nextstrain strongly prefers using TSV files for metadata even though Augur commands support other delimiters as inputs. +If you are using other formats, we recommend using :doc:`augur curate passthru ` to convert them to TSV. + +Nextstrain tools and workflows produce `RFC 4180 CSV-like TSVs `__. + +When using `csvtk `__ + +* the ``--lazy`` (``-l``) option should not be necessary +* the ``fix-quotes``/``del-quotes`` commands should not be necessary + +When using `tsv-utils `__ + +* pass the inputs through ``csv2tsv --csv-delim $'\t'`` +* pass the final ``tsv-util`` outputs through ``csvtk fix-quotes --tabs`` + +.. code-block:: bash + + csv2tsv --csv-delim $'\t' metadata.tsv \ + | tsv-select -H -f strain,date \ + | tsv-uniq -H -f strain \ + | csvtk fix-quotes --tabs > output.tsv + +See our internal `discussion on TSV standardization `__ for more details. + +JSON +==== + Nextstrain uses a few different kinds of `JSON `__ files at various stages in a typical build.