diff --git a/README.md b/README.md index 5ad8791..b046f3b 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ Install this tool using `pip`: The `file` command analyzes the history of an individual file. -This assumes you have a JSON file that consists of an array of objects, and that has multiple versions stored away in the Git history, likely through [Git scraping](https://simonwillison.net/2020/Oct/9/git-scraping/). +The command assumes you have a JSON file that consists of an array of objects, and that has multiple versions stored away in the Git history, likely through [Git scraping](https://simonwillison.net/2020/Oct/9/git-scraping/). + +(CSV and other formats are supported too, see below.) Most basic usage is: @@ -58,9 +60,17 @@ Note that `id`, `item`, `version` and `commit` are reserved column names that ar There is one exception: if you have an `id` column and use `--id id` without specifying more than one ID column, your ìd` column will be used as the item ID but will not be renamed. +### CSV and TSV data + +If the data in your repository is a CSV or TSV file you can process it by adding the `--csv` option. This will attempt to detect which delimiter is used by the file, so the same option works for both comma- and tab-separated values. + + git-convert file trees.db trees.csv --id TreeID + ### Custom conversions using --convert -This tool expects each version of the stored file to be a JSON file that looks something like this: +If your data is not already either CSV/TSV or a flat JSON array, you can reshape it using the `--convert` option. + +The format needed by this tool is an array of dictionaries that looks like this: ```json [ @@ -77,7 +87,7 @@ This tool expects each version of the stored file to be a JSON file that looks s ] ``` -If your data does not fit this shape, you can still use this tool to analyze it by writing a snippet of Python code that converts each stored file content into a Python list of dictionaries. +If your data does not fit this shape, you can provide a snippet of Python code to converts the on-disk content of each stored file into a Python list of dictionaries. For example, if your stored files each look like this: @@ -104,13 +114,15 @@ json.loads(content)["incidents"] ``` (The `json` module is exposed to your custom function by default.) -You would run the tool like this: +You would then run the tool like this: git-convert file database.db incidents.json \ --id id \ --convert 'json.loads(content)["incidents"]' -If you need to import additional modules you can do so with `--import`. This example shows how you could read a CSV file that uses `;` as the delimiter: +The `content` variable is always a `bytes` object representing the content of the file at a specific moment in the repository's history. + +You can import additional modules using `--import`. This example shows how you could read a CSV file that uses `;` as the delimiter: git-history file trees.db ../sf-tree-history/Street_Tree_List.csv \ --repo ../sf-tree-history \ diff --git a/git_history/cli.py b/git_history/cli.py index 3e1e529..61e29fd 100644 --- a/git_history/cli.py +++ b/git_history/cli.py @@ -3,6 +3,7 @@ import hashlib import json import sqlite_utils +import textwrap from pathlib import Path @@ -49,9 +50,14 @@ def cli(): "ids", "--id", multiple=True, help="Columns (can be multiple) to use as an ID" ) @click.option("ignore", "--ignore", multiple=True, help="Columns to ignore") +@click.option( + "csv_", + "--csv", + is_flag=True, + help="Expect CSV/TSV data, not JSON", +) @click.option( "--convert", - default="json.loads(content)", help="Python code to read each file version content and return it as a list of dicts. Defaults to json.parse(content)", ) @click.option( @@ -74,11 +80,30 @@ def file( branch, ids, ignore, + csv_, convert, imports, ignore_duplicate_ids, ): "Analyze the history of a specific file and write it to SQLite" + if csv_ and convert: + raise click.ClickException("Cannot use both --csv and --convert") + + if csv_: + convert = textwrap.dedent( + """ + decoded = content.decode("utf-8") + dialect = csv.Sniffer().sniff(decoded[:512]) + reader = csv.DictReader(io.StringIO(decoded), dialect=dialect) + return list(reader) + """ + ) + imports = ["io", "csv"] + + if not convert: + convert = "json.loads(content)" + + # Clean up the provided code # If single line and no 'return', add the return if "\n" not in convert and not convert.strip().startswith("return "): convert = "return {}".format(convert) diff --git a/tests/test_git_history.py b/tests/test_git_history.py index b62ecbf..ec19ca8 100644 --- a/tests/test_git_history.py +++ b/tests/test_git_history.py @@ -45,6 +45,14 @@ def repo(tmpdir): ), "utf-8", ) + (repo_dir / "trees.csv").write_text( + "TreeID,name\n1,Sophia\n2,Charlie", + "utf-8", + ) + (repo_dir / "trees.tsv").write_text( + "TreeID\tname\n1\tSophia\n2\tCharlie", + "utf-8", + ) git_commit = [ "git", "-c", @@ -61,6 +69,8 @@ def repo(tmpdir): "items.json", "items-with-reserved-columns.json", "items-with-banned-columns.json", + "trees.csv", + "trees.tsv", ], cwd=str(repo_dir), ) @@ -306,3 +316,45 @@ def test_file_with_banned_columns(repo, tmpdir, specify_id): "}\n" "" ) + + +@pytest.mark.parametrize("file", ("trees.csv", "trees.tsv")) +def test_csv_tsv(repo, tmpdir, file): + runner = CliRunner() + db_path = str(tmpdir / "db.db") + with runner.isolated_filesystem(): + result = runner.invoke( + cli, + [ + "file", + db_path, + str(repo / file), + "--repo", + str(repo), + "--id", + "TreeID", + "--csv", + ], + catch_exceptions=False, + ) + assert result.exit_code == 0 + db = sqlite_utils.Database(db_path) + assert db.schema == ( + "CREATE TABLE [commits] (\n" + " [hash] TEXT PRIMARY KEY,\n" + " [commit_at] TEXT\n" + ");\n" + "CREATE TABLE [items] (\n" + " [id] TEXT PRIMARY KEY,\n" + " [TreeID] TEXT,\n" + " [name] TEXT\n" + ");\n" + "CREATE TABLE [item_versions] (\n" + " [item] TEXT REFERENCES [items]([id]),\n" + " [version] INTEGER,\n" + " [commit] TEXT REFERENCES [commits]([hash]),\n" + " [TreeID] TEXT,\n" + " [name] TEXT,\n" + " PRIMARY KEY ([item], [version])\n" + ");" + )