Integration Testing, Round 2 (#26)

* Integration test build matrix is now parameterized - This should allow each independent test to control their passing thresholds - This will allow us to more flexibly control passing states over time as the project improves * Disables fail-fast flag for testing matrix * Sets a lower passing threshold for bioconda tests to start * Truncates logs when scripts run from a GitHub Workflow - From experience, GitHub seems to be struggling to maintain all the logging information we are dumping to the console. - To mitigate this, we suppress dumping all the failed recipe file names to the console. This should dramatically shorten the amount of text being buffered. - This is unfortunate for the time being, but it should also make it easier to navigate the output of `convert` and `rattler-bulk-build` * Experimental timeout mechanism * Improves timeout mechanism - Tracking timeouts now use the `subshell.run()` timeout parameter instead of attempting to use some UNIX signal solution from StackOverflow * Reduced timeout * Removes `ExitCode` enum - Exit codes are now stored as ints, there is no way to predict what rattler-build my return to us - Tweaks to minimum test passing metrics * Adds timeout, disables bioconda_03 and 04 - I can't figure out why these integration tests cause so many issues for the GitHub runner, so it'll be a story for another time * Fixes disabling tests * Fixes minor typo * Starts work on conda-forge integration test Work is based on the branch used for #15 - Adds integration test case for conda-forge - Adds new `scripts` directory for developer helper scripts - Adds `randomly_select_recipes.py` utility that allows developers to randomly select `n` recipes from a GitHub organization hosting publicly accessible feedstock repositories * Fixes issue with parsing raw bytes from the GET request * Bumps CI minimum scores * Test data now pulls from `conda-recipe-manager-test-data` - Integration tests now pull data from the test data repo using the sparse option in the checkout action. * Fixes typos
conda-incubator · Apr 12, 2024 · dc5b370 · dc5b370
1 parent e22a369
commit dc5b370
Show file tree

Hide file tree

Showing 9 changed files with 226 additions and 35 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -63,13 +63,44 @@ jobs:
   ## Integration tests ##
   integration-rattler:
       runs-on: ubuntu-latest
+      timeout-minutes: 45
       name: Test on ${{ matrix.test-directory }}
       strategy:
+        fail-fast: false
         matrix:
           test-directory:
-            - anaconda_recipes
+            - anaconda_recipes_01
+            - bioconda_recipes_01
+            - bioconda_recipes_02
+            - bioconda_recipes_03
+            - bioconda_recipes_04
+          include:
+            - test-directory: anaconda_recipes_01
+              convert-success: 0.80
+              rattler-success: 0.50
+            - test-directory: bioconda_recipes_01
+              convert-success: 0.55
+              rattler-success: 0.02
+            - test-directory: bioconda_recipes_02
+              convert-success: 0.55
+              rattler-success: 0.08
+            - test-directory: bioconda_recipes_03
+              convert-success: 0.55
+              rattler-success: 0.05
+            - test-directory: bioconda_recipes_04
+              convert-success: 0.55
+              rattler-success: 0.05
+            # 2,000 randomly selected conda-forge recipes
+            - test-directory: conda_forge_recipes_01
+              convert-success: 0.75
+              rattler-success: 0.08
       steps:
         - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+          with:
+            repository: conda-incubator/conda-recipe-manager-test-data
+            path: test_data
+            sparse-checkout: recipes_v0/${{ matrix.test-directory }}
         - uses: ./.github/actions/setup-env
           with:
             python-version: "3.11"
@@ -78,5 +109,5 @@ jobs:
             source $CONDA/bin/activate
             conda activate conda-recipe-manager
             conda install -y -c conda-forge rattler-build
-            conda-recipe-manager convert -m 0.80 -o recipe.yaml tests/test_aux_files/integration/${{ matrix.test-directory }}
-            conda-recipe-manager rattler-bulk-build -m 0.30 tests/test_aux_files/integration/${{ matrix.test-directory }} --render-only
+            conda-recipe-manager convert -t -m ${{ matrix.convert-success }} -o recipe.yaml test_data/recipes_v0/${{ matrix.test-directory }}
+            conda-recipe-manager rattler-bulk-build -t -m ${{ matrix.rattler-success }} test_data/recipes_v0/${{ matrix.test-directory }} --render-only
diff --git a/Makefile b/Makefile
@@ -14,6 +14,7 @@ PYTHON3 := "$(CONDA_PREFIX)/bin/python3"
 CONDA_ENV_NAME ?= conda-recipe-manager
 SRC_DIR = conda_recipe_manager
 TEST_DIR = tests/
+SCRIPTS_DIR = scripts/
 
 define BROWSER_PYSCRIPT
 import os, webbrowser, sys
@@ -104,11 +105,11 @@ lint:			## runs the linter against the project
 	pylint --rcfile=.pylintrc $(SRC_DIR) $(TEST_DIR)
 
 format:			## runs the code auto-formatter
-	isort --profile black --line-length=120 $(SRC_DIR) $(TEST_DIR)
-	black --line-length=120 $(SRC_DIR) $(TEST_DIR)
+	isort --profile black --line-length=120 $(SRC_DIR) $(TEST_DIR) $(SCRIPTS_DIR)
+	black --line-length=120 $(SRC_DIR) $(TEST_DIR) $(SCRIPTS_DIR)
 
 format-docs:	## runs the docstring auto-formatter. Note this requires manually installing `docconvert`
-	docconvert --in-place --config .docconvert.json $(SRC_DIR) $(TEST_DIR)
+	docconvert --in-place --config .docconvert.json $(SRC_DIR) $(TEST_DIR) $(SCRIPTS_DIR)
 
 analyze:		## runs static analyzer on the project
-	mypy --config-file=.mypy.ini --cache-dir=/dev/null $(SRC_DIR) $(TEST_DIR)
+	mypy --config-file=.mypy.ini --cache-dir=/dev/null $(SRC_DIR) $(TEST_DIR) $(SCRIPTS_DIR)
diff --git a/conda_recipe_manager/commands/convert.py b/conda_recipe_manager/commands/convert.py
@@ -164,8 +164,14 @@ def process_recipe(file: Path, path: Path, output: Optional[Path]) -> tuple[str,
     default=DEFAULT_BULK_SUCCESS_PASS_THRESHOLD,
     help="Sets a minimum passing success rate for bulk operations.",
 )
+@click.option(
+    "--truncate",
+    "-t",
+    is_flag=True,
+    help="Truncates logging. On large tests in a GitHub CI environment, this can eliminate log buffering issues.",
+)
 def convert(
-    path: Path, output: Optional[Path], min_success_rate: float
+    path: Path, output: Optional[Path], min_success_rate: float, truncate: bool
 ) -> None:  # pylint: disable=redefined-outer-name
     """
     Recipe conversion CLI utility. By default, recipes print to STDOUT. Messages always print to STDERR. Takes 1 file or
@@ -281,14 +287,15 @@ def convert(
     }
 
     final_output = {
-        "recipes_with_exceptions": recipes_with_except,
-        "recipes_with_errors": recipes_with_errors,
-        "recipes_with_warnings": recipes_with_warnings,
         "exception_histogram": except_histogram,
         "error_histogram": errors_histogram,
         "warnings_histogram": warnings_histogram,
         "statistics": stats,
     }
+    if not truncate:
+        final_output["recipes_with_exceptions"] = recipes_with_except
+        final_output["recipes_with_errors"] = recipes_with_errors
+        final_output["recipes_with_warnings"] = recipes_with_warnings
 
     print_out(json.dumps(final_output, indent=2))
     sys.exit(ExitCode.SUCCESS if percent_recipe_success >= min_success_rate else ExitCode.MISSED_SUCCESS_THRESHOLD)
diff --git a/conda_recipe_manager/commands/rattler_bulk_build.py b/conda_recipe_manager/commands/rattler_bulk_build.py
@@ -12,7 +12,6 @@
 import sys
 import time
 from dataclasses import dataclass
-from enum import IntEnum
 from pathlib import Path
 from typing import Final, cast
 
@@ -26,17 +25,16 @@
 # "successfully"
 DEFAULT_BULK_SUCCESS_PASS_THRESHOLD: Final[float] = 0.80
 RATTLER_ERROR_REGEX = re.compile(r"Error:\s+.*")
+# Timeout to halt operation
+DEFAULT_RATTLER_BUILD_TIMEOUT: Final[int] = 120
 
 
-class ExitCode(IntEnum):
-    """
-    Error codes to return upon script completion
-    """
-
-    SUCCESS = 0
-    NO_FILES_FOUND = 1
-    # In bulk operation mode, this indicates that the % success threshold was not met
-    MISSED_SUCCESS_THRESHOLD = 42
+## Error codes (NOTE: there may be overlap with rattler-build) ##
+SUCCESS: Final[int] = 0
+NO_FILES_FOUND: Final[int] = 1
+# In bulk operation mode, this indicates that the % success threshold was not met
+MISSED_SUCCESS_THRESHOLD: Final[int] = 42
+TIMEOUT: Final[int] = 43
 
 
 @dataclass
@@ -45,7 +43,7 @@ class BuildResult:
     Struct that contains the results, metadata, errors, etc of building a single recipe file.
     """
 
-    code: ExitCode
+    code: int
     errors: list[str]
 
 
@@ -60,16 +58,23 @@ def build_recipe(file: Path, path: Path, args: list[str]) -> tuple[str, BuildRes
     """
     cmd: list[str] = ["rattler-build", "build", "-r", str(file)]
     cmd.extend(args)
-    output: Final[subprocess.CompletedProcess[str]] = subprocess.run(
-        " ".join(cmd),
-        encoding="utf-8",
-        capture_output=True,
-        shell=True,
-        check=False,
-    )
+    try:
+        output: Final[subprocess.CompletedProcess[str]] = subprocess.run(
+            " ".join(cmd),
+            encoding="utf-8",
+            capture_output=True,
+            shell=True,
+            check=False,
+            timeout=DEFAULT_RATTLER_BUILD_TIMEOUT,
+        )
+    except subprocess.TimeoutExpired:
+        return str(file.relative_to(path)), BuildResult(
+            code=TIMEOUT,
+            errors=["Recipe build dry-run timed out."],
+        )
 
     return str(file.relative_to(path)), BuildResult(
-        code=ExitCode(output.returncode),
+        code=output.returncode,
         errors=cast(list[str], RATTLER_ERROR_REGEX.findall(output.stderr)),
     )
 
@@ -92,8 +97,14 @@ def build_recipe(file: Path, path: Path, args: list[str]) -> tuple[str, BuildRes
     default=DEFAULT_BULK_SUCCESS_PASS_THRESHOLD,
     help="Sets a minimum passing success rate for bulk operations.",
 )
+@click.option(
+    "--truncate",
+    "-t",
+    is_flag=True,
+    help="Truncates logging. On large tests in a GitHub CI environment, this can eliminate log buffering issues.",
+)
 @click.pass_context
-def rattler_bulk_build(ctx: click.Context, path: Path, min_success_rate: float) -> None:
+def rattler_bulk_build(ctx: click.Context, path: Path, min_success_rate: float, truncate: bool) -> None:
     """
     Given a directory of feedstock repositories, performs multiple recipe builds using rattler-build.
     All unknown options and arguments for this script are passed directly to `rattler-build build`.
@@ -108,7 +119,7 @@ def rattler_bulk_build(ctx: click.Context, path: Path, min_success_rate: float)
 
     if not files:
         print_err(f"No `recipe.yaml` files found in: {path}")
-        sys.exit(ExitCode.NO_FILES_FOUND)
+        sys.exit(NO_FILES_FOUND)
 
     # Process recipes in parallel
     thread_pool_size: Final[int] = mp.cpu_count()
@@ -123,7 +134,7 @@ def rattler_bulk_build(ctx: click.Context, path: Path, min_success_rate: float)
     recipes_with_errors: list[str] = []
     error_histogram: dict[str, int] = {}
     for file, build_result in results.items():
-        if build_result.code == ExitCode.SUCCESS:
+        if build_result.code == SUCCESS:
             total_success += 1
         else:
             total_errors += 1
@@ -149,10 +160,11 @@ def rattler_bulk_build(ctx: click.Context, path: Path, min_success_rate: float)
         },
     }
     final_output = {
-        "recipes_with_build_error_code": recipes_with_errors,
         "error_histogram": error_histogram,
         "stats": stats,
     }
+    if not truncate:
+        final_output["recipes_with_build_error_code"] = recipes_with_errors
 
     print(json.dumps(final_output, indent=2))
-    sys.exit(ExitCode.SUCCESS if percent_success >= min_success_rate else ExitCode.MISSED_SUCCESS_THRESHOLD)
+    sys.exit(SUCCESS if percent_success >= min_success_rate else MISSED_SUCCESS_THRESHOLD)
diff --git a/environment.yaml b/environment.yaml
@@ -21,4 +21,6 @@ dependencies:
   - conda-build
   - jsonschema
   - types-jsonschema
+  - requests
+  - types-requests
   - pre-commit
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
   "jinja2",
   "pyyaml",
   "jsonschema",
+  "requests",
 ]
 
 [project.optional-dependencies]

diff --git a/recipe/meta.yaml b/recipe/meta.yaml
@@ -28,6 +28,7 @@ requirements:
     - jinja2
     - pyyaml
     - jsonschema
+    - requests
 
 test:
   imports:

diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,28 @@
+# Scripts
+
+## Overview
+
+This directory contains 1-off development scripts related to this project.
+
+They should not be packaged to be run by a user/consumer of this project.
+
+# randomly_select_recipes.py
+
+Given a list of feedstock repositories owned by a GitHub organization, randomly select `NUM_RECIPES` number of recipe
+files to dump into `OUT_DIR`
+
+## Dependencies
+- `requests`
+- Some manual work with `gh` to produce the input file
+
+## Usage:
+```sh
+./randomly_select_recipes.py [-e EXCLUDE_FILE] FILE NUM_RECIPES OUT_DIR
+```
+Where `-e EXCLUDE_FILE` is a list of repository names (1 line per repo name) to ignore when randomly selecting
+recipes from the other list. This is useful for generating multiple sets of non-overlapping repository files.
+
+For `conda-forge`, the input file used by this script was generated with:
+```sh
+gh repo list conda-forge -L 20000 > conda-forge-list.out
+```
diff --git a/scripts/randomly_select_recipes.py b/scripts/randomly_select_recipes.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+File:           randomly_select_recipes.py
+Description:    Helper script to randomly select and acquire recipe files from a GitHub org.
+"""
+import argparse
+import csv
+import multiprocessing as mp
+import random
+from pathlib import Path
+from typing import Final, cast
+
+import requests
+
+# GET request timeout, in seconds
+HTTP_GET_TIMEOUT: Final[float] = 15
+
+
+def fetch_repo(org_repo: str, out_dir: Path) -> str:
+    """
+    Fetch a feedstock repo's recipe file and dump it to a corresponding location on disk.
+    :param org_repo: String containing `org/repo`, which is what `gh repo list` returns
+    :param out_dir: Path to the directory where files should be saved to
+    :returns: The repository identifier, if successfully pulled and saved. Otherwise returns an empty string
+    """
+    url_options: Final[list[str]] = [
+        f"https://raw.githubusercontent.com/{org_repo}/main/recipe/meta.yaml",
+        f"https://raw.githubusercontent.com/{org_repo}/master/recipe/meta.yaml",
+    ]
+
+    slash_idx: Final[int] = org_repo.find("/")
+    if slash_idx < 0:
+        return ""
+    repo: Final[str] = org_repo[slash_idx + 1 :]
+    file_path: Final[Path] = out_dir / f"{repo}/recipe/meta.yaml"
+
+    for url in url_options:
+        try:
+            response = requests.get(url, timeout=HTTP_GET_TIMEOUT)
+            if response.status_code == 200:
+                file_path.parent.mkdir(exist_ok=True, parents=True)
+                file_path.write_text(response.text)
+                return org_repo
+        except requests.exceptions.RequestException:  # type: ignore[misc]
+            continue
+    return ""
+
+
+def main() -> None:
+    """
+    Main execution point of the script
+    """
+    parser = argparse.ArgumentParser(
+        description="Randomly pulls n number of recipe files from a list of repos from a GitHub organization"
+    )
+    parser.add_argument("--exclude", "-e", default="", type=str, help="File containing a list of repos to exclude")
+    parser.add_argument(
+        "file", type=Path, help="File containing the output of `gh repo list <org>`"  # type: ignore[misc]
+    )
+    parser.add_argument("num_recipes", type=int, help="Target number of recipes to select")
+    parser.add_argument("out_dir", type=Path, help="Directory to place fetched recipe files in.")  # type: ignore[misc]
+    args = parser.parse_args()
+
+    # Keep the type checker happy
+    exclude: Final[bool] = cast(bool, args.exclude)
+    gh_list_file: Final[Path] = cast(Path, args.file)
+    num_recipes: Final[int] = cast(int, args.num_recipes)
+    out_dir: Final[Path] = cast(Path, args.out_dir)
+
+    # Parse excluded repos
+    # TODO: This list probably comes from `ls` and won't have the prefixed org name
+    excluded_repos: set[str] = set()
+    if exclude:
+        with open(exclude, encoding="utf-8") as fd:
+            for line in fd:
+                excluded_repos.add(line.strip())
+
+    # Parse the GitHub repo list
+    all_repos: set[str] = set()
+    with open(gh_list_file, encoding="utf-8") as fd:
+        reader = csv.reader(fd, delimiter="\t", quotechar='"')
+        for row in reader:
+            if not row:
+                continue
+            all_repos.add(row[0])
+
+    # Randomly select N valid repos
+    allowed_repos: Final[set[str]] = all_repos - excluded_repos
+    picked_repos: Final[set[str]] = (
+        allowed_repos if num_recipes >= len(allowed_repos) else set(random.sample(sorted(allowed_repos), num_recipes))
+    )
+
+    print(f"Selected {len(picked_repos)} out of {num_recipes} requested repos...")
+    print("Fetching...")
+
+    # This method could be refined. But to be lazy and avoid authentication issues and extra dependencies, we make an
+    # attempt to pull the raw files based on an assumed location.
+    with mp.Pool(mp.cpu_count()) as pool:
+        results = pool.starmap(fetch_repo, [(repo, out_dir) for repo in picked_repos])  # type: ignore[misc]
+
+    unique_results: Final[set[str]] = set(results)
+    if "" in unique_results:
+        unique_results.remove("")
+    print(f"Fetched {len(unique_results)} out of {len(picked_repos)} picked repositories...")
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ Expand Up / @@ -28,6 +28,7 @@ requirements: @@
         - jinja2
         - pyyaml
         - jsonschema
+        - requests
     test:
       imports:
@@ Expand Down @@