Merge remote-tracking branch 'origin/main' into vllm_token_ids

ServiceNow · Jan 6, 2025 · c3e39db · c3e39db
2 parents 2dabb13 + c626df1
commit c3e39db
Show file tree

Hide file tree

Showing 117 changed files with 131 additions and 56 deletions.
diff --git a/examples/form_filler/scripts/prepare_test_assets.py b/examples/form_filler/scripts/prepare_test_assets.py
@@ -1,7 +1,7 @@
 import os
 import random
-from pathlib import Path
 import sys
+from pathlib import Path
 
 import hydra
 import yaml
@@ -14,8 +14,7 @@
 from ..user_simulator_agent import UserSimulatorTape
 from .run_formfiller_agent import run_formfiller_agent
 
-assets_folder = Path(__file__).parent.parent.parent.parent / "tests" / "res" / "form_filler"
-
+assets_folder = Path(__file__).parent.parent.parent.parent / "tests" / "examples" / "res" / "form_filler"
 
 input_tapes_for_teacher_path = assets_folder / "input_tapes_for_teacher.yaml"
 input_tapes_for_user_path = assets_folder / "input_tapes_for_user.yaml"

diff --git a/pyproject.toml b/pyproject.toml
@@ -56,3 +56,36 @@ markers = [
   "gpu: requires gpu",
   "multi_gpu: requires more than 1 gpu",
 ]
+
+[tool.tox]
+min_version = "4"
+requires = ["tox-uv>=1"]
+env_list = ["type", "3.10", "3.11", "3.12", "3.13"]
+skip_missing_interpreters = false
+
+[tool.tox.env_run_base]
+description = "Run test under {base_python}"
+deps = [
+  "pytest>=8"
+]
+commands = [
+  [
+    "pytest",
+    "-s",
+    "--color=yes",
+    "-m",
+    "not slow",
+    "tests/",
+    "--ignore=tests/finetune/",
+    "--ignore=tests/examples/",
+  ],
+]
+
+[tool.tox.env.type]
+description = "run type check on code base"
+deps = [
+  "mypy==1.11.2",
+  "types-cachetools>=5.5.0.20240820",
+  "types-chardet>=5.0.4.6",
+]
+commands = [["mypy", "tapeagents"]]
diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -2,6 +2,7 @@
 pytest==8.3.3
 flake8==6.0.0
 ruff==0.6.6
+tox==4.15.1
 # notebook
 ipykernel==6.29.5
 testbook==0.4.2

diff --git a/requirements.finetune.txt b/requirements.finetune.txt
@@ -4,6 +4,5 @@ deepspeed==0.15.1
 numpy==1.26.4
 peft==0.12.0
 tokenizers==0.20.1
-transformers==4.45.2
 wandb==0.19.1
 vllm==0.6.1
diff --git a/tests/res/data_science/final_tape.json → ...examples/res/data_science/final_tape.json b/tests/res/data_science/final_tape.json → ...examples/res/data_science/final_tape.json
diff --git a/tests/res/data_science/tapedata.sqlite → ...examples/res/data_science/tapedata.sqlite b/tests/res/data_science/tapedata.sqlite → ...examples/res/data_science/tapedata.sqlite
diff --git a/tests/res/delegate/start_tape.json → tests/examples/res/delegate/start_tape.json b/tests/res/delegate/start_tape.json → tests/examples/res/delegate/start_tape.json
diff --git a/tests/res/delegate/tape.json → tests/examples/res/delegate/tape.json b/tests/res/delegate/tape.json → tests/examples/res/delegate/tape.json
diff --git a/tests/res/delegate/tapedata.sqlite → tests/examples/res/delegate/tapedata.sqlite b/tests/res/delegate/tapedata.sqlite → tests/examples/res/delegate/tapedata.sqlite
diff --git a/tests/res/delegate_stack/llm.json → tests/examples/res/delegate_stack/llm.json b/tests/res/delegate_stack/llm.json → tests/examples/res/delegate_stack/llm.json
diff --git a/tests/res/delegate_stack/start_tape.json → ...amples/res/delegate_stack/start_tape.json b/tests/res/delegate_stack/start_tape.json → ...amples/res/delegate_stack/start_tape.json
diff --git a/tests/res/delegate_stack/tape1.json → tests/examples/res/delegate_stack/tape1.json b/tests/res/delegate_stack/tape1.json → tests/examples/res/delegate_stack/tape1.json
diff --git a/tests/res/delegate_stack/tape2.json → tests/examples/res/delegate_stack/tape2.json b/tests/res/delegate_stack/tape2.json → tests/examples/res/delegate_stack/tape2.json
diff --git a/tests/res/delegate_stack/tapedata.sqlite → ...amples/res/delegate_stack/tapedata.sqlite b/tests/res/delegate_stack/tapedata.sqlite → ...amples/res/delegate_stack/tapedata.sqlite
diff --git a/.../form_filler/input_tapes_for_teacher.yaml → .../form_filler/input_tapes_for_teacher.yaml b/.../form_filler/input_tapes_for_teacher.yaml → .../form_filler/input_tapes_for_teacher.yaml
diff --git a/...res/form_filler/input_tapes_for_user.yaml → ...res/form_filler/input_tapes_for_user.yaml b/...res/form_filler/input_tapes_for_user.yaml → ...res/form_filler/input_tapes_for_user.yaml
diff --git a/...form_filler/output_tapes_for_teacher.yaml → ...form_filler/output_tapes_for_teacher.yaml b/...form_filler/output_tapes_for_teacher.yaml → ...form_filler/output_tapes_for_teacher.yaml
diff --git a/...es/form_filler/output_tapes_for_user.yaml → ...es/form_filler/output_tapes_for_user.yaml b/...es/form_filler/output_tapes_for_user.yaml → ...es/form_filler/output_tapes_for_user.yaml
diff --git a/tests/res/form_filler/tapedata.sqlite → .../examples/res/form_filler/tapedata.sqlite b/tests/res/form_filler/tapedata.sqlite → .../examples/res/form_filler/tapedata.sqlite
diff --git a/...orm_filler/teacher_agent_test_config.yaml → ...orm_filler/teacher_agent_test_config.yaml b/...orm_filler/teacher_agent_test_config.yaml → ...orm_filler/teacher_agent_test_config.yaml
diff --git a/...ler/user_simulator_agent_test_config.yaml → ...ler/user_simulator_agent_test_config.yaml b/...ler/user_simulator_agent_test_config.yaml → ...ler/user_simulator_agent_test_config.yaml
diff --git a/tests/res/gaia_agent/tapedata.sqlite.gz → ...xamples/res/gaia_agent/tapedata.sqlite.gz b/tests/res/gaia_agent/tapedata.sqlite.gz → ...xamples/res/gaia_agent/tapedata.sqlite.gz
diff --git a/tests/res/gaia_agent/tapes/l1_task002.json → ...ples/res/gaia_agent/tapes/l1_task002.json b/tests/res/gaia_agent/tapes/l1_task002.json → ...ples/res/gaia_agent/tapes/l1_task002.json
diff --git a/tests/res/gaia_agent/tapes/l1_task003.json → ...ples/res/gaia_agent/tapes/l1_task003.json b/tests/res/gaia_agent/tapes/l1_task003.json → ...ples/res/gaia_agent/tapes/l1_task003.json
diff --git a/tests/res/gaia_agent/tapes/l1_task004.json → ...ples/res/gaia_agent/tapes/l1_task004.json b/tests/res/gaia_agent/tapes/l1_task004.json → ...ples/res/gaia_agent/tapes/l1_task004.json
diff --git a/tests/res/gaia_agent/tapes/l1_task006.json → ...ples/res/gaia_agent/tapes/l1_task006.json b/tests/res/gaia_agent/tapes/l1_task006.json → ...ples/res/gaia_agent/tapes/l1_task006.json
diff --git a/tests/res/gaia_agent/tapes/l1_task007.json → ...ples/res/gaia_agent/tapes/l1_task007.json b/tests/res/gaia_agent/tapes/l1_task007.json → ...ples/res/gaia_agent/tapes/l1_task007.json
diff --git a/tests/res/gaia_agent/tapes/l1_task008.json → ...ples/res/gaia_agent/tapes/l1_task008.json b/tests/res/gaia_agent/tapes/l1_task008.json → ...ples/res/gaia_agent/tapes/l1_task008.json
diff --git a/tests/res/gaia_agent/tapes/l1_task009.json → ...ples/res/gaia_agent/tapes/l1_task009.json b/tests/res/gaia_agent/tapes/l1_task009.json → ...ples/res/gaia_agent/tapes/l1_task009.json
diff --git a/tests/res/gaia_agent/tapes/l1_task010.json → ...ples/res/gaia_agent/tapes/l1_task010.json b/tests/res/gaia_agent/tapes/l1_task010.json → ...ples/res/gaia_agent/tapes/l1_task010.json
diff --git a/tests/res/gaia_agent/tapes/l1_task011.json → ...ples/res/gaia_agent/tapes/l1_task011.json b/tests/res/gaia_agent/tapes/l1_task011.json → ...ples/res/gaia_agent/tapes/l1_task011.json
diff --git a/tests/res/gaia_agent/tapes/l1_task012.json → ...ples/res/gaia_agent/tapes/l1_task012.json b/tests/res/gaia_agent/tapes/l1_task012.json → ...ples/res/gaia_agent/tapes/l1_task012.json
diff --git a/tests/res/gaia_agent/tapes/l1_task014.json → ...ples/res/gaia_agent/tapes/l1_task014.json b/tests/res/gaia_agent/tapes/l1_task014.json → ...ples/res/gaia_agent/tapes/l1_task014.json
diff --git a/tests/res/gaia_agent/tapes/l1_task015.json → ...ples/res/gaia_agent/tapes/l1_task015.json b/tests/res/gaia_agent/tapes/l1_task015.json → ...ples/res/gaia_agent/tapes/l1_task015.json
diff --git a/tests/res/gaia_agent/tapes/l1_task017.json → ...ples/res/gaia_agent/tapes/l1_task017.json b/tests/res/gaia_agent/tapes/l1_task017.json → ...ples/res/gaia_agent/tapes/l1_task017.json
diff --git a/tests/res/gaia_agent/tapes/l1_task018.json → ...ples/res/gaia_agent/tapes/l1_task018.json b/tests/res/gaia_agent/tapes/l1_task018.json → ...ples/res/gaia_agent/tapes/l1_task018.json
diff --git a/tests/res/gaia_agent/tapes/l1_task019.json → ...ples/res/gaia_agent/tapes/l1_task019.json b/tests/res/gaia_agent/tapes/l1_task019.json → ...ples/res/gaia_agent/tapes/l1_task019.json
diff --git a/tests/res/gaia_agent/tapes/l1_task020.json → ...ples/res/gaia_agent/tapes/l1_task020.json b/tests/res/gaia_agent/tapes/l1_task020.json → ...ples/res/gaia_agent/tapes/l1_task020.json
diff --git a/tests/res/gaia_agent/tapes/l1_task022.json → ...ples/res/gaia_agent/tapes/l1_task022.json b/tests/res/gaia_agent/tapes/l1_task022.json → ...ples/res/gaia_agent/tapes/l1_task022.json
diff --git a/tests/res/gaia_agent/tapes/l1_task023.json → ...ples/res/gaia_agent/tapes/l1_task023.json b/tests/res/gaia_agent/tapes/l1_task023.json → ...ples/res/gaia_agent/tapes/l1_task023.json
diff --git a/tests/res/gaia_agent/tapes/l1_task024.json → ...ples/res/gaia_agent/tapes/l1_task024.json b/tests/res/gaia_agent/tapes/l1_task024.json → ...ples/res/gaia_agent/tapes/l1_task024.json
diff --git a/tests/res/gaia_agent/tapes/l1_task025.json → ...ples/res/gaia_agent/tapes/l1_task025.json b/tests/res/gaia_agent/tapes/l1_task025.json → ...ples/res/gaia_agent/tapes/l1_task025.json
diff --git a/tests/res/gaia_agent/tapes/l1_task026.json → ...ples/res/gaia_agent/tapes/l1_task026.json b/tests/res/gaia_agent/tapes/l1_task026.json → ...ples/res/gaia_agent/tapes/l1_task026.json
diff --git a/tests/res/gaia_agent/tapes/l1_task027.json → ...ples/res/gaia_agent/tapes/l1_task027.json b/tests/res/gaia_agent/tapes/l1_task027.json → ...ples/res/gaia_agent/tapes/l1_task027.json
diff --git a/tests/res/gaia_agent/tapes/l1_task029.json → ...ples/res/gaia_agent/tapes/l1_task029.json b/tests/res/gaia_agent/tapes/l1_task029.json → ...ples/res/gaia_agent/tapes/l1_task029.json
diff --git a/tests/res/gaia_agent/tapes/l1_task030.json → ...ples/res/gaia_agent/tapes/l1_task030.json b/tests/res/gaia_agent/tapes/l1_task030.json → ...ples/res/gaia_agent/tapes/l1_task030.json
diff --git a/tests/res/gaia_agent/tapes/l1_task032.json → ...ples/res/gaia_agent/tapes/l1_task032.json b/tests/res/gaia_agent/tapes/l1_task032.json → ...ples/res/gaia_agent/tapes/l1_task032.json
diff --git a/tests/res/gaia_agent/tapes/l1_task033.json → ...ples/res/gaia_agent/tapes/l1_task033.json b/tests/res/gaia_agent/tapes/l1_task033.json → ...ples/res/gaia_agent/tapes/l1_task033.json
diff --git a/tests/res/gaia_agent/tapes/l1_task034.json → ...ples/res/gaia_agent/tapes/l1_task034.json b/tests/res/gaia_agent/tapes/l1_task034.json → ...ples/res/gaia_agent/tapes/l1_task034.json
diff --git a/tests/res/gaia_agent/tapes/l1_task035.json → ...ples/res/gaia_agent/tapes/l1_task035.json b/tests/res/gaia_agent/tapes/l1_task035.json → ...ples/res/gaia_agent/tapes/l1_task035.json
diff --git a/tests/res/gaia_agent/tapes/l1_task036.json → ...ples/res/gaia_agent/tapes/l1_task036.json b/tests/res/gaia_agent/tapes/l1_task036.json → ...ples/res/gaia_agent/tapes/l1_task036.json
diff --git a/tests/res/gaia_agent/tapes/l1_task037.json → ...ples/res/gaia_agent/tapes/l1_task037.json b/tests/res/gaia_agent/tapes/l1_task037.json → ...ples/res/gaia_agent/tapes/l1_task037.json
diff --git a/tests/res/gaia_agent/tapes/l1_task040.json → ...ples/res/gaia_agent/tapes/l1_task040.json b/tests/res/gaia_agent/tapes/l1_task040.json → ...ples/res/gaia_agent/tapes/l1_task040.json
diff --git a/tests/res/gaia_agent/tapes/l1_task041.json → ...ples/res/gaia_agent/tapes/l1_task041.json b/tests/res/gaia_agent/tapes/l1_task041.json → ...ples/res/gaia_agent/tapes/l1_task041.json
diff --git a/tests/res/gaia_agent/tapes/l1_task043.json → ...ples/res/gaia_agent/tapes/l1_task043.json b/tests/res/gaia_agent/tapes/l1_task043.json → ...ples/res/gaia_agent/tapes/l1_task043.json
diff --git a/tests/res/gaia_agent/tapes/l1_task044.json → ...ples/res/gaia_agent/tapes/l1_task044.json b/tests/res/gaia_agent/tapes/l1_task044.json → ...ples/res/gaia_agent/tapes/l1_task044.json
diff --git a/tests/res/gaia_agent/tapes/l1_task045.json → ...ples/res/gaia_agent/tapes/l1_task045.json b/tests/res/gaia_agent/tapes/l1_task045.json → ...ples/res/gaia_agent/tapes/l1_task045.json
diff --git a/tests/res/gaia_agent/tapes/l1_task047.json → ...ples/res/gaia_agent/tapes/l1_task047.json b/tests/res/gaia_agent/tapes/l1_task047.json → ...ples/res/gaia_agent/tapes/l1_task047.json
diff --git a/tests/res/gaia_agent/tapes/l1_task048.json → ...ples/res/gaia_agent/tapes/l1_task048.json b/tests/res/gaia_agent/tapes/l1_task048.json → ...ples/res/gaia_agent/tapes/l1_task048.json
diff --git a/tests/res/gaia_agent/tapes/l1_task050.json → ...ples/res/gaia_agent/tapes/l1_task050.json b/tests/res/gaia_agent/tapes/l1_task050.json → ...ples/res/gaia_agent/tapes/l1_task050.json
diff --git a/tests/res/gaia_agent/tapes/l1_task051.json → ...ples/res/gaia_agent/tapes/l1_task051.json b/tests/res/gaia_agent/tapes/l1_task051.json → ...ples/res/gaia_agent/tapes/l1_task051.json
diff --git a/tests/res/gaia_agent/tapes/l1_task052.json → ...ples/res/gaia_agent/tapes/l1_task052.json b/tests/res/gaia_agent/tapes/l1_task052.json → ...ples/res/gaia_agent/tapes/l1_task052.json
diff --git a/tests/res/gaia_agent/web_cache.jsonl → ...s/examples/res/gaia_agent/web_cache.jsonl b/tests/res/gaia_agent/web_cache.jsonl → ...s/examples/res/gaia_agent/web_cache.jsonl
diff --git a/tests/res/intro_notebook/failed_tape.json → ...mples/res/intro_notebook/failed_tape.json b/tests/res/intro_notebook/failed_tape.json → ...mples/res/intro_notebook/failed_tape.json
diff --git a/tests/res/intro_notebook/tapedata.sqlite → ...amples/res/intro_notebook/tapedata.sqlite b/tests/res/intro_notebook/tapedata.sqlite → ...amples/res/intro_notebook/tapedata.sqlite
diff --git a/tests/res/intro_notebook/tool_cache.jsonl → ...mples/res/intro_notebook/tool_cache.jsonl b/tests/res/intro_notebook/tool_cache.jsonl → ...mples/res/intro_notebook/tool_cache.jsonl
diff --git a/tests/res/intro_notebook/web_cache.jsonl → ...amples/res/intro_notebook/web_cache.jsonl b/tests/res/intro_notebook/web_cache.jsonl → ...amples/res/intro_notebook/web_cache.jsonl
diff --git a/tests/res/llama_agent/start_tape.json → .../examples/res/llama_agent/start_tape.json b/tests/res/llama_agent/start_tape.json → .../examples/res/llama_agent/start_tape.json
diff --git a/tests/res/llama_agent/tape.json → tests/examples/res/llama_agent/tape.json b/tests/res/llama_agent/tape.json → tests/examples/res/llama_agent/tape.json
diff --git a/tests/res/llama_agent/tapedata.sqlite → .../examples/res/llama_agent/tapedata.sqlite b/tests/res/llama_agent/tapedata.sqlite → .../examples/res/llama_agent/tapedata.sqlite
diff --git a/tests/res/llama_agent/traces.json → tests/examples/res/llama_agent/traces.json b/tests/res/llama_agent/traces.json → tests/examples/res/llama_agent/traces.json
diff --git a/tests/res/optimize/config.yaml → tests/examples/res/optimize/config.yaml b/tests/res/optimize/config.yaml → tests/examples/res/optimize/config.yaml
diff --git a/...bo_9a4540619ef903566aa0a958d20e8463.jsonl → ...bo_9a4540619ef903566aa0a958d20e8463.jsonl b/...bo_9a4540619ef903566aa0a958d20e8463.jsonl → ...bo_9a4540619ef903566aa0a958d20e8463.jsonl
diff --git a/tests/res/optimize/tape.json → tests/examples/res/optimize/tape.json b/tests/res/optimize/tape.json → tests/examples/res/optimize/tape.json
diff --git a/tests/res/tape_improver/final_tape.json → ...xamples/res/tape_improver/final_tape.json b/tests/res/tape_improver/final_tape.json → ...xamples/res/tape_improver/final_tape.json
diff --git a/tests/res/tape_improver/tapedata.sqlite → ...xamples/res/tape_improver/tapedata.sqlite b/tests/res/tape_improver/tapedata.sqlite → ...xamples/res/tape_improver/tapedata.sqlite
diff --git a/tests/res/workarena/guided/tapedata.sqlite → ...ples/res/workarena/guided/tapedata.sqlite b/tests/res/workarena/guided/tapedata.sqlite → ...ples/res/workarena/guided/tapedata.sqlite
diff --git a/...icenow.multi-chart-min-max-retrieval.json → ...icenow.multi-chart-min-max-retrieval.json b/...icenow.multi-chart-min-max-retrieval.json → ...icenow.multi-chart-min-max-retrieval.json
diff --git a/..._workarena.servicenow.order-ipad-pro.json → ..._workarena.servicenow.order-ipad-pro.json b/..._workarena.servicenow.order-ipad-pro.json → ..._workarena.servicenow.order-ipad-pro.json
diff --git a/...workarena.servicenow.create-incident.json → ...workarena.servicenow.create-incident.json b/...workarena.servicenow.create-incident.json → ...workarena.servicenow.create-incident.json
diff --git a/...ena.servicenow.knowledge-base-search.json → ...ena.servicenow.knowledge-base-search.json b/...ena.servicenow.knowledge-base-search.json → ...ena.servicenow.knowledge-base-search.json
diff --git a/tests/test_examples.py → tests/examples/test_examples.py b/tests/test_examples.py → tests/examples/test_examples.py
@@ -8,20 +8,23 @@
 import tempfile
 from pathlib import Path
 
-import transformers
 import yaml
 from omegaconf import DictConfig
 
-from tapeagents.finetune.data import load_samples
 from tapeagents.io import load_tapes
 from tests.make_test_data import run_test_in_tmp_dir
 
-sys.path.append(str(Path(__file__).parent.parent.resolve()))  # allow to import from examples
+sys.path.append(str(Path(__file__).parent.parent.parent.resolve()))  # allow to import from examples
 
 from examples.data_science import data_science
 from examples.delegate import ExampleTape, FindIrregularVerbs
-from examples.delegate_stack import ExampleTape as ExampleTapeStack
-from examples.delegate_stack import Linguist, make_analyze_text_chain
+from examples.delegate_stack import (
+    ExampleTape as ExampleTapeStack,
+)
+from examples.delegate_stack import (
+    Linguist,
+    make_analyze_text_chain,
+)
 from examples.form_filler.environment import FormFillerEnvironment
 from examples.form_filler.scripts.prepare_test_assets import (
     get_teacher_agent,
@@ -34,16 +37,13 @@
 from examples.gaia_agent.agent import GaiaAgent
 from examples.gaia_agent.environment import GaiaEnvironment
 from examples.gaia_agent.tape import GaiaTape
-from examples.gsm8k_tuning.finetune_student import get_training_samples_from_tapes
-from examples.gsm8k_tuning.math_agent import MathAgent, MathTape
 from examples.llama_agent import LLAMAChatBot
 from examples.optimize.optimize import make_agentic_rag_agent, make_env
-from examples.rl_gsm8k.orchestrate_rl import CoTMathAgent, RLMathTape, extract_tape_training_samples
 from examples.tape_improver import tape_improver
 from examples.workarena.agent import WorkArenaAgent
 from examples.workarena.steps import WorkArenaTape
 from tapeagents.config import DB_DEFAULT_FILENAME
-from tapeagents.core import AgentStep, LLMCall, TrainingText
+from tapeagents.core import AgentStep, TrainingText
 from tapeagents.dialog_tape import DialogTape
 from tapeagents.environment import EmptyEnvironment
 from tapeagents.llms import LLM, ReplayLLM, TrainableLLM
@@ -224,7 +224,7 @@ def test_form_filler():
     os.environ["TAPEAGENTS_MOCK_DATE"] = "2024-12-09"
     assets_dir = str(Path(__file__).parent / "res" / "form_filler")
     forms_path = str(
-        Path(__file__).parent.parent / "examples" / "form_filler" / "assets" / "forms" / "train" / "FlyCorp"
+        Path(__file__).parent.parent.parent / "examples" / "form_filler" / "assets" / "forms" / "train" / "FlyCorp"
     )
     env = FormFillerEnvironment.from_spec(forms_path)
 
@@ -273,7 +273,7 @@ def test_tape_improver():
 
 
 def test_optimize():
-    with run_test_in_tmp_dir("optimize"):
+    with run_test_in_tmp_dir("tests/examples/res/optimize"):
         with open("config.yaml") as f:
             cfg = DictConfig(yaml.safe_load(f))
         agent = make_agentic_rag_agent(cfg)
@@ -283,41 +283,6 @@ def test_optimize():
         assert replay_success, "Failed to replay tape"
 
 
-def test_gsm8k_tuning_tapes_generation():
-    run_dir = f"{res_path}/gsm8k_tuning"
-    llm = mock_llm(run_dir)
-    agent = MathAgent.create(llm)
-    tapes = load_tapes(MathTape, os.path.join(run_dir, "tapes"), file_extension=".json")
-    logger.info(f"Validate {len(tapes)} tapes")
-    fails = replay_tapes(agent, tapes, reuse_observations=True)
-    assert fails == 0, f"{fails} failed tapes"
-
-
-def test_gsm8k_tuning_samples_prep():
-    run_dir = f"{res_path}/gsm8k_tuning"
-    training_samples = load_samples(f"{run_dir}/training_samples.jsonl")
-    new_training_samples = get_training_samples_from_tapes(f"{run_dir}/tapes/")
-    assert training_samples == new_training_samples
-
-
-def test_rl_gsm8k_data():
-    run_dir = f"{res_path}/rl_gsm8k"
-    tapes = load_tapes(RLMathTape, run_dir, file_extension=".json")
-    llm = mock_llm(run_dir)
-    llm.tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-    agent = CoTMathAgent.create(llm)
-    cfg = DictConfig({"dataset_name": "math", "finetune": {"seq_length": 1024}})
-    training_samples = []
-    for tape in tapes:
-        for step in tape:
-            if llm_call_data := step.metadata.other.get("llm_call"):
-                step.metadata.other["llm_call"] = LLMCall(**llm_call_data)
-        _, training_sample, _ = extract_tape_training_samples(tape, agent, "train", cfg)
-        training_samples.append(training_sample[0])
-    new_training_samples = load_samples(f"{run_dir}/training_samples.jsonl")
-    assert training_samples == new_training_samples
-
-
 if __name__ == "__main__":
     test_llama_agent()
     test_llama_agent_traces()
@@ -329,6 +294,3 @@ def test_rl_gsm8k_data():
     test_data_science()
     test_form_filler()
     test_tape_improver()
-    test_gsm8k_tuning_tapes_generation()
-    test_gsm8k_tuning_samples_prep()
-    test_rl_gsm8k_data()
diff --git a/tests/test_notebooks.py → tests/examples/test_notebooks.py b/tests/test_notebooks.py → tests/examples/test_notebooks.py
@@ -3,7 +3,8 @@
 from pathlib import Path
 
 import testbook
-from make_test_data import run_test_in_tmp_dir
+
+from tests.make_test_data import run_test_in_tmp_dir
 
 res_dir = f"{pathlib.Path(__file__).parent.resolve()}/res"
 
@@ -12,7 +13,7 @@ def test_intro_notebook():
     intro_notebook_path = Path("intro.ipynb").resolve()
     assets_path = Path("assets").resolve()
     with testbook.testbook(intro_notebook_path) as tb:
-        with run_test_in_tmp_dir("intro_notebook") as test_data_dir:
+        with run_test_in_tmp_dir("tests/examples/res/intro_notebook") as test_data_dir:
             shutil.copytree(assets_path, Path("assets"))
             sqlite_path = Path(test_data_dir) / "tapedata.sqlite"
             tb.inject(
@@ -31,5 +32,6 @@ def test_intro_notebook():
             )
             tb.execute()
 
+
 if __name__ == "__main__":
     test_intro_notebook()
diff --git a/.../res/conf/deepspeed/accelerate_local.yaml → .../res/conf/deepspeed/accelerate_local.yaml b/.../res/conf/deepspeed/accelerate_local.yaml → .../res/conf/deepspeed/accelerate_local.yaml
diff --git a/tests/res/conf/finetune/data/test.yaml → ...finetune/res/conf/finetune/data/test.yaml b/tests/res/conf/finetune/data/test.yaml → ...finetune/res/conf/finetune/data/test.yaml
diff --git a/tests/res/conf/finetune/test.yaml → tests/finetune/res/conf/finetune/test.yaml b/tests/res/conf/finetune/test.yaml → tests/finetune/res/conf/finetune/test.yaml
diff --git a/tests/res/conf/job/stub.yaml → tests/finetune/res/conf/job/stub.yaml b/tests/res/conf/job/stub.yaml → tests/finetune/res/conf/job/stub.yaml
diff --git a/tests/res/finetune/data/dev/data.jsonl → ...finetune/res/finetune/data/dev/data.jsonl b/tests/res/finetune/data/dev/data.jsonl → ...finetune/res/finetune/data/dev/data.jsonl
diff --git a/tests/res/finetune/data/train/data.jsonl → ...netune/res/finetune/data/train/data.jsonl b/tests/res/finetune/data/train/data.jsonl → ...netune/res/finetune/data/train/data.jsonl
diff --git a/tests/res/finetune/results/bf16.json → ...s/finetune/res/finetune/results/bf16.json b/tests/res/finetune/results/bf16.json → ...s/finetune/res/finetune/results/bf16.json
diff --git a/tests/res/finetune/results/deepspeed.json → ...etune/res/finetune/results/deepspeed.json b/tests/res/finetune/results/deepspeed.json → ...etune/res/finetune/results/deepspeed.json
diff --git a/tests/res/finetune/results/fp16.json → ...s/finetune/res/finetune/results/fp16.json b/tests/res/finetune/results/fp16.json → ...s/finetune/res/finetune/results/fp16.json
diff --git a/...etune/results/gradient_checkpointing.json → ...etune/results/gradient_checkpointing.json b/...etune/results/gradient_checkpointing.json → ...etune/results/gradient_checkpointing.json
diff --git a/.../res/finetune/results/half_precision.json → .../res/finetune/results/half_precision.json b/.../res/finetune/results/half_precision.json → .../res/finetune/results/half_precision.json
diff --git a/tests/res/finetune/results/lora.json → ...s/finetune/res/finetune/results/lora.json b/tests/res/finetune/results/lora.json → ...s/finetune/res/finetune/results/lora.json
diff --git a/tests/res/finetune/results/multi_gpu.json → ...etune/res/finetune/results/multi_gpu.json b/tests/res/finetune/results/multi_gpu.json → ...etune/res/finetune/results/multi_gpu.json
diff --git a/tests/res/finetune/results/regular.json → ...inetune/res/finetune/results/regular.json b/tests/res/finetune/results/regular.json → ...inetune/res/finetune/results/regular.json
diff --git a/tests/res/finetune/results/resumption.json → ...tune/res/finetune/results/resumption.json b/tests/res/finetune/results/resumption.json → ...tune/res/finetune/results/resumption.json
diff --git a/tests/res/gsm8k_tuning/tapedata.sqlite → ...finetune/res/gsm8k_tuning/tapedata.sqlite b/tests/res/gsm8k_tuning/tapedata.sqlite → ...finetune/res/gsm8k_tuning/tapedata.sqlite
diff --git a/...es/gsm8k_tuning/tapes/task0_attempt1.json → ...es/gsm8k_tuning/tapes/task0_attempt1.json b/...es/gsm8k_tuning/tapes/task0_attempt1.json → ...es/gsm8k_tuning/tapes/task0_attempt1.json
diff --git a/...es/gsm8k_tuning/tapes/task1_attempt1.json → ...es/gsm8k_tuning/tapes/task1_attempt1.json b/...es/gsm8k_tuning/tapes/task1_attempt1.json → ...es/gsm8k_tuning/tapes/task1_attempt1.json
diff --git a/...es/gsm8k_tuning/tapes/task2_attempt1.json → ...es/gsm8k_tuning/tapes/task2_attempt1.json b/...es/gsm8k_tuning/tapes/task2_attempt1.json → ...es/gsm8k_tuning/tapes/task2_attempt1.json
diff --git a/...es/gsm8k_tuning/tapes/task3_attempt1.json → ...es/gsm8k_tuning/tapes/task3_attempt1.json b/...es/gsm8k_tuning/tapes/task3_attempt1.json → ...es/gsm8k_tuning/tapes/task3_attempt1.json
diff --git a/...s/res/gsm8k_tuning/training_samples.jsonl → ...e/res/gsm8k_tuning/training_samples.jsonl b/...s/res/gsm8k_tuning/training_samples.jsonl → ...e/res/gsm8k_tuning/training_samples.jsonl
diff --git a/tests/res/rl_gsm8k/tapedata.sqlite → tests/finetune/res/rl_gsm8k/tapedata.sqlite b/tests/res/rl_gsm8k/tapedata.sqlite → tests/finetune/res/rl_gsm8k/tapedata.sqlite
diff --git a/tests/res/rl_gsm8k/tapes.json → tests/finetune/res/rl_gsm8k/tapes.json b/tests/res/rl_gsm8k/tapes.json → tests/finetune/res/rl_gsm8k/tapes.json
diff --git a/tests/res/rl_gsm8k/training_samples.jsonl → ...etune/res/rl_gsm8k/training_samples.jsonl b/tests/res/rl_gsm8k/training_samples.jsonl → ...etune/res/rl_gsm8k/training_samples.jsonl
diff --git a/tests/res/tokenizer/starcoder/merges.txt → ...netune/res/tokenizer/starcoder/merges.txt b/tests/res/tokenizer/starcoder/merges.txt → ...netune/res/tokenizer/starcoder/merges.txt
diff --git a/...kenizer/starcoder/special_tokens_map.json → ...kenizer/starcoder/special_tokens_map.json b/...kenizer/starcoder/special_tokens_map.json → ...kenizer/starcoder/special_tokens_map.json
diff --git a/tests/res/tokenizer/starcoder/tokenizer.json → ...ne/res/tokenizer/starcoder/tokenizer.json b/tests/res/tokenizer/starcoder/tokenizer.json → ...ne/res/tokenizer/starcoder/tokenizer.json
diff --git a/...tokenizer/starcoder/tokenizer_config.json → ...tokenizer/starcoder/tokenizer_config.json b/...tokenizer/starcoder/tokenizer_config.json → ...tokenizer/starcoder/tokenizer_config.json
diff --git a/tests/res/tokenizer/starcoder/vocab.json → ...netune/res/tokenizer/starcoder/vocab.json b/tests/res/tokenizer/starcoder/vocab.json → ...netune/res/tokenizer/starcoder/vocab.json
diff --git a/tests/run_finetune.py → tests/finetune/run_finetune.py b/tests/run_finetune.py → tests/finetune/run_finetune.py
diff --git a/tests/test_dataloaders.py → tests/finetune/test_dataloaders.py b/tests/test_dataloaders.py → tests/finetune/test_dataloaders.py
diff --git a/tests/finetune/test_example.py b/tests/finetune/test_example.py
@@ -0,0 +1,79 @@
+import logging
+import os
+import sys
+from pathlib import Path
+
+import transformers
+from omegaconf import DictConfig
+
+from tapeagents.io import load_tapes
+
+sys.path.append(str(Path(__file__).parent.parent.resolve()))  # allow to import from examples
+
+from examples.gsm8k_tuning.finetune_student import get_training_samples_from_tapes
+from examples.gsm8k_tuning.math_agent import MathAgent, MathTape
+from examples.rl_gsm8k.orchestrate_rl import (
+    CoTMathAgent,
+    RLMathTape,
+    extract_tape_training_samples,
+)
+from tapeagents.core import LLMCall
+from tapeagents.finetune.data import load_samples
+from tapeagents.llms import LLM, ReplayLLM, TrainableLLM
+from tapeagents.orchestrator import replay_tapes
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+res_path = Path(__file__).parent.resolve() / "res"
+
+
+def mock_llm(run_dir: str) -> LLM:
+    llama = TrainableLLM(
+        base_url="https://api.together.xyz",
+        model_name="meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
+        tokenizer_name="meta-llama/Meta-Llama-3-70B-Instruct",
+        parameters=dict(temperature=0.7, max_tokens=512),
+    )
+    return ReplayLLM.from_llm(llama, run_dir)
+
+
+def test_gsm8k_tuning_tapes_generation():
+    run_dir = f"{res_path}/gsm8k_tuning"
+    llm = mock_llm(run_dir)
+    agent = MathAgent.create(llm)
+    tapes = load_tapes(MathTape, os.path.join(run_dir, "tapes"), file_extension=".json")
+    logger.info(f"Validate {len(tapes)} tapes")
+    fails = replay_tapes(agent, tapes, reuse_observations=True)
+    assert fails == 0, f"{fails} failed tapes"
+
+
+def test_gsm8k_tuning_samples_prep():
+    run_dir = f"{res_path}/gsm8k_tuning"
+    training_samples = load_samples(f"{run_dir}/training_samples.jsonl")
+    new_training_samples = get_training_samples_from_tapes(f"{run_dir}/tapes/")
+    assert training_samples == new_training_samples
+
+
+def test_rl_gsm8k_data():
+    run_dir = f"{res_path}/rl_gsm8k"
+    tapes = load_tapes(RLMathTape, run_dir, file_extension=".json")
+    llm = mock_llm(run_dir)
+    llm.tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+    agent = CoTMathAgent.create(llm)
+    cfg = DictConfig({"dataset_name": "math", "finetune": {"seq_length": 1024}})
+    training_samples = []
+    for tape in tapes:
+        for step in tape:
+            if llm_call_data := step.metadata.other.get("llm_call"):
+                step.metadata.other["llm_call"] = LLMCall(**llm_call_data)
+        _, training_sample, _ = extract_tape_training_samples(tape, agent, "train", cfg)
+        training_samples.append(training_sample[0])
+    new_training_samples = load_samples(f"{run_dir}/training_samples.jsonl")
+    assert training_samples == new_training_samples
+
+
+if __name__ == "__main__":
+    test_gsm8k_tuning_tapes_generation()
+    test_gsm8k_tuning_samples_prep()
+    test_rl_gsm8k_data()
diff --git a/tests/test_finetune.py → tests/finetune/test_finetune.py b/tests/test_finetune.py → tests/finetune/test_finetune.py
diff --git a/tests/make_test_data.py b/tests/make_test_data.py
@@ -18,7 +18,7 @@ def run_test_in_tmp_dir(test_name: str):
     """Copy test resources to a temporary directory and run the test there"""
     cur_dir = os.getcwd()
     tmpdir = tempfile.mkdtemp()
-    test_data_dir = Path(f"tests/res/{test_name}").resolve()
+    test_data_dir = Path(f"{test_name}").resolve()
     os.chdir(tmpdir)
     shutil.copytree(test_data_dir, tmpdir, dirs_exist_ok=True)
     # force creation of SQLite tables