From d23be088d20c7603e4f8a5a7d49374d3b760660f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 14:54:35 -0800 Subject: [PATCH 1/9] run od on video in parallel --- vision_agent/tools/tools.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 32efaa06..21e45acf 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -314,18 +314,29 @@ def _apply_object_detection( # inner method to avoid circular importing issues. # Process each segment and collect detections detections_per_segment: List[Any] = [] - for segment_index, segment in enumerate(segments): - segment_detections = process_segment( - segment_frames=segment, - od_model=od_model, - prompt=prompt, - fine_tune_id=fine_tune_id, - chunk_length=chunk_length, - image_size=image_size, - segment_index=segment_index, - object_detection_tool=_apply_object_detection, - ) - detections_per_segment.append(segment_detections) + with ThreadPoolExecutor() as executor: + futures = { + executor.submit( + process_segment, + segment_frames=segment, + od_model=od_model, + prompt=prompt, + fine_tune_id=fine_tune_id, + chunk_length=chunk_length, + image_size=image_size, + segment_index=segment_index, + object_detection_tool=_apply_object_detection, + ): segment_index + for segment_index, segment in enumerate(segments) + } + + for future in as_completed(futures): + segment_index = futures[future] + detections_per_segment.append((segment_index, future.result())) + + detections_per_segment = [ + x[1] for x in sorted(detections_per_segment, key=lambda x: x[0]) + ] merged_detections = merge_segments(detections_per_segment) post_processed = post_process(merged_detections, image_size) From ec8379318e34bcd8f70e9f55e84ad8928c497e63 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 15:56:19 -0800 Subject: [PATCH 2/9] fix prompts for tool testing --- vision_agent/tools/planner_tools.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/vision_agent/tools/planner_tools.py b/vision_agent/tools/planner_tools.py index 7f98682a..58e42f13 100644 --- a/vision_agent/tools/planner_tools.py +++ b/vision_agent/tools/planner_tools.py @@ -10,12 +10,7 @@ from PIL import Image import vision_agent.tools as T -from vision_agent.agent.agent_utils import ( - DefaultImports, - extract_code, - extract_json, - extract_tag, -) +from vision_agent.agent.agent_utils import DefaultImports, extract_json, extract_tag from vision_agent.agent.vision_agent_planner_prompts_v2 import ( CATEGORIZE_TOOL_REQUEST, FINALIZE_PLAN, @@ -36,6 +31,9 @@ from vision_agent.utils.sim import get_tool_recommender TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS} +TOOL_LOAD_DOCS = T.get_tool_documentation( + [T.load_image, T.extract_frames_and_timestamps] +) CONFIG = Config() _LOGGER = logging.getLogger(__name__) @@ -179,6 +177,7 @@ def run_tool_testing( cleaned_tool_docs.append(tool_doc) tool_docs = cleaned_tool_docs tool_docs_str = "\n".join([e["doc"] for e in tool_docs]) + tool_docs_str += "\n" + TOOL_LOAD_DOCS # type: ignore prompt = TEST_TOOLS.format( tool_docs=tool_docs_str, @@ -217,8 +216,15 @@ def run_tool_testing( examples=EXAMPLES, media=str(image_paths), ) - code = extract_code(lmm.generate(prompt, media=image_paths)) # type: ignore - code = process_code(code) + response = cast(str, lmm.generate(prompt, media=image_paths)) + code = extract_tag(response, "code") + if code is None: + code = response + + try: + code = process_code(code) + except Exception as e: + _LOGGER.error(f"Error processing code: {e}") tool_output = code_interpreter.exec_isolation( DefaultImports.prepend_imports(code) ) From 7b0f2b4a63f6ae410b1831abf2729eda6c52903f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 15:56:52 -0800 Subject: [PATCH 3/9] add update messages --- vision_agent/agent/types.py | 1 + vision_agent/agent/vision_agent_planner_v2.py | 5 +++-- vision_agent/tools/__init__.py | 8 ++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/vision_agent/agent/types.py b/vision_agent/agent/types.py index ea44afa3..f173804b 100644 --- a/vision_agent/agent/types.py +++ b/vision_agent/agent/types.py @@ -33,6 +33,7 @@ class AgentMessage(BaseModel): Literal["interaction_response"], Literal["conversation"], Literal["planner"], + Literal["planner_update"], Literal["coder"], ] content: str diff --git a/vision_agent/agent/vision_agent_planner_v2.py b/vision_agent/agent/vision_agent_planner_v2.py index db656a06..dfb4dbba 100644 --- a/vision_agent/agent/vision_agent_planner_v2.py +++ b/vision_agent/agent/vision_agent_planner_v2.py @@ -218,8 +218,8 @@ def execute_code_action( f"[bold cyan]Code Execution Output ({end - start:.2f}s):[/bold cyan] [yellow]{escape(obs)}[/yellow]" ) - count = 1 - while not execution.success and count <= 3: + count = 0 + while not execution.success and count < 3: prompt = FIX_BUG.format(chat_history=get_planning(chat), code=code, error=obs) response = cast(str, model.chat([{"role": "user", "content": prompt}])) new_code = extract_tag(response, "code") @@ -513,6 +513,7 @@ def generate_plan( code = extract_tag(response, "execute_python") finalize_plan = extract_tag(response, "finalize_plan") finished = finalize_plan is not None + self.update_callback({"role": "planner_update", "content": response}) if self.verbose: _CONSOLE.print( diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index e10178ef..69c839ba 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -23,6 +23,9 @@ TOOLS_INFO, UTIL_TOOLS, UTILITIES_DOCSTRING, + agentic_object_detection, + agentic_sam2_instance_segmentation, + agentic_sam2_video_tracking, claude35_text_extraction, closest_box_distance, closest_mask_distance, @@ -30,6 +33,7 @@ countgd_sam2_instance_segmentation, countgd_sam2_video_tracking, countgd_visual_prompt_object_detection, + custom_object_detection, depth_anything_v2, detr_segmentation, document_extraction, @@ -63,10 +67,6 @@ video_temporal_localization, vit_image_classification, vit_nsfw_classification, - custom_object_detection, - agentic_object_detection, - agentic_sam2_instance_segmentation, - agentic_sam2_video_tracking, ) __new_tools__ = [ From 1c8d83157cc356a6b65777d72de55807492ee114 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 16:57:54 -0800 Subject: [PATCH 4/9] update default fps to 5 --- vision_agent/tools/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 21e45acf..9b4dfbf1 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -2657,7 +2657,7 @@ def save_image(image: np.ndarray, file_path: str) -> None: def save_video( - frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1 + frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 5 ) -> str: """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk. From 32a82dca2479d99638707ad925892508ef1adcd4 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 16:58:07 -0800 Subject: [PATCH 5/9] better variable name --- vision_agent/tools/planner_tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vision_agent/tools/planner_tools.py b/vision_agent/tools/planner_tools.py index 58e42f13..75cd7f9f 100644 --- a/vision_agent/tools/planner_tools.py +++ b/vision_agent/tools/planner_tools.py @@ -31,7 +31,7 @@ from vision_agent.utils.sim import get_tool_recommender TOOL_FUNCTIONS = {tool.__name__: tool for tool in T.TOOLS} -TOOL_LOAD_DOCS = T.get_tool_documentation( +LOAD_TOOLS_DOCSTRING = T.get_tool_documentation( [T.load_image, T.extract_frames_and_timestamps] ) @@ -177,7 +177,7 @@ def run_tool_testing( cleaned_tool_docs.append(tool_doc) tool_docs = cleaned_tool_docs tool_docs_str = "\n".join([e["doc"] for e in tool_docs]) - tool_docs_str += "\n" + TOOL_LOAD_DOCS # type: ignore + tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING # type: ignore prompt = TEST_TOOLS.format( tool_docs=tool_docs_str, From e12d6b76a4ec729cbf7c5ec343c1d60bba8adda3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 17:51:37 -0800 Subject: [PATCH 6/9] fix rounds --- vision_agent/tools/tools.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 9b4dfbf1..dd1b742e 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -222,7 +222,7 @@ def sam2( ret = _sam2(image, detections, image_size) _display_tool_trace( sam2.__name__, - {}, + {"detections": detections}, ret["display_data"], ret["files"], ) @@ -401,7 +401,7 @@ def _owlv2_object_detection( { "label": bbox["label"], "bbox": normalize_bbox(bbox["bounding_box"], image_size), - "score": bbox["score"], + "score": round(bbox["score"], 2), } for bbox in bboxes ] @@ -409,7 +409,7 @@ def _owlv2_object_detection( { "label": bbox["label"], "bbox": bbox["bounding_box"], - "score": bbox["score"], + "score": round(bbox["score"], 2), } for bbox in bboxes ] @@ -593,7 +593,7 @@ def owlv2_sam2_video_tracking( ) _display_tool_trace( owlv2_sam2_video_tracking.__name__, - {}, + {"prompt": prompt, "chunk_length": chunk_length}, ret["display_data"], ret["files"], ) @@ -2161,7 +2161,7 @@ def siglip_classification(image: np.ndarray, labels: List[str]) -> Dict[str, Any return response -# agentic od tools +# Agentic OD Tools def _agentic_object_detection( From 3b1ccb6f579e793217b939810350b90d09258b70 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 17:51:52 -0800 Subject: [PATCH 7/9] filter double tool calls in interaction --- vision_agent/agent/vision_agent_planner_v2.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent_planner_v2.py b/vision_agent/agent/vision_agent_planner_v2.py index dfb4dbba..a6356ad6 100644 --- a/vision_agent/agent/vision_agent_planner_v2.py +++ b/vision_agent/agent/vision_agent_planner_v2.py @@ -270,9 +270,25 @@ def create_hil_response( except Exception: continue + # There's a chance that the same tool is called multiple times with the same inputs + # in the interaction. We want to remove duplicates to avoid redundancy by picking + # the last occurrence of the tool. + cleaned_content = [] + seen_content = set() + for c in reversed(content): + if "request" in c and "function_name" in c["request"] and "files" in c: + key = (c["request"]["function_name"], hash(c["files"])) + if key in seen_content: + continue + + seen_content.add(key) + cleaned_content.append(c) + else: + cleaned_content.append(c) + return AgentMessage( role="interaction", - content="" + json.dumps(content) + "", + content="" + json.dumps(cleaned_content) + "", media=None, ) From 6a759fd6055a019652c6357b9afd776dcc195389 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 17:55:07 -0800 Subject: [PATCH 8/9] remove comments --- vision_agent/tools/planner_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/tools/planner_tools.py b/vision_agent/tools/planner_tools.py index 75cd7f9f..c64bac23 100644 --- a/vision_agent/tools/planner_tools.py +++ b/vision_agent/tools/planner_tools.py @@ -177,7 +177,7 @@ def run_tool_testing( cleaned_tool_docs.append(tool_doc) tool_docs = cleaned_tool_docs tool_docs_str = "\n".join([e["doc"] for e in tool_docs]) - tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING # type: ignore + tool_docs_str += "\n" + LOAD_TOOLS_DOCSTRING prompt = TEST_TOOLS.format( tool_docs=tool_docs_str, From 4039f06bb8d3d8ddd6c7faaab0a65adce2818519 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 17 Jan 2025 17:57:15 -0800 Subject: [PATCH 9/9] revert count --- vision_agent/agent/vision_agent_planner_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vision_agent/agent/vision_agent_planner_v2.py b/vision_agent/agent/vision_agent_planner_v2.py index a6356ad6..7c5b458f 100644 --- a/vision_agent/agent/vision_agent_planner_v2.py +++ b/vision_agent/agent/vision_agent_planner_v2.py @@ -218,8 +218,8 @@ def execute_code_action( f"[bold cyan]Code Execution Output ({end - start:.2f}s):[/bold cyan] [yellow]{escape(obs)}[/yellow]" ) - count = 0 - while not execution.success and count < 3: + count = 1 + while not execution.success and count <= 3: prompt = FIX_BUG.format(chat_history=get_planning(chat), code=code, error=obs) response = cast(str, model.chat([{"role": "user", "content": prompt}])) new_code = extract_tag(response, "code")