landing-ai · dillonalaird · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/docs/index.md b/docs/index.md
diff --git a/docs/index_old.md b/docs/index_old.md
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -72,9 +72,10 @@ markdown_extensions:
   - pymdownx.details
 
 nav:
-  - Quick start: index.md
+  - VisionAgent: index.md
   - APIs:
       - vision_agent.agent: api/agent.md
       - vision_agent.tools: api/tools.md
       - vision_agent.lmm: api/lmm.md
       - vision_agent.utils: api/utils.md
+  - VisionAgent (Old): index_old.md
diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
@@ -157,10 +157,11 @@ def format_conversation(chat: List[AgentMessage]) -> str:
     chat = copy.deepcopy(chat)
     prompt = ""
     for chat_i in chat:
-        if chat_i.role == "user":
-            prompt += f"USER: {chat_i.content}\n\n"
-        elif chat_i.role == "observation" or chat_i.role == "coder":
-            prompt += f"OBSERVATION: {chat_i.content}\n\n"
+        if chat_i.role == "user" or chat_i.role == "coder":
+            if "<final_code>" in chat_i.role:
+                prompt += f"OBSERVATION: {chat_i.content}\n\n"
+            elif chat_i.role == "user":
+                prompt += f"USER: {chat_i.content}\n\n"
         elif chat_i.role == "conversation":
             prompt += f"AGENT: {chat_i.content}\n\n"
     return prompt

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -291,10 +291,9 @@ def __init__(
             verbosity (int): The verbosity level of the agent.
             callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback
                 function to send intermediate update messages.
-            code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values
-                it can be one of: None, "local" or "e2b". If None, it will read from
-                the environment variable "CODE_SANDBOX_RUNTIME". If a CodeInterpreter
-                object is provided it will use that.
+            code_sandbox_runtime (Optional[str]): For string values it can be one of:
+                None, "local" or "e2b". If None, it will read from the environment
+                variable "CODE_SANDBOX_RUNTIME".
         """
 
         self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -44,35 +44,35 @@
 
 ## Subtasks
 
-This plan uses the owl_v2_image tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
--Use owl_v2_image with prompt 'person, helmet' to detect both people and helmets in the image
+This plan uses the owlv2_object_detection tool to detect both people and helmets in a single pass, which should be efficient and accurate. We can then compare the detections to determine if each person is wearing a helmet.
+-Use owlv2_object_detection with prompt 'person, helmet' to detect both people and helmets in the image
 -Process the detections to match helmets with people based on bounding box proximity
 -Count people with and without helmets based on the matching results
 -Return a dictionary with the counts
 
 
 **Tool Tests and Outputs**:
-After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owl_v2_image seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
+After examining the image, I can see 4 workers in total, with 3 wearing yellow safety helmets and 1 not wearing a helmet. Plan 1 using owlv2_object_detection seems to be the most accurate in detecting both people and helmets. However, it needs some modifications to improve accuracy. We should increase the confidence threshold to 0.15 to filter out the lowest confidence box, and implement logic to associate helmets with people based on their bounding box positions. Plan 2 and Plan 3 seem less reliable given the tool outputs, as they either failed to distinguish between people with and without helmets or misclassified all workers as not wearing helmets.
 
 **Tool Output Thoughts**:
 ```python
 ...
 ```
 ----- stdout -----
-Plan 1 - owl_v2_image:
+Plan 1 - owlv2_object_detection:
 
 [{{'label': 'helmet', 'score': 0.15, 'bbox': [0.85, 0.41, 0.87, 0.45]}}, {{'label': 'helmet', 'score': 0.3, 'bbox': [0.8, 0.43, 0.81, 0.46]}}, {{'label': 'helmet', 'score': 0.31, 'bbox': [0.85, 0.45, 0.86, 0.46]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.84, 0.45, 0.88, 0.58]}}, {{'label': 'person', 'score': 0.31, 'bbox': [0.78, 0.43, 0.82, 0.57]}}, {{'label': 'helmet', 'score': 0.33, 'bbox': [0.3, 0.65, 0.32, 0.67]}}, {{'label': 'person', 'score': 0.29, 'bbox': [0.28, 0.65, 0.36, 0.84]}}, {{'label': 'helmet', 'score': 0.29, 'bbox': [0.13, 0.82, 0.15, 0.85]}}, {{'label': 'person', 'score': 0.3, 'bbox': [0.1, 0.82, 0.24, 1.0]}}]
 
 ...
 
 **Input Code Snippet**:
 ```python
-from vision_agent.tools import load_image, owl_v2_image
+from vision_agent.tools import load_image, owlv2_object_detection
 
 def check_helmets(image_path):
     image = load_image(image_path)
     # Detect people and helmets, filter out the lowest confidence helmet score of 0.15
-    detections = owl_v2_image("person, helmet", image, box_threshold=0.15)
+    detections = owlv2_object_detection("person, helmet", image, box_threshold=0.15)
     height, width = image.shape[:2]
 
     # Separate people and helmets

diff --git a/vision_agent/agent/vision_agent_planner_prompts.py b/vision_agent/agent/vision_agent_planner_prompts.py
@@ -55,27 +55,27 @@
 --- EXAMPLE1 ---
 plan1:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'owlv2_object_detection' tool with the prompt 'person' to detect and count the number of people in the image.
 plan2:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'florence2_sam2_instance_segmentation' tool with the prompt 'person' to detect and count the number of people in the image.
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
 - Use the 'countgd_object_detection' tool to count the dominant foreground object, which in this case is people.
 
 ```python
-from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_object_detection
+from vision_agent.tools import load_image, owlv2_object_detection, florence2_sam2_instance_segmentation, countgd_object_detection
 image = load_image("image.jpg")
-owl_v2_out = owl_v2_image("person", image)
+owl_v2_out = owlv2_object_detection("person", image)
 
-f2s2_out = florence2_sam2_image("person", image)
+f2s2_out = florence2_sam2_instance_segmentation("person", image)
 # strip out the masks from the output becuase they don't provide useful information when printed
 f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
 
 cgd_out = countgd_object_detection("person", image)
 
-final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_object_detection": cgd_out}}
+final_out = {{"owlv2_object_detection": owl_v2_out, "florence2_sam2_instance_segmentation": f2s2, "countgd_object_detection": cgd_out}}
 print(final_out)
 --- END EXAMPLE1 ---
 

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
@@ -55,10 +55,10 @@
 
 OBSERVATION:
 [Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owl_v2_image
+0|from vision_agent.tools import load_image, owlv2_object_detection
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
-3|    dogs = owl_v2_image("dog", image)
+3|    dogs = owlv2_object_detection("dog", image)
 4|    return dogs
 [End of artifact]
 
@@ -96,10 +96,10 @@
 
 OBSERVATION:
 [Artifact dog_detector.py (5 lines total)]
-0|from vision_agent.tools import load_image, owl_v2_image
+0|from vision_agent.tools import load_image, owlv2_object_detection
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
-3|    dogs = owl_v2_image("dog", image, threshold=0.24)
+3|    dogs = owlv2_object_detection("dog", image, threshold=0.24)
 4|    return dogs
 [End of artifact]
 
@@ -168,10 +168,10 @@
 
 OBSERVATION:
 [Artifact code.py (17 lines total)]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
-3|    detections = florence2_phrase_grounding("worker, helmet", image)
+3|    detections = florence2_object_detection("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
 5|    helmets = [d for d in detections if d['label'] == 'helmet']
 6|    count = 0
@@ -217,7 +217,7 @@
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
-AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
+AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_object_detection call with the fine tuning id.</thinking>
 <execute_python>
 use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
 </execute_python>
@@ -228,11 +228,11 @@
 ---
 +++
 @@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ from vision_agent.tools import load_image, florence2_object_detection, closest_box_distance, overlay_bounding_boxes, save_image
  def count_workers_with_helmets(image_path: str, output_path: str):
      image = load_image(image_path)
--    detections = florence2_phrase_grounding("worker, helmet", image)
-+    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+-    detections = florence2_object_detection("worker, helmet", image)
++    detections = florence2_object_detection("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
      workers = [d for d in detections if d['label'] == 'worker']
      helmets = [d for d in detections if d['label'] == 'helmet']
      count = 0
@@ -250,7 +250,7 @@ def count_workers_with_helmets(image_path: str, output_path: str):
 3
 
 AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
-<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
+<response>I have updated the code using the fine tuned florence2_object_detection model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
 <let_user_respond>true</let_user_respond>
 """
 

diff --git a/vision_agent/agent/vision_agent_prompts_v2.py b/vision_agent/agent/vision_agent_prompts_v2.py
@@ -1,5 +1,5 @@
 CONVERSATION = """
-**Role**: You are a help agent that assists users with their vision-related questions.
+**Role**: You are a help agent that called VisionAent, built by LandingAI, that assists users write code to solve vision tasks.
 
 **Actions**:
 `generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
@@ -8,7 +8,7 @@
 
 **Exampels**:
 Here is an example of how you can interact with a user and Actions:
---- START EXAMPLES ---
+--- EXAMPLE1 ---
 USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
 
 AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
@@ -28,8 +28,23 @@ def test_detect_dogs():
 </final_test>
 
 AGENT: <response>Here is the code to detect dogs in the image.</response>
+--- END EXAMPLE1 ---
 
---- END EXAMPLES ---
+--- EXAMPLE2 ---
+USER: Who are you and what can you do?
+
+AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users write code to solve vision tasks. I can write code to do the following tasks:
+
+- Detecting objects in an image with bounding boxes
+- Detecting objects in an image with masks
+- Tracking object in a video
+- Classifying images
+- Understanding documents
+- Pose estimation
+- Visual question answering for both images and videos
+
+How can I help you?</response>
+--- END EXAMPLE2 ---
 
 **Conversation**:
 Here is the current conversation so far:

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
@@ -23,25 +23,7 @@
 from vision_agent.utils.execute import CodeInterpreter, CodeInterpreterFactory
 
 
-def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
-    # only keep last 10 messages
-    conv = format_conversation(chat[-10:])
-    prompt = CONVERSATION.format(
-        conversation=conv,
-    )
-    response = agent([{"role": "user", "content": prompt}], stream=False)
-    return cast(str, response)
-
-
-def check_for_interaction(chat: List[AgentMessage]) -> bool:
-    return (
-        len(chat) > 2
-        and chat[-2].role == "interaction"
-        and chat[-1].role == "interaction_response"
-    )
-
-
-def extract_conversation_for_generate_code(
+def extract_conversation(
     chat: List[AgentMessage],
 ) -> Tuple[List[AgentMessage], Optional[str]]:
     chat = copy.deepcopy(chat)
@@ -75,7 +57,27 @@ def extract_conversation_for_generate_code(
 
         extracted_chat_strip_code = [chat_i] + extracted_chat_strip_code
 
-    return extracted_chat_strip_code[-5:], final_code
+    return extracted_chat_strip_code, final_code
+
+
+def run_conversation(agent: LMM, chat: List[AgentMessage]) -> str:
+    extracted_chat, _ = extract_conversation(chat)
+    extracted_chat = extracted_chat[-10:]
+
+    conv = format_conversation(chat)
+    prompt = CONVERSATION.format(
+        conversation=conv,
+    )
+    response = agent([{"role": "user", "content": prompt}], stream=False)
+    return cast(str, response)
+
+
+def check_for_interaction(chat: List[AgentMessage]) -> bool:
+    return (
+        len(chat) > 2
+        and chat[-2].role == "interaction"
+        and chat[-1].role == "interaction_response"
+    )
 
 
 def maybe_run_action(
@@ -84,8 +86,10 @@ def maybe_run_action(
     chat: List[AgentMessage],
     code_interpreter: Optional[CodeInterpreter] = None,
 ) -> Optional[List[AgentMessage]]:
+    extracted_chat, final_code = extract_conversation(chat)
+    # only keep last 5 messages to keep context recent and not overwhelm LLM
+    extracted_chat = extracted_chat[-5:]
     if action == "generate_or_edit_vision_code":
-        extracted_chat, _ = extract_conversation_for_generate_code(chat)
         # there's an issue here because coder.generate_code will send it's code_context
         # to the outside user via it's update_callback, but we don't necessarily have
         # access to that update_callback here, so we re-create the message using
@@ -105,7 +109,6 @@ def maybe_run_action(
                 )
             ]
     elif action == "edit_code":
-        extracted_chat, final_code = extract_conversation_for_generate_code(chat)
         plan_context = PlanContext(
             plan="Edit the latest code observed in the fewest steps possible according to the user's feedback.",
             instructions=[

diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
@@ -50,6 +50,7 @@ def __init__(
         api_key: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
         if not api_key:
@@ -59,7 +60,9 @@ def __init__(
 
         self.client = OpenAI(api_key=api_key)
         self.model_name = model_name
-        if "max_tokens" not in kwargs:
+        self.image_detail = image_detail
+        # o1 does not use max_tokens
+        if "max_tokens" not in kwargs and not model_name.startswith("o1"):
             kwargs["max_tokens"] = max_tokens
         if json_mode:
             kwargs["response_format"] = {"type": "json_object"}
@@ -106,7 +109,7 @@ def chat(
                                     or encoded_media.startswith("data:image/")
                                     else f"data:image/png;base64,{encoded_media}"
                                 ),
-                                "detail": "low",
+                                "detail": self.image_detail,
                             },
                         },
                     )
@@ -186,6 +189,7 @@ def __init__(
         azure_endpoint: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
+        image_detail: str = "low",
         **kwargs: Any,
     ):
         if not api_key:
@@ -208,6 +212,7 @@ def __init__(
             azure_endpoint=azure_endpoint,
         )
         self.model_name = model_name
+        self.image_detail = image_detail
 
         if "max_tokens" not in kwargs:
             kwargs["max_tokens"] = max_tokens
@@ -370,9 +375,11 @@ def __init__(
         api_key: Optional[str] = None,
         model_name: str = "claude-3-5-sonnet-20240620",
         max_tokens: int = 4096,
+        image_size: int = 768,
         **kwargs: Any,
     ):
         self.client = anthropic.Anthropic(api_key=api_key)
+        self.image_size = image_size
         self.model_name = model_name
         if "max_tokens" not in kwargs:
             kwargs["max_tokens"] = max_tokens
@@ -399,7 +406,7 @@ def chat(
             ]
             if "media" in msg:
                 for media_path in msg["media"]:
-                    encoded_media = encode_media(media_path, resize=768)
+                    encoded_media = encode_media(media_path, resize=self.image_size)
                     if encoded_media.startswith("data:image/png;base64,"):
                         encoded_media = encoded_media[len("data:image/png;base64,") :]
                     content.append(