diff --git a/sam2/modeling/position_encoding.py b/sam2/modeling/position_encoding.py index cf53e30..52ac226 100644 --- a/sam2/modeling/position_encoding.py +++ b/sam2/modeling/position_encoding.py @@ -16,7 +16,7 @@ class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one - used by the Attention is all you need paper, generalized to work on images. + used by the Attention Is All You Need paper, generalized to work on images. """ def __init__( diff --git a/sam2/modeling/sam2_base.py b/sam2/modeling/sam2_base.py index 50d1655..224a8c1 100644 --- a/sam2/modeling/sam2_base.py +++ b/sam2/modeling/sam2_base.py @@ -642,7 +642,7 @@ def _prepare_memory_conditioned_features( pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W) return pix_feat_with_mem - # Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder) + # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder) to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)] to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)] diff --git a/sam2/sam2_image_predictor.py b/sam2/sam2_image_predictor.py index 56d9325..41ce53a 100644 --- a/sam2/sam2_image_predictor.py +++ b/sam2/sam2_image_predictor.py @@ -183,7 +183,7 @@ def predict_batch( normalize_coords=True, ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images. - It returns a tupele of lists of masks, ious, and low_res_masks_logits. + It returns a tuple of lists of masks, ious, and low_res_masks_logits. """ assert self._is_batch, "This function should only be used when in batched mode" if not self._is_image_set: diff --git a/sam2/sam2_video_predictor.py b/sam2/sam2_video_predictor.py index e0a9c99..8b2fd6c 100644 --- a/sam2/sam2_video_predictor.py +++ b/sam2/sam2_video_predictor.py @@ -44,7 +44,7 @@ def init_state( offload_state_to_cpu=False, async_loading_frames=False, ): - """Initialize a inference state.""" + """Initialize an inference state.""" compute_device = self.device # device of the model images, video_height, video_width = load_video_frames( video_path=video_path, @@ -589,7 +589,7 @@ def propagate_in_video_preflight(self, inference_state): # to `propagate_in_video_preflight`). consolidated_frame_inds = inference_state["consolidated_frame_inds"] for is_cond in [False, True]: - # Separately consolidate conditioning and non-conditioning temp outptus + # Separately consolidate conditioning and non-conditioning temp outputs storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs" # Find all the frames that contain temporary outputs for any objects # (these should be the frames that have just received clicks for mask inputs @@ -598,7 +598,7 @@ def propagate_in_video_preflight(self, inference_state): for obj_temp_output_dict in temp_output_dict_per_obj.values(): temp_frame_inds.update(obj_temp_output_dict[storage_key].keys()) consolidated_frame_inds[storage_key].update(temp_frame_inds) - # consolidate the temprary output across all objects on this frame + # consolidate the temporary output across all objects on this frame for frame_idx in temp_frame_inds: consolidated_out = self._consolidate_temp_output_across_obj( inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True diff --git a/sam2/utils/misc.py b/sam2/utils/misc.py index e2d39a0..525e8cb 100644 --- a/sam2/utils/misc.py +++ b/sam2/utils/misc.py @@ -68,7 +68,7 @@ def mask_to_box(masks: torch.Tensor): compute bounding box given an input mask Inputs: - - masks: [B, 1, H, W] boxes, dtype=torch.Tensor + - masks: [B, 1, H, W] masks, dtype=torch.Tensor Returns: - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor @@ -120,7 +120,7 @@ def __init__( self.offload_video_to_cpu = offload_video_to_cpu self.img_mean = img_mean self.img_std = img_std - # items in `self._images` will be loaded asynchronously + # items in `self.images` will be loaded asynchronously self.images = [None] * len(img_paths) # catch and raise any exceptions in the async loading thread self.exception = None diff --git a/sav_dataset/sav_evaluator.py b/sav_dataset/sav_evaluator.py index 1c319e1..d4b0ef0 100644 --- a/sav_dataset/sav_evaluator.py +++ b/sav_dataset/sav_evaluator.py @@ -72,7 +72,7 @@ parser.add_argument( "--do_not_skip_first_and_last_frame", help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. " - "Set this to true for evaluation on settings that doen't skip first and last frames", + "Set this to true for evaluation on settings that doesn't skip first and last frames", action="store_true", ) diff --git a/sav_dataset/utils/sav_benchmark.py b/sav_dataset/utils/sav_benchmark.py index babb330..c4b2444 100644 --- a/sav_dataset/utils/sav_benchmark.py +++ b/sav_dataset/utils/sav_benchmark.py @@ -183,7 +183,7 @@ def _seg2bmap(seg, width=None, height=None): assert not ( width > w | height > h | abs(ar1 - ar2) > 0.01 - ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height) + ), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height) e = np.zeros_like(seg) s = np.zeros_like(seg) diff --git a/setup.py b/setup.py index 92ee0f3..ebef97c 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ LICENSE = "Apache 2.0" # Read the contents of README file -with open("README.md", "r") as f: +with open("README.md", "r", encoding="utf-8") as f: LONG_DESCRIPTION = f.read() # Required dependencies