A more reasonable logic for replace_unicode_tokens, continuing #25

* it will perform the replacement when all the chars are of the special unicode categories
allenai · Jul 8, 2022 · 1815c49 · 1815c49
1 parent 3a43b1e
commit 1815c49
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 1 deletion.
diff --git a/src/vila/utils.py b/src/vila/utils.py
@@ -117,7 +117,7 @@ def replace_unicode_tokens(
     """
     tokens = tokens.copy()
     for idx in range(len(tokens)):
-        if any(unicodedata.category(ch) in unicode_categories for ch in tokens[idx]):
+        if all(unicodedata.category(ch) in unicode_categories for ch in tokens[idx]):
             logging.debug(f"Replacing special unicode tokens {tokens[idx]} with {replace_token}")
             tokens[idx] = replace_token
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,4 +1,6 @@
 from vila.predictors import normalize_bbox, unnormalize_bbox
+from vila.utils import replace_unicode_tokens
+from vila.constants import UNICODE_CATEGORIES_TO_REPLACE
 
 
 def test_normalize_bbox():
@@ -19,3 +21,16 @@ def test_normalize_bbox():
 
     assert unnormalize_bbox((125.0, 250.0, 250.0, 500.0), 1024, 1024) == (128, 256, 256, 512)
     # fmt: on
+
+
+def test_replace_unicode_tokens():
+
+    words = ["\uf02a", "\uf02a\u00ad", "Modalities\uf02a"]
+
+    out = replace_unicode_tokens(
+        words,
+        UNICODE_CATEGORIES_TO_REPLACE,
+        "[UNK]",
+    )
+
+    assert out == ["[UNK]", "[UNK]", "Modalities\uf02a"]