Skip to content

Commit

Permalink
A more reasonable logic for replace_unicode_tokens, continuing #25
Browse files Browse the repository at this point in the history
* it will perform the replacement when all the chars are of the special unicode categories
  • Loading branch information
lolipopshock committed Jul 8, 2022
1 parent 3a43b1e commit 1815c49
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/vila/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def replace_unicode_tokens(
"""
tokens = tokens.copy()
for idx in range(len(tokens)):
if any(unicodedata.category(ch) in unicode_categories for ch in tokens[idx]):
if all(unicodedata.category(ch) in unicode_categories for ch in tokens[idx]):
logging.debug(f"Replacing special unicode tokens {tokens[idx]} with {replace_token}")
tokens[idx] = replace_token

Expand Down
15 changes: 15 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from vila.predictors import normalize_bbox, unnormalize_bbox
from vila.utils import replace_unicode_tokens
from vila.constants import UNICODE_CATEGORIES_TO_REPLACE


def test_normalize_bbox():
Expand All @@ -19,3 +21,16 @@ def test_normalize_bbox():

assert unnormalize_bbox((125.0, 250.0, 250.0, 500.0), 1024, 1024) == (128, 256, 256, 512)
# fmt: on


def test_replace_unicode_tokens():

words = ["\uf02a", "\uf02a\u00ad", "Modalities\uf02a"]

out = replace_unicode_tokens(
words,
UNICODE_CATEGORIES_TO_REPLACE,
"[UNK]",
)

assert out == ["[UNK]", "[UNK]", "Modalities\uf02a"]

0 comments on commit 1815c49

Please sign in to comment.