-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Replace Certain Unicode Characters for the input (#23)
* allow setting return_type in predict_page * replace certain unicode characters * Update the unicode-char-searching examples * rename char file * Add tests * Add arguments for `replace_empty_unicode`
- Loading branch information
1 parent
5509e3b
commit a593f1a
Showing
7 changed files
with
36,052 additions
and
10 deletions.
There are no files selected for viewing
34,627 changes: 34,627 additions & 0 deletions
34,627
examples/find-empty-unicode-chars/unicode-char-categories.csv
Large diffs are not rendered by default.
Oops, something went wrong.
216 changes: 216 additions & 0 deletions
216
examples/find-empty-unicode-chars/unicode-char-search.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd \n", | ||
"df = pd.read_csv(\"unicode-char-categories.csv\")\n", | ||
"# From https://www.fileformat.info/info/unicode/category/index.htm" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from transformers import AutoTokenizer" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tokenizer = AutoTokenizer.from_pretrained(\"allenai/ivila-row-layoutlm-finetuned-s2vl-v2\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"UNICODE_CATEGORIES_TO_REPLACE = [\"Cc\", \"Cf\", \"Co\", \"Cs\", \"Mn\", \"Zl\", \"Zp\", \"Zs\"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def calculate_tokenization_len(row): \n", | ||
" uni_code = r'\\u' + row['Character'][2:]\n", | ||
" s = uni_code.encode().decode('unicode_escape')\n", | ||
" return len(tokenizer(s, add_special_tokens=False)['input_ids'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['Tokenization Length'] = df.apply(calculate_tokenization_len, axis=1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>count</th>\n", | ||
" <th>mean</th>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Code</th>\n", | ||
" <th></th>\n", | ||
" <th></th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>Cc</th>\n", | ||
" <td>65</td>\n", | ||
" <td>0.000000</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Cf</th>\n", | ||
" <td>163</td>\n", | ||
" <td>0.766871</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Co</th>\n", | ||
" <td>6</td>\n", | ||
" <td>0.666667</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Cs</th>\n", | ||
" <td>6</td>\n", | ||
" <td>0.000000</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Mn</th>\n", | ||
" <td>1950</td>\n", | ||
" <td>0.549744</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Zl</th>\n", | ||
" <td>1</td>\n", | ||
" <td>0.000000</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Zp</th>\n", | ||
" <td>1</td>\n", | ||
" <td>0.000000</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>Zs</th>\n", | ||
" <td>17</td>\n", | ||
" <td>0.000000</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" count mean\n", | ||
"Code \n", | ||
"Cc 65 0.000000\n", | ||
"Cf 163 0.766871\n", | ||
"Co 6 0.666667\n", | ||
"Cs 6 0.000000\n", | ||
"Mn 1950 0.549744\n", | ||
"Zl 1 0.000000\n", | ||
"Zp 1 0.000000\n", | ||
"Zs 17 0.000000" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.groupby('Code')['Tokenization Length'].agg(['count', 'mean']).loc[UNICODE_CATEGORIES_TO_REPLACE]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array(['Cc', 'Cf', 'Co', 'Cs', 'Mc', 'Mn', 'So', 'Zl', 'Zp', 'Zs'],\n", | ||
" dtype=object)" | ||
] | ||
}, | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df[df['Tokenization Length']==0].Code.unique()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df[df['Tokenization Length']==0]['Character'].to_csv(\"zero-length-unicode-chars.txt\", index=None, header=None)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.