-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpreprocessing.py
40 lines (37 loc) · 1.52 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import pandas as pd
def make_trans():
a = 'a b c d e f g h i j k l m n o p q r s t u v w x y z ё'.split()
b = 'а в с д е ф г н и ж к л м н о р к р с т у в в х у з е'.split()
trans_dict = dict(zip(a, b))
trans_table = ''.join(a).maketrans(trans_dict)
return trans_table
def normalize(ser: pd.Series):
# "СокДобрый" -> "Сок Добрый"
camel_case_pat = re.compile(r'([а-яa-z])([А-ЯA-Z])')
# "lmno" -> "лмно"
trans_table = make_trans()
# "14х15х30" -> "DxDxD"
dxdxd_pat = re.compile(r'((?:\d+\s*[х\*]\s*){2}\d+)')
# "1.2 15,5" -> "1p2 15p5"
digit_pat = re.compile(r'(\d+)[\.,](\d+)')
# "15 мл" -> "15мл"
unit = 'мг|г|гр|кг|мл|л|шт'
unit_pat = re.compile(fr'((?:\d+p)?\d+)\s*({unit})\b')
# "ж/б ст/б" -> "жб стб"
w_w_pat = re.compile(r'\b([а-я]{1,2})/([а-я]{1,2})\b')
# "a b c d" -> "abcd"
glue_pat = re.compile(r'(?<=(?<!\w)\w) (?=\w(?!\w))', re.UNICODE)
return ser \
.str.replace(camel_case_pat, r'\1 \2') \
.str.lower() \
.str.replace(r'ъ\b', '') \
.str.translate(trans_table) \
.str.replace(dxdxd_pat, ' DxDxD ') \
.str.replace('№', ' NUM ') \
.str.replace('%', ' PERC ') \
.str.replace(digit_pat, r' \1p\2 ') \
.str.replace(unit_pat, r' \1\2 ') \
.str.replace(w_w_pat, r' \1\2 ') \
.str.replace(r'[\W_]', ' ') \
.str.replace(glue_pat, '')