-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompile_training_data.py
61 lines (44 loc) · 1.58 KB
/
compile_training_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from pathlib import Path
from tqdm import tqdm
import zipfile
import csv
package_path = Path("data/package.zip")
removed = [
# put words you want excluded here
]
FOLDER_NAME = package_path.parent / package_path.stem
def extract_package():
print("Extracting package...")
with zipfile.ZipFile(package_path, 'r') as compressed:
compressed.extractall(FOLDER_NAME)
def get_messages():
result = []
for message_csv in tqdm(FOLDER_NAME.glob("messages/**/messages.csv"), desc="Getting messages", unit="msg"):
with open(message_csv, "r", encoding="utf-8") as file:
reader = csv.DictReader(file)
for row in reader:
content = row["Contents"]
if not content:
continue
content = content.replace("\n", " ")
result.append(content)
return result
def filter_messages(messages):
result = []
for message in tqdm(messages, desc="Filtering messages", total=len(messages), unit="msg"):
if message is None or any(x in message for x in removed):
continue
result.append(message)
return result
def write_messages(messages):
with open("data/messages.txt", "w", encoding="utf-8") as file:
for message in tqdm(messages, desc="Writing messages", total=len(messages), unit="msg"):
file.write(message + "\n")
def main():
if not FOLDER_NAME.exists():
extract_package()
messages = get_messages()
filtered = filter_messages(messages)
write_messages(filtered)
if __name__ == "__main__":
main()