-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsentence_splitter.py
executable file
·110 lines (84 loc) · 4.16 KB
/
sentence_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
from __future__ import annotations
import argparse
import contextlib
import multiprocessing
import sys
from typing import Any, IO
import spacy
from tqdm.auto import tqdm
# From https://github.com/allenai/allennlp/blob/b2eb036/allennlp/commands/__init__.py#L27
class ArgumentParserWithDefaults(argparse.ArgumentParser):
"""Custom argument parser that will display the default value for an argument in the help message."""
_action_defaults_to_ignore = {"help", "store_true", "store_false", "store_const"}
@staticmethod
def _is_empty_default(default: Any) -> bool:
if default is None:
return True
if isinstance(default, (str, list, tuple, set)):
return not bool(default)
return False
def add_argument(self, *args, **kwargs) -> None:
# Add default value to the help message when the default is meaningful.
default = kwargs.get("default")
if kwargs.get(
"action"
) not in self._action_defaults_to_ignore and not self._is_empty_default(default):
description = kwargs.get("help", "")
kwargs["help"] = f"{description} (default = {default})"
super().add_argument(*args, **kwargs)
def parse_args() -> argparse.Namespace:
parser = ArgumentParserWithDefaults(description="Program to split documents into sentences. The input should be "
"a text file with one document per line. The output will be one "
"sentence per line, and documents will be separated by an empty "
"line.")
parser.add_argument("path", metavar="FILE", nargs="?", default="-", help="input file or '-' to read from stdin")
parser.add_argument("--encoding", default="utf-8")
parser.add_argument("--spacy-model-name", default="en_core_web_sm")
parser.add_argument("--buffer-size", type=int, help="to control how much of the file is loaded into memory at once")
parser.add_argument("--n-process", type=int, default=-1, help="to control how many processes are used run the "
"spaCy pipeline. Especially useful when running on a "
"CPU. Set to -1 to use one process when running on a "
"GPU or to use all available cores when running on a "
"CPU.")
parser.add_argument("--use-gpu", choices=["no", "prefer", "require"], default="no")
return parser.parse_args()
def get_file_line_count(path: str, **kwargs) -> int | None:
if path == "-":
return None
else:
with open(path, **kwargs) as file:
return sum(1 for _ in file)
# From https://stackoverflow.com/a/29824059/1165181
@contextlib.contextmanager
def smart_open(path: str, mode: str | None = "r", **kwargs) -> IO[Any]:
if path == "-":
file = sys.stdin if mode is None or mode == "" or "r" in mode else sys.stdout
else:
file = open(path, mode, **kwargs)
try:
yield file
finally:
if path != "-":
file.close()
def main() -> None:
args = parse_args()
line_count = get_file_line_count(args.path, encoding=args.encoding)
if args.use_gpu == "no":
use_gpu = False
elif args.use_gpu == "prefer":
use_gpu = spacy.prefer_gpu()
elif args.use_gpu == "require":
use_gpu = spacy.require_gpu()
else:
raise ValueError(f"Unknown value for --use-gpu: {args.use_gpu}")
n_process = (1 if use_gpu else multiprocessing.cpu_count()) if args.n_process == -1 else args.n_process
nlp = spacy.load(args.spacy_model_name)
with smart_open(args.path, encoding=args.encoding) as file:
for doc in tqdm(nlp.pipe(file, batch_size=args.buffer_size, n_process=n_process), total=line_count):
for sent in doc.sents:
if sentence_text := sent.text.strip():
print(sentence_text)
print()
if __name__ == "__main__":
main()