This repository has been archived by the owner on May 22, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathrecognize_commands.py
executable file
·188 lines (161 loc) · 7.39 KB
/
recognize_commands.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Stream accuracy recognize commands."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
class RecognizeResult(object):
"""Save recognition result temporarily.
Attributes:
founded_command: A string indicating the word just founded. Default value
is '_silence_'
score: An float representing the confidence of founded word. Default
value is zero.
is_new_command: A boolean indicating if the founded command is a new one
against the last one. Default value is False.
"""
def __init__(self):
self._founded_command = "_silence_"
self._score = 0
self._is_new_command = False
@property
def founded_command(self):
return self._founded_command
@founded_command.setter
def founded_command(self, value):
self._founded_command = value
@property
def score(self):
return self._score
@score.setter
def score(self, value):
self._score = value
@property
def is_new_command(self):
return self._is_new_command
@is_new_command.setter
def is_new_command(self, value):
self._is_new_command = value
class RecognizeCommands(object):
"""Smooth the inference results by using average window.
Maintain a slide window over the audio stream, which adds new result(a pair of
the 1.confidences of all classes and 2.the start timestamp of input audio
clip) directly the inference produces one and removes the most previous one
and other abnormal values. Then it smooth the results in the window to get
the most reliable command in this period.
Attributes:
_label: A list containing commands at corresponding lines.
_average_window_duration: The length of average window.
_detection_threshold: A confidence threshold for filtering out unreliable
command.
_suppression_ms: Milliseconds every two reliable founded commands should
apart.
_minimum_count: An integer count indicating the minimum results the average
window should cover.
_previous_results: A deque to store previous results.
_label_count: The length of label list.
_previous_top_label: Last founded command. Initial value is '_silence_'.
_previous_top_time: The timestamp of _previous results. Default is -np.inf.
"""
def __init__(self, labels, average_window_duration_ms, detection_threshold,
suppression_ms, minimum_count):
"""Init the RecognizeCommands with parameters used for smoothing."""
# Configuration
self._labels = labels
self._average_window_duration_ms = average_window_duration_ms
self._detection_threshold = detection_threshold
self._suppression_ms = suppression_ms
self._minimum_count = minimum_count
# Working Variable
self._previous_results = collections.deque()
self._label_count = len(labels)
self._previous_top_label = "_silence_"
self._previous_top_time = -np.inf
def process_latest_result(self, latest_results, current_time_ms,
recognize_element):
"""Smoothing the results in average window when a new result is added in.
Receive a new result from inference and put the founded command into
a RecognizeResult instance after the smoothing procedure.
Args:
latest_results: A list containing the confidences of all labels.
current_time_ms: The start timestamp of the input audio clip.
recognize_element: An instance of RecognizeResult to store founded
command, its scores and if it is a new command.
Raises:
ValueError: The length of this result from inference doesn't match
label count.
ValueError: The timestamp of this result is earlier than the most
previous one in the average window
"""
if latest_results.shape[0] != self._label_count:
raise ValueError("The results for recognition should contain {} "
"elements, but there are {} produced".format(
self._label_count, latest_results.shape[0]))
if (self._previous_results.__len__() != 0 and
current_time_ms < self._previous_results[0][0]):
raise ValueError("Results must be fed in increasing time order, "
"but receive a timestamp of {}, which was earlier "
"than the previous one of {}".format(
current_time_ms, self._previous_results[0][0]))
# Add the latest result to the head of the deque.
self._previous_results.append([current_time_ms, latest_results])
# Prune any earlier results that are too old for the averaging window.
time_limit = current_time_ms - self._average_window_duration_ms
while time_limit > self._previous_results[0][0]:
self._previous_results.popleft()
# If there are too few results, the result will be unreliable and bail.
how_many_results = self._previous_results.__len__()
earliest_time = self._previous_results[0][0]
sample_duration = current_time_ms - earliest_time
if (how_many_results < self._minimum_count or
sample_duration < self._average_window_duration_ms / 4):
recognize_element.founded_command = self._previous_top_label
recognize_element.score = 0.0
recognize_element.is_new_command = False
return
# Calculate the average score across all the results in the window.
average_scores = np.zeros(self._label_count)
for item in self._previous_results:
score = item[1]
for i in range(score.size):
average_scores[i] += score[i] / how_many_results
# Sort the averaged results in descending score order.
sorted_averaged_index_score = []
for i in range(self._label_count):
sorted_averaged_index_score.append([i, average_scores[i]])
sorted_averaged_index_score = sorted(
sorted_averaged_index_score, key=lambda p: p[1], reverse=True)
# Use the information of previous result to get current result
current_top_index = sorted_averaged_index_score[0][0]
current_top_label = self._labels[current_top_index]
current_top_score = sorted_averaged_index_score[0][1]
time_since_last_top = 0
if (self._previous_top_label == "_silence_" or
self._previous_top_time == -np.inf):
time_since_last_top = np.inf
else:
time_since_last_top = current_time_ms - self._previous_top_time
if (current_top_score > self._detection_threshold and
current_top_label != self._previous_top_label and
time_since_last_top > self._suppression_ms):
self._previous_top_label = current_top_label
self._previous_top_time = current_time_ms
recognize_element.is_new_command = True
else:
recognize_element.is_new_command = False
recognize_element.founded_command = current_top_label
recognize_element.score = current_top_score