-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathsmall_table.go
187 lines (172 loc) · 6.79 KB
/
small_table.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
package quamina
// byteCeiling - the automaton runs on UTF-8 bytes, which map nicely to Go's byte, which is uint8. The values
// 0xF5-0xFF can't appear in UTF-8 strings. We use 0xF5 as a value terminator, so characters F6 and higher
// can't appear.
const byteCeiling int = 0xf6
// valueTerminator - whenever we're trying to match a value with a pattern that extends to the end of that
// value, we virtually add one of these as the last character, both to the automaton and the value at run-time.
// This simplifies things because you don't have to treat absolute-string-match (only works at last char in
// value) and prefix match differently.
const valueTerminator byte = 0xf5
// nolint:gofmt,goimports
// smallTable serves as a lookup table that encodes mappings between ranges of byte values and the
// transition on any byte in the range.
//
// The way it works is exposed in the step() function just below. Logically, it's a slice of {byte, S}
// but I imagine organizing it this way is a bit more memory-efficient. Suppose we want to model a table where
// byte values 3 and 4 map to ss1 and byte 0x34 maps to ss2. Then the smallTable would look like:
//
// ceilings:---|3|----|5|-|0x34|--|x35|-|byteCeiling|
// states:---|nil|-|&ss1|--|nil|-|&ss2|---------|nil|
// invariant: The last element of ceilings is always byteCeiling
//
// The motivation is that we want to build a state machine on byte values to implement things like prefixes and
// ranges of bytes. This could be done simply with an array of size byteCeiling for each state in the machine,
// or a map[byte]S, but both would be size-inefficient, particularly in the case where you're implementing
// ranges. Now, the step function is O(N) in the number of entries, but empirically, the number of entries is
// small even in large automata, so skipping throgh the ceilings list is measurably about the same speed as a map
// or array construct. One could imagine making step() smarter and do a binary search in the case where there are
// more than some number of entries. But I'm dubious, the ceilings field is []byte and running through a single-digit
// number of those has a good chance of minimizing memory fetches.
// Since this is used to support nondeterministic finite automata (NFAs), it is possible for a state
// to have epsilon transitions, i.e. a transition that is always taken whatever the next input symbol is.
type smallTable struct {
ceilings []byte
steps []*faNext
epsilon []*faState
}
// newSmallTable mostly exists to enforce the constraint that every smallTable has a byteCeiling entry at
// the end, which smallTable.step totally depends on.
func newSmallTable() *smallTable {
return &smallTable{
ceilings: []byte{byte(byteCeiling)},
steps: []*faNext{nil},
}
}
type stepOut struct {
steps []*faState
epsilon []*faState
}
var forbiddenBytes = map[byte]bool{
0xC0: true, 0xC1: true,
0xF5: true, 0xF6: true, 0xF7: true, 0xF8: true, 0xF9: true, 0xFA: true,
0xFB: true, 0xFC: true, 0xFD: true, 0xFE: true, 0xFF: true,
}
// step finds the list of states that result from a transition on the utf8Byte argument. The states can come
// as a result of looking in the table structure, and also the "epsilon" transitions that occur on every
// input byte. Since this is the white-hot center of Quamina's runtime CPU, we don't want to be merging
// the two lists. So to avoid any memory allocation, the caller passes in a structure with the two lists
// and step fills them in.
func (t *smallTable) step(utf8Byte byte, out *stepOut) {
out.epsilon = t.epsilon
for index, ceiling := range t.ceilings {
if utf8Byte < ceiling {
if t.steps[index] == nil {
out.steps = nil
} else {
out.steps = t.steps[index].states
}
return
}
}
_, forbidden := forbiddenBytes[utf8Byte]
if forbidden {
return
}
panic("Malformed smallTable")
}
// dStep takes a step through an NFA in the case where it is known that the NFA in question
// is deterministic, i.e. each combination of an faState and a byte value transitions to at
// most one other byte value.
func (t *smallTable) dStep(utf8Byte byte) *faState {
for index, ceiling := range t.ceilings {
if utf8Byte < ceiling {
if t.steps[index] == nil {
return nil
} else {
return t.steps[index].states[0]
}
}
}
_, forbidden := forbiddenBytes[utf8Byte]
if forbidden {
return nil
}
panic("Malformed smallTable")
}
// makeSmallTable creates a pre-loaded small table, with all bytes not otherwise specified having the defaultStep
// value, and then a few other values with their indexes and values specified in the other two arguments. The
// goal is to reduce memory churn
// constraint: positions must be provided in order
func makeSmallTable(defaultStep *faNext, indices []byte, steps []*faNext) *smallTable {
t := smallTable{
ceilings: make([]byte, 0, len(indices)+2),
steps: make([]*faNext, 0, len(indices)+2),
}
var lastIndex byte = 0
for i, index := range indices {
if index > lastIndex {
t.ceilings = append(t.ceilings, index)
t.steps = append(t.steps, defaultStep)
}
t.ceilings = append(t.ceilings, index+1)
t.steps = append(t.steps, steps[i])
lastIndex = index + 1
}
if indices[len(indices)-1] < byte(byteCeiling) {
t.ceilings = append(t.ceilings, byte(byteCeiling))
t.steps = append(t.steps, defaultStep)
}
return &t
}
func (t *smallTable) gatherMetadata(meta *nfaMetadata) {
eps := len(t.epsilon)
for _, step := range t.steps {
if step != nil {
if (eps + len(step.states)) > meta.maxOutDegree {
meta.maxOutDegree = eps + len(step.states)
}
for _, state := range step.states {
state.table.gatherMetadata(meta)
}
}
}
}
// unpackedTable replicates the data in the smallTable ceilings and states arrays. It's quite hard to
// update the list structure in a smallTable, but trivial in an unpackedTable. The idea is that to update
// a smallTable you unpack it, update, then re-pack it. Not gonna be the most efficient thing so at some future point…
// TODO: Figure out how to update a smallTable in place
type unpackedTable [byteCeiling]*faNext
func unpackTable(t *smallTable) *unpackedTable {
var u unpackedTable
unpackedIndex := 0
for packedIndex, c := range t.ceilings {
ceiling := int(c)
for unpackedIndex < ceiling {
u[unpackedIndex] = t.steps[packedIndex]
unpackedIndex++
}
}
return &u
}
func (t *smallTable) pack(u *unpackedTable) {
ceilings := make([]byte, 0, 16)
steps := make([]*faNext, 0, 16)
lastStep := u[0]
for unpackedIndex, ss := range u {
if ss != lastStep {
ceilings = append(ceilings, byte(unpackedIndex))
steps = append(steps, lastStep)
}
lastStep = ss
}
ceilings = append(ceilings, byte(byteCeiling))
steps = append(steps, lastStep)
t.ceilings = ceilings
t.steps = steps
}
func (t *smallTable) addByteStep(utf8Byte byte, step *faNext) {
unpacked := unpackTable(t)
unpacked[utf8Byte] = step
t.pack(unpacked)
}