diff --git a/builder.go b/builder.go index f793329..993246f 100644 --- a/builder.go +++ b/builder.go @@ -20,9 +20,21 @@ import ( ) var defaultBuilderOpts = &BuilderOpts{ - Encoder: 1, - RegistryTableSize: 10000, - RegistryMRUSize: 2, + Encoder: 1, + RegistryTableSize: 10000, + RegistryMRUSize: 2, + UnfinishedNodesStackSize: 64, + BuilderNodePoolingConfig: BuilderNodePoolingConfig{ + // This value should always be significantly larger than + // RegistryTableSize * RegistryMRUSize because that defines + // how many items can be stored in the registry, and if the + // pools MaxSize is not much larger than that, then all the + // BuilderNodes will end up stuck in the registry and not + // returned to the pool which will cause the building process + // to begin allocating a lot. + MaxSize: 100000, + MaxTransitionSize: 32, + }, } // A Builder is used to build a new FST. When possible data is @@ -50,9 +62,10 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { if opts == nil { opts = defaultBuilderOpts } - builderNodePool := &builderNodePool{} + + builderNodePool := newBuilderNodePool(opts.BuilderNodePoolingConfig) rv := &Builder{ - unfinished: newUnfinishedNodes(builderNodePool), + unfinished: newUnfinishedNodes(builderNodePool, opts), registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize), builderNodePool: builderNodePool, opts: opts, @@ -83,6 +96,7 @@ func (b *Builder) Reset(w io.Writer) error { if err != nil { return err } + return nil } @@ -156,12 +170,22 @@ func (b *Builder) compileFrom(iState int) error { func (b *Builder) compile(node *builderNode) (int, error) { if node.final && len(node.trans) == 0 && node.finalOutput == 0 { + // We're done with this node so its safe to put it back in the pool. + b.builderNodePool.Put(node) return 0, nil } found, addr, entry := b.registry.entry(node) if found { + // This node already existed in the registry (and thus the registry + // did not assume ownership of it) so its safe to put it back in + // the pool. + b.builderNodePool.Put(node) return addr, nil } + // If the node was not found in the registry, then the registry will + // have assumed ownership of it and is responsible for returning it + // to the pool. + addr, err := b.encoder.encodeState(node, b.lastAddr) if err != nil { return 0, err @@ -194,10 +218,15 @@ func (u *unfinishedNodes) Reset() { u.pushEmpty(false) } -func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes { +func newUnfinishedNodes(p *builderNodePool, opts *BuilderOpts) *unfinishedNodes { + initialSize := opts.UnfinishedNodesStackSize + if initialSize <= 0 { + initialSize = defaultBuilderOpts.UnfinishedNodesStackSize + } + rv := &unfinishedNodes{ - stack: make([]*builderNodeUnfinished, 0, 64), - cache: make([]builderNodeUnfinished, 64), + stack: make([]*builderNodeUnfinished, 0, initialSize), + cache: make([]builderNodeUnfinished, initialSize), builderNodePool: p, } rv.pushEmpty(false) @@ -416,21 +445,40 @@ func outputCat(l, r uint64) uint64 { return l + r } +// BuilderNodePoolingConfig is the configuration struct for the BuilderNodePool. +// Note that unsafe.SizeOf(transition{}) is 24 bytes and unsafe.SizeOf(BuilderNode{}) +// is 48 bytes so the maximum amount of memory used by the pool should be approximately +// MaxSize * (48 + 24 * MaxTransitionSize) not including the extra space required +// by the G.C. Note if an F.S.T construction never requires this many BuilderNodes then +// the maximum size of the pool will never be reached as it is allocated lazily. +type BuilderNodePoolingConfig struct { + // Maximum number of builder nodes can be retained in the pool. + MaxSize int + // Maximum size of the transitions array for an individual builder node. + MaxTransitionSize int +} + // builderNodePool pools builderNodes using a singly linked list. // -// NB: builderNode lifecylce is described by the following interactions - -// +------------------------+ +----------------------+ -// | Unfinished Nodes | Transfer once | Registry | -// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) | -// +------------------------+ marked frozen +----------------------+ -// ^ | -// | | -// | Put() -// | Get() on +-------------------+ when -// +-new char--------| builderNode Pool |<-----------evicted -// +-------------------+ +// The lifecycle is as follows: +// +// 1. Builder retrieves a node from the pool using Get() whenever it needs one. +// 2. After a node is compiled it is either: +// a. Discarded and immediately returned to the pool. +// b. Transferred to the registry (which assumes ownership of it) and will +// return it to the pool when it evicts the node to make room for another, +// or when the entire registry is Reset(). type builderNodePool struct { - head *builderNode + config BuilderNodePoolingConfig + size int + head *builderNode +} + +func newBuilderNodePool(config BuilderNodePoolingConfig) *builderNodePool { + // Pool will lazy alloc. + return &builderNodePool{ + config: config, + } } func (p *builderNodePool) Get() *builderNode { @@ -439,14 +487,20 @@ func (p *builderNodePool) Get() *builderNode { } head := p.head p.head = p.head.next + p.size-- return head } func (p *builderNodePool) Put(v *builderNode) { - if v == nil { + if v == nil || + p.size >= p.config.MaxSize || + cap(v.trans) > p.config.MaxTransitionSize { + // Don't store nil or allow the pool to violate its config. return } + v.reset() v.next = p.head p.head = v + p.size++ } diff --git a/registry.go b/registry.go index f5b9b4d..896dca6 100644 --- a/registry.go +++ b/registry.go @@ -19,6 +19,13 @@ type registryCell struct { node *builderNode } +// Registry is used as a form of LRU so that the number of nodes that need to be kept +// in memory is reduced. When the builder is compiling the FST and is presented with +// compiling a given node, it can check the registry to see if an equivalent node has +// already been compiled. If so, the registry will return the address of the already +// compiled node and the builder can use that. If an equivalent node has not already +// been compiled (or was, but has since been evicted from the LRU), the builder will +// recompile it into the encoder and then add it to the registry for future use. type registry struct { builderNodePool *builderNodePool table []registryCell @@ -40,7 +47,12 @@ func newRegistry(p *builderNodePool, tableSize, mruSize int) *registry { func (r *registry) Reset() { var empty registryCell for i := range r.table { - r.builderNodePool.Put(r.table[i].node) + if r.table[i].node != nil { + // Only try and return to the pool if the node actually exists to + // avoid excessive function call overhead in the scenario where many + // of the cells are empty. + r.builderNodePool.Put(r.table[i].node) + } r.table[i] = empty } } @@ -77,6 +89,10 @@ func (r *registry) hash(b *builderNode) int { type registryCache []registryCell +// The registry is responsible for returning BuilderNodes that it controls to the BuilderNodePool once +// they are evicted. As a result, all the codepaths in the entry method that return false (entry was not +// found and the registry is assuming ownership of this node) will return the corresponding evicted node to +// the builderNodePool. func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, int, *registryCell) { if len(r) == 1 { if r[0].node != nil && r[0].node.equiv(node) { @@ -93,6 +109,7 @@ func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, in return true, addr, nil } } + // no match last := len(r) - 1 pool.Put(r[last].node) diff --git a/vellum.go b/vellum.go index b2537b3..320f607 100644 --- a/vellum.go +++ b/vellum.go @@ -54,9 +54,11 @@ var ErrIteratorDone = errors.New("iterator-done") // BuilderOpts is a structure to let advanced users customize the behavior // of the builder and some aspects of the generated FST. type BuilderOpts struct { - Encoder int - RegistryTableSize int - RegistryMRUSize int + Encoder int + RegistryTableSize int + RegistryMRUSize int + UnfinishedNodesStackSize int + BuilderNodePoolingConfig BuilderNodePoolingConfig } // New returns a new Builder which will stream out the diff --git a/vellum_test.go b/vellum_test.go index 652d9b6..20e00f9 100644 --- a/vellum_test.go +++ b/vellum_test.go @@ -149,9 +149,26 @@ func TestRoundTripSimple(t *testing.T) { } func TestRoundTripThousand(t *testing.T) { - dataset := thousandTestWords - randomThousandVals := randomValues(dataset) + b, err := New(nil, nil) + if err != nil { + t.Fatalf("error creating builder: %v", err) + } + testRoundTripThousand(t, b) +} + +func TestRoundTripThousandBuilderIsReusable(t *testing.T) { + b, err := New(nil, nil) + if err != nil { + t.Fatalf("error creating builder: %v", err) + } + + for i := 0; i < 1000; i++ { + testRoundTripThousand(t, b) + } +} + +func testRoundTripThousand(t *testing.T, b *Builder) { f, err := ioutil.TempFile("", "vellum") if err != nil { t.Fatal(err) @@ -167,11 +184,14 @@ func TestRoundTripThousand(t *testing.T) { } }() - b, err := New(f, nil) + err = b.Reset(f) if err != nil { - t.Fatalf("error creating builder: %v", err) + t.Fatalf("error resetting builder: %v", err) } + dataset := thousandTestWords + randomThousandVals := randomValues(dataset) + err = insertStrings(b, dataset, randomThousandVals) if err != nil { t.Fatalf("error inserting thousand words: %v", err)