Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

configure stack size #3

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
96 changes: 75 additions & 21 deletions builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,21 @@ import (
)

var defaultBuilderOpts = &BuilderOpts{
Encoder: 1,
RegistryTableSize: 10000,
RegistryMRUSize: 2,
Encoder: 1,
RegistryTableSize: 10000,
RegistryMRUSize: 2,
UnfinishedNodesStackSize: 64,
BuilderNodePoolingConfig: BuilderNodePoolingConfig{
// This value should always be significantly larger than
// RegistryTableSize * RegistryMRUSize because that defines
// how many items can be stored in the registry, and if the
// pools MaxSize is not much larger than that, then all the
// BuilderNodes will end up stuck in the registry and not
// returned to the pool which will cause the building process
// to begin allocating a lot.
MaxSize: 100000,
MaxTransitionSize: 32,
},
}

// A Builder is used to build a new FST. When possible data is
Expand Down Expand Up @@ -50,9 +62,10 @@ func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) {
if opts == nil {
opts = defaultBuilderOpts
}
builderNodePool := &builderNodePool{}

builderNodePool := newBuilderNodePool(opts.BuilderNodePoolingConfig)
rv := &Builder{
unfinished: newUnfinishedNodes(builderNodePool),
unfinished: newUnfinishedNodes(builderNodePool, opts),
registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize),
builderNodePool: builderNodePool,
opts: opts,
Expand Down Expand Up @@ -83,6 +96,7 @@ func (b *Builder) Reset(w io.Writer) error {
if err != nil {
return err
}

return nil
}

Expand Down Expand Up @@ -156,12 +170,22 @@ func (b *Builder) compileFrom(iState int) error {
func (b *Builder) compile(node *builderNode) (int, error) {
if node.final && len(node.trans) == 0 &&
node.finalOutput == 0 {
// We're done with this node so its safe to put it back in the pool.
b.builderNodePool.Put(node)
return 0, nil
}
found, addr, entry := b.registry.entry(node)
if found {
// This node already existed in the registry (and thus the registry
// did not assume ownership of it) so its safe to put it back in
// the pool.
b.builderNodePool.Put(node)
return addr, nil
}
// If the node was not found in the registry, then the registry will
// have assumed ownership of it and is responsible for returning it
// to the pool.

addr, err := b.encoder.encodeState(node, b.lastAddr)
if err != nil {
return 0, err
Expand Down Expand Up @@ -194,10 +218,15 @@ func (u *unfinishedNodes) Reset() {
u.pushEmpty(false)
}

func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes {
func newUnfinishedNodes(p *builderNodePool, opts *BuilderOpts) *unfinishedNodes {
initialSize := opts.UnfinishedNodesStackSize
if initialSize <= 0 {
initialSize = defaultBuilderOpts.UnfinishedNodesStackSize
}

rv := &unfinishedNodes{
stack: make([]*builderNodeUnfinished, 0, 64),
cache: make([]builderNodeUnfinished, 64),
stack: make([]*builderNodeUnfinished, 0, initialSize),
cache: make([]builderNodeUnfinished, initialSize),
builderNodePool: p,
}
rv.pushEmpty(false)
Expand Down Expand Up @@ -416,21 +445,40 @@ func outputCat(l, r uint64) uint64 {
return l + r
}

// BuilderNodePoolingConfig is the configuration struct for the BuilderNodePool.
// Note that unsafe.SizeOf(transition{}) is 24 bytes and unsafe.SizeOf(BuilderNode{})
// is 48 bytes so the maximum amount of memory used by the pool should be approximately
// MaxSize * (48 + 24 * MaxTransitionSize) not including the extra space required
// by the G.C. Note if an F.S.T construction never requires this many BuilderNodes then
// the maximum size of the pool will never be reached as it is allocated lazily.
type BuilderNodePoolingConfig struct {
// Maximum number of builder nodes can be retained in the pool.
MaxSize int
// Maximum size of the transitions array for an individual builder node.
MaxTransitionSize int
}

// builderNodePool pools builderNodes using a singly linked list.
//
// NB: builderNode lifecylce is described by the following interactions -
// +------------------------+ +----------------------+
// | Unfinished Nodes | Transfer once | Registry |
// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) |
// +------------------------+ marked frozen +----------------------+
// ^ |
// | |
// | Put()
// | Get() on +-------------------+ when
// +-new char--------| builderNode Pool |<-----------evicted
// +-------------------+
// The lifecycle is as follows:
//
// 1. Builder retrieves a node from the pool using Get() whenever it needs one.
// 2. After a node is compiled it is either:
// a. Discarded and immediately returned to the pool.
// b. Transferred to the registry (which assumes ownership of it) and will
// return it to the pool when it evicts the node to make room for another,
// or when the entire registry is Reset().
type builderNodePool struct {
head *builderNode
config BuilderNodePoolingConfig
size int
head *builderNode
}

func newBuilderNodePool(config BuilderNodePoolingConfig) *builderNodePool {
// Pool will lazy alloc.
return &builderNodePool{
config: config,
}
}

func (p *builderNodePool) Get() *builderNode {
Expand All @@ -439,14 +487,20 @@ func (p *builderNodePool) Get() *builderNode {
}
head := p.head
p.head = p.head.next
p.size--
return head
}

func (p *builderNodePool) Put(v *builderNode) {
if v == nil {
if v == nil ||
p.size >= p.config.MaxSize ||
cap(v.trans) > p.config.MaxTransitionSize {
// Don't store nil or allow the pool to violate its config.
return
}

v.reset()
v.next = p.head
p.head = v
p.size++
}
19 changes: 18 additions & 1 deletion registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ type registryCell struct {
node *builderNode
}

// Registry is used as a form of LRU so that the number of nodes that need to be kept
// in memory is reduced. When the builder is compiling the FST and is presented with
// compiling a given node, it can check the registry to see if an equivalent node has
// already been compiled. If so, the registry will return the address of the already
// compiled node and the builder can use that. If an equivalent node has not already
// been compiled (or was, but has since been evicted from the LRU), the builder will
// recompile it into the encoder and then add it to the registry for future use.
type registry struct {
builderNodePool *builderNodePool
table []registryCell
Expand All @@ -40,7 +47,12 @@ func newRegistry(p *builderNodePool, tableSize, mruSize int) *registry {
func (r *registry) Reset() {
var empty registryCell
for i := range r.table {
r.builderNodePool.Put(r.table[i].node)
if r.table[i].node != nil {
// Only try and return to the pool if the node actually exists to
// avoid excessive function call overhead in the scenario where many
// of the cells are empty.
r.builderNodePool.Put(r.table[i].node)
}
r.table[i] = empty
}
}
Expand Down Expand Up @@ -77,6 +89,10 @@ func (r *registry) hash(b *builderNode) int {

type registryCache []registryCell

// The registry is responsible for returning BuilderNodes that it controls to the BuilderNodePool once
// they are evicted. As a result, all the codepaths in the entry method that return false (entry was not
// found and the registry is assuming ownership of this node) will return the corresponding evicted node to
// the builderNodePool.
func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, int, *registryCell) {
if len(r) == 1 {
if r[0].node != nil && r[0].node.equiv(node) {
Expand All @@ -93,6 +109,7 @@ func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, in
return true, addr, nil
}
}

// no match
last := len(r) - 1
pool.Put(r[last].node)
Expand Down
8 changes: 5 additions & 3 deletions vellum.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,11 @@ var ErrIteratorDone = errors.New("iterator-done")
// BuilderOpts is a structure to let advanced users customize the behavior
// of the builder and some aspects of the generated FST.
type BuilderOpts struct {
Encoder int
RegistryTableSize int
RegistryMRUSize int
Encoder int
RegistryTableSize int
RegistryMRUSize int
UnfinishedNodesStackSize int
BuilderNodePoolingConfig BuilderNodePoolingConfig
}

// New returns a new Builder which will stream out the
Expand Down
28 changes: 24 additions & 4 deletions vellum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,26 @@ func TestRoundTripSimple(t *testing.T) {
}

func TestRoundTripThousand(t *testing.T) {
dataset := thousandTestWords
randomThousandVals := randomValues(dataset)
b, err := New(nil, nil)
if err != nil {
t.Fatalf("error creating builder: %v", err)
}

testRoundTripThousand(t, b)
}

func TestRoundTripThousandBuilderIsReusable(t *testing.T) {
b, err := New(nil, nil)
if err != nil {
t.Fatalf("error creating builder: %v", err)
}

for i := 0; i < 1000; i++ {
testRoundTripThousand(t, b)
}
}

func testRoundTripThousand(t *testing.T, b *Builder) {
f, err := ioutil.TempFile("", "vellum")
if err != nil {
t.Fatal(err)
Expand All @@ -167,11 +184,14 @@ func TestRoundTripThousand(t *testing.T) {
}
}()

b, err := New(f, nil)
err = b.Reset(f)
if err != nil {
t.Fatalf("error creating builder: %v", err)
t.Fatalf("error resetting builder: %v", err)
}

dataset := thousandTestWords
randomThousandVals := randomValues(dataset)

err = insertStrings(b, dataset, randomThousandVals)
if err != nil {
t.Fatalf("error inserting thousand words: %v", err)
Expand Down