Fast import scanning (#195)

Adds experimental new package to do fast scanning of imports. This elides all of the parsing and processing work and does only enough lexical analysis to identify import statements and collect all referenced files. Co-authored-by: Josh Humphries <[email protected]>
bufbuild · Nov 8, 2023 · 146b831 · 146b831
1 parent 0897124
commit 146b831
Show file tree

Hide file tree

Showing 3 changed files with 608 additions and 0 deletions.
diff --git a/internal/benchmarks/benchmark_test.go b/internal/benchmarks/benchmark_test.go
@@ -32,12 +32,14 @@ import (
 	"sort"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 
 	"github.com/jhump/protoreflect/desc"
 	"github.com/jhump/protoreflect/desc/protoparse"
 	"github.com/stretchr/testify/require"
+	"golang.org/x/sync/errgroup"
 	"google.golang.org/protobuf/proto"
 	"google.golang.org/protobuf/types/descriptorpb"
 
@@ -46,6 +48,7 @@ import (
 	"github.com/bufbuild/protocompile/internal/protoc"
 	"github.com/bufbuild/protocompile/linker"
 	"github.com/bufbuild/protocompile/parser"
+	"github.com/bufbuild/protocompile/parser/imports"
 	"github.com/bufbuild/protocompile/protoutil"
 	"github.com/bufbuild/protocompile/reporter"
 )
@@ -353,6 +356,93 @@ func benchmarkGoogleapisProtoparse(b *testing.B, factory func() *protoparse.Pars
 	}
 }
 
+func BenchmarkGoogleapisScanImports(b *testing.B) {
+	par := runtime.GOMAXPROCS(-1)
+	cpus := runtime.NumCPU()
+	if par > cpus {
+		par = cpus
+	}
+	type entry struct {
+		filename string
+		imports  []string
+	}
+	for i := 0; i < b.N; i++ {
+		workCh := make(chan string, par)
+		resultsCh := make(chan entry, par)
+		grp, ctx := errgroup.WithContext(context.Background())
+		// producer
+		grp.Go(func() error {
+			defer close(workCh)
+			for _, name := range googleapisSources {
+				select {
+				case workCh <- filepath.Join(googleapisDir, name):
+				case <-ctx.Done():
+					return ctx.Err()
+				}
+			}
+			return nil
+		})
+		var numProcs atomic.Int32
+		numProcs.Store(int32(par))
+		for i := 0; i < par; i++ {
+			// consumers/processors
+			grp.Go(func() error {
+				defer func() {
+					if numProcs.Add(-1) == 0 {
+						// last one to leave closes the channel
+						close(resultsCh)
+					}
+				}()
+				for {
+					var filename string
+					select {
+					case name, ok := <-workCh:
+						if !ok {
+							return nil
+						}
+						filename = name
+					case <-ctx.Done():
+						return ctx.Err()
+					}
+					r, err := os.Open(filename)
+					var imps []string
+					if err != nil {
+						return err
+					}
+					imps, err = imports.ScanForImports(r)
+					_ = r.Close()
+					if err != nil {
+						return err
+					}
+					select {
+					case resultsCh <- entry{filename: filename, imports: imps}:
+					case <-ctx.Done():
+						return ctx.Err()
+					}
+				}
+			})
+		}
+		results := make(map[string][]string, len(googleapisSources))
+		grp.Go(func() error {
+			// accumulator
+			for {
+				select {
+				case entry, ok := <-resultsCh:
+					if !ok {
+						return nil
+					}
+					results[entry.filename] = entry.imports
+				case <-ctx.Done():
+					return ctx.Err()
+				}
+			}
+		})
+
+		err := grp.Wait()
+		require.NoError(b, err)
+	}
+}
+
 func BenchmarkGoogleapisProtoc(b *testing.B) {
 	benchmarkGoogleapisProtoc(b, "--include_source_info")
 }

diff --git a/parser/imports/fast_imports.go b/parser/imports/fast_imports.go
@@ -0,0 +1,73 @@
+// Copyright 2020-2023 Buf Technologies, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package imports
+
+import (
+	"io"
+	"strings"
+)
+
+var closeSymbol = map[tokenType]tokenType{
+	openParenToken:   closeParenToken,
+	openBraceToken:   closeBraceToken,
+	openBracketToken: closeBracketToken,
+	openAngleToken:   closeAngleToken,
+}
+
+// ScanForImports scans the given reader, which should contain Protobuf source, and
+// returns the set of imports declared in the file. It returns an error if there is
+// an I/O error reading from r. In the event of such an error, it will still return
+// a slice of imports that contains as many imports as were found before the I/O
+// error occurred.
+func ScanForImports(r io.Reader) ([]string, error) {
+	var imports []string
+	var contextStack []tokenType
+	var currentImport []string
+	lexer := newLexer(r)
+	for {
+		token, text, err := lexer.Lex()
+		if err != nil {
+			return imports, err
+		}
+		if token == eofToken {
+			return imports, nil
+		}
+
+		if currentImport != nil {
+			switch token {
+			case stringToken:
+				currentImport = append(currentImport, text.(string))
+			default:
+				if len(currentImport) > 0 {
+					imports = append(imports, strings.Join(currentImport, ""))
+				}
+				currentImport = nil
+			}
+		}
+
+		switch token {
+		case openParenToken, openBraceToken, openBracketToken, openAngleToken:
+			contextStack = append(contextStack, closeSymbol[token])
+		case closeParenToken, closeBraceToken, closeBracketToken, closeAngleToken:
+			if len(contextStack) > 0 && contextStack[len(contextStack)-1] == token {
+				contextStack = contextStack[:len(contextStack)-1]
+			}
+		case identifierToken:
+			if text == "import" && len(contextStack) == 0 {
+				currentImport = []string{}
+			}
+		}
+	}
+}