From 8e80b678bc0f79b3099a49a63970f9dfb851a333 Mon Sep 17 00:00:00 2001 From: Mariusz Glebocki Date: Sun, 22 Oct 2023 02:49:51 +0200 Subject: [PATCH] Parallel builds with concatenated .cpp --- bin/verilator_includer2 | 83 +++++++++++++++++++++++++++++++++++++++++ include/verilated.mk.in | 48 +++++++++++++++++++----- 2 files changed, 121 insertions(+), 10 deletions(-) create mode 100755 bin/verilator_includer2 diff --git a/bin/verilator_includer2 b/bin/verilator_includer2 new file mode 100755 index 00000000000..740b68655ec --- /dev/null +++ b/bin/verilator_includer2 @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +import sys +from dataclasses import dataclass, field +from pathlib import Path + +@dataclass(slots = True) +class BucketData: + bucket_id: int + size: int = 0 + filenames: list[str] = field(default_factory=list) + + def __iter__(self): + return iter((self.bucket_id, self.size, self.filenames)) + + +def get_list(fn: Path) -> tuple[int, list[tuple[int, Path]]]: + total_size = 0 + result: list[tuple[int, Path]] = [] + files: list[str] = [] + with fn.open("r") as f: + files = f.read().split() + + for f in files: + if not f: continue + f = Path(f.strip()) + size = f.stat().st_size + total_size += size + result.append((size, f)) + + return (total_size, result) + + +def main(): + input_list_file = Path(sys.argv[1]) + buckets_count = int(sys.argv[2]) + if buckets_count <= 0: + raise ValueError(f"Arg 2: Expected buckets count, got: {sys.argv[2]}") + output_name_template = sys.argv[3] + if "%" not in output_name_template: + raise ValueError(f"Arg 3: template must contain '%': {sys.argv[3]}") + + total_size, input_list = get_list(input_list_file) + + ideal_bucket_size = total_size // buckets_count + + huge_files_num = 0 + huge_files_size = 0 + for size, _ in input_list: + if size > ideal_bucket_size: + huge_files_num += 1 + huge_files_size += size + + ideal_bucket_size = max(1, total_size - huge_files_size) // max(1, buckets_count - huge_files_num) + + buckets: list[BucketData] = [BucketData(i + 1) for i in range(buckets_count)] + for bucket in buckets: + while input_list: + next_size, next_fn = input_list[0] + diff_now = abs(ideal_bucket_size - bucket.size) + diff_next = abs(ideal_bucket_size - bucket.size - next_size) + if bucket.size == 0 or diff_now > diff_next: + bucket.size += next_size + bucket.filenames.append(str(next_fn)) + input_list.pop(0) + else: + break + + while input_list: + next_size, next_fn = input_list[0] + buckets[-1].size += next_size + buckets[-1].filenames.append(str(next_fn)) + input_list.pop(0) + + for bucket_id, size, filenames in sorted(buckets, key = lambda b: b.size, reverse = True): + # print(f"Bucket {bucket_id:>2} size: {size:>8}, distance from ideal: {ideal_bucket_size - size:>8}", file=sys.stderr) + output_list_file = Path(output_name_template.replace("%", str(bucket_id))) + with output_list_file.open("w") as f: + f.write("\n".join([f"#include <{fn}>" for fn in filenames]) + "\n") + + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/include/verilated.mk.in b/include/verilated.mk.in index 59a94486f2f..f52b6ab297e 100644 --- a/include/verilated.mk.in +++ b/include/verilated.mk.in @@ -51,6 +51,7 @@ CFG_LDLIBS_THREADS = @CFG_LDLIBS_THREADS@ VERILATOR_COVERAGE = $(PERL) $(VERILATOR_ROOT)/bin/verilator_coverage VERILATOR_INCLUDER = $(PYTHON3) $(VERILATOR_ROOT)/bin/verilator_includer +VERILATOR_INCLUDER2 = $(PYTHON3) $(VERILATOR_ROOT)/bin/verilator_includer2 VERILATOR_CCACHE_REPORT = $(PYTHON3) $(VERILATOR_ROOT)/bin/verilator_ccache_report ###################################################################### @@ -202,20 +203,46 @@ VK_GLOBAL_OBJS = $(addsuffix .o, $(VM_GLOBAL_FAST) $(VM_GLOBAL_SLOW)) $(VK_GLOBAL_OBJS): $(VM_PREFIX).mk ifneq ($(VM_PARALLEL_BUILDS),1) - # Fast build for small designs: All .cpp files in one fell swoop. This - # saves total compute, but can be slower if only a little changes. It is - # also a lot slower for medium to large designs when the speed of the C - # compiler dominates, which in this mode is not parallelizable. +# Fast build for small designs: All .cpp files in one fell swoop. This +# saves total compute, but can be slower if only a little changes. It is +# also a lot slower for medium to large designs when the speed of the C +# compiler dominates, which in this mode is not parallelizable. - VK_OBJS += $(VM_PREFIX)__ALL.o - $(VM_PREFIX)__ALL.cpp: $(addsuffix .cpp, $(VM_FAST) $(VM_SLOW)) +VK_OBJS += $(VM_PREFIX)__ALL.o + +$(VM_PREFIX)__ALL.cpp: $(addsuffix .cpp, $(VM_FAST) $(VM_SLOW)) $(VERILATOR_INCLUDER) -DVL_INCLUDE_OPT=include $^ > $@ - all_cpp: $(VM_PREFIX)__ALL.cpp + +.PHONY: all_cpp +all_cpp: $(VM_PREFIX)__ALL.cpp else - # Parallel build: Each .cpp file by itself. This can be somewhat slower for - # very small designs and examples, but is a lot faster for large designs. - VK_OBJS += $(VK_OBJS_FAST) $(VK_OBJS_SLOW) +# TODO(mglb): rename to something related to .cpp files concatenation +# VM_PARALLEL_JOBS ?= 20 + +ifneq ($(filter-out 0 1,$(VM_PARALLEL_JOBS)),) + +_VK_JOB_IDS := $(shell seq 1 ${VM_PARALLEL_JOBS}) + +.INTERMEDIATE: fast.list slow.list +fast.list: $(VM_FAST:%=%.cpp) +slow.list: $(VM_SLOW:%=%.cpp) + +fast.list slow.list: + $(file >$@,$(strip $^)) + +$(foreach id,$(_VK_JOB_IDS),%_$(id)_$(VM_PARALLEL_JOBS).cpp): %.list + $(VERILATOR_INCLUDER2) $< ${VM_PARALLEL_JOBS} $(<:%.list=%)_%_${VM_PARALLEL_JOBS}.cpp + +VK_OBJS_FAST = $(foreach job_id,${_VK_JOB_IDS},fast_${job_id}_${VM_PARALLEL_JOBS}.o) +VK_OBJS_SLOW = $(foreach job_id,${_VK_JOB_IDS},slow_${job_id}_${VM_PARALLEL_JOBS}.o) + +endif + +# Parallel build: Each .cpp file by itself. This can be somewhat slower for +# very small designs and examples, but is a lot faster for large designs. + +VK_OBJS += $(VK_OBJS_FAST) $(VK_OBJS_SLOW) endif # When archiving just objects (.o), use single $(AR) run @@ -344,6 +371,7 @@ debug-make:: @echo VM_GLOBAL_FAST: $(VM_GLOBAL_FAST) @echo VM_GLOBAL_SLOW: $(VM_GLOBAL_SLOW) @echo VM_PARALLEL_BUILDS: $(VM_PARALLEL_BUILDS) + @echo VM_PARALLEL_JOBS: $(VM_PARALLEL_JOBS) @echo VM_PREFIX: $(VM_PREFIX) @echo VM_SUPPORT_FAST: $(VM_SUPPORT_FAST) @echo VM_SUPPORT_SLOW: $(VM_SUPPORT_SLOW)