diff --git a/bin/verilator_includer2 b/bin/verilator_includer2
new file mode 100755
index 00000000000..740b68655ec
--- /dev/null
+++ b/bin/verilator_includer2
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+
+@dataclass(slots = True)
+class BucketData:
+    bucket_id: int
+    size: int = 0
+    filenames: list[str] = field(default_factory=list)
+
+    def __iter__(self):
+        return iter((self.bucket_id, self.size, self.filenames))
+
+
+def get_list(fn: Path) -> tuple[int, list[tuple[int, Path]]]:
+    total_size = 0
+    result: list[tuple[int, Path]] = []
+    files: list[str] = []
+    with fn.open("r") as f:
+        files = f.read().split()
+
+    for f in files:
+        if not f: continue
+        f = Path(f.strip())
+        size = f.stat().st_size
+        total_size += size
+        result.append((size, f))
+
+    return (total_size, result)
+
+
+def main():
+    input_list_file = Path(sys.argv[1])
+    buckets_count = int(sys.argv[2])
+    if buckets_count <= 0:
+        raise ValueError(f"Arg 2: Expected buckets count, got: {sys.argv[2]}")
+    output_name_template = sys.argv[3]
+    if "%" not in output_name_template:
+        raise ValueError(f"Arg 3: template must contain '%': {sys.argv[3]}")
+
+    total_size, input_list = get_list(input_list_file)
+
+    ideal_bucket_size = total_size // buckets_count
+
+    huge_files_num = 0
+    huge_files_size = 0
+    for size, _ in input_list:
+        if size > ideal_bucket_size:
+            huge_files_num += 1
+            huge_files_size += size
+
+    ideal_bucket_size = max(1, total_size - huge_files_size) // max(1, buckets_count - huge_files_num)
+
+    buckets: list[BucketData] = [BucketData(i + 1) for i in range(buckets_count)]
+    for bucket in buckets:
+        while input_list:
+            next_size, next_fn = input_list[0]
+            diff_now = abs(ideal_bucket_size - bucket.size)
+            diff_next = abs(ideal_bucket_size - bucket.size - next_size)
+            if bucket.size == 0 or diff_now > diff_next:
+                bucket.size += next_size
+                bucket.filenames.append(str(next_fn))
+                input_list.pop(0)
+            else:
+                break
+
+    while input_list:
+        next_size, next_fn = input_list[0]
+        buckets[-1].size += next_size
+        buckets[-1].filenames.append(str(next_fn))
+        input_list.pop(0)
+
+    for bucket_id, size, filenames in sorted(buckets, key = lambda b: b.size, reverse = True):
+        #  print(f"Bucket {bucket_id:>2} size: {size:>8}, distance from ideal: {ideal_bucket_size - size:>8}", file=sys.stderr)
+        output_list_file = Path(output_name_template.replace("%", str(bucket_id)))
+        with output_list_file.open("w") as f:
+            f.write("\n".join([f"#include <{fn}>" for fn in filenames]) + "\n")
+
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/include/verilated.mk.in b/include/verilated.mk.in
index 59a94486f2f..f52b6ab297e 100644
--- a/include/verilated.mk.in
+++ b/include/verilated.mk.in
@@ -51,6 +51,7 @@ CFG_LDLIBS_THREADS = @CFG_LDLIBS_THREADS@
 
 VERILATOR_COVERAGE = $(PERL) $(VERILATOR_ROOT)/bin/verilator_coverage
 VERILATOR_INCLUDER = $(PYTHON3) $(VERILATOR_ROOT)/bin/verilator_includer
+VERILATOR_INCLUDER2 = $(PYTHON3) $(VERILATOR_ROOT)/bin/verilator_includer2
 VERILATOR_CCACHE_REPORT = $(PYTHON3) $(VERILATOR_ROOT)/bin/verilator_ccache_report
 
 ######################################################################
@@ -202,20 +203,46 @@ VK_GLOBAL_OBJS = $(addsuffix .o, $(VM_GLOBAL_FAST) $(VM_GLOBAL_SLOW))
 $(VK_GLOBAL_OBJS): $(VM_PREFIX).mk
 
 ifneq ($(VM_PARALLEL_BUILDS),1)
-  # Fast build for small designs: All .cpp files in one fell swoop. This
-  # saves total compute, but can be slower if only a little changes. It is
-  # also a lot slower for medium to large designs when the speed of the C
-  # compiler dominates, which in this mode is not parallelizable.
+# Fast build for small designs: All .cpp files in one fell swoop. This
+# saves total compute, but can be slower if only a little changes. It is
+# also a lot slower for medium to large designs when the speed of the C
+# compiler dominates, which in this mode is not parallelizable.
 
-  VK_OBJS += $(VM_PREFIX)__ALL.o
-  $(VM_PREFIX)__ALL.cpp: $(addsuffix .cpp, $(VM_FAST) $(VM_SLOW))
+VK_OBJS += $(VM_PREFIX)__ALL.o
+
+$(VM_PREFIX)__ALL.cpp: $(addsuffix .cpp, $(VM_FAST) $(VM_SLOW))
 	$(VERILATOR_INCLUDER) -DVL_INCLUDE_OPT=include $^ > $@
-  all_cpp: $(VM_PREFIX)__ALL.cpp
+
+.PHONY: all_cpp
+all_cpp: $(VM_PREFIX)__ALL.cpp
 else
-  # Parallel build: Each .cpp file by itself. This can be somewhat slower for
-  # very small designs and examples, but is a lot faster for large designs.
 
-  VK_OBJS += $(VK_OBJS_FAST) $(VK_OBJS_SLOW)
+# TODO(mglb): rename to something related to .cpp files concatenation
+# VM_PARALLEL_JOBS ?= 20
+
+ifneq ($(filter-out 0 1,$(VM_PARALLEL_JOBS)),)
+
+_VK_JOB_IDS := $(shell seq 1 ${VM_PARALLEL_JOBS})
+
+.INTERMEDIATE: fast.list slow.list
+fast.list: $(VM_FAST:%=%.cpp)
+slow.list: $(VM_SLOW:%=%.cpp)
+
+fast.list slow.list:
+	$(file >$@,$(strip $^))
+
+$(foreach id,$(_VK_JOB_IDS),%_$(id)_$(VM_PARALLEL_JOBS).cpp): %.list
+	$(VERILATOR_INCLUDER2) $< ${VM_PARALLEL_JOBS} $(<:%.list=%)_%_${VM_PARALLEL_JOBS}.cpp
+
+VK_OBJS_FAST = $(foreach job_id,${_VK_JOB_IDS},fast_${job_id}_${VM_PARALLEL_JOBS}.o)
+VK_OBJS_SLOW = $(foreach job_id,${_VK_JOB_IDS},slow_${job_id}_${VM_PARALLEL_JOBS}.o)
+
+endif
+
+# Parallel build: Each .cpp file by itself. This can be somewhat slower for
+# very small designs and examples, but is a lot faster for large designs.
+
+VK_OBJS += $(VK_OBJS_FAST) $(VK_OBJS_SLOW)
 endif
 
 # When archiving just objects (.o), use single $(AR) run
@@ -344,6 +371,7 @@ debug-make::
 	@echo VM_GLOBAL_FAST: $(VM_GLOBAL_FAST)
 	@echo VM_GLOBAL_SLOW: $(VM_GLOBAL_SLOW)
 	@echo VM_PARALLEL_BUILDS:  $(VM_PARALLEL_BUILDS)
+	@echo VM_PARALLEL_JOBS:  $(VM_PARALLEL_JOBS)
 	@echo VM_PREFIX:  $(VM_PREFIX)
 	@echo VM_SUPPORT_FAST: $(VM_SUPPORT_FAST)
 	@echo VM_SUPPORT_SLOW: $(VM_SUPPORT_SLOW)