From eb8e53919bdfe91d129f0353fef8f88a32afc583 Mon Sep 17 00:00:00 2001
From: Matt Keeter <matt@oxide.computer>
Date: Mon, 5 Feb 2024 16:25:52 -0500
Subject: [PATCH] Multi region packing (#1599)

This PR modifies the build system to let flash use all available MPU regions,
resulting in tighter packing and smaller total image size.  This shrinks the
`sidecar-rev-c` image by 256 KiB, and the `gimlet-f` image by 188 KiB.

In other words, tasks go from having exactly 1 flash region to _at least_ 1
flash region; this is mostly plumbing that change from
`suggest_memory_region_size` all the way through the `kconfig`.

The change does makes task packing trickier, because tasks can be placed in one
of two orientations: largest-chunk-first, or largest-chunk-last.  I went for a
very dumb O(N^2) algorithm that checks every unplaced task and picks the best;
we're far from having > 100 tasks, so I'm not worried about bad scaling (famous
last words!).

In addition, it updates `xtask/src/sizes.rs` to print the individual chunks when
the `-v` flag is specified.
---
 app/sidecar/base.toml     |   2 +-
 build/kconfig/src/lib.rs  |  20 +-
 build/xtask/src/clippy.rs |   9 +-
 build/xtask/src/config.rs |  85 +++++-
 build/xtask/src/dist.rs   | 529 +++++++++++++++++++++++++++++---------
 build/xtask/src/main.rs   |  12 +-
 build/xtask/src/sizes.rs  | 189 ++++++++++----
 sys/kern/build.rs         |  60 ++++-
 8 files changed, 697 insertions(+), 209 deletions(-)
diff --git a/app/sidecar/base.toml b/app/sidecar/base.toml
index e9ce78840..029799ca8 100644
--- a/app/sidecar/base.toml
+++ b/app/sidecar/base.toml
@@ -6,7 +6,7 @@ fwid = true
 
 [kernel]
 name = "sidecar"
-requires = {flash = 24600, ram = 6256}
+requires = {flash = 25184, ram = 6256}
 features = ["dump"]
 
 [caboose]
diff --git a/build/kconfig/src/lib.rs b/build/kconfig/src/lib.rs
index 2b388ade0..5aaed33e9 100644
--- a/build/kconfig/src/lib.rs
+++ b/build/kconfig/src/lib.rs
@@ -51,7 +51,7 @@ pub struct TaskConfig {
     ///
     /// The name is the "output" assignment that generated this region,
     /// typically (but not necessarily!) either `"ram"` or `"flash"`.
-    pub owned_regions: BTreeMap<String, RegionConfig>,
+    pub owned_regions: BTreeMap<String, MultiRegionConfig>,
 
     /// Names of regions (in the app-level `shared_regions`) that this task
     /// needs access to.
@@ -117,6 +117,24 @@ pub struct RegionConfig {
     pub attributes: RegionAttributes,
 }
 
+/// Description of one memory span containing multiple adjacent regions
+///
+/// This is equivalent to [`RegionConfig`], but represents a single memory span
+/// that should be configured as multiple regions in the MPU.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MultiRegionConfig {
+    /// Address of start of region. The platform likely has alignment
+    /// requirements for this; it must meet them. (For example, on ARMv7-M, it
+    /// must be naturally aligned for the size.)
+    pub base: u32,
+    /// Size of region, in bytes for each chunk. The platform likely has
+    /// alignment requirements for this; it must meet them. (For example, on
+    /// ARMv7-M, it must be a power of two greater than 16.)
+    pub sizes: Vec<u32>,
+    /// Flags describing what can be done with this region.
+    pub attributes: RegionAttributes,
+}
+
 #[derive(Copy, Clone, Debug, Serialize, Deserialize)]
 pub struct RegionAttributes {
     /// Region can be read by tasks that include it.
diff --git a/build/xtask/src/clippy.rs b/build/xtask/src/clippy.rs
index cea586776..6593a184f 100644
--- a/build/xtask/src/clippy.rs
+++ b/build/xtask/src/clippy.rs
@@ -5,7 +5,6 @@
 use std::path::PathBuf;
 
 use anyhow::{bail, Result};
-use indexmap::IndexMap;
 
 use crate::config::Config;
 
@@ -49,8 +48,10 @@ pub fn run(
 
         let build_config = if name == "kernel" {
             // Build dummy allocations for each task
-            let fake_sizes: IndexMap<_, _> =
-                [("flash", 64), ("ram", 64)].into_iter().collect();
+            let fake_sizes = crate::dist::TaskRequest {
+                memory: [("flash", 64), ("ram", 64)].into_iter().collect(),
+                spare_regions: 0,
+            };
             let task_sizes = toml
                 .tasks
                 .keys()
@@ -71,7 +72,7 @@ pub fn run(
             let mut entry_points: std::collections::HashMap<_, _> = allocs
                 .tasks
                 .iter()
-                .map(|(k, v)| (k.clone(), v["flash"].start))
+                .map(|(k, v)| (k.clone(), v["flash"].start()))
                 .collect();
 
             // add a dummy caboose point
diff --git a/build/xtask/src/config.rs b/build/xtask/src/config.rs
index ef2d2f2cd..db593073a 100644
--- a/build/xtask/src/config.rs
+++ b/build/xtask/src/config.rs
@@ -2,7 +2,7 @@
 // License, v. 2.0. If a copy of the MPL was not distributed with this
 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
 
-use std::collections::{hash_map::DefaultHasher, BTreeMap, BTreeSet};
+use std::collections::{hash_map::DefaultHasher, BTreeMap, BTreeSet, VecDeque};
 use std::hash::Hasher;
 use std::ops::Range;
 use std::path::{Path, PathBuf};
@@ -461,15 +461,24 @@ impl Config {
     }
 
     /// Suggests an appropriate size for the given task (or "kernel"), given
-    /// its true size.  The size depends on MMU implementation, dispatched
-    /// based on the `target` in the config file.
-    pub fn suggest_memory_region_size(&self, name: &str, size: u64) -> u64 {
+    /// its true size and a number of available regions.  The size depends on
+    /// MMU implementation, dispatched based on the `target` in the config file.
+    ///
+    /// The returned `Vec<u64>` always has the largest value first.
+    pub fn suggest_memory_region_size(
+        &self,
+        name: &str,
+        size: u64,
+        regions: usize,
+    ) -> VecDeque<u64> {
         match name {
             "kernel" => {
                 // Nearest chunk of 16
-                ((size + 15) / 16) * 16
+                [((size + 15) / 16) * 16].into_iter().collect()
             }
-            _ => self.mpu_alignment().suggest_memory_region_size(size),
+            _ => self
+                .mpu_alignment()
+                .suggest_memory_region_size(size, regions),
         }
     }
 
@@ -525,10 +534,68 @@ enum MpuAlignment {
 
 impl MpuAlignment {
     /// Suggests a minimal memory region size fitting the given number of bytes
-    fn suggest_memory_region_size(&self, size: u64) -> u64 {
+    ///
+    /// If multiple regions are available, then we may use them for efficiency.
+    /// The resulting `Vec` is guaranteed to have the largest value first.
+    fn suggest_memory_region_size(
+        &self,
+        mut size: u64,
+        regions: usize,
+    ) -> VecDeque<u64> {
         match self {
-            MpuAlignment::PowerOfTwo => size.next_power_of_two(),
-            MpuAlignment::Chunk(c) => ((size + c - 1) / c) * c,
+            MpuAlignment::PowerOfTwo => {
+                const MIN_MPU_REGION_SIZE: u64 = 32;
+                let mut out = VecDeque::new();
+                for _ in 0..regions {
+                    let s =
+                        (size.next_power_of_two() / 2).max(MIN_MPU_REGION_SIZE);
+                    out.push_back(s);
+                    size = size.saturating_sub(s);
+                    if size == 0 {
+                        break;
+                    }
+                }
+                if size > 0 {
+                    if let Some(s) = out.back_mut() {
+                        *s *= 2;
+                    } else {
+                        out.push_back(size.next_power_of_two());
+                    }
+                }
+                // Merge duplicate regions at the end
+                while out.len() >= 2 {
+                    let n = out.len();
+                    if out[n - 1] == out[n - 2] {
+                        out.pop_back();
+                        *out.back_mut().unwrap() *= 2;
+                    } else {
+                        break;
+                    }
+                }
+                // Split the initial (largest) region into as many smaller
+                // regions as we can fit.  This doesn't change total size, but
+                // can make alignment more flexible, since smaller regions have
+                // less stringent alignment requirements.
+                while out[0] > MIN_MPU_REGION_SIZE {
+                    let largest = out[0];
+                    let n = out.iter().filter(|c| **c == largest).count();
+                    if out.len() + n > regions {
+                        break;
+                    }
+                    // Replace `n` instances of `largest` at the start of `out`
+                    // with `n * 2` instances of `largest / 2`
+                    for _ in 0..n {
+                        out.pop_front();
+                    }
+                    for _ in 0..n * 2 {
+                        out.push_front(largest / 2);
+                    }
+                }
+                out
+            }
+            MpuAlignment::Chunk(c) => {
+                [((size + c - 1) / c) * c].into_iter().collect()
+            }
         }
     }
     /// Returns the desired alignment for a region of a particular size
diff --git a/build/xtask/src/dist.rs b/build/xtask/src/dist.rs
index fc83779fd..033ceb119 100644
--- a/build/xtask/src/dist.rs
+++ b/build/xtask/src/dist.rs
@@ -210,6 +210,99 @@ pub fn list_tasks(app_toml: &Path) -> Result<()> {
 /// Represents allocations and free spaces for a particular image
 type AllocationMap = (Allocations, IndexMap<String, Range<u32>>);
 
+/// Module to prevent people from messing with invariants of checked types
+mod checked_types {
+    use super::*;
+
+    /// Simple data structure to store a set of guaranteed-contiguous ranges
+    ///
+    /// This will panic if you violate that constraint!
+    #[derive(Debug, Clone, Default, Hash)]
+    pub struct ContiguousRanges(Vec<Range<u32>>);
+    impl ContiguousRanges {
+        pub fn new(r: Range<u32>) -> Self {
+            Self(vec![r])
+        }
+        pub fn iter(&self) -> impl Iterator<Item = &Range<u32>> {
+            self.0.iter()
+        }
+        pub fn start(&self) -> u32 {
+            self.0.first().unwrap().start
+        }
+        pub fn end(&self) -> u32 {
+            self.0.last().unwrap().end
+        }
+        pub fn contains(&self, v: &u32) -> bool {
+            (self.start()..self.end()).contains(v)
+        }
+        pub fn push(&mut self, r: Range<u32>) {
+            if let Some(t) = &self.0.last() {
+                assert_eq!(t.end, r.start, "ranges must be contiguous");
+            }
+            self.0.push(r)
+        }
+    }
+
+    impl<'a> IntoIterator for &'a ContiguousRanges {
+        type Item = &'a Range<u32>;
+        type IntoIter = std::slice::Iter<'a, Range<u32>>;
+        fn into_iter(self) -> Self::IntoIter {
+            self.0.iter()
+        }
+    }
+
+    /// Simple wrapper data structure that enforces that values are decreasing
+    ///
+    /// Each value must be the same or smaller than the previous value
+    #[derive(Debug)]
+    pub struct OrderedVecDeque {
+        data: VecDeque<u32>,
+    }
+    impl OrderedVecDeque {
+        pub fn new() -> Self {
+            Self {
+                data: VecDeque::new(),
+            }
+        }
+        pub fn iter(&self) -> impl Iterator<Item = &u32> {
+            self.data.iter()
+        }
+        pub fn into_iter(
+            self,
+        ) -> impl Iterator<Item = u32> + DoubleEndedIterator {
+            self.data.into_iter()
+        }
+        pub fn front(&self) -> Option<&u32> {
+            self.data.front()
+        }
+        pub fn pop_front(&mut self) -> Option<u32> {
+            self.data.pop_front()
+        }
+        pub fn push_front(&mut self, v: u32) {
+            if let Some(f) = self.front() {
+                assert!(v >= *f);
+            }
+            self.data.push_front(v)
+        }
+        pub fn back(&self) -> Option<&u32> {
+            self.data.back()
+        }
+        pub fn push_back(&mut self, v: u32) {
+            if let Some(f) = self.back() {
+                assert!(v <= *f);
+            }
+            self.data.push_back(v)
+        }
+    }
+    impl From<OrderedVecDeque> for VecDeque<u32> {
+        fn from(v: OrderedVecDeque) -> Self {
+            v.data
+        }
+    }
+}
+// Republish these types for widespread availability
+pub use checked_types::{ContiguousRanges, OrderedVecDeque};
+
 pub fn package(
     verbose: bool,
     edges: bool,
@@ -279,9 +372,35 @@ pub fn package(
         })
         .collect::<Result<_, _>>()?;
 
+    // Build a set of requests for the memory allocator
+    let mut task_reqs = HashMap::new();
+    for (t, sz) in task_sizes {
+        let n = sz.len()
+            + cfg
+                .toml
+                .extern_regions_for(t, &cfg.toml.image_names[0])
+                .unwrap()
+                .len()
+            + cfg.toml.tasks.get(t).unwrap().uses.len()
+            + cfg
+                .toml
+                .caboose
+                .as_ref()
+                .map(|c| c.tasks.contains(&t.to_string()))
+                .unwrap_or(false) as usize;
+
+        task_reqs.insert(
+            t,
+            TaskRequest {
+                memory: sz,
+                spare_regions: 7 - n,
+            },
+        );
+    }
+
     // Allocate memories.
     let allocated =
-        allocate_all(&cfg.toml, &task_sizes, cfg.toml.caboose.as_ref())?;
+        allocate_all(&cfg.toml, &task_reqs, cfg.toml.caboose.as_ref())?;
 
     for image_name in &cfg.toml.image_names {
         // Build each task.
@@ -330,7 +449,7 @@ pub fn package(
                     task_entry_point(&cfg, name, image_name)
                 } else {
                     // Dummy entry point
-                    Ok(allocs.tasks[name]["flash"].start)
+                    Ok(allocs.tasks[name]["flash"].start())
                 };
                 ep.map(|ep| (name.clone(), ep))
             })
@@ -976,6 +1095,7 @@ fn link_dummy_task(
         .toml
         .memories(&cfg.toml.image_names[0])?
         .into_iter()
+        .map(|(name, r)| (name, ContiguousRanges::new(r)))
         .collect();
     let extern_regions = cfg.toml.extern_regions_for(name, image_name)?;
 
@@ -1290,7 +1410,7 @@ fn check_task_priorities(toml: &Config) -> Result<()> {
 
 fn generate_task_linker_script(
     name: &str,
-    map: &BTreeMap<String, Range<u32>>,
+    map: &BTreeMap<String, ContiguousRanges>,
     sections: Option<&IndexMap<String, String>>,
     stacksize: u32,
     images: &IndexMap<String, Range<u32>>,
@@ -1310,9 +1430,9 @@ fn generate_task_linker_script(
     }
 
     writeln!(linkscr, "MEMORY\n{{")?;
-    for (name, range) in map {
-        let mut start = range.start;
-        let end = range.end;
+    for (name, ranges) in map {
+        let mut start = ranges.start();
+        let end = ranges.end();
         let name = name.to_ascii_uppercase();
 
         // Our stack comes out of RAM
@@ -1735,7 +1855,11 @@ pub struct Allocations {
     /// Map from memory-name to address-range
     pub kernel: BTreeMap<String, Range<u32>>,
     /// Map from task-name to memory-name to address-range
-    pub tasks: BTreeMap<String, BTreeMap<String, Range<u32>>>,
+    ///
+    /// A task may have multiple address ranges in the same memory space for
+    /// efficient packing; if this is the case, the addresses will be contiguous
+    /// and each individual range will respect MPU requirements.
+    pub tasks: BTreeMap<String, BTreeMap<String, ContiguousRanges>>,
     /// Optional trailing caboose, located in the given region
     pub caboose: Option<(String, Range<u32>)>,
 }
@@ -1761,6 +1885,18 @@ impl Allocations {
     }
 }
 
+/// A set of memory requests from a single task
+#[derive(Debug, Clone)]
+pub struct TaskRequest<'a> {
+    /// Memory requests, as a map from memory name -> size
+    pub memory: IndexMap<&'a str, u64>,
+
+    /// Number of extra regions available for more efficient packing
+    ///
+    /// If this is zero, then each request in `memory` can only use 1 region
+    pub spare_regions: usize,
+}
+
 /// Allocates address space from all regions for the kernel and all tasks.
 ///
 /// The allocation strategy is slightly involved, because of the limitations of
@@ -1797,7 +1933,7 @@ impl Allocations {
 /// requests per alignment size.
 pub fn allocate_all(
     toml: &Config,
-    task_sizes: &HashMap<&str, IndexMap<&str, u64>>,
+    task_sizes: &HashMap<&str, TaskRequest>,
     caboose: Option<&CabooseConfig>,
 ) -> Result<BTreeMap<String, AllocationMap>> {
     // Collect all allocation requests into queues, one per memory type, indexed
@@ -1807,7 +1943,7 @@ pub fn allocate_all(
     // We keep kernel and task requests separate so we can always service the
     // kernel first.
     //
-    // The task map is: memory name -> allocation size -> queue of task name.
+    // The task map is: memory name -> task name -> requested regions
     // The kernel map is: memory name -> allocation size
     let kernel = &toml.kernel;
     let tasks = &toml.tasks;
@@ -1821,105 +1957,53 @@ pub fn allocate_all(
         let mut free = toml.memories(image_name)?;
         let kernel_requests = &kernel.requires;
 
-        let mut task_requests: BTreeMap<&str, BTreeMap<u32, VecDeque<&str>>> =
+        let mut task_requests: BTreeMap<&str, IndexMap<&str, OrderedVecDeque>> =
             BTreeMap::new();
 
         for name in tasks.keys() {
-            for (mem, amt) in task_sizes[name.as_str()].iter() {
-                let bytes = toml.suggest_memory_region_size(name, *amt);
+            let req = &task_sizes[name.as_str()];
+            for (&mem, &amt) in req.memory.iter() {
+                // Right now, flash is most limited, so it gets to use all of
+                // our spare regions (if present)
+                let n = if mem == "flash" {
+                    req.spare_regions + 1
+                } else {
+                    1
+                };
+                let bytes = toml.suggest_memory_region_size(name, amt, n);
                 if let Some(r) = tasks[name].max_sizes.get(&mem.to_string()) {
-                    if bytes > *r as u64 {
+                    let total_bytes = bytes.iter().sum::<u64>();
+                    if total_bytes > u64::from(*r) {
                         bail!(
                         "task {}: needs {} bytes of {} but max-sizes limits it to {}",
-                        name, bytes, mem, r);
+                        name, total_bytes, mem, r);
                     }
                 }
+                // Convert from u64 -> u32
+                let mut bs = OrderedVecDeque::new();
+                for b in bytes {
+                    bs.push_back(b.try_into().unwrap());
+                }
                 task_requests
                     .entry(mem)
                     .or_default()
-                    .entry(bytes.try_into().unwrap())
-                    .or_default()
-                    .push_back(name.as_str());
+                    .insert(name.as_str(), bs);
             }
         }
 
         // Okay! Do memory types one by one, fitting kernel first.
         for (region, avail) in &mut free {
             let mut k_req = kernel_requests.get(region.as_str());
-            let mut t_reqs = task_requests.get_mut(region.as_str());
-
-            fn reqs_map_not_empty(
-                om: &Option<&mut BTreeMap<u32, VecDeque<&str>>>,
-            ) -> bool {
-                om.iter()
-                    .flat_map(|map| map.values())
-                    .any(|q| !q.is_empty())
-            }
-
-            'fitloop: while k_req.is_some() || reqs_map_not_empty(&t_reqs) {
-                let align = if avail.start == 0 {
-                    // Lie to keep the masks in range. This could be avoided by
-                    // tracking log2 of masks rather than masks.
-                    1 << 31
-                } else {
-                    1 << avail.start.trailing_zeros()
-                };
-
-                // Search order is:
-                // - Kernel.
-                // - Task requests equal to or smaller than this alignment, in
-                //   descending order of size.
-                // - Task requests larger than this alignment, in ascending
-                //   order of size.
-
-                if let Some(&sz) = k_req.take() {
-                    // The kernel wants in on this.
-                    allocs.kernel.insert(
-                        region.to_string(),
-                        allocate_k(region, sz, avail)?,
-                    );
-                    continue 'fitloop;
-                }
-
-                if let Some(t_reqs) = t_reqs.as_mut() {
-                    for (&sz, q) in t_reqs.range_mut(..=align).rev() {
-                        if let Some(task) = q.pop_front() {
-                            // We can pack an equal or smaller one in.
-                            let align = toml.task_memory_alignment(sz);
-                            allocs
-                                .tasks
-                                .entry(task.to_string())
-                                .or_default()
-                                .insert(
-                                    region.to_string(),
-                                    allocate_one(region, sz, align, avail)?,
-                                );
-                            continue 'fitloop;
-                        }
-                    }
-
-                    for (&sz, q) in t_reqs.range_mut(align + 1..) {
-                        if let Some(task) = q.pop_front() {
-                            // We've gotta use a larger one.
-                            let align = toml.task_memory_alignment(sz);
-                            allocs
-                                .tasks
-                                .entry(task.to_string())
-                                .or_default()
-                                .insert(
-                                    region.to_string(),
-                                    allocate_one(region, sz, align, avail)?,
-                                );
-                            continue 'fitloop;
-                        }
-                    }
-                }
-
-                // If we reach this point, it means our loop condition is wrong,
-                // because one of the above things should really have happened.
-                // Panic here because otherwise it's a hang.
-                panic!("loop iteration without progess made!");
-            }
+            let t_reqs = task_requests.get_mut(region.as_str());
+            let mut t_reqs_empty = IndexMap::new();
+            allocate_region(
+                region,
+                toml,
+                &mut k_req,
+                t_reqs.unwrap_or(&mut t_reqs_empty),
+                avail,
+                &mut allocs,
+            )?;
         }
 
         if let Some(caboose) = caboose {
@@ -1944,6 +2028,195 @@ pub fn allocate_all(
     Ok(result)
 }
 
+fn allocate_region(
+    region: &str,
+    toml: &Config,
+    k_req: &mut Option<&u32>,
+    t_reqs: &mut IndexMap<&str, OrderedVecDeque>,
+    avail: &mut Range<u32>,
+    allocs: &mut Allocations,
+) -> Result<()> {
+    // The kernel gets to go first!
+    if let Some(&sz) = k_req.take() {
+        allocs
+            .kernel
+            .insert(region.to_string(), allocate_k(region, sz, avail)?);
+    }
+
+    while !t_reqs.is_empty() {
+        // At this point, we need to find a task that fits based on our existing
+        // alignment.  This is tricky, because -- for efficient packing -- we
+        // allow tasks to span multiple regions.  For example, a task could look
+        // like this:
+        //
+        //   4444221
+        //
+        // representing three regions of size 4, 2, 1.
+        //
+        // Such a task could be placed in two ways:
+        //
+        //      |4444221 ("forward")
+        //   122|4444    ("reverse")
+        //      | where this line is the alignment for the largest chunk
+
+        #[derive(Debug)]
+        enum Direction {
+            Forward,
+            Reverse,
+        }
+        #[derive(Debug)]
+        struct Match<'a> {
+            gap: u32,
+            align: u32,
+            size: u32,
+            name: &'a str,
+            dir: Direction,
+        }
+        impl<'a> Match<'a> {
+            /// Updates our "current best" with new values, if they're better
+            ///
+            /// Our policy is to rank by
+            /// 1) smallest gap required, and then
+            /// 2) largest resulting alignment
+            /// 3) smallest task size (for backwards compatibility)
+            fn update(
+                &mut self,
+                gap: u32,
+                align: u32,
+                size: u32,
+                name: &'a str,
+                dir: Direction,
+            ) {
+                // Ignore any gap that's < 1/8 of final alignment, since that's
+                // "close enough"
+                let gap_m = gap.saturating_sub(align / 8);
+                let our_gap_m = self.gap.saturating_sub(self.align / 8);
+                if gap_m < our_gap_m
+                    || (gap_m == our_gap_m && align > self.align)
+                    || gap < self.gap
+                    || (gap == self.gap && align > self.align)
+                    || (gap == self.gap
+                        && align == self.align
+                        && size < self.size)
+                {
+                    self.gap = gap;
+                    self.align = align;
+                    self.size = size;
+                    self.name = name;
+                    self.dir = dir;
+                }
+            }
+        }
+
+        let mut best = Match {
+            gap: u32::MAX,
+            align: 0,
+            size: u32::MAX,
+            name: "",
+            dir: Direction::Forward,
+        };
+
+        for (&task_name, mem) in t_reqs.iter() {
+            let align = toml.task_memory_alignment(*mem.front().unwrap());
+            let size_mask = align - 1;
+
+            // Place the chunk using reverse orientation, with padding if it's
+            // not aligned.  The alignment (for scoring purposes) is the
+            // alignment of the largest region, since that's last in memory.
+            let total_size: u32 = mem.iter().sum();
+            let base = (avail.start + total_size + size_mask) & !size_mask;
+            let gap_reverse = base - avail.start - total_size;
+            best.update(
+                gap_reverse,
+                align,
+                total_size,
+                task_name,
+                Direction::Reverse,
+            );
+
+            // Place the chunk using forward orientation, with padding if it's
+            // not aligned.  The alignment (for scoring purposes) is the
+            // alignment of the last region, since that may be worse than the
+            // starting alignment.
+            let base = (avail.start + size_mask) & !size_mask;
+            let gap_forward = base - avail.start;
+            best.update(
+                gap_forward,
+                toml.task_memory_alignment(*mem.back().unwrap()),
+                total_size,
+                task_name,
+                Direction::Forward,
+            );
+        }
+        let Some(sizes) = t_reqs.remove(best.name) else {
+            panic!("could not find a task");
+        };
+
+        // Prepare to pack values either forward or reverse
+        //
+        // At this point, we drop the "values must be ordered" constraint,
+        // because we may combine adjacent regions to reduce the total region
+        // count.  This could violate the ordering constraint, but is still
+        // valid from the MPU's perspective.
+        let mut sizes: VecDeque<u32> = match best.dir {
+            Direction::Forward => sizes.into(),
+            Direction::Reverse => sizes.into_iter().rev().collect(),
+        };
+        avail.start += best.gap;
+
+        while let Some(mut size) = sizes.pop_front() {
+            // When building the size list, we split the largest size to reduce
+            // alignment requirements.  Now, we try to merge them again, to
+            // reduce the number of regions stored in the kernel's flash.
+            //
+            // For example, [256, 256, 64] => [512, 64] if the initial position
+            // is aligned for a 512-byte region.
+            let mut n = sizes.iter().filter(|s| **s == size).count() + 1;
+            if n > 1 {
+                n &= !1; // only consider an even number of regions
+                let possible_align = toml.task_memory_alignment(size * 2);
+                if avail.start & (possible_align - 1) == 0 {
+                    size *= 2;
+                    for _ in 0..n - 1 {
+                        sizes.pop_front();
+                    }
+                    for _ in 0..n / 2 - 1 {
+                        sizes.push_front(size);
+                    }
+                }
+            }
+
+            // We do our own alignment management, so assert that we haven't
+            // messed it up:
+            let align = toml.task_memory_alignment(size);
+            assert!(avail.start & (align - 1) == 0);
+
+            allocs
+                .tasks
+                .entry(best.name.to_string())
+                .or_default()
+                .entry(region.to_string())
+                .or_default()
+                .push(allocate_one(region, size, align, avail)?);
+        }
+
+        // Check that our allocations are all aligned and contiguous
+        let mut prev: Option<Range<u32>> = None;
+        for r in &allocs.tasks[best.name][region] {
+            if let Some(prev) = prev {
+                assert_eq!(prev.end, r.start);
+            }
+            let size = r.end - r.start;
+            assert!(size >= 32); // minimum MPU size
+            let align = toml.task_memory_alignment(size);
+            assert!(r.start.trailing_zeros() >= align.trailing_zeros());
+            prev = Some(r.clone());
+        }
+    }
+
+    Ok(())
+}
+
 fn allocate_k(
     region: &str,
     size: u32,
@@ -2003,7 +2276,7 @@ fn allocate_one(
 /// system.
 pub fn make_kconfig(
     toml: &Config,
-    task_allocations: &BTreeMap<String, BTreeMap<String, Range<u32>>>,
+    task_allocations: &BTreeMap<String, BTreeMap<String, ContiguousRanges>>,
     entry_points: &HashMap<String, u32>,
     image_name: &str,
 ) -> Result<build_kconfig::KernelConfig> {
@@ -2084,7 +2357,7 @@ pub fn make_kconfig(
 
         let flash = &task_allocations[name]["flash"];
         let entry_offset = if flash.contains(&entry_points[name]) {
-            entry_points[name] - flash.start
+            entry_points[name] - flash.start()
         } else {
             bail!(
                 "entry point {:#x} is not in flash range {:#x?}",
@@ -2111,45 +2384,47 @@ pub fn make_kconfig(
         }
 
         let extern_regions = toml.extern_regions_for(name, image_name)?;
-        let owned_regions = task_allocations[name]
+        let mut owned_regions = BTreeMap::new();
+        for (out_name, range) in task_allocations[name]
             .iter()
+            .flat_map(|(name, chunks)| chunks.iter().map(move |c| (name, c)))
             .chain(extern_regions.iter())
-            .map(|(out_name, range)| {
-                // Look up region for this image
-                let mut regions = toml.outputs[out_name]
-                    .iter()
-                    .filter(|o| &o.name == image_name);
-                let out = regions.next().expect("no region for name");
-                if regions.next().is_some() {
-                    bail!("multiple {out_name} regions for name {image_name}");
-                }
-                let size = range.end - range.start;
-                if p2_required && !size.is_power_of_two() {
-                    bail!(
-                        "memory region for task '{name}' output '{out_name}' \
-                        is required to be a power of two, but has size {size}"
-                    );
-                }
+        {
+            // Look up region for this image
+            let mut regions = toml.outputs[out_name]
+                .iter()
+                .filter(|o| &o.name == image_name);
+            let out = regions.next().expect("no region for name");
+            if regions.next().is_some() {
+                bail!("multiple {out_name} regions for name {image_name}");
+            }
+            let size = range.end - range.start;
+            if p2_required && !size.is_power_of_two() {
+                bail!(
+                    "memory region for task '{name}' output '{out_name}' \
+                     is required to be a power of two, but has size {size}"
+                );
+            }
 
-                Ok((
-                    out_name.to_string(),
-                    build_kconfig::RegionConfig {
-                        base: range.start,
-                        size,
-                        attributes: build_kconfig::RegionAttributes {
-                            read: out.read,
-                            write: out.write,
-                            execute: out.execute,
-                            special_role: if out.dma {
-                                Some(build_kconfig::SpecialRole::Dma)
-                            } else {
-                                None
-                            },
+            owned_regions
+                .entry(out_name.to_string())
+                .or_insert(build_kconfig::MultiRegionConfig {
+                    base: range.start,
+                    sizes: vec![],
+                    attributes: build_kconfig::RegionAttributes {
+                        read: out.read,
+                        write: out.write,
+                        execute: out.execute,
+                        special_role: if out.dma {
+                            Some(build_kconfig::SpecialRole::Dma)
+                        } else {
+                            None
                         },
                     },
-                ))
-            })
-            .collect::<Result<BTreeMap<_, _>, _>>()?;
+                })
+                .sizes
+                .push(size);
+        }
 
         tasks.push(build_kconfig::TaskConfig {
             owned_regions,
diff --git a/build/xtask/src/main.rs b/build/xtask/src/main.rs
index 4ebb116fb..294c00121 100644
--- a/build/xtask/src/main.rs
+++ b/build/xtask/src/main.rs
@@ -92,9 +92,9 @@ enum Xtask {
 
     /// Runs `xtask dist` and reports the sizes of resulting tasks
     Sizes {
-        /// Request verbosity from tools we shell out to.
-        #[clap(short)]
-        verbose: bool,
+        /// `-v` shows chunk sizes; `-vv` makes build verbose
+        #[clap(short, action = clap::ArgAction::Count)]
+        verbose: u8,
         /// Path to the image configuration file, in TOML.
         cfg: PathBuf,
 
@@ -248,7 +248,7 @@ fn run(xtask: Xtask) -> Result<()> {
         } => {
             let allocs = dist::package(verbose, edges, &cfg, None, dirty)?;
             for (_, (a, _)) in allocs {
-                sizes::run(&cfg, &a, true, false, false)?;
+                sizes::run(&cfg, &a, true, false, false, false)?;
             }
         }
         Xtask::Build {
@@ -289,9 +289,9 @@ fn run(xtask: Xtask) -> Result<()> {
             save,
             dirty,
         } => {
-            let allocs = dist::package(verbose, false, &cfg, None, dirty)?;
+            let allocs = dist::package(verbose >= 2, false, &cfg, None, dirty)?;
             for (_, (a, _)) in allocs {
-                sizes::run(&cfg, &a, false, compare, save)?;
+                sizes::run(&cfg, &a, false, compare, save, verbose >= 1)?;
             }
         }
         Xtask::Humility { args } => {
diff --git a/build/xtask/src/sizes.rs b/build/xtask/src/sizes.rs
index 48f3c962f..6970c74db 100644
--- a/build/xtask/src/sizes.rs
+++ b/build/xtask/src/sizes.rs
@@ -8,14 +8,14 @@ use std::io::Write;
 use std::path::Path;
 use std::process;
 
-use anyhow::{bail, Result};
+use anyhow::{bail, Context, Result};
 use colored::*;
 use goblin::Object;
 use indexmap::map::Entry;
 use indexmap::IndexMap;
 
 use crate::{
-    dist::{Allocations, DEFAULT_KERNEL_STACK},
+    dist::{Allocations, ContiguousRanges, DEFAULT_KERNEL_STACK},
     Config,
 };
 
@@ -34,6 +34,7 @@ pub fn run(
     only_suggest: bool,
     compare: bool,
     save: bool,
+    verbose: bool,
 ) -> Result<()> {
     let toml = Config::from_file(cfg)?;
     let sizes = create_sizes(&toml)?;
@@ -63,7 +64,7 @@ pub fn run(
     // Print detailed sizes relative to usage
     if !only_suggest {
         let map = build_memory_map(&toml, &sizes, allocs)?;
-        print_memory_map(&toml, &map)?;
+        print_memory_map(&toml, &map, verbose)?;
         print!("\n\n");
         print_task_table(&toml, &map)?;
     }
@@ -78,7 +79,10 @@ pub fn run(
         }
         let size = toml.kernel.requires[&mem.to_string()];
 
-        let suggestion = toml.suggest_memory_region_size("kernel", used);
+        let suggestion = toml.suggest_memory_region_size("kernel", used, 1);
+        assert_eq!(suggestion.len(), 1, "kernel should not be > 1 region");
+        let suggestion = suggestion[0];
+
         if suggestion >= size as u64 {
             continue;
         }
@@ -117,10 +121,10 @@ enum Recommended {
     FixedSize(u32),
     MaxSize(u32),
 }
-#[derive(Copy, Clone, Debug)]
+#[derive(Clone, Debug)]
 struct MemoryChunk<'a> {
     used_size: u64,
-    total_size: u32,
+    total_size: Vec<u32>,
     owner: &'a str,
     recommended: Option<Recommended>,
 }
@@ -145,11 +149,17 @@ fn build_memory_map<'a>(
         .chain(std::iter::once((
             "kernel",
             toml.kernel.requires.clone(),
-            allocs.kernel.clone(),
+            allocs
+                .kernel
+                .iter()
+                .map(|(name, v)| {
+                    (name.to_owned(), ContiguousRanges::new(v.clone()))
+                })
+                .collect(),
         )))
         .chain(allocs.caboose.iter().map(|(region, size)| {
             let mut alloc = BTreeMap::new();
-            alloc.insert(region.clone(), size.clone());
+            alloc.insert(region.clone(), ContiguousRanges::new(size.clone()));
             let mut requires = IndexMap::new();
             requires
                 .insert(region.clone(), toml.caboose.as_ref().unwrap().size);
@@ -164,10 +174,10 @@ fn build_memory_map<'a>(
             }
             let alloc = &alloc[&mem_name.to_string()];
             map.entry(mem_name).or_default().insert(
-                alloc.start,
+                alloc.start(),
                 MemoryChunk {
                     used_size: used,
-                    total_size: alloc.end - alloc.start,
+                    total_size: alloc.iter().map(|v| v.end - v.start).collect(),
                     owner: name,
                     recommended: requires
                         .get(mem_name.to_owned())
@@ -198,7 +208,7 @@ fn print_task_table(
     let mem_pad = map
         .values()
         .flat_map(|m| m.values())
-        .map(|c| format!("{}", c.total_size).len())
+        .map(|c| format!("{}", c.total_size.iter().sum::<u32>()).len())
         .chain(std::iter::once(4))
         .max()
         .unwrap_or(0) as usize;
@@ -215,7 +225,9 @@ fn print_task_table(
         .map(|(region, map)| {
             (
                 *region,
-                map.iter().map(|(_, chunk)| (chunk.owner, *chunk)).collect(),
+                map.iter()
+                    .map(|(_, chunk)| (chunk.owner, chunk.clone()))
+                    .collect(),
             )
         })
         .collect();
@@ -247,7 +259,7 @@ fn print_task_table(
                 print!(
                     "{:<mem$}  {:<mem$}  ",
                     chunk.used_size,
-                    chunk.total_size,
+                    chunk.total_size.iter().sum::<u32>(),
                     mem = mem_pad,
                 );
                 match chunk.recommended {
@@ -265,6 +277,7 @@ fn print_task_table(
 fn print_memory_map(
     toml: &Config,
     map: &BTreeMap<&str, BTreeMap<u32, MemoryChunk>>,
+    verbose: bool,
 ) -> Result<()> {
     let task_pad = toml
         .tasks
@@ -277,61 +290,133 @@ fn print_memory_map(
     let mem_pad = map
         .values()
         .flat_map(|m| m.values())
-        .map(|c| format!("{}", c.total_size).len())
+        .map(|c| format!("{}", c.total_size.iter().sum::<u32>()).len())
         .max()
         .unwrap_or(0) as usize;
     for (mem_name, map) in map {
         println!("\n{}:", mem_name);
-        println!(
-            "      ADDRESS  | {:^task$} | {:>mem$} | {:>mem$} | LIMIT",
+        if verbose {
+            println!(
+            "      ADDRESS  | {:^task$} | {:>mem$} | {:>mem$} | {:>mem$} | LIMIT",
             "PROGRAM",
             "USED",
             "SIZE",
+            "CHUNKS",
             task = task_pad,
             mem = mem_pad,
         );
+        } else {
+            println!(
+                "      ADDRESS  | {:^task$} | {:>mem$} | {:>mem$} | LIMIT",
+                "PROGRAM",
+                "USED",
+                "SIZE",
+                task = task_pad,
+                mem = mem_pad,
+            );
+        }
 
         let next = map.keys().skip(1).map(Some).chain(std::iter::once(None));
         for ((start, chunk), next) in map.iter().zip(next) {
-            print!("    {:#010x} | ", start);
-            print!(
-                "{:<size$} | {:>mem$} | {:>mem$} | ",
-                chunk.owner,
-                chunk.used_size,
-                chunk.total_size,
-                size = task_pad,
-                mem = mem_pad,
-            );
-            match chunk.recommended {
-                None => print!("(auto)"),
-                Some(Recommended::MaxSize(m)) => print!("{}", m),
-                Some(Recommended::FixedSize(_)) => print!("(fixed)"),
+            for (i, mem) in chunk.total_size.iter().enumerate() {
+                print!(
+                    "    {:#010x} | ",
+                    start + chunk.total_size[0..i].iter().sum::<u32>()
+                );
+                if verbose {
+                    if i == 0 {
+                        print!(
+                            "{:<size$} | {:>mem$} | {:>mem$} | {:>mem$} | ",
+                            chunk.owner,
+                            chunk.used_size,
+                            chunk.total_size.iter().sum::<u32>(),
+                            mem,
+                            size = task_pad,
+                            mem = mem_pad,
+                        );
+                    } else {
+                        print!(
+                            "{:<size$} | {:>mem$} | {:>mem$} | {:>mem$} | ",
+                            "",
+                            "",
+                            "",
+                            mem,
+                            size = task_pad,
+                            mem = mem_pad,
+                        );
+                    }
+                } else {
+                    print!(
+                        "{:<size$} | {:>mem$} | {:>mem$} | ",
+                        chunk.owner,
+                        chunk.used_size,
+                        chunk.total_size.iter().sum::<u32>(),
+                        size = task_pad,
+                        mem = mem_pad,
+                    );
+                }
+                if i == 0 {
+                    match chunk.recommended {
+                        None => print!("(auto)"),
+                        Some(Recommended::MaxSize(m)) => print!("{}", m),
+                        Some(Recommended::FixedSize(_)) => print!("(fixed)"),
+                    }
+                }
+                println!();
+                // Only print the header if we're not being verbose
+                if !verbose {
+                    break;
+                }
             }
-            println!();
 
             // Print padding, if relevant
+            let chunk_size = chunk.total_size.iter().sum::<u32>();
             if let Some(&next) = next {
-                if next != start + chunk.total_size {
-                    print!("    {:#010x} | ", start + chunk.total_size);
+                if next != start + chunk_size {
+                    print!("    {:#010x} | ", start + chunk_size);
+                    if verbose {
+                        println!(
+                            "{:<size$} | {:>mem$} | {:>mem$} | {:>mem$} |",
+                            "-padding-",
+                            "--",
+                            next - (start + chunk_size),
+                            "--",
+                            size = task_pad,
+                            mem = mem_pad,
+                        );
+                    } else {
+                        println!(
+                            "{:<size$} | {:>mem$} | {:>mem$} | ",
+                            "-padding-",
+                            "--",
+                            next - (start + chunk_size),
+                            size = task_pad,
+                            mem = mem_pad,
+                        );
+                    }
+                }
+            } else {
+                print!("    {:#010x} | ", start + chunk_size);
+                if verbose {
+                    println!(
+                        "{:<size$} | {:>mem$} | {:>mem$} | {:>mem$} | ",
+                        "--end--",
+                        "",
+                        "",
+                        "",
+                        size = task_pad,
+                        mem = mem_pad,
+                    );
+                } else {
                     println!(
                         "{:<size$} | {:>mem$} | {:>mem$} | ",
-                        "-padding-",
-                        "--",
-                        next - (start + chunk.total_size),
+                        "--end--",
+                        "",
+                        "",
                         size = task_pad,
                         mem = mem_pad,
                     );
                 }
-            } else {
-                print!("    {:#010x} | ", start + chunk.total_size);
-                println!(
-                    "{:<size$} | {:>mem$} | {:>mem$} | ",
-                    "--end--",
-                    "",
-                    "",
-                    size = task_pad,
-                    mem = mem_pad,
-                );
             }
         }
     }
@@ -371,25 +456,25 @@ pub fn load_task_size<'a>(
             let r = memory_sizes.entry(region).or_insert_with(|| start..end);
             r.start = r.start.min(start);
             r.end = r.end.max(end);
-            true
+            Ok(())
         } else {
-            false
+            bail!("could not find region at {start}");
         }
     };
     for phdr in &elf.program_headers {
         if phdr.p_type != goblin::elf::program_header::PT_LOAD {
             continue;
         }
-        record_size(phdr.p_vaddr, phdr.p_memsz);
+        record_size(phdr.p_vaddr, phdr.p_memsz)?;
 
         // If the VirtAddr disagrees with the PhysAddr, then this is a
         // section which is relocated into RAM, so we also accumulate
         // its FileSiz in the physical address (which is presumably
         // flash).
-        if phdr.p_vaddr != phdr.p_paddr
-            && !record_size(phdr.p_paddr, phdr.p_filesz)
-        {
-            bail!("Failed to remap relocated section at {}", phdr.p_paddr);
+        if phdr.p_vaddr != phdr.p_paddr {
+            record_size(phdr.p_paddr, phdr.p_filesz).with_context(|| {
+                format!("Failed to remap relocated section at {}", phdr.p_paddr)
+            })?;
         }
     }
 
diff --git a/sys/kern/build.rs b/sys/kern/build.rs
index ad78672cd..fd7cbbf51 100644
--- a/sys/kern/build.rs
+++ b/sys/kern/build.rs
@@ -33,7 +33,20 @@ struct Generated {
 enum RegionKey {
     Null,
     Shared(String),
-    Owned(usize, String),
+    Owned {
+        /// Index of the task, based on ordering in the kconfig
+        task_index: usize,
+
+        /// Index of this particular region within the task
+        ///
+        /// Each task's memory span can be built from multiple contiguous MPU
+        /// regions; if that's the case, then `chunk_index` varies.  The region
+        /// with `chunk_index == 0` is at the base address.
+        chunk_index: usize,
+
+        /// Name of memory which we're using for this region
+        memory_name: String,
+    },
 }
 
 fn process_config() -> Result<Generated> {
@@ -77,7 +90,23 @@ fn process_config() -> Result<Generated> {
     // Finally, the task-specific regions.
     for (i, task) in kconfig.tasks.iter().enumerate() {
         for (name, region) in &task.owned_regions {
-            region_table.insert(RegionKey::Owned(i, name.clone()), *region);
+            let mut base = region.base;
+            for (j, &size) in region.sizes.iter().enumerate() {
+                let r = RegionConfig {
+                    base,
+                    size,
+                    attributes: region.attributes,
+                };
+                base += size;
+                region_table.insert(
+                    RegionKey::Owned {
+                        task_index: i,
+                        chunk_index: j,
+                        memory_name: name.clone(),
+                    },
+                    r,
+                );
+            }
         }
     }
 
@@ -95,12 +124,18 @@ fn process_config() -> Result<Generated> {
             region_table.get_index_of(&RegionKey::Null).unwrap(),
         ];
 
-        for name in task.owned_regions.keys() {
-            regions.push(
-                region_table
-                    .get_index_of(&RegionKey::Owned(i, name.clone()))
-                    .unwrap(),
-            );
+        for (name, region) in &task.owned_regions {
+            for j in 0..region.sizes.len() {
+                regions.push(
+                    region_table
+                        .get_index_of(&RegionKey::Owned {
+                            task_index: i,
+                            chunk_index: j,
+                            memory_name: name.clone(),
+                        })
+                        .unwrap(),
+                );
+            }
         }
 
         for name in &task.shared_regions {
@@ -292,7 +327,14 @@ fn translate_address(
     task_index: usize,
     address: OwnedAddress,
 ) -> u32 {
-    let key = RegionKey::Owned(task_index, address.region_name);
+    // Addresses within a particular task's memory span can be calculated from
+    // the base address of the task's first memory region, which has a
+    // chunk_index of 0 (since all chunks within the span are contiguous).
+    let key = RegionKey::Owned {
+        task_index,
+        chunk_index: 0,
+        memory_name: address.region_name,
+    };
     region_table[&key].base + address.offset
 }