diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index e38a824e9..0eb0ffb71 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -23,4 +23,4 @@ jobs:
           touch Project.toml
           julia -O3 --project -e 'import Pkg; Pkg.develop(path="moment_kinetics/"); Pkg.add("NCDatasets"); Pkg.precompile()'
           # Reduce nstep for each example to 10 to avoid the CI job taking too long
-          julia -O3 --project -e 'using moment_kinetics; for (root, dirs, files) in walkdir("examples") for file in files if endswith(file, ".toml") filename = joinpath(root, file); println(filename); input = moment_kinetics.moment_kinetics_input.read_input_file(filename); t_input = get(input, "timestepping", Dict{String,Any}()); t_input["nstep"] = 10; t_input["dt"] = 1.0e-12; input["timestepping"] = t_input; pop!(get(input, "z", Dict{String,Any}()), "nelement_local", ""); pop!(get(input, "r", Dict{String,Any}()), "nelement_local", ""); electron_t_input = get(input, "electron_timestepping", Dict{String,Any}()); electron_t_input["initialization_residual_value"] = 1.0e8; electron_t_input["converged_residual_value"] = 1.0e8; input["electron_timestepping"] = electron_t_input; nl_solver_input = get(input, "nonlinear_solver", Dict{String,Any}()); nl_solver_input["rtol"] = 1.0e6; nl_solver_input["atol"] = 1.0e6; input["nonlinear_solver"] = nl_solver_input; run_moment_kinetics(input) end end end'
+          julia -O3 --project -e 'using moment_kinetics; using moment_kinetics.type_definitions: OptionsDict; for (root, dirs, files) in walkdir("examples") for file in files if endswith(file, ".toml") filename = joinpath(root, file); println(filename); input = moment_kinetics.moment_kinetics_input.read_input_file(filename); t_input = get(input, "timestepping", OptionsDict()); t_input["nstep"] = 10; t_input["dt"] = 1.0e-12; input["timestepping"] = t_input; pop!(get(input, "z", OptionsDict()), "nelement_local", ""); pop!(get(input, "r", OptionsDict()), "nelement_local", ""); electron_t_input = get(input, "electron_timestepping", OptionsDict()); electron_t_input["initialization_residual_value"] = 1.0e8; electron_t_input["converged_residual_value"] = 1.0e8; input["electron_timestepping"] = electron_t_input; nl_solver_input = get(input, "nonlinear_solver", OptionsDict()); nl_solver_input["rtol"] = 1.0e6; nl_solver_input["atol"] = 1.0e6; input["nonlinear_solver"] = nl_solver_input; run_moment_kinetics(input) end end end'
diff --git a/.github/workflows/parallel_test.yml b/.github/workflows/parallel_test.yml
index 5441dcc82..129063d91 100644
--- a/.github/workflows/parallel_test.yml
+++ b/.github/workflows/parallel_test.yml
@@ -21,14 +21,15 @@ jobs:
       - uses: julia-actions/cache@v2
       - run: |
           touch Project.toml
-          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
+          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences", "PackageCompiler"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
           julia --project -O3 --check-bounds=no -e 'using MPI; MPI.install_mpiexecjl(; destdir=".")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["NCDatasets", "Random", "SpecialFunctions", "StatsBase", "Test"]); Pkg.develop(path="moment_kinetics/")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.precompile()'
+          julia --project -O3 --check-bounds=no precompile.jl
           # Need to use openmpi so that we can use `--oversubscribe` to allow using more MPI ranks than physical cores
-          ./mpiexecjl -np 3 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
-          ./mpiexecjl -np 4 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
-          ./mpiexecjl -np 2 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1 --long
+          ./mpiexecjl -np 3 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
+          ./mpiexecjl -np 4 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
+          ./mpiexecjl -np 2 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1 --long
           # Note: MPI.jl's default implementation is mpich, which has a similar option
           # `--with-device=ch3:sock`, but that needs to be set when compiling mpich.
         shell: bash
@@ -46,12 +47,13 @@ jobs:
       - uses: julia-actions/cache@v2
       - run: |
           touch Project.toml
-          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
+          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences", "PackageCompiler"]); using MPIPreferences; MPIPreferences.use_jll_binary("OpenMPI_jll")'
           julia --project -O3 --check-bounds=no -e 'using MPI; MPI.install_mpiexecjl(; destdir=".")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["NCDatasets", "Random", "SpecialFunctions", "StatsBase", "Test"]); Pkg.develop(path="moment_kinetics/")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.precompile()'
+          julia --project -O3 --check-bounds=no precompile.jl
           # Need to use openmpi so that we can use `--oversubscribe` to allow using more MPI ranks than physical cores
-          ./mpiexecjl -np 4 --oversubscribe julia --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
+          ./mpiexecjl -np 4 --oversubscribe julia -J moment_kinetics.so --project -O3 --check-bounds=no moment_kinetics/test/runtests.jl --ci --debug 1
           # Note: MPI.jl's default implementation is mpich, which has a similar option
           # `--with-device=ch3:sock`, but that needs to be set when compiling mpich.
         shell: bash
diff --git a/docs/src/developing.md b/docs/src/developing.md
index 4b57f1e15..241f4aceb 100644
--- a/docs/src/developing.md
+++ b/docs/src/developing.md
@@ -41,6 +41,18 @@ String and as a tree of HDF5 variables.
     use `nothing` as a default for some setting, that is fine, but must be done
     after the input is read, and not stored in the `input_dict`.
 
+!!! warning "Parallel I/O consistency"
+    To ensure consistency between all MPI ranks in the order of reads and/or
+    writes when using Parallel I/O, all dictionary types used to store options
+    must be either `OrderedDict` or `SortedDict`, so that their order of
+    entries is deterministic (which is not the case for `Dict`, which instead
+    optimises for look-up speed). This should mostly be taken care of by using
+    `moment_kinetics`'s `OptionsDict` type (which is an alias for
+    `OrderedDict{String,Any}`). We also need to sort the input after it is read
+    by `TOML`, which is taken care of by
+    [`moment_kinetics.input_structs.convert_to_sorted_nested_OptionsDict`](@ref).
+    See also [Parallel I/O](@ref parallel_io_section).
+
 
 ## Array types
 
@@ -219,6 +231,48 @@ communicator is `comm_block[]`).
 See also notes on debugging the 'anyv' parallelisation: [Collision operator and
 'anyv' region](@ref).
 
+## [Parallel I/O](@id parallel_io_section)
+
+The code provides an option to use parallel I/O, which allows all output to be
+written to a single output file even when using distributed-MPI parallelism -
+this is the default option when the linked HDF5 library is compiled with
+parallel-I/O support.
+
+There are a few things to be aware of to ensure parallel I/O works correctly:
+* Some operations have to be called simultaneously on all the MPI ranks that
+  have the output file open. Roughly, these are any operations that change the
+  'metadata' of the file, for example opening/closing files, creating
+  variables, extending dimensions of variables, changing attributes of
+  variables. Reading or writing the data from a variable does not have to be
+  done collectively - actually when we write data we ensure that every rank
+  that is writing writes a non-overlapping slice of the array to avoid
+  contention that could slow down the I/O (because one rank has to wait for
+  another) and to avoid slight inconsistencies because it is uncertain which
+  rank writes the data last. For more details see the [HDF5.jl
+  documentation](https://juliaio.github.io/HDF5.jl/stable/mpi/#Reading-and-writing-data-in-parallel)
+  and the [HDF5
+  documentation](https://support.hdfgroup.org/archive/support/HDF5/doc/RM/CollectiveCalls.html).
+* One important subtlety is that the `Dict` type does not guarantee a
+  deterministic order of entries. When you iterate over a `Dict`, you can get
+  the results in a different order at different times or on different MPI
+  ranks. If we iterated over a `Dict` to create variables to write to an output
+  file, or to read from a file, then different MPI ranks might (sometimes) get
+  the variables in a different order, causing errors. We therefore use either
+  `OrderedDict` or `SortedDict` types for anything that might be written to or
+  read from an HDF5 file.
+
+If the collective operations are not done perfectly consistently, the errors
+can be extremely non-obvious. The inconsistent operations may appear to execute
+correctly, for example because the same number of variables are created, and
+the metadata may only actually be written from the rank-0 process, but the
+inconsistency may cause errors later. [JTO, 3/11/2024: my best guess as to the
+reason for this is that it puts HDF5's 'metadata cache' in inconsistent states
+on different ranks, and this means that at some later time the ranks will cycle
+some metadata out of the cache in different orders, and then some ranks will be
+able to get the metadata from the cache, while others have to read it from the
+file. The reading from the file requires some collective MPI call, which is
+only called from some ranks and not others, causing the code to hang.]
+
 ## Package structure
 
 The structure of the packages in the `moment_kinetics` repo is set up so that
diff --git a/makie_post_processing/makie_post_processing/src/makie_post_processing.jl b/makie_post_processing/makie_post_processing/src/makie_post_processing.jl
index b881ddfa7..0652d06c6 100644
--- a/makie_post_processing/makie_post_processing/src/makie_post_processing.jl
+++ b/makie_post_processing/makie_post_processing/src/makie_post_processing.jl
@@ -8429,7 +8429,7 @@ end
 """
     timing_data(run_info; plot_prefix=nothing, threshold=nothing,
                 include_patterns=nothing, exclude_patterns=nothing, ranks=nothing,
-                figsize=nothing, interactive_figs=nothing, include_legend=true)
+                figsize=nothing, include_legend=true)
 
 Plot timings from different parts of the `moment_kinetics` code. Only timings from
 function calls during the time evolution loop are included, not from the setup, because we
@@ -8463,11 +8463,11 @@ set in `this_input_dict`. When this function is called as part of
 the settings are read from the post processing input file (by default
 `post_processing_input.toml`). The function arguments take precedence, if they are given.
 
-If you load GLMakie (by doing `using GLMakie`) before running this function,
-`interactive_figs` can be passed one of, or a Vector of, `:times`, `:ncalls`, or `:allocs`
-to display an interactive window showing that figure. Multiple values can be passed, but
-the DataInspector (which displays information when you hover over the figure) will only
-work for the first one in the list (this may be a GLMakie bug?).
+If you load GLMakie by doing `using GLMakie` before running this function, but after
+calling `using makie_post_processing` (because `CairoMakie` is loaded when the module is
+loaded and would take over if you load `GLMakie` before `makie_post_processing`), the
+figures will be displayed in interactive windows. When you hover over a line some useful
+information will be displayed.
 
 Pass `include_legend=false` to remove legends from the figures. This is mostly useful for
 interactive figures where hovering over the lines can show what they are, so that the
@@ -8475,8 +8475,7 @@ legend is not needed.
 """
 function timing_data(run_info::Tuple; plot_prefix=nothing, threshold=nothing,
                      include_patterns=nothing, exclude_patterns=nothing, ranks=nothing,
-                     this_input_dict=nothing, figsize=nothing, interactive_figs=nothing,
-                     include_legend=true)
+                     this_input_dict=nothing, figsize=nothing, include_legend=true)
 
     if this_input_dict !== nothing
         input = Dict_to_NamedTuple(this_input_dict["timing_data"])
@@ -8515,40 +8514,31 @@ function timing_data(run_info::Tuple; plot_prefix=nothing, threshold=nothing,
                     ncalls_ax=ncalls_ax, allocs_ax=allocs_ax, irun=irun, figsize=figsize)
     end
 
-    if interactive_figs === nothing && string(Makie.current_backend()) == "GLMakie"
+    if string(Makie.current_backend()) == "GLMakie"
         # Can make interactive plots
 
-        if isa(interactive_figs, Symbol)
-            interactive_figs = [interactive_figs]
-        end
         backend = Makie.current_backend()
 
-        for figtype ∈ interactive_figs
-            if figtype == :times
-                if include_legend
-                    Legend(times_fig[2,1], times_ax; tellheight=true, tellwidth=false,
-                           merge=true)
-                end
-                DataInspector(times_fig)
-                display(backend.Screen(), times_fig)
-            elseif figtype == :ncalls
-                if include_legend
-                    Legend(ncalls_fig[2,1], times_ax; tellheight=true, tellwidth=false,
-                           merge=true)
-                end
-                DataInspector(ncalls_fig)
-                display(backend.Screen(), ncalls_fig)
-            elseif figtype == :allocs
-                if include_legend
-                    Legend(allocs_fig[2,1], times_ax; tellheight=true, tellwidth=false,
-                           merge=true)
-                end
-                DataInspector(allocs_fig)
-                display(backend.Screen(), allocs_fig)
-            else
-                error("Got unrecognized entry $figtype in `interactive_figs`")
-            end
+        if include_legend
+            Legend(times_fig[2,1], times_ax; tellheight=true, tellwidth=false,
+                   merge=true)
+        end
+        DataInspector(times_fig)
+        display(backend.Screen(), times_fig)
+
+        if include_legend
+            Legend(ncalls_fig[2,1], times_ax; tellheight=true, tellwidth=false,
+                   merge=true)
         end
+        DataInspector(ncalls_fig)
+        display(backend.Screen(), ncalls_fig)
+
+        if include_legend
+            Legend(allocs_fig[2,1], times_ax; tellheight=true, tellwidth=false,
+                   merge=true)
+        end
+        DataInspector(allocs_fig)
+        display(backend.Screen(), allocs_fig)
     elseif plot_prefix !== nothing
         if include_legend
             Legend(times_fig[2,1], times_ax; tellheight=true, tellwidth=true, merge=true)
@@ -8587,7 +8577,7 @@ end
 function timing_data(run_info; plot_prefix=nothing, threshold=nothing,
                      include_patterns=nothing, exclude_patterns=nothing, ranks=nothing,
                      this_input_dict=nothing, times_ax=nothing, ncalls_ax=nothing,
-                     allocs_ax=nothing, irun=1, figsize=nothing, interactive_figs=nothing,
+                     allocs_ax=nothing, irun=1, figsize=nothing,
                      include_legend=true)
 
     if this_input_dict !== nothing
@@ -8658,143 +8648,155 @@ function timing_data(run_info; plot_prefix=nothing, threshold=nothing,
     end
 
     linestyles = linestyle=[:solid, :dash, :dot, :dashdot, :dashdotdot]
-    for irank ∈ 0:run_info.nrank-1
-        all_global_timer_variables = run_info.timing_variable_names[irank]
-        time_advance_timer_variables = [v for v ∈ all_global_timer_variables if occursin("time_advance! step", v)]
-        time_variables = [v for v ∈ time_advance_timer_variables if startswith(v, "time:")]
-        ncalls_variables = [v for v ∈ time_advance_timer_variables if startswith(v, "ncalls:")]
-        allocs_variables = [v for v ∈ time_advance_timer_variables if startswith(v, "allocs:")]
-
-        this_rank_group = "timing_data/rank$irank"
-
-        inspector_label_func = (self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])"
-
-        function label_irank(ax, variable, color, unit_conversion=1)
-            if run_info.nrank > 1
-                # Label curves with irank so we can tell which is which
-                index = ((irank + 1) % (length(variable) - 1)) + 1
-                with_theme
-                text!(ax, run_info.time[index],
-                      variable[index] * unit_conversion;
-                      text="$irank", color=color)
-            end
-        end
-
-        function check_include_exclude(variable_name)
-            explicitly_included = (include_patterns !== nothing &&
-                                   any(occursin(p, variable_name) for p ∈ include_patterns))
-            if exclude_patterns === nothing && include_patterns !== nothing
-                excluded = !explicitly_included
-            elseif exclude_patterns !== nothing
-                if !explicitly_included &&
-                        any(occursin(p, variable_name) for p ∈ exclude_patterns)
-                    excluded = true
-                else
-                    excluded = false
-                end
+    time_advance_timer_variables = [v for v ∈ run_info.timing_variable_names if occursin("time_advance! step", v)]
+    time_variables = [v for v ∈ time_advance_timer_variables if startswith(v, "time:")]
+    ncalls_variables = [v for v ∈ time_advance_timer_variables if startswith(v, "ncalls:")]
+    allocs_variables = [v for v ∈ time_advance_timer_variables if startswith(v, "allocs:")]
+
+    timing_group = "timing_data"
+
+    function label_irank(ax, variable, irank, color, unit_conversion=1)
+        if run_info.nrank > 1
+            # Label curves with irank so we can tell which is which
+            index = ((irank + 1) % (length(variable) - 1)) + 1
+            with_theme
+            text!(ax, run_info.time[index],
+                  variable[index] * unit_conversion;
+                  text="$irank", color=color)
+        end
+    end
+
+    function check_include_exclude(variable_name)
+        explicitly_included = (include_patterns !== nothing &&
+                               any(occursin(p, variable_name) for p ∈ include_patterns))
+        if exclude_patterns === nothing && include_patterns !== nothing
+            excluded = !explicitly_included
+        elseif exclude_patterns !== nothing
+            if !explicitly_included &&
+                    any(occursin(p, variable_name) for p ∈ exclude_patterns)
+                excluded = true
             else
                 excluded = false
             end
-            return excluded, explicitly_included
+        else
+            excluded = false
         end
+        return excluded, explicitly_included
+    end
 
-        # Plot the total time
-        time_unit_conversion = 1.0e-9 # ns to s
-        total_time_variable_name = "time:moment_kinetics;time_advance! step"
-        total_time = get_variable(run_info, total_time_variable_name * "_per_step",
-                                  group=this_rank_group)
+    # Plot the total time
+    time_unit_conversion = 1.0e-9 # ns to s
+    total_time_variable_name = "time:moment_kinetics;time_advance! step"
+    total_time = get_variable(run_info, total_time_variable_name * "_per_step",
+                              group=timing_group)
+    for irank ∈ 0:run_info.nrank-1
         label = "time_advance! step"
-        lines!(times_ax, run_info.time, total_time .* time_unit_conversion; color=:black,
-               linestyle=linestyles[irun], label=label,
-               inspector_label=inspector_label_func)
-        label_irank(times_ax, total_time, :black, time_unit_conversion)
-        mean_total_time = mean(total_time)
-        for (variable_counter, variable_name) ∈ enumerate(time_variables)
-            if variable_name == total_time_variable_name
-                # Plotted this already
-                continue
-            end
-            excluded, explicitly_included = check_include_exclude(variable_name)
-            if excluded
-                continue
-            end
-            variable = get_variable(run_info, variable_name * "_per_step",
-                                    group=this_rank_group)
-            if !explicitly_included && mean(variable) < threshold * mean_total_time
-                # This variable takes a very small amount of time, so skip.
-                continue
-            end
+        irank_slice = total_time[irank+1,:]
+        lines!(times_ax, run_info.time, irank_slice .* time_unit_conversion;
+               color=:black, linestyle=linestyles[irun], label=label,
+               inspector_label=(self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])")
+        label_irank(times_ax, irank_slice, irank, :black, time_unit_conversion)
+    end
+    mean_total_time = mean(total_time)
+    for (variable_counter, variable_name) ∈ enumerate(time_variables)
+        if variable_name == total_time_variable_name
+            # Plotted this already
+            continue
+        end
+        excluded, explicitly_included = check_include_exclude(variable_name)
+        if excluded
+            continue
+        end
+        variable = get_variable(run_info, variable_name * "_per_step",
+                                group=timing_group)
+        if !explicitly_included && mean(variable) < threshold * mean_total_time
+            # This variable takes a very small amount of time, so skip.
+            continue
+        end
+        for irank ∈ 0:run_info.nrank-1
             label = split(variable_name, "time_advance! step;")[2]
-            l = lines!(times_ax, run_info.time, variable .* time_unit_conversion;
+            irank_slice = variable[irank+1,:]
+            l = lines!(times_ax, run_info.time, irank_slice .* time_unit_conversion;
                        color=Cycled(variable_counter), linestyle=linestyles[irun],
-                       label=label, inspector_label=inspector_label_func)
-            label_irank(times_ax, variable, l.color, time_unit_conversion)
+                       label=label, inspector_label=(self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])")
+            label_irank(times_ax, irank_slice, irank, l.color, time_unit_conversion)
         end
+    end
 
-        # Plot the number of calls
-        total_ncalls_variable_name = "ncalls:moment_kinetics;time_advance! step"
-        total_ncalls = get_variable(run_info, total_ncalls_variable_name * "_per_step",
-                                  group=this_rank_group)
+    # Plot the number of calls
+    total_ncalls_variable_name = "ncalls:moment_kinetics;time_advance! step"
+    total_ncalls = get_variable(run_info, total_ncalls_variable_name * "_per_step",
+                              group=timing_group)
+    for irank ∈ 0:run_info.nrank-1
         label = "time_advance! step"
-        lines!(ncalls_ax, run_info.time, total_ncalls; color=:black,
+        irank_slice = total_ncalls[irank+1,:]
+        lines!(ncalls_ax, run_info.time, irank_slice; color=:black,
                linestyle=linestyles[irun], label=label,
-               inspector_label=inspector_label_func)
-        label_irank(ncalls_ax, total_ncalls, :black)
-        mean_total_ncalls = mean(total_ncalls)
-        for (variable_counter, variable_name) ∈ enumerate(ncalls_variables)
-            if variable_name == total_ncalls_variable_name
-                # Plotted this already
-                continue
-            end
-            excluded, explicitly_included = check_include_exclude(variable_name)
-            if excluded
-                continue
-            end
-            variable = get_variable(run_info, variable_name * "_per_step",
-                                    group=this_rank_group)
-            if !explicitly_included && mean(variable) < threshold * mean_total_ncalls
-                # This variable takes a very small number of calls, so skip.
-                continue
-            end
+               inspector_label=(self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])")
+        label_irank(ncalls_ax, irank_slice, irank, :black)
+    end
+    mean_total_ncalls = mean(total_ncalls)
+    for (variable_counter, variable_name) ∈ enumerate(ncalls_variables)
+        if variable_name == total_ncalls_variable_name
+            # Plotted this already
+            continue
+        end
+        excluded, explicitly_included = check_include_exclude(variable_name)
+        if excluded
+            continue
+        end
+        variable = get_variable(run_info, variable_name * "_per_step",
+                                group=timing_group)
+        if !explicitly_included && mean(variable) < threshold * mean_total_ncalls
+            # This variable takes a very small number of calls, so skip.
+            continue
+        end
+        for irank ∈ 0:run_info.nrank-1
             label = split(variable_name, "time_advance! step;")[2]
-            l = lines!(ncalls_ax, run_info.time, variable; color=Cycled(variable_counter),
-                       linestyle=linestyles[irun], label=label,
-                       inspector_label=inspector_label_func)
-            label_irank(ncalls_ax, variable, l.color)
+            irank_slice = variable[irank+1,:]
+            l = lines!(ncalls_ax, run_info.time, irank_slice;
+                       color=Cycled(variable_counter), linestyle=linestyles[irun],
+                       label=label, inspector_label=(self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])")
+            label_irank(ncalls_ax, irank_slice, irank, l.color)
         end
+    end
 
-        # Plot the total allocs
-        allocs_unit_conversion = 2^(-20) # bytes to MB
-        total_allocs_variable_name = "allocs:moment_kinetics;time_advance! step"
-        total_allocs = get_variable(run_info, total_allocs_variable_name * "_per_step",
-                                    group=this_rank_group)
+    # Plot the total allocs
+    allocs_unit_conversion = 2^(-20) # bytes to MB
+    total_allocs_variable_name = "allocs:moment_kinetics;time_advance! step"
+    total_allocs = get_variable(run_info, total_allocs_variable_name * "_per_step",
+                                group=timing_group)
+    for irank ∈ 0:run_info.nrank-1
         label = "time_advance! step"
-        lines!(allocs_ax, run_info.time, total_allocs .* allocs_unit_conversion;
+        irank_slice = total_allocs[irank+1,:]
+        lines!(allocs_ax, run_info.time, irank_slice .* allocs_unit_conversion;
                color=:black, linestyle=linestyles[irun], label=label,
-               inspector_label=inspector_label_func)
-        label_irank(allocs_ax, total_allocs, :black, allocs_unit_conversion)
-        mean_total_allocs = mean(total_allocs)
-        for (variable_counter, variable_name) ∈ enumerate(allocs_variables)
-            if variable_name == total_allocs_variable_name
-                # Plotted this already
-                continue
-            end
-            excluded, explicitly_included = check_include_exclude(variable_name)
-            if excluded
-                continue
-            end
-            variable = get_variable(run_info, variable_name * "_per_step",
-                                    group=this_rank_group)
-            if !explicitly_included && mean(variable) < threshold * mean_total_allocs
-                # This variable represents a very small amount of allocs, so skip.
-                continue
-            end
+               inspector_label=(self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])")
+        label_irank(allocs_ax, irank_slice, irank, :black, allocs_unit_conversion)
+    end
+    mean_total_allocs = mean(total_allocs)
+    for (variable_counter, variable_name) ∈ enumerate(allocs_variables)
+        if variable_name == total_allocs_variable_name
+            # Plotted this already
+            continue
+        end
+        excluded, explicitly_included = check_include_exclude(variable_name)
+        if excluded
+            continue
+        end
+        variable = get_variable(run_info, variable_name * "_per_step",
+                                group=timing_group)
+        if !explicitly_included && mean(variable) < threshold * mean_total_allocs
+            # This variable represents a very small amount of allocs, so skip.
+            continue
+        end
+        for irank ∈ 0:run_info.nrank-1
             label = split(variable_name, "time_advance! step;")[2]
-            l = lines!(allocs_ax, run_info.time, variable .* allocs_unit_conversion;
+            irank_slice = variable[irank+1,:]
+            l = lines!(allocs_ax, run_info.time, irank_slice .* allocs_unit_conversion;
                        color=Cycled(variable_counter), linestyle=linestyles[irun],
-                       label=label, inspector_label=inspector_label_func)
-            label_irank(allocs_ax, variable, l.color,
-                        allocs_unit_conversion)
+                       label=label, inspector_label=(self,i,p) -> "$(self.label[]) $irank\nx: $(p[1])\ny: $(p[2])")
+            label_irank(allocs_ax, irank_slice, irank, l.color, allocs_unit_conversion)
         end
     end
 
@@ -8803,37 +8805,28 @@ function timing_data(run_info; plot_prefix=nothing, threshold=nothing,
 
         # Can make interactive plots
 
-        if isa(interactive_figs, Symbol)
-            interactive_figs = [interactive_figs]
-        end
         backend = Makie.current_backend()
 
-        for figtype ∈ interactive_figs
-            if figtype == :times
-                if include_legend
-                    Legend(times_fig[2,1], times_ax; tellheight=true, tellwidth=false,
-                           merge=true)
-                end
-                DataInspector(times_fig)
-                display(backend.Screen(), times_fig)
-            elseif figtype == :ncalls
-                if include_legend
-                    Legend(ncalls_fig[2,1], times_ax; tellheight=true, tellwidth=false,
-                           merge=true)
-                end
-                DataInspector(ncalls_fig)
-                display(backend.Screen(), ncalls_fig)
-            elseif figtype == :allocs
-                if include_legend
-                    Legend(allocs_fig[2,1], times_ax; tellheight=true, tellwidth=false,
-                           merge=true)
-                end
-                DataInspector(allocs_fig)
-                display(backend.Screen(), allocs_fig)
-            else
-                error("Got unrecognized entry $figtype in `interactive_figs`")
-            end
+        if include_legend
+            Legend(times_fig[2,1], times_ax; tellheight=true, tellwidth=false,
+                   merge=true)
+        end
+        DataInspector(times_fig)
+        display(backend.Screen(), times_fig)
+
+        if include_legend
+            Legend(ncalls_fig[2,1], times_ax; tellheight=true, tellwidth=false,
+                   merge=true)
+        end
+        DataInspector(ncalls_fig)
+        display(backend.Screen(), ncalls_fig)
+
+        if include_legend
+            Legend(allocs_fig[2,1], times_ax; tellheight=true, tellwidth=false,
+                   merge=true)
         end
+        DataInspector(allocs_fig)
+        display(backend.Screen(), allocs_fig)
     else
         if times_fig !== nothing
             if include_legend
diff --git a/moment_kinetics/src/coordinates.jl b/moment_kinetics/src/coordinates.jl
index 0f80b4992..bf1c45416 100644
--- a/moment_kinetics/src/coordinates.jl
+++ b/moment_kinetics/src/coordinates.jl
@@ -20,6 +20,7 @@ using ..input_structs
 using ..moment_kinetics_structs: null_spatial_dimension_info, null_velocity_dimension_info
 
 using MPI
+using OrderedCollections: OrderedDict
 
 """
 structure containing basic information related to coordinates
@@ -204,7 +205,7 @@ function get_coordinate_input(input_dict, name; ignore_MPI=false)
     if input_dict === nothing
         boundary_parameters = nothing
     else
-        boundary_parameters_defaults = Dict{Symbol,Any}()
+        boundary_parameters_defaults = OrderedDict{Symbol,Any}()
         if name == "z"
             # parameter controlling the cutoff of the ion distribution function in the vpa
             # domain at the wall in z
diff --git a/moment_kinetics/src/external_sources.jl b/moment_kinetics/src/external_sources.jl
index 6fbd35a96..2eeeb3ac1 100644
--- a/moment_kinetics/src/external_sources.jl
+++ b/moment_kinetics/src/external_sources.jl
@@ -31,6 +31,7 @@ using ..timer_utils
 using ..velocity_moments: get_density
 
 using MPI
+using OrderedCollections: OrderedDict
 
 """
     setup_external_sources!(input_dict, r, z)
@@ -156,10 +157,14 @@ function setup_external_sources!(input_dict, r, z, electron_physics)
                   * "density_midpoint_control, energy, alphas, alphas-with-losses, "
                   * "beam, beam-with-losses")
         end
-        return ion_source_data(; Dict(Symbol(k)=>v for (k,v) ∈ input)..., r_amplitude,
-                z_amplitude=z_amplitude, PI_density_target=PI_density_target,
-                PI_controller_amplitude, controller_source_profile,
-                PI_density_target_ir, PI_density_target_iz, PI_density_target_rank)
+        return ion_source_data(; OrderedDict(Symbol(k)=>v for (k,v) ∈ input)...,
+                r_amplitude=r_amplitude, z_amplitude=z_amplitude,
+                PI_density_target=PI_density_target,
+                PI_controller_amplitude=PI_controller_amplitude,
+                controller_source_profile=controller_source_profile,
+                PI_density_target_ir=PI_density_target_ir,
+                PI_density_target_iz=PI_density_target_iz,
+                PI_density_target_rank=PI_density_target_rank)
     end
 
     function get_settings_neutrals(source_index, active_flag)
@@ -313,10 +318,14 @@ function setup_external_sources!(input_dict, r, z, electron_physics)
                   * "beam, beam-with-losses, recycling (for neutrals only)")
         end
 
-        return neutral_source_data(; Dict(Symbol(k)=>v for (k,v) ∈ input)..., r_amplitude,
-                z_amplitude=z_amplitude, PI_density_target=PI_density_target,
-                PI_controller_amplitude, controller_source_profile,
-                PI_density_target_ir, PI_density_target_iz, PI_density_target_rank)
+        return neutral_source_data(; OrderedDict(Symbol(k)=>v for (k,v) ∈ input)...,
+                r_amplitude=r_amplitude, z_amplitude=z_amplitude,
+                PI_density_target=PI_density_target,
+                PI_controller_amplitude=PI_controller_amplitude,
+                controller_source_profile=controller_source_profile,
+                PI_density_target_ir=PI_density_target_ir,
+                PI_density_target_iz=PI_density_target_iz,
+                PI_density_target_rank=PI_density_target_rank)
     end
     function get_settings_electrons(i, ion_settings)
         # Note most settings for the electron source are copied from the ion source,
@@ -339,9 +348,12 @@ function setup_external_sources!(input_dict, r, z, electron_physics)
             end
             input["source_strength"] = ion_settings.source_strength
         end
-        return electron_source_data(input["source_strength"], input["source_T"],
-                                    ion_settings.active, ion_settings.r_amplitude,
-                                    ion_settings.z_amplitude, ion_settings.source_type)
+        return electron_source_data(source_strength=input["source_strength"],
+                                    source_T=input["source_T"],
+                                    active=ion_settings.active,
+                                    r_amplitude=ion_settings.r_amplitude,
+                                    z_amplitude=ion_settings.z_amplitude,
+                                    source_type=ion_settings.source_type)
     end
 
     # put all ion sources into ion_source_data struct vector
diff --git a/moment_kinetics/src/file_io.jl b/moment_kinetics/src/file_io.jl
index 227d6eb66..5630aaf6d 100644
--- a/moment_kinetics/src/file_io.jl
+++ b/moment_kinetics/src/file_io.jl
@@ -15,7 +15,7 @@ using ..debugging
 using ..input_structs
 using ..looping
 using ..timer_utils
-using ..timer_utils: timer_names_per_rank_moments, timer_names_per_rank_dfns,
+using ..timer_utils: timer_names_all_ranks_moments, timer_names_all_ranks_dfns,
                      TimerNamesDict, SortedDict
 using ..moment_kinetics_structs: scratch_pdf, em_fields_struct
 using ..type_definitions: mk_float, mk_int
@@ -434,29 +434,6 @@ function setup_file_io(io_input, boundary_distributions, vz, vr, vzeta, vpa, vpe
                        external_source_settings, input_dict, restart_time_index,
                        previous_runs_info, time_for_setup, t_params, nl_solver_params)
 
-    # A Dict to store the names of timers that have been created on each MPI rank
-    if block_rank[] == 0
-        if io_input.parallel_io
-            # Need timer name for every rank, as variable creation and extension are
-            # collective operations.
-            for irank ∈ 0:global_size[]-1
-                timer_names_per_rank_moments[irank] = (TimerNamesDict(), Ref(0))
-                timer_names_per_rank_dfns[irank] = (TimerNamesDict(), Ref(0))
-            end
-        else
-            # Timers only written from their own block, so only names from this block are
-            # needed.
-            for irank_in_block ∈ 0:block_size[]-1
-                this_global_rank = global_rank[] + irank_in_block
-                timer_names_per_rank_moments[this_global_rank] = (TimerNamesDict(), Ref(0))
-                timer_names_per_rank_dfns[this_global_rank] = (TimerNamesDict(), Ref(0))
-            end
-        end
-    else
-        timer_names_per_rank_moments[global_rank[]] = (TimerNamesDict(), Ref(0))
-        timer_names_per_rank_dfns[global_rank[]] = (TimerNamesDict(), Ref(0))
-    end
-
     begin_serial_region()
     @serial_region begin
         # Only read/write from first process in each 'block'
@@ -494,8 +471,8 @@ function setup_file_io(io_input, boundary_distributions, vz, vr, vzeta, vpa, vpe
 
         return ascii, io_moments, io_dfns
     end
-    # For other processes in the block, return (nothing, nothing, nothing)
-    return nothing, nothing, nothing
+    # For other processes in the block, return objects with just the input.
+    return nothing, (io_input=io_input,), (io_input=io_input,)
 end
 
 """
@@ -605,8 +582,8 @@ function setup_electron_io(io_input, vpa, vperp, z, r, composition, collisions,
 
         return file_info
     end
-    # For other processes in the block, return nothing
-    return nothing
+    # For other processes in the block, return an object with just the input.
+    return (io_input=io_input,)
 end
 
 """
@@ -711,6 +688,8 @@ Write a single variable to a file or group. If a description or units are passed
 attributes of the variable.
 
 If `overwrite=true` is passed, overwrite the variable if it already exists in the file.
+Note that when overwriting a `String` variable, the new `String` must have exactly the
+same length as the original `String`.
 """
 function write_single_value! end
 
@@ -1114,9 +1093,6 @@ function define_dynamic_moment_variables!(fid, n_ion_species, n_neutral_species,
         dynamic = create_io_group(fid, "dynamic_data", description="time evolving variables")
         timing = create_io_group(fid, "timing_data",
                                  description="timing data to check run-time performance")
-        for i ∈ 0:global_size[]-1
-            create_io_group(timing, "rank$i", description="timing data for MPI rank $i")
-        end
 
         io_time = create_dynamic_variable!(dynamic, "time", mk_float; parallel_io=parallel_io,
                                            description="simulation time")
@@ -2150,8 +2126,9 @@ function setup_moments_io(prefix, io_input, vz, vr, vzeta, vpa, vperp, r, z,
         return file_info
     end
 
-    # For processes other than the root process of each shared-memory group...
-    return nothing
+    # Should not be called processes other than the root process of each shared-memory
+    # group...
+    error("setup_moments_io() called by non-block-root block_rank[]=$(block_rank[])")
 end
 
 """
@@ -2253,8 +2230,9 @@ function reopen_moments_io(file_info)
                                getvar("nl_solver_diagnostics"), io_input)
     end
 
-    # For processes other than the root process of each shared-memory group...
-    return nothing
+    # Should not be called processes other than the root process of each shared-memory
+    # group...
+    error("reopen_moments_io() called by non-block-root block_rank[]=$(block_rank[])")
 end
 
 """
@@ -2309,8 +2287,9 @@ function setup_dfns_io(prefix, io_input, boundary_distributions, r, z, vperp, vp
         return file_info
     end
 
-    # For processes other than the root process of each shared-memory group...
-    return nothing
+    # Should not be called processes other than the root process of each shared-memory
+    # group...
+    error("setup_dfns_io() called by non-block-root block_rank[]=$(block_rank[])")
 end
 
 """
@@ -2425,8 +2404,9 @@ function reopen_dfns_io(file_info)
                             getvar("f_neutral_start_last_timestep"), io_input, io_moments)
     end
 
-    # For processes other than the root process of each shared-memory group...
-    return nothing
+    # Should not be called processes other than the root process of each shared-memory
+    # group...
+    error("reopen_dfns_io() called by non-block-root block_rank[]=$(block_rank[])")
 end
 
 """
@@ -2464,7 +2444,7 @@ file
                          io_or_file_info_moments, t_idx, time_for_run, t_params,
                          nl_solver_params, r, z, dfns=false) = begin
 
-    io_moments = nothing
+    io_moments = io_or_file_info_moments
     @serial_region begin
         # Only read/write from first process in each 'block'
 
@@ -2535,56 +2515,60 @@ function write_timing_data(io_moments, t_idx, dfns=false)
     # hard-coded list of every timer (which would make it inconvenient to add new timers
     # or debug timers).
     #
-    # As well, when using parallel I/O all processes (in the communicator that opens the
-    # output file) must create and extend variables, although it's OK for only one process
-    # to write a variable if the variable is a scalar.
-    #
-    # These two facts mean that every process in `comm_inter_block[]` needs to know the
-    # complete list of timers on every rank, so we have to gather this list. The list is
-    # stored in `timer_names_per_rank_moments` and `timer_names_per_rank_dfns`. We store
-    # two separate lists because moments and dfns might not be written at the same times,
-    # so some timers may be newly added in one but not the other, and so need to be
-    # tracked separately.
+    # It is most convenient for every process in `comm_world` to know the complete list
+    # of timers that exist on any rank, so we have to gather this list. The list is
+    # stored in `timer_names_all_ranks_moments` and `timer_names_all_ranks_dfns`. We
+    # store two separate lists because moments and dfns might not be written at the same
+    # times, so some timers may be newly added in one but not the other, and so need to
+    # be tracked separately.
     #
-    # In order to make sure that every process in `comm_inter_block[]` deals with the same
-    # variable at the same time, we store the variable names for each rank in a nested
-    # SortedDict (with the same nesting structure as the `global_timer` on each rank). The
-    # sort applied to the keys of a SortedDict means that we can be sure to iterate
-    # through the names in the same order on every process. The names for every rank are
-    # collected onto each process in `comm_inter_block[]`, but on the rest of the
-    # processes, only the names from that individual process are collected (as 'the rest'
-    # do not write data, so do not need information from other processes).
+    # In order to make sure that every process deals with the same variable at the same
+    # time, we store the variable names for each rank in a nested SortedDict (with the
+    # same nesting structure as the `global_timer` on each rank). The sort applied to
+    # the keys of a SortedDict means that we can be sure to iterate through the names in
+    # the same order on every process.
     #
-    # Only the process in the same shared-memory block as `irank` collects the timing data
-    # from `irank` and writes it to file.
+    # The timing data for every process in a shared-memory block is collected to the
+    # root process of the block to be written to file.
     #
     # In order to avoid communicating many strings at every output step, we first check
-    # how many timers there are on each rank, and compare that to the number in the
-    # collected list. The number of new variables is gathered, and if there are no new
-    # variables, no communication of variable names needs to be done. If communication is
-    # needed, only the names of the new variables need to be gathered.
+    # how new timers there are on each rank, that are not in the global list of timers.
+    # The number of new variables is gathered, and if there are no new variables, no
+    # communication of variable names needs to be done. If communication is needed, only
+    # the names of the new variables need to be gathered.
     #
     # Once the lists of variable names have been updated, the timing data is gathered onto
     # the root process of each shared-memory block. The data is communicated as vectors of
     # integers, with the order of the entries being determined by the order of the nested
     # SortedDict.
     #
-    # Each entry in `timer_names_per_rank_moments` and `timer_names_per_rank_dfns` is a
-    # SortedDict, even if it does not yet contain any other entries, because sub-timers
-    # might be added at any point.
+    # Each entry in `timer_names_all_ranks_moments` and `timer_names_all_ranks_dfns` is
+    # a SortedDict, even if it does not yet contain any other entries, because
+    # sub-timers might be added at any point.
 
     if dfns
-        timer_names_per_rank = timer_names_per_rank_dfns
+        timer_names_all_ranks = timer_names_all_ranks_dfns
     else
-        timer_names_per_rank = timer_names_per_rank_moments
+        timer_names_all_ranks = timer_names_all_ranks_moments
+    end
+
+    # Collect the names that have not been used on any rank before this call.
+    unique_new_names = String[]
+
+    if block_rank[] == 0
+        io_group = get_group(io_moments.fid, "timing_data")
     end
 
     # Find any new timer names on this process.
-    function get_new_timer_names(timer_names_dict)
+    function get_new_timer_names()
         new_timer_names = String[]
         function get_names_inner(this_timer, timer_names_subdict, prefix)
             names_subdict_keys = keys(timer_names_subdict)
-            for k ∈ keys(this_timer.inner_timers)
+            inner_timers_keys = collect(keys(this_timer.inner_timers))
+            # Sort keys to ensure that the list created by this function has a
+            # deterministic order.
+            sort!(inner_timers_keys)
+            for k ∈ inner_timers_keys
                 this_name = prefix == "" ? k : prefix * ";" * k
                 if k ∉ names_subdict_keys
                     push!(new_timer_names, this_name)
@@ -2596,49 +2580,29 @@ function write_timing_data(io_moments, t_idx, dfns=false)
             end
             return nothing
         end
-        get_names_inner(global_timer, timer_names_dict, "")
+        get_names_inner(global_timer, timer_names_all_ranks, "")
         return new_timer_names
     end
 
-    # Calculate the total number of entries in a nested Dict. This is used to store the
-    # total size of each entry of `timer_names_per_rank_moments` and
-    # `timer_names_per_rank_dfns` so that we know the size of the arrays of data that need
-    # to be communicated.
-    function get_size_of_timer_dict(this_dict)
-        # One count for the timer represented by this_dict.
-        counter = 1
-
-        # Recursively iterate through all contained Dicts, and add them to the counter.
-        for sub_dict ∈ values(this_dict)
-            counter += get_size_of_timer_dict(sub_dict)
-        end
-        return counter
-    end
-
-    # Add the new timer names in to `timer_names_per_rank_moments` or
-    # `timer_names_per_rank_dfns`.
-    function add_new_timer_names!(timer_names, new_timer_names, irank=nothing)
+    # Add the new timer names in to `timer_names_all_ranks_moments` or
+    # `timer_names_all_ranks_dfns`. Note that there may be duplicate names in
+    # new_timer_names, and these will be ignored.
+    function add_new_timer_names!(new_timer_names)
         for n ∈ new_timer_names
-            if irank === nothing
-                this_rank, this_name = split(n, ":")
-                this_rank = parse(mk_int, this_rank)
-                this_dict, _ = timer_names_per_rank[this_rank]
-            else
-                this_name = n
-                this_dict, _ = timer_names_per_rank[irank]
-            end
-            split_name = split(this_name, ";")
-            for level ∈ split_name
-                if level ∉ keys(this_dict)
-                    this_dict[level] = TimerNamesDict()
+            this_dict_all_ranks = timer_names_all_ranks
+            split_name = split(n, ";")
+            n_levels = length(split_name)
+            for (i, level) ∈ enumerate(split_name)
+                if level ∉ keys(this_dict_all_ranks)
+                    this_dict_all_ranks[level] = TimerNamesDict()
+                    if i == n_levels
+                        # New variable that has not been used on any rank before.
+                        push!(unique_new_names, n)
+                    end
                 end
-                this_dict = this_dict[level]
+                this_dict_all_ranks = this_dict_all_ranks[level]
             end
         end
-        for (timer_names_dict, timer_names_dict_size) ∈ values(timer_names_per_rank)
-            # Subtract 1 because we do not want to count the 'top level' as an entry.
-            timer_names_dict_size[] = get_size_of_timer_dict(timer_names_dict) - 1
-        end
         return nothing
     end
 
@@ -2646,13 +2610,14 @@ function write_timing_data(io_moments, t_idx, dfns=false)
     # simultaneously by all processes in `comm_inter_block[]`.
     function create_new_timer_io_variables!(new_timer_names, timer_group, parallel_io)
         for n ∈ new_timer_names
-            this_rank, this_name = split(n, ":")
-            rank_group = get_group(timer_group, "rank$this_rank")
-            create_dynamic_variable!(rank_group, "time:" * this_name, mk_int;
+            create_dynamic_variable!(io_group, "time:" * n, mk_int,
+                                     (name="rank", n=global_size[]);
                                      parallel_io=parallel_io)
-            create_dynamic_variable!(rank_group, "ncalls:" * this_name, mk_int;
+            create_dynamic_variable!(io_group, "ncalls:" * n, mk_int,
+                                     (name="rank", n=global_size[]);
                                      parallel_io=parallel_io)
-            create_dynamic_variable!(rank_group, "allocs:" * this_name, mk_int;
+            create_dynamic_variable!(io_group, "allocs:" * n, mk_int,
+                                     (name="rank", n=global_size[]);
                                      parallel_io=parallel_io)
         end
         return nothing
@@ -2661,305 +2626,167 @@ function write_timing_data(io_moments, t_idx, dfns=false)
     # Once all the names are known, use this function to collect all the data from timers
     # on this process into arrays to be communicated.
     function get_data_from_timers()
-        sorted_timer_names = timer_names_per_rank[global_rank[]][1]
         times = mk_int[]
         ncalls = mk_int[]
         allocs = mk_int[]
         function walk_through_timers(names_dict, timer)
-            push!(times, timer.accumulated_data.time)
-            push!(ncalls, timer.accumulated_data.ncalls)
-            push!(allocs, timer.accumulated_data.allocs)
+            if timer === nothing
+                # Timer not found on this rank, so set to 0
+                push!(times, 0.0)
+                push!(ncalls, 0.0)
+                push!(allocs, 0.0)
+            else
+                push!(times, timer.accumulated_data.time)
+                push!(ncalls, timer.accumulated_data.ncalls)
+                push!(allocs, timer.accumulated_data.allocs)
+            end
 
             # Note that here we have to get the order of entries from names_dict (which is
             # a SortedDict) to ensure that the order of each list is consistent between
             # all different processes.
             for (sub_name, sub_dict) ∈ pairs(names_dict)
-                walk_through_timers(sub_dict, timer[sub_name])
+                if timer === nothing || sub_name ∉ keys(timer.inner_timers)
+                    walk_through_timers(sub_dict, nothing)
+                else
+                    walk_through_timers(sub_dict, timer[sub_name])
+                end
             end
         end
-        for name ∈ keys(sorted_timer_names)
-            walk_through_timers(sorted_timer_names[name], global_timer[name])
+        for name ∈ keys(timer_names_all_ranks)
+            walk_through_timers(timer_names_all_ranks[name], global_timer[name])
         end
         return times, ncalls, allocs
     end
 
-    if block_rank[] == 0
-        parallel_io = io_moments.io_input.parallel_io
-    end
-
-    # First count how many new timer names there are on all the processes in this block
-    if block_rank[] == 0
-        n_new_names = Vector{mk_int}(undef, block_size[])
-        new_names_iblock = get_new_timer_names(timer_names_per_rank[global_rank[]][1])
-        n_new_names[1] = length(new_names_iblock)
-        reqs = [MPI.Irecv!(@view(n_new_names[irank+1:irank+1]), comm_block[]; source=irank)
-                for irank ∈ 1:block_size[]-1]
-        MPI.Waitall(reqs)
-        block_total_new_names = sum(n_new_names)
+    parallel_io = io_moments.io_input.parallel_io
+    if parallel_io
+        comm = comm_world
+        comm_size = global_size[]
+        comm_rank = global_rank[]
     else
-        new_names_iblock = get_new_timer_names(timer_names_per_rank[global_rank[]][1])
-        add_new_timer_names!(timer_names_per_rank, new_names_iblock, global_rank[][1])
-        n_new_names_iblock = Ref(length(new_names_iblock))
-        MPI.Send(n_new_names_iblock, comm_block[]; dest=0)
+        comm = comm_block[]
+        comm_size = block_size[]
+        comm_rank = block_rank[]
     end
 
-    # Gather new names, from any processes in the block that have new names.
-    if block_rank[] == 0
-        if block_total_new_names > 0
-            new_names = SortedDict{mk_int,Vector{String}}()
-            if n_new_names[1] > 0
-                new_names[0] = new_names_iblock
-            else
-                new_names[0] = String[]
-            end
-            for iblock ∈ 1:block_size[]-1
-                if n_new_names[iblock+1] > 0
-                    names_length = Ref(0)
-                    MPI.Recv!(names_length, comm_block[]; source=iblock)
-                    names_char_vector = Vector{Char}(undef, names_length[])
-                    MPI.Recv!(names_char_vector, comm_block[]; source=iblock)
-                    new_names[iblock] = split(string(names_char_vector...), "&")
-                else
-                    new_names[iblock] = String[]
-                end
-            end
-        end
-    else
-        if n_new_names_iblock[] > 0
-            names_string = join(new_names_iblock, "&")
-            names_char_vector = [names_string...]
-            names_length = Ref(length(names_char_vector))
-            MPI.Send(names_length, comm_block[]; dest=0)
-            MPI.Send(names_char_vector, comm_block[]; dest=0)
-        end
-    end
-
-    # Allgather new names onto all ranks in comm_inter_block[].
-    # If parallel_io=false, no need to communicate between different blocks, as each block
-    # writes output independently.
-    if block_rank[] == 0 && parallel_io
-        # Get total number of new names
-        global_total_new_names = MPI.Allreduce(block_total_new_names, +,
-                                               comm_inter_block[])
-
-        if global_total_new_names > 0
-            # Pack all new names for a block into a single string for communication.
-            this_block_string = ""
-            for irank ∈ 0:block_size[]-1
-                this_global_rank = global_rank[] + irank
-                this_rank_string = string(("$this_global_rank:" * s * "&" for s ∈ new_names[irank])...)
-                this_block_string = join([this_block_string, this_rank_string])
-            end
-
-            # Get the sizes of the per-block strings that need to be gathered
-            string_sizes = Vector{mk_int}(undef, n_blocks[])
-            string_sizes[iblock_index[]+1] = length(this_block_string)
-            string_buffer = MPI.UBuffer(string_sizes, 1)
-            MPI.Allgather!(string_buffer, comm_inter_block[])
-
-            # Gather the strings
-            gathered_char_vector = Vector{Char}(undef, sum(string_sizes))
-            local_start_index = sum(string_sizes[1:iblock_index[]]) + 1
-            local_end_index = local_start_index - 1 + string_sizes[iblock_index[]+1]
-            gathered_char_vector[local_start_index:local_end_index] .= [this_block_string...]
-            gathered_buffer = MPI.VBuffer(gathered_char_vector, string_sizes)
-            MPI.Allgatherv!(gathered_buffer, comm_inter_block[])
+    # First count how many new timer names all processes
+    new_names_this_rank = get_new_timer_names()
+    n_new_names = Ref(length(new_names_this_rank))
+    MPI.Allreduce!(n_new_names, +, comm)
 
-            # The string will end with a "&", so we need to slice off the final element, which
-            # will be an empty string.
-            all_new_names = split(string(gathered_char_vector...), "&")[1:end-1]
+    # Allgather new names onto all ranks - if parallel_io=true this is all ranks in
+    # comm_world, if parallel_io=false it is all ranks in comm_block[] (as each block
+    # writes output independently).
+    if n_new_names[] > 0
+        # Pack all new names into a single string for communication.
+        new_names_string = string((s * "&" for s ∈ new_names_this_rank)...)
 
-            # Add the new names to timer_names_per_rank
-            add_new_timer_names!(timer_names_per_rank, all_new_names)
+        # Get the sizes of the per-rank strings that need to be gathered
+        string_sizes = Vector{mk_int}(undef, comm_size)
+        string_sizes[comm_rank+1] = length(new_names_string)
+        string_buffer = MPI.UBuffer(string_sizes, 1)
+        MPI.Allgather!(string_buffer, comm)
 
-            create_new_timer_io_variables!(all_new_names,
-                                           get_group(io_moments.fid, "timing_data"),
-                                           parallel_io)
-        end
-    elseif block_rank[] == 0
-        # This is the `parallel_io=false` case
-        if block_total_new_names > 0
-            # Put irank prefixes onto timer names
-            for irank ∈ 0:block_size[] - 1
-                this_global_rank = global_rank[] + irank
-                names_without_prefix = new_names[irank]
-                new_names[irank] = ["$this_global_rank:" * s for s ∈ names_without_prefix]
-            end
+        # Gather the strings
+        gathered_char_vector = Vector{Char}(undef, sum(string_sizes))
+        local_start_index = sum(string_sizes[1:comm_rank]) + 1
+        local_end_index = local_start_index - 1 + string_sizes[comm_rank+1]
+        gathered_char_vector[local_start_index:local_end_index] .= [new_names_string...]
+        gathered_buffer = MPI.VBuffer(gathered_char_vector, string_sizes)
+        MPI.Allgatherv!(gathered_buffer, comm)
 
-            all_new_names = vcat((new_names[irank] for irank ∈ 0:block_size[]-1)...)
+        # The string will end with a "&", so we need to slice off the final element, which
+        # will be an empty string.
+        all_new_names = split(string(gathered_char_vector...), "&")[1:end-1]
 
-            # Add the new names to timer_names_per_rank
-            add_new_timer_names!(timer_names_per_rank, all_new_names)
+        # Add the new names to timer_names_per_rank
+        add_new_timer_names!(all_new_names)
 
-            create_new_timer_io_variables!(all_new_names,
-                                           get_group(io_moments.fid, "timing_data"),
-                                           parallel_io)
+        if block_rank[] == 0
+            create_new_timer_io_variables!(unique_new_names, io_group, parallel_io)
         end
     end
 
     # Collect the timing data onto the root process of each block
-    if block_rank[] == 0
-        times_data = Dict{mk_int,Vector{mk_int}}()
-        ncalls_data = Dict{mk_int,Vector{mk_int}}()
-        allocs_data = Dict{mk_int,Vector{mk_int}}()
-        times_data[0], ncalls_data[0], allocs_data[0] = get_data_from_timers()
-        for irank ∈ 1:block_size[]-1
-            this_global_rank = global_rank[] + irank
-            times_data[irank] = zeros(mk_int, timer_names_per_rank[this_global_rank][2][])
-            ncalls_data[irank] = zeros(mk_int, timer_names_per_rank[this_global_rank][2][])
-            allocs_data[irank] = zeros(mk_int, timer_names_per_rank[this_global_rank][2][])
-        end
-        times_reqs = MPI.Request[MPI.Irecv!(times_data[irank], comm_block[]; source=irank)
-                                 for irank ∈ 1:block_size[]-1]
-        ncalls_reqs = MPI.Request[MPI.Irecv!(ncalls_data[irank], comm_block[]; source=irank)
-                                  for irank ∈ 1:block_size[]-1]
-        allocs_reqs = MPI.Request[MPI.Irecv!(allocs_data[irank], comm_block[]; source=irank)
-                                  for irank ∈ 1:block_size[]-1]
-        MPI.Waitall(MPI.Request[times_reqs..., ncalls_reqs..., allocs_reqs...])
-    else
-        times_data, ncalls_data, allocs_data = get_data_from_timers()
-        times_req = MPI.Isend(times_data, comm_block[]; dest=0)
-        ncalls_req = MPI.Isend(ncalls_data, comm_block[]; dest=0)
-        allocs_req = MPI.Isend(allocs_data, comm_block[]; dest=0)
-        MPI.Waitall([times_req, ncalls_req, allocs_req])
-    end
+    times_data, ncalls_data, allocs_data = get_data_from_timers()
+    n_timers = length(times_data)
+    gathered_times_data = MPI.Gather(times_data, comm_block[]; root=0)
+    gathered_ncalls_data = MPI.Gather(ncalls_data, comm_block[]; root=0)
+    gathered_allocs_data = MPI.Gather(allocs_data, comm_block[]; root=0)
 
     if block_rank[] == 0
+        gathered_times_data = reshape(gathered_times_data, n_timers, block_size[])
+        gathered_ncalls_data = reshape(gathered_ncalls_data, n_timers, block_size[])
+        gathered_allocs_data = reshape(gathered_allocs_data, n_timers, block_size[])
+
         # Write the timer variables
-        for irank ∈ keys(timer_names_per_rank)
-            this_rank_block = irank ÷ block_size[]
-            this_rank_rank = irank % block_size[]
-            this_rank_names, this_rank_nvars = timer_names_per_rank[irank]
-
-            # Only the process in the same shared-memory block as the data came from
-            # actually has the data, so this process is the one that writes it.
-            write_from_this_rank = (this_rank_block == iblock_index[])
-
-            # We iterate through the variables in the same order as they were packed into
-            # the array in `get_data_from_timers()`, so `counter` gets the corresponding
-            # data from the flattened array that was communicated.
-            counter = 1
-            io_group = get_group(io_moments.fid, "timing_data/rank$irank")
-
-            # For timers on the root process of each shared memory block, we do not need
-            # to communicate so we can write the data directly from the timers. This
-            # should help ensure that at least for these processes the timer data has the
-            # right name in the output file since no packing/unpacking of flattened arrays
-            # is needed. This verification (since times can then be compared to other
-            # processes with the expectation that most of them are similar) justifies the
-            # partial code duplication in having this function be separate from
-            # `write_for_irank()`.
-            function write_for_block_root(names_dict, this_name, this_timer,
-                                          this_top_level_name=this_name)
-                io_time = io_group["time:" * this_name]
-                io_ncalls = io_group["ncalls:" * this_name]
-                io_allocs = io_group["allocs:" * this_name]
-                if t_idx < 0
-                    # The top-level timer (usually "moment_kinetics" was probably the
-                    # first one created. It definitely exists, and should have been
-                    # written at each timestep.
-                    # If we got the length of `time:$this_name`, the variable might have
-                    # the wrong length (e.g. if it has only just been created and has
-                    # length 1).
-                    this_t_idx = length(io_group["time:" * this_top_level_name])
-                else
-                    this_t_idx = t_idx
-                end
-                if write_from_this_rank
-                    append_to_dynamic_var(io_time, this_timer.accumulated_data.time,
-                                          this_t_idx, parallel_io;
-                                          write_from_this_rank=true)
-                    append_to_dynamic_var(io_ncalls, this_timer.accumulated_data.ncalls,
-                                          this_t_idx, parallel_io;
-                                          write_from_this_rank=true)
-                    append_to_dynamic_var(io_allocs, this_timer.accumulated_data.allocs,
-                                          this_t_idx, parallel_io;
-                                          write_from_this_rank=true)
-                else
-                    append_to_dynamic_var(io_time, nothing, this_t_idx,
-                                          parallel_io; write_from_this_rank=false)
-                    append_to_dynamic_var(io_ncalls, nothing, this_t_idx,
-                                          parallel_io; write_from_this_rank=false)
-                    append_to_dynamic_var(io_allocs, nothing, this_t_idx,
-                                          parallel_io; write_from_this_rank=false)
-                end
-                counter += 1
-                for (sub_name, sub_dict) ∈ pairs(names_dict)
-                    write_for_block_root(sub_dict, this_name * ";" * sub_name,
-                                         this_timer === nothing ? nothing :
-                                         this_timer[sub_name], this_top_level_name)
-                end
-            end
 
-            # Write data for non-root processes in the shared memory block, which must be
-            # unpacked from the arrays that were communicated.
-            function write_for_irank(names_dict, this_name, this_top_level_name=this_name)
-                io_time = io_group["time:" * this_name]
-                io_ncalls = io_group["ncalls:" * this_name]
-                io_allocs = io_group["allocs:" * this_name]
-                if t_idx < 0
-                    # The top-level timer (usually "moment_kinetics" was probably the
-                    # first one created. It definitely exists, and should have been
-                    # written at each timestep.
-                    # If we got the length of `time:$this_name`, the variable might have
-                    # the wrong length (e.g. if it has only just been created and has
-                    # length 1).
-                    this_t_idx = length(io_group["time:" * this_top_level_name])
-                else
-                    this_t_idx = t_idx
-                end
-                if write_from_this_rank
-                    irank_in_block = irank % block_size[]
-                    append_to_dynamic_var(io_time, times_data[irank_in_block][counter],
-                                          this_t_idx, parallel_io,
-                                          write_from_this_rank=true)
-                    append_to_dynamic_var(io_ncalls, ncalls_data[irank_in_block][counter],
-                                          this_t_idx, parallel_io,
-                                          write_from_this_rank=true)
-                    append_to_dynamic_var(io_allocs, allocs_data[irank_in_block][counter],
-                                          this_t_idx, parallel_io,
-                                          write_from_this_rank=true)
-                else
-                    append_to_dynamic_var(io_time, nothing, this_t_idx,
-                                          parallel_io, write_from_this_rank=false)
-                    append_to_dynamic_var(io_ncalls, nothing, this_t_idx,
-                                          parallel_io, write_from_this_rank=false)
-                    append_to_dynamic_var(io_allocs, nothing, this_t_idx,
-                                          parallel_io, write_from_this_rank=false)
-                end
-                counter += 1
-                for (sub_name, sub_dict) ∈ pairs(names_dict)
-                    write_for_irank(sub_dict, this_name * ";" * sub_name, this_top_level_name)
-                end
-            end
+        # We iterate through the variables in the same order as they were packed into
+        # the array in `get_data_from_timers()`, so `counter` gets the corresponding
+        # data from the flattened array that was communicated.
+        counter = 1
 
-            if this_rank_rank == 0
-                # This was the data from the root process of a shared-memory block.
-                for top_level_name ∈ keys(this_rank_names)
-                    write_for_block_root(this_rank_names[top_level_name], top_level_name,
-                                         write_from_this_rank ? global_timer[top_level_name] : nothing)
-                end
-            else
-                # This was data from a process that is not the root of a shared-memory
-                # block, so had to be packed, communicated, and unpacked.
-                for top_level_name ∈ keys(this_rank_names)
-                    write_for_irank(this_rank_names[top_level_name], top_level_name)
-                end
-            end
-            if counter != this_rank_nvars[] + 1
-                error("Got wrong number of timers. Wrote $(counter-1) but should have "
-                      * "been $this_rank_nvars")
+        # Write data for all processes in the shared memory block, which must be
+        # unpacked from the arrays that were communicated.
+        if t_idx < 0
+            # The top-level timer (usually "moment_kinetics" was probably the
+            # first one created. It definitely exists, and should have been
+            # written at each timestep.
+            # If we got the length of `time:$this_name`, the variable might have
+            # the wrong length (e.g. if it has only just been created and has
+            # length 1).
+            t_idx = length(io_group["time:" * first(keys(timer_names_all_ranks))])
+        end
+        if parallel_io
+            timer_coord = (local_io_range=1:block_size[],
+                           global_io_range=global_rank[]+1:global_rank[]+block_size[])
+        else
+            timer_coord = (local_io_range=1:block_size[],
+                           global_io_range=1:block_size[])
+        end
+        function write_level(names_dict, this_name)
+            io_time = io_group["time:" * this_name]
+            io_ncalls = io_group["ncalls:" * this_name]
+            io_allocs = io_group["allocs:" * this_name]
+            @views append_to_dynamic_var(io_time, gathered_times_data[counter,:], t_idx,
+                                         parallel_io, timer_coord)
+            @views append_to_dynamic_var(io_ncalls, gathered_ncalls_data[counter,:],
+                                         t_idx, parallel_io, timer_coord)
+            @views append_to_dynamic_var(io_allocs, gathered_allocs_data[counter,:],
+                                         t_idx, parallel_io, timer_coord)
+            counter += 1
+            for (sub_name, sub_dict) ∈ pairs(names_dict)
+                write_level(sub_dict, this_name * ";" * sub_name)
             end
         end
+
+        for top_level_name ∈ keys(timer_names_all_ranks)
+            write_level(timer_names_all_ranks[top_level_name], top_level_name)
+        end
     end
 
+    # Pick a fixed size for "global_timer_string" so that we can overwrite the variable
+    # without needing to resize it.
+    global_timer_string_size = 10000 # 100 characters x 100 lines seems like a reasonable maximum size.
+    global_timer_description = "Formatted representation of global_timer"
     if global_rank[] == 0 || (block_rank[] == 0 && !parallel_io)
         if t_idx > 1 || t_idx == -1
             if t_idx == -1
                 top_level = nothing
             else
-                if "moment_kinetics" ∈ keys(timer_names_per_rank[global_rank[]])
+                if "moment_kinetics" ∈ keys(timer_names_all_ranks)
                     top_level = ("moment_kinetics", "time_advance! step", "ssp_rk!")
+                    this_dict = timer_names_all_ranks
+
+                    # Check all the expected levels are present, otherwise just set
+                    # top_level=nothing.
+                    for n ∈ top_level
+                        if n ∉ keys(this_dict)
+                            top_level = nothing
+                            break
+                        end
+                        this_dict = this_dict[n]
+                    end
                 else
                     # If `time_advance!()` was called in a non-standard way (i.e. not by
                     # `run_moment_kinetics()`), the actual timers might be different. In
@@ -2971,33 +2798,35 @@ function write_timing_data(io_moments, t_idx, dfns=false)
             # was printed to the terminal, for a quick look.
             string_to_write = format_global_timer(; show_output=false,
                                                     top_level=top_level)
-            string_size = Ref(length(string_to_write))
-            if parallel_io
-                MPI.Bcast!(string_size, comm_inter_block[]; root=0)
-            end
+
+            # Ensure `string_to_write` is no longer than `global_timer_string_size`.
+            string_to_write = string_to_write[1:min(length(string_to_write), global_timer_string_size)]
+            # Ensure `string_to_write` is at least as long as `global_timer_string_size`.
+            # Do this way instead of using `rpad()` because `rpad()` measures the length
+            # using `textwidth()` rather than a raw character count, whereas we want a
+            # fixed number of ASCII characters to write to the output file.
+            string_to_write = string_to_write * ' '^(global_timer_string_size - length(string_to_write))
+
             write_single_value!(get_group(io_moments.fid, "timing_data"),
                                 "global_timer_string", string_to_write;
                                 parallel_io=parallel_io,
-                                description="Formatted representation of global_timer",
+                                description=global_timer_description,
                                 overwrite=true)
         end
     elseif block_rank[] == 0
         if t_idx > 1 || t_idx == -1
             # Although only global_rank[]==0 needs to write "global_timer_string" when we
             # are using parallel I/O, other ranks in `comm_inter_block[]` must also call
-            # `write_single_value!() so that the variable in the HDF5 file can be
-            # (re-)created.
-            # ???These other ranks do not actually write data though, so it does not
-            # matter what is passed to the `data` argument of `write_single_value!()` (as
-            # long as it is a scalar).
-            # ? Need to pass a string with the right length?
-            string_size = Ref(0)
-            MPI.Bcast!(string_size, comm_inter_block[]; root=0)
-            string_to_write = " " ^ string_size[]
+            # `write_single_value!() so that the variable in the HDF5 file can be created.
+            # These other ranks do not actually write data though, so it does not matter
+            # what is passed to the `data` argument of `write_single_value!()` (as
+            # long as it is a string with the right length).
+            string_to_write = " " ^ global_timer_string_size
             write_single_value!(get_group(io_moments.fid, "timing_data"),
                                 "global_timer_string", string_to_write;
-                                parallel_io=parallel_io, description="Formatted representation
-                                of global_timer", overwrite=true)
+                                parallel_io=parallel_io,
+                                description=global_timer_description,
+                                overwrite=true)
         end
     end
 
@@ -3012,8 +2841,8 @@ file. Needs to be called after exiting from the `@timeit` block so that all time
 finalised properly.
 """
 function write_final_timing_data_to_binary(io_or_file_info_moments, io_or_file_info_dfns)
-    io_moments = nothing
-    io_dfns_moments = nothing
+    io_moments = io_or_file_info_moments
+    io_dfns_moments = io_or_file_info_dfns
     @serial_region begin
         # Only read/write from first process in each 'block'
 
@@ -3411,7 +3240,7 @@ binary output file
                          t_params, nl_solver_params, r, z, vperp, vpa, vzeta, vr,
                          vz) = begin
     io_dfns = nothing
-    io_dfns_moments = nothing
+    io_dfns_moments = io_or_file_info_dfns
     closefile = true
     @serial_region begin
         # Only read/write from first process in each 'block'
@@ -3589,8 +3418,8 @@ end
 close all opened output files
 """
 function finish_file_io(ascii_io::Union{ascii_ios,Nothing},
-                        binary_moments::Union{io_moments_info,Tuple,Nothing},
-                        binary_dfns::Union{io_dfns_info,Tuple,Nothing})
+                        binary_moments::Union{io_moments_info,Tuple,NamedTuple,Nothing},
+                        binary_dfns::Union{io_dfns_info,Tuple,NamedTuple,Nothing})
     @serial_region begin
         # Only read/write from first process in each 'block'
 
@@ -3618,7 +3447,7 @@ end
 close output files for electron initialization
 """
 function finish_electron_io(
-        binary_initial_electron::Union{io_initial_electron_info,Tuple,Nothing,Bool})
+        binary_initial_electron::Union{io_initial_electron_info,Tuple,Nothing,Bool,NamedTuple})
 
     @serial_region begin
         # Only read/write from first process in each 'block'
diff --git a/moment_kinetics/src/file_io_hdf5.jl b/moment_kinetics/src/file_io_hdf5.jl
index 8b9ff40e5..d647ab554 100644
--- a/moment_kinetics/src/file_io_hdf5.jl
+++ b/moment_kinetics/src/file_io_hdf5.jl
@@ -92,9 +92,6 @@ function write_single_value!(file_or_group::HDF5.H5DataStore, name,
                              description=nothing, units=nothing,
                              overwrite=false) where {T,N}
     if isa(data, Union{Number, AbstractString})
-        if overwrite && name ∈ keys(file_or_group)
-            delete_object(file_or_group, name)
-        end
         # When we write a scalar, and parallel_io=true, we need to create the variable on
         # every process in `comm_inter_block[]` but we only want to actually write the
         # data from one process (we choose `global_rank[]==0`) to avoid corruption due to
@@ -106,15 +103,17 @@ function write_single_value!(file_or_group::HDF5.H5DataStore, name,
         # length of the string, so cannot be easily created by hand. Note that a String of
         # the correct length must be passed from every process in `comm_inter_block[]`,
         # but only the contents of the string on `global_rank[]==0` are actually written.
-        io_var, var_hdf5_type = create_dataset(file_or_group, name, data)
-        if !parallel_io || global_rank[] == 0
-            write_dataset(io_var, var_hdf5_type, data)
+        if !(overwrite && name ∈ keys(file_or_group))
+            create_dataset(file_or_group, name, data)
+            if description !== nothing
+                add_attribute!(file_or_group[name], "description", description)
+            end
+            if units !== nothing
+                add_attribute!(file_or_group[name], "units", units)
+            end
         end
-        if description !== nothing
-            add_attribute!(file_or_group[name], "description", description)
-        end
-        if units !== nothing
-            add_attribute!(file_or_group[name], "units", units)
+        if !parallel_io || global_rank[] == 0
+            write(file_or_group[name], data)
         end
         return nothing
     end
@@ -262,14 +261,14 @@ end
 function append_to_dynamic_var(io_var::HDF5.Dataset,
                                data::Union{Nothing,Number,AbstractArray{T,N}}, t_idx,
                                parallel_io::Bool,
-                               coords::Union{coordinate,Integer}...;
+                               coords::Union{coordinate,NamedTuple,Integer}...;
                                only_root=false, write_from_this_rank=nothing) where {T,N}
     # Extend time dimension for this variable
     dims = size(io_var)
     dims_mod = (dims[1:end-1]..., t_idx)
     HDF5.set_extent_dims(io_var, dims_mod)
-    local_ranges = Tuple(isa(c, coordinate) ? c.local_io_range : 1:c for c ∈ coords)
-    global_ranges = Tuple(isa(c, coordinate) ? c.global_io_range : 1:c for c ∈ coords)
+    local_ranges = Tuple(isa(c, Integer) ? (1:c) : c.local_io_range for c ∈ coords)
+    global_ranges = Tuple(isa(c, Integer) ? (1:c) : c.global_io_range for c ∈ coords)
 
     if only_root && parallel_io && global_rank[] != 0
         # Variable should only be written from root, and this process is not root for the
diff --git a/moment_kinetics/src/fokker_planck.jl b/moment_kinetics/src/fokker_planck.jl
index fb5ac876b..001d90824 100644
--- a/moment_kinetics/src/fokker_planck.jl
+++ b/moment_kinetics/src/fokker_planck.jl
@@ -45,6 +45,7 @@ using FastGaussQuadrature
 using Dates
 using LinearAlgebra: lu, ldiv!
 using MPI
+using OrderedCollections: OrderedDict
 using ..type_definitions: mk_float, mk_int
 using ..array_allocation: allocate_float, allocate_shared_float
 using ..communication
@@ -82,7 +83,7 @@ use_fokker_planck = true
 nuii = 1.0
 frequency_option = "manual"
 """
-function setup_fkpl_collisions_input(toml_input::Dict)
+function setup_fkpl_collisions_input(toml_input::AbstractDict)
     reference_params = setup_reference_parameters(toml_input)
     # get reference collision frequency (note factor of 1/2 due to definition choices)
     nuii_fkpl_default = 0.5*get_reference_collision_frequency_ii(reference_params)
@@ -117,7 +118,7 @@ function setup_fkpl_collisions_input(toml_input::Dict)
     if !input_section["use_fokker_planck"]
         input_section["nuii"] = -1.0
     end
-    input = Dict(Symbol(k)=>v for (k,v) in input_section)
+    input = OrderedDict(Symbol(k)=>v for (k,v) in input_section)
     #println(input)
     if input_section["slowing_down_test"]
         # calculate nu_alphae and vc3 (critical speed of slowing down)
diff --git a/moment_kinetics/src/geo.jl b/moment_kinetics/src/geo.jl
index 41be4d0db..933598a65 100644
--- a/moment_kinetics/src/geo.jl
+++ b/moment_kinetics/src/geo.jl
@@ -14,6 +14,8 @@ using ..array_allocation: allocate_float
 using ..type_definitions: mk_float, mk_int
 using ..reference_parameters: setup_reference_parameters
 
+using OrderedCollections: OrderedDict
+
 """
 struct containing the geometric data necessary for 
 non-trivial axisymmetric geometries, to be passed 
@@ -81,7 +83,7 @@ DeltaB = 0.0
 option = ""
 
 """
-function setup_geometry_input(toml_input::Dict)
+function setup_geometry_input(toml_input::AbstractDict)
 
     reference_params = setup_reference_parameters(toml_input)
     reference_rhostar = get_default_rhostar(reference_params)
@@ -105,7 +107,7 @@ function setup_geometry_input(toml_input::Dict)
         # constant for testing nonzero dBdr when nr = 1
         dBdr_constant = 0.0)
     
-    input = Dict(Symbol(k)=>v for (k,v) in input_section)
+    input = OrderedDict(Symbol(k)=>v for (k,v) in input_section)
     #println(input)
     return geometry_input(; input...)
 end
diff --git a/moment_kinetics/src/input_structs.jl b/moment_kinetics/src/input_structs.jl
index f427c3a83..edcdb9a8a 100644
--- a/moment_kinetics/src/input_structs.jl
+++ b/moment_kinetics/src/input_structs.jl
@@ -17,12 +17,15 @@ export io_input
 export pp_input
 export geometry_input
 export set_defaults_and_check_top_level!, set_defaults_and_check_section!,
-       check_sections!, options_to_TOML, Dict_to_NamedTuple
+       check_sections!, options_to_TOML, Dict_to_NamedTuple,
+       convert_to_sorted_nested_OptionsDict
 
 using ..communication
-using ..type_definitions: mk_float, mk_int
+using ..type_definitions: mk_float, mk_int, OptionsDict
 
+using DataStructures: SortedDict
 using MPI
+using OrderedCollections: OrderedDict
 using TOML
 
 """
@@ -730,7 +733,7 @@ import Base: get
 Utility method for converting a string to an Enum when getting from a Dict, based on the
 type of the default value
 """
-function get(d::Dict, key, default::Enum)
+function get(d::OptionsDict, key, default::Enum)
     val_maybe_string = get(d, key, nothing)
     if val_maybe_string == nothing
         return default
@@ -750,14 +753,12 @@ end
 Set the defaults for options in the top level of the input, and check that there are not
 any unexpected options (i.e. options that have no default).
 
-Modifies the options[section_name]::Dict by adding defaults for any values that are not
-already present.
+Modifies the options[section_name]::OptionsDict by adding defaults for any values that
+are not already present.
 
 Ignores any sections, as these will be checked separately.
 """
-function set_defaults_and_check_top_level!(options::AbstractDict; kwargs...)
-    DictType = typeof(options)
-
+function set_defaults_and_check_top_level!(options::OptionsDict; kwargs...)
     # Check for any unexpected values in the options - all options that are set should be
     # present in the kwargs of this function call
     options_keys_symbols = keys(kwargs)
@@ -781,13 +782,12 @@ function set_defaults_and_check_top_level!(options::AbstractDict; kwargs...)
     return options
 end
 
-function _get_section_and_check_option_names(options, section_name, section_keys)
-
-    DictType = typeof(options)
+function _get_section_and_check_option_names(options::OptionsDict, section_name,
+                                             section_keys)
 
     if !(section_name ∈ keys(options))
         # If section is not present, create it
-        options[section_name] = DictType()
+        options[section_name] = OptionsDict()
     end
 
     if !isa(options[section_name], AbstractDict)
@@ -839,11 +839,10 @@ const _section_check_store_name = "_section_check_store"
 Set the defaults for options in a section, and check that there are not any unexpected
 options (i.e. options that have no default).
 
-Modifies the options[section_name]::Dict by adding defaults for any values that are not
-already present.
+Modifies the options[section_name]::OptionsDict by adding defaults for any values that
+are not already present.
 """
-function set_defaults_and_check_section!(options::AbstractDict, section_name;
-                                         kwargs...)
+function set_defaults_and_check_section!(options::OptionsDict, section_name; kwargs...)
 
     section_keys_symbols = keys(kwargs)
     section_keys = (String(k) for k ∈ section_keys_symbols)
@@ -865,7 +864,7 @@ function set_defaults_and_check_section!(options::AbstractDict, section_name;
 end
 
 """
-    set_defaults_and_check_section!(options::AbstractDict, struct_type::Type,
+    set_defaults_and_check_section!(options::OptionsDict, struct_type::Type,
                                     name::Union{String,Nothing}=nothing)
 
 Alternative form to be used when the options should be stored in a struct of type
@@ -874,15 +873,15 @@ Alternative form to be used when the options should be stored in a struct of typ
 The returned instance of `struct_type` is immutable, so if you need to modify the settings
 - e.g. to apply some logic to set defaults depending on other settings/parameters - then
 you should use the 'standard' version of [`set_defaults_and_check_section!`](@ref) that
-returns a `Dict` that can be modified, and then use that `Dict` to initialise the
-`struct_type`.
+returns a `OptionsDict` that can be modified, and then use that `OptionsDict` to
+initialise the `struct_type`.
 
 The name of the section in the options that will be read defaults to the name of
 `struct_type`, but can be set using the `section_name` argument.
 
 Returns an instance of `struct_type`.
 """
-function set_defaults_and_check_section!(options::AbstractDict, struct_type::Type,
+function set_defaults_and_check_section!(options::OptionsDict, struct_type::Type,
                                          section_name::Union{String,Nothing}=nothing)
 
     if section_name === nothing
@@ -910,12 +909,12 @@ function set_defaults_and_check_section!(options::AbstractDict, struct_type::Typ
 end
 
 """
-    check_sections!(options::AbstractDict)
+    check_sections!(options::OptionsDict)
 
 Check that there are no unexpected sections in `options`. The 'expected sections' are the
 ones that were defined with [`set_defaults_and_check_section!`](@ref).
 """
-function check_sections!(options::AbstractDict; check_no_top_level_options=true)
+function check_sections!(options::OptionsDict; check_no_top_level_options=true)
 
     expected_section_names = pop!(options, _section_check_store_name)
 
@@ -945,18 +944,38 @@ function check_sections!(options::AbstractDict; check_no_top_level_options=true)
 end
 
 """
-Convert a Dict whose keys are String or Symbol to a NamedTuple
+    convert_to_sorted_nested_OptionsDict(d::AbstractDict)
+
+To ensure consistency when writing options to an output file, the entries in the
+dictionary containing the options must be in a deterministic order. As TOML reads
+options into a nested `Dict`, the only way to guarantee this is to sort the options
+before storing them in an `OptionsDict`. `OptionsDict` is an alias for
+`OrderedDict{String,Any}` so it will preserve the order of entries as long as they were
+in a consistent order when it was created.
+"""
+function convert_to_sorted_nested_OptionsDict(d::AbstractDict)
+    sorted_d = SortedDict{String,Any}(d)
+    for (k,v) ∈ pairs(sorted_d)
+        if isa(v, AbstractDict)
+            sorted_d[k] = convert_to_sorted_nested_OptionsDict(v)
+        end
+    end
+    return OptionsDict(sorted_d)
+end
+
+"""
+Convert an OrderedDict whose keys are String or Symbol to a NamedTuple
 
 Useful as NamedTuple is immutable, so option values cannot be accidentally changed.
 """
-function Dict_to_NamedTuple(d)
+function Dict_to_NamedTuple(d::OrderedDict)
     return NamedTuple(Symbol(k)=>v for (k,v) ∈ d)
 end
 
 """
     options_to_toml(io::IO [=stdout], data::AbstractDict; sorted=false, by=identity)
 
-Convert `moment_kinetics` 'options' (in the form of a `Dict`) to TOML format.
+Convert `moment_kinetics` 'options' (in the form of an `AbstractDict`) to TOML format.
 
 This function is defined so that we can handle some extra types, for example `Enum`.
 
diff --git a/moment_kinetics/src/krook_collisions.jl b/moment_kinetics/src/krook_collisions.jl
index 6e37d6dfc..f0a99404d 100644
--- a/moment_kinetics/src/krook_collisions.jl
+++ b/moment_kinetics/src/krook_collisions.jl
@@ -15,6 +15,8 @@ using ..reference_parameters: get_reference_collision_frequency_ii,
                               get_reference_collision_frequency_ei
 using ..reference_parameters: setup_reference_parameters
 
+using OrderedCollections: OrderedDict
+
 """
 Function for reading Krook collision operator input parameters. 
 Structure the namelist as follows.
@@ -24,7 +26,7 @@ use_krook = true
 nuii0 = 1.0
 frequency_option = "manual"
 """
-function setup_krook_collisions_input(toml_input::Dict)
+function setup_krook_collisions_input(toml_input::AbstractDict)
     reference_params = setup_reference_parameters(toml_input)
     # get reference collision frequency
     nuii_krook_default = get_reference_collision_frequency_ii(reference_params)
@@ -63,7 +65,7 @@ function setup_krook_collisions_input(toml_input::Dict)
         input_section["nuee0"] = -1.0
         input_section["nuei0"] = -1.0
     end
-    input = Dict(Symbol(k)=>v for (k,v) in input_section)
+    input = OrderedDict(Symbol(k)=>v for (k,v) in input_section)
     #println(input)
     return krook_collisions_input(; input...)
 end
diff --git a/moment_kinetics/src/load_data.jl b/moment_kinetics/src/load_data.jl
index 53ceda028..35e427319 100644
--- a/moment_kinetics/src/load_data.jl
+++ b/moment_kinetics/src/load_data.jl
@@ -39,6 +39,7 @@ using ..z_advection: update_speed_z!
 using Glob
 using HDF5
 using MPI
+using OrderedCollections: OrderedDict
 
 const timestep_diagnostic_variables = ("time_for_run", "step_counter", "dt",
                                        "failure_counter", "failure_caused_by",
@@ -664,7 +665,7 @@ function reload_evolving_fields!(pdf, moments, fields, boundary_distributions,
                                                                vzeta, vr, vz, parallel_io)
 
             # Test whether any interpolation is needed
-            interpolation_needed = Dict(
+            interpolation_needed = OrderedDict(
                 x.name => (restart_x !== nothing
                            && (x.n != restart_x.n
                                || !all(isapprox.(x.grid, restart_x.grid))))
@@ -1105,7 +1106,7 @@ function reload_electron_data!(pdf, moments, t_params, restart_prefix_iblock, ti
                                                                vzeta, vr, vz, parallel_io)
 
             # Test whether any interpolation is needed
-            interpolation_needed = Dict(
+            interpolation_needed = OrderedDict(
                 x.name => x.n != restart_x.n || !all(isapprox.(x.grid, restart_x.grid))
                 for (x, restart_x) ∈ ((z, restart_z), (r, restart_r),
                                       (vperp, restart_vperp), (vpa, restart_vpa)))
@@ -3595,16 +3596,13 @@ function get_run_info_no_setup(run_dir::Union{AbstractString,Tuple{AbstractStrin
         evolving_variables = Tuple(evolving_variables)
     end
 
-    timing_variable_names = Dict{mk_int, Vector{String}}()
-    for fid ∈ fids0
-        timing_group = get_group(fid, "timing_data")
-        timing_rank_names = collect(k for k in keys(timing_group) if startswith(k, "rank"))
-        for group_name ∈ timing_rank_names
-            rank_group = get_group(timing_group, group_name)
-            irank = parse(mk_int, split(group_name, "rank")[2])
-            timing_variable_names[irank] = collect(keys(rank_group))
-        end
-    end
+    # Assume the timing variables are the same in every restart - this may not always be
+    # true, and might cause errors if some variables are missing for restarts after the
+    # first.
+    timing_group = get_group(fids0[1], "timing_data")
+    timing_variable_names = collect(k for k in keys(timing_group)
+                                    if startswith(k, "time:") || startswith(k, "ncalls:") ||
+                                       startswith(k, "allocs:"))
 
     if parallel_io
         files = fids0
diff --git a/moment_kinetics/src/maxwell_diffusion.jl b/moment_kinetics/src/maxwell_diffusion.jl
index 73a36c44a..85d2d4639 100644
--- a/moment_kinetics/src/maxwell_diffusion.jl
+++ b/moment_kinetics/src/maxwell_diffusion.jl
@@ -21,6 +21,8 @@ using ..calculus: second_derivative!
 using ..timer_utils
 using ..reference_parameters: get_reference_collision_frequency_ii, setup_reference_parameters
 
+using OrderedCollections: OrderedDict
+
 """
 Function for reading Maxwell diffusion operator input parameters. 
 Structure the namelist as follows.
@@ -30,7 +32,7 @@ use_maxwell_diffusion = true
 D_ii = 1.0
 diffusion_coefficient_option = "manual"
 """
-function setup_mxwl_diff_collisions_input(toml_input::Dict)
+function setup_mxwl_diff_collisions_input(toml_input::AbstractDict)
     reference_params = setup_reference_parameters(toml_input)
     # get reference diffusion coefficient, made up of collision frequency and 
     # thermal speed for now. NOTE THAT THIS CONSTANT PRODUCES ERRORS. DO NOT USE
@@ -63,7 +65,7 @@ function setup_mxwl_diff_collisions_input(toml_input::Dict)
         input_section["D_ii"] = -1.0
         input_section["D_nn"] = -1.0
     end
-    input = Dict(Symbol(k)=>v for (k,v) in input_section)
+    input = OrderedDict(Symbol(k)=>v for (k,v) in input_section)
 
     return mxwl_diff_collisions_input(; input...)
 end
diff --git a/moment_kinetics/src/moment_kinetics.jl b/moment_kinetics/src/moment_kinetics.jl
index 1f2c86593..3309ca8ed 100644
--- a/moment_kinetics/src/moment_kinetics.jl
+++ b/moment_kinetics/src/moment_kinetics.jl
@@ -125,9 +125,6 @@ function run_moment_kinetics(input_dict::OptionsDict; restart=false, restart_tim
 
     mk_state = nothing
     try
-        # Reset timers in case a previous run was timed
-        reset_mk_timers!()
-
         @timeit global_timer "moment_kinetics" begin
             # set up all the structs, etc. needed for a run
             mk_state = setup_moment_kinetics(input_dict; restart=restart,
@@ -187,7 +184,7 @@ function run_moment_kinetics()
     end
     restart_time_index = options["restart-time-index"]
     if inputfile === nothing
-        this_input = Dict()
+        this_input = OptionsDict()
     else
         this_input = inputfile
     end
@@ -404,6 +401,9 @@ function cleanup_moment_kinetics!(ascii_io, io_moments, io_dfns)
         end
     end
 
+    # Reset timers
+    reset_mk_timers!()
+
     # clean up MPI objects
     finalize_comms!()
 
diff --git a/moment_kinetics/src/moment_kinetics_input.jl b/moment_kinetics/src/moment_kinetics_input.jl
index 9f49aef52..18e78aeb6 100644
--- a/moment_kinetics/src/moment_kinetics_input.jl
+++ b/moment_kinetics/src/moment_kinetics_input.jl
@@ -31,7 +31,7 @@ using TOML
 Read input from a TOML file
 """
 function read_input_file(input_filename::String)
-    input = TOML.parsefile(input_filename)
+    input = convert_to_sorted_nested_OptionsDict(TOML.parsefile(input_filename))
 
     # Use input_filename (without the extension) as default for "run_name"
     if !("output" ∈ keys(input) && "run_name" in keys(input["output"]))
diff --git a/moment_kinetics/src/species_input.jl b/moment_kinetics/src/species_input.jl
index 59d7043fe..dc9067891 100644
--- a/moment_kinetics/src/species_input.jl
+++ b/moment_kinetics/src/species_input.jl
@@ -13,6 +13,8 @@ using ..input_structs: spatial_initial_condition_input, velocity_initial_conditi
 using ..input_structs: boltzmann_electron_response, boltzmann_electron_response_with_simple_sheath
 using ..reference_parameters: setup_reference_parameters
 
+using OrderedCollections: OrderedDict
+
 function get_species_input(toml_input)
     
     reference_params = setup_reference_parameters(toml_input)
@@ -79,7 +81,7 @@ function get_species_input(toml_input)
                                                  velocity_initial_condition_input,
                                                  "vpa_IC_ion_species_$is")
         
-        spec_input = Dict(Symbol(k)=>v for (k,v) in spec_section)
+        spec_input = OrderedDict(Symbol(k)=>v for (k,v) in spec_section)
         ion_spec_params_list[is] = ion_species_parameters(; type="ion", z_IC=z_IC,
                                                             r_IC=r_IC, vpa_IC=vpa_IC,
                                                             spec_input...)
@@ -106,14 +108,14 @@ function get_species_input(toml_input)
                                                 velocity_initial_condition_input,
                                                 "vz_IC_neutral_species_$isn")
         
-        spec_input = Dict(Symbol(k)=>v for (k,v) in spec_section)
+        spec_input = OrderedDict(Symbol(k)=>v for (k,v) in spec_section)
         neutral_spec_params_list[isn] = neutral_species_parameters(; type="neutral",
                                                                      z_IC=z_IC, r_IC=r_IC,
                                                                      vz_IC=vz_IC,
                                                                      spec_input...)
     end
     # construct composition struct
-    composition_input = Dict(Symbol(k)=>v for (k,v) in composition_section)
+    composition_input = OrderedDict(Symbol(k)=>v for (k,v) in composition_section)
     composition = species_composition(; n_species=nspec_tot, ion=ion_spec_params_list,
                                         neutral=neutral_spec_params_list,
                                         composition_input...)
diff --git a/moment_kinetics/src/timer_utils.jl b/moment_kinetics/src/timer_utils.jl
index 1fadff14e..3ff8f02ff 100644
--- a/moment_kinetics/src/timer_utils.jl
+++ b/moment_kinetics/src/timer_utils.jl
@@ -37,16 +37,16 @@ const TimerNamesDict = SortedDict{String,SortedDict,Base.Order.ForwardOrdering}
 TimerNamesDict() = TimerNamesDict(Base.Order.ForwardOrdering())
 
 """
-Nested Dict containting the names of all timers that have been created on each MPI rank
+Nested SortedDict with the names of all timers that have been created on any MPI rank
 and added to the moments output file.
 """
-const timer_names_per_rank_moments = SortedDict{mk_int,Tuple{TimerNamesDict,Ref{mk_int}}}()
+const timer_names_all_ranks_moments = TimerNamesDict()
 
 """
-Nested Dict containting the names of all timers that have been created on each MPI rank
+Nested SortedDict with the names of all timers that have been created on any MPI rank
 and added to the dfns output file.
 """
-const timer_names_per_rank_dfns = SortedDict{mk_int,Tuple{TimerNamesDict,Ref{mk_int}}}()
+const timer_names_all_ranks_dfns = TimerNamesDict()
 
 """
     format_global_timer(; show=true, truncate_output=true)
@@ -165,6 +165,7 @@ function format_global_timer(; show_output=false, threshold=1.0e-3, truncate_out
         # when we save the string to an HDF5 or NetCDF file. μ often appears because times
         # may be printed in microseconds.
         result = replace(result, "μ" => "u")
+        result = ascii(replace(result, !isascii=>' '))
     end
 end
 
@@ -175,8 +176,8 @@ Reset all global state of timers.
 """
 function reset_mk_timers!()
     reset_timer!(global_timer)
-    empty!(timer_names_per_rank_moments)
-    empty!(timer_names_per_rank_dfns)
+    empty!(timer_names_all_ranks_moments)
+    empty!(timer_names_all_ranks_dfns)
 end
 
 end #timer_utils
diff --git a/moment_kinetics/src/type_definitions.jl b/moment_kinetics/src/type_definitions.jl
index e53fea73d..b5b14fafc 100644
--- a/moment_kinetics/src/type_definitions.jl
+++ b/moment_kinetics/src/type_definitions.jl
@@ -6,6 +6,8 @@ export mk_float
 export mk_int
 export OptionsDict
 
+using OrderedCollections: OrderedDict
+
 """
 """
 const mk_float = Float64
@@ -16,6 +18,6 @@ const mk_int = Int64
 
 """
 """
-const OptionsDict = Dict{String,Any}
+const OptionsDict = OrderedDict{String,Any}
 
 end
diff --git a/moment_kinetics/src/utils.jl b/moment_kinetics/src/utils.jl
index ce92c5d98..74b8a3bbb 100644
--- a/moment_kinetics/src/utils.jl
+++ b/moment_kinetics/src/utils.jl
@@ -30,14 +30,14 @@ function __init__()
 end
 
 """
-    get_unnormalized_parameters(input::Dict)
+    get_unnormalized_parameters(input::AbstractDict)
     get_unnormalized_parameters(input_filename::String)
 
 Get many parameters for the simulation setup given by `input` or in the file
 `input_filename`, in SI units and eV, returned as an OrderedDict.
 """
 function get_unnormalized_parameters end
-function get_unnormalized_parameters(input::Dict)
+function get_unnormalized_parameters(input::AbstractDict)
     io_input, evolve_moments, t_params, z, z_spectral, r, r_spectral, vpa, vpa_spectral,
         vperp, vperp_spectral, gyrophase, gyrophase_spectral, vz, vz_spectral, vr,
         vr_spectral, vzeta, vzeta_spectral, composition, species, collisions, geometry,
@@ -90,8 +90,8 @@ end
 """
     print_unnormalized_parameters(input)
 
-Print many parameters for the simulation setup given by `input` (a Dict of parameters or
-a String giving a filename), in SI units and eV.
+Print many parameters for the simulation setup given by `input` (an AbstractDict of
+parameters or a String giving a filename), in SI units and eV.
 """
 function print_unnormalized_parameters(args...; kwargs...)
 
@@ -312,23 +312,26 @@ Merge two AbstractDicts `a` and `b`. Any elements that are AbstractDicts are als
 """
 function recursive_merge end
 function recursive_merge(a::AbstractDict, b::AbstractDict)
-    return mergewith(recursive_merge, a, b)
-end
-function recursive_merge(a::AbstractDict, b)
-    error("Cannot merge a Dict with a non-Dict, got $a and $b")
-end
-function recursive_merge(a, b::AbstractDict)
-    error("Cannot merge a Dict with a non-Dict, got $a and $b")
-end
-function recursive_merge(a, b)
-    return b
+    result = deepcopy(a)
+    a_keys = collect(keys(a))
+    for (k,v) ∈ pairs(b)
+        if k ∉ a_keys
+            result[k] = v
+        elseif isa(result[k], AbstractDict) && isa(v, AbstractDict)
+            result[k] = recursive_merge(result[k], v)
+        elseif isa(result[k], AbstractDict) || isa(v, AbstractDict)
+            error("Cannot merge a Dict with a non-Dict, got $(result[k]) and $v")
+        else
+            result[k] = v
+        end
+    end
+    return result
 end
 
 """
-Dict merge function for named keyword arguments
-for case when input Dict is a mixed Dict of Dicts
-and non-Dict float/int/string entries, and the
-keyword arguments are also a mix of Dicts and non-Dicts
+Dict merge function for named keyword arguments for case when input AbstractDict is a
+mixed AbstractDict of AbstractDicts and non-AbstractDict float/int/string entries, and
+the keyword arguments are also a mix of AbstractDicts and non-AbstractDicts
 """
 function merge_dict_with_kwargs!(dict_base; args...)
     for (k,v) in args
diff --git a/moment_kinetics/test/harrisonthompson.jl b/moment_kinetics/test/harrisonthompson.jl
index d0ec7f31b..64cc24e66 100644
--- a/moment_kinetics/test/harrisonthompson.jl
+++ b/moment_kinetics/test/harrisonthompson.jl
@@ -62,63 +62,63 @@ function findphi(z, R_ion)
 end
 
 # default inputs for tests
-test_input_finite_difference = Dict("composition" => OptionsDict("n_ion_species" => 1,
-                                                                      "n_neutral_species" => 0,
-                                                                      "electron_physics" => "boltzmann_electron_response",
-                                                                      "T_e" => 1.0,
-                                                                      "T_wall" => 1.0),
-                                    "ion_species_1" => OptionsDict("initial_density" => 1.0,
-                                                                        "initial_temperature" => 1.0),
-                                    "z_IC_ion_species_1" => OptionsDict("initialization_option" => "gaussian",
-                                                                             "density_amplitude" => 0.0,
-                                                                             "density_phase" => 0.0,
-                                                                             "upar_amplitude" => 0.0,
-                                                                             "upar_phase" => 0.0,
-                                                                             "temperature_amplitude" => 0.0,
-                                                                             "temperature_phase" => 0.0),
-                                    "vpa_IC_ion_species_1" => OptionsDict("initialization_option" => "gaussian",
-                                                                             "density_amplitude" => 1.0,
-                                                                             "density_phase" => 0.0,
-                                                                             "upar_amplitude" => 0.0,
-                                                                             "upar_phase" => 0.0,
-                                                                             "temperature_amplitude" => 0.0,
-                                                                             "temperature_phase" => 0.0),
-                                    "output" => OptionsDict("run_name" => "finite_difference",
-                                                            "parallel_io" => false),
-                                    "evolve_moments" => OptionsDict("density" => false,
-                                                                    "parallel_flow" => false,
-                                                                    "parallel_pressure" => false,
-                                                                    "moments_conservation" => false),
-                                    "reactions" => OptionsDict("charge_exchange_frequency" => 0.0,
-                                                               "ionization_frequency" => 0.0),
-                                    "timestepping" => OptionsDict("nstep" => 9000,
-                                                                       "dt" => 0.0005,
-                                                                       "nwrite" => 9000,
-                                                                       "split_operators" => false),
-                                    "r" => OptionsDict("ngrid" => 1,
-                                                       "nelement" => 1,
-                                                       "bc" => "periodic",
-                                                       "discretization" => "finite_difference"),
-                                    "z" => OptionsDict("ngrid" => 100,
-                                                       "nelement" => 1,
-                                                       "bc" => "wall",
-                                                       "discretization" => "finite_difference"),
-                                    "vpa" => OptionsDict("ngrid" => 200,
-                                                         "nelement" => 1,
-                                                         "L" => 8.0,
-                                                         "bc" => "zero",
-                                                         "discretization" => "finite_difference"),
-                                    "vz" => OptionsDict("ngrid" => 200,
-                                                        "nelement" => 1,
-                                                        "L" => 8.0,
-                                                        "bc" => "zero",
-                                                        "discretization" => "finite_difference"),
-                                    "ion_source_1" => OptionsDict("active" => true,
-                                                                "source_strength" => ionization_frequency,
-                                                                "source_T" => 0.25,
-                                                                "z_profile" => "constant",
-                                                                "r_profile" => "constant"),
-                                   )
+test_input_finite_difference = OptionsDict("composition" => OptionsDict("n_ion_species" => 1,
+                                                                        "n_neutral_species" => 0,
+                                                                        "electron_physics" => "boltzmann_electron_response",
+                                                                        "T_e" => 1.0,
+                                                                        "T_wall" => 1.0),
+                                           "ion_species_1" => OptionsDict("initial_density" => 1.0,
+                                                                          "initial_temperature" => 1.0),
+                                           "z_IC_ion_species_1" => OptionsDict("initialization_option" => "gaussian",
+                                                                               "density_amplitude" => 0.0,
+                                                                               "density_phase" => 0.0,
+                                                                               "upar_amplitude" => 0.0,
+                                                                               "upar_phase" => 0.0,
+                                                                               "temperature_amplitude" => 0.0,
+                                                                               "temperature_phase" => 0.0),
+                                           "vpa_IC_ion_species_1" => OptionsDict("initialization_option" => "gaussian",
+                                                                                 "density_amplitude" => 1.0,
+                                                                                 "density_phase" => 0.0,
+                                                                                 "upar_amplitude" => 0.0,
+                                                                                 "upar_phase" => 0.0,
+                                                                                 "temperature_amplitude" => 0.0,
+                                                                                 "temperature_phase" => 0.0),
+                                           "output" => OptionsDict("run_name" => "finite_difference",
+                                                                   "parallel_io" => false),
+                                           "evolve_moments" => OptionsDict("density" => false,
+                                                                           "parallel_flow" => false,
+                                                                           "parallel_pressure" => false,
+                                                                           "moments_conservation" => false),
+                                           "reactions" => OptionsDict("charge_exchange_frequency" => 0.0,
+                                                                      "ionization_frequency" => 0.0),
+                                           "timestepping" => OptionsDict("nstep" => 9000,
+                                                                         "dt" => 0.0005,
+                                                                         "nwrite" => 9000,
+                                                                         "split_operators" => false),
+                                           "r" => OptionsDict("ngrid" => 1,
+                                                              "nelement" => 1,
+                                                              "bc" => "periodic",
+                                                              "discretization" => "finite_difference"),
+                                           "z" => OptionsDict("ngrid" => 100,
+                                                              "nelement" => 1,
+                                                              "bc" => "wall",
+                                                              "discretization" => "finite_difference"),
+                                           "vpa" => OptionsDict("ngrid" => 200,
+                                                                "nelement" => 1,
+                                                                "L" => 8.0,
+                                                                "bc" => "zero",
+                                                                "discretization" => "finite_difference"),
+                                           "vz" => OptionsDict("ngrid" => 200,
+                                                               "nelement" => 1,
+                                                               "L" => 8.0,
+                                                               "bc" => "zero",
+                                                               "discretization" => "finite_difference"),
+                                           "ion_source_1" => OptionsDict("active" => true,
+                                                                         "source_strength" => ionization_frequency,
+                                                                         "source_T" => 0.25,
+                                                                         "z_profile" => "constant",
+                                                                         "r_profile" => "constant"),
+                                          )
 
 test_input_chebyshev = recursive_merge(test_input_finite_difference,
                                        OptionsDict("output" => OptionsDict("run_name" => "chebyshev_pseudospectral"),
diff --git a/moment_kinetics/test/jacobian_matrix_tests.jl b/moment_kinetics/test/jacobian_matrix_tests.jl
index 7e48cc80f..e61ae7d33 100644
--- a/moment_kinetics/test/jacobian_matrix_tests.jl
+++ b/moment_kinetics/test/jacobian_matrix_tests.jl
@@ -131,57 +131,57 @@ test_input = OptionsDict("output" => OptionsDict("run_name" => "jacobian_matrix"
                                              "discretization" => "gausslegendre_pseudospectral",
                                              "element_spacing_option" => "coarse_tails",
                                             ),
-                         "timestepping" => Dict{String,Any}("type" => "KennedyCarpenterARK324",
-                                                            "implicit_electron_advance" => false,
-                                                            "implicit_electron_ppar" => true,
-                                                            "implicit_ion_advance" => false,
-                                                            "implicit_vpa_advection" => false,
-                                                            "nstep" => 1,
-                                                            "dt" => ion_dt,
-                                                            "minimum_dt" => 1.0e-7,
-                                                            "rtol" => 1.0e-4,
-                                                            "max_increase_factor_near_last_fail" => 1.001,
-                                                            "last_fail_proximity_factor" => 1.1,
-                                                            "max_increase_factor" => 1.05,
-                                                            "nwrite" => 10000,
-                                                            "nwrite_dfns" => 10000,
-                                                            "steady_state_residual" => true,
-                                                            "converged_residual_value" => 1.0e-3,
-                                                           ),
-                         "electron_timestepping" => Dict{String,Any}("nstep" => 1,
-                                                                     "dt" => dt,
-                                                                     "maximum_dt" => 1.0,
-                                                                     "nwrite" => 10000,
-                                                                     "nwrite_dfns" => 100000,
-                                                                     "type" => "Fekete4(3)",
-                                                                     "rtol" => 1.0e-6,
-                                                                     "atol" => 1.0e-14,
-                                                                     "minimum_dt" => 1.0e-10,
-                                                                     "initialization_residual_value" => 2.5,
-                                                                     "converged_residual_value" => 1.0e-2,
-                                                                     "constraint_forcing_rate" => 2.321,
-                                                                    ),
-                         "nonlinear_solver" => Dict{String,Any}("nonlinear_max_iterations" => 100,
-                                                                "rtol" => 1.0e-5,
-                                                                "atol" => 1.0e-15,
-                                                                "preconditioner_update_interval" => 1,
+                         "timestepping" => OptionsDict("type" => "KennedyCarpenterARK324",
+                                                       "implicit_electron_advance" => false,
+                                                       "implicit_electron_ppar" => true,
+                                                       "implicit_ion_advance" => false,
+                                                       "implicit_vpa_advection" => false,
+                                                       "nstep" => 1,
+                                                       "dt" => ion_dt,
+                                                       "minimum_dt" => 1.0e-7,
+                                                       "rtol" => 1.0e-4,
+                                                       "max_increase_factor_near_last_fail" => 1.001,
+                                                       "last_fail_proximity_factor" => 1.1,
+                                                       "max_increase_factor" => 1.05,
+                                                       "nwrite" => 10000,
+                                                       "nwrite_dfns" => 10000,
+                                                       "steady_state_residual" => true,
+                                                       "converged_residual_value" => 1.0e-3,
+                                                      ),
+                         "electron_timestepping" => OptionsDict("nstep" => 1,
+                                                                "dt" => dt,
+                                                                "maximum_dt" => 1.0,
+                                                                "nwrite" => 10000,
+                                                                "nwrite_dfns" => 100000,
+                                                                "type" => "Fekete4(3)",
+                                                                "rtol" => 1.0e-6,
+                                                                "atol" => 1.0e-14,
+                                                                "minimum_dt" => 1.0e-10,
+                                                                "initialization_residual_value" => 2.5,
+                                                                "converged_residual_value" => 1.0e-2,
+                                                                "constraint_forcing_rate" => 2.321,
                                                                ),
-                         "ion_numerical_dissipation" => Dict{String,Any}("vpa_dissipation_coefficient" => 1.0e0,
+                         "nonlinear_solver" => OptionsDict("nonlinear_max_iterations" => 100,
+                                                           "rtol" => 1.0e-5,
+                                                           "atol" => 1.0e-15,
+                                                           "preconditioner_update_interval" => 1,
+                                                          ),
+                         "ion_numerical_dissipation" => OptionsDict("vpa_dissipation_coefficient" => 1.0e0,
+                                                                    "force_minimum_pdf_value" => 0.0,
+                                                                   ),
+                         "electron_numerical_dissipation" => OptionsDict("vpa_dissipation_coefficient" => 2.0,
                                                                          "force_minimum_pdf_value" => 0.0,
                                                                         ),
-                         "electron_numerical_dissipation" => Dict{String,Any}("vpa_dissipation_coefficient" => 2.0,
-                                                                              "force_minimum_pdf_value" => 0.0,
-                                                                             ),
-                         "neutral_numerical_dissipation" => Dict{String,Any}("vz_dissipation_coefficient" => 1.0e-1,
-                                                                             "force_minimum_pdf_value" => 0.0,
-                                                                            ),
-                         "ion_source_1" => Dict{String,Any}("active" => true,
-                                                            "z_profile" => "gaussian",
-                                                            "z_width" => 0.125,
-                                                            "source_strength" => 0.1,
-                                                            "source_T" => 2.0,
-                                                           ),
-                         "krook_collisions" => Dict{String,Any}("use_krook" => true),
+                         "neutral_numerical_dissipation" => OptionsDict("vz_dissipation_coefficient" => 1.0e-1,
+                                                                        "force_minimum_pdf_value" => 0.0,
+                                                                       ),
+                         "ion_source_1" => OptionsDict("active" => true,
+                                                       "z_profile" => "gaussian",
+                                                       "z_width" => 0.125,
+                                                       "source_strength" => 0.1,
+                                                       "source_T" => 2.0,
+                                                      ),
+                         "krook_collisions" => OptionsDict("use_krook" => true),
                         )
 
 function get_mk_state(test_input)