diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2c81d69..acb382e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -31,6 +31,10 @@ jobs:
         uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
+      - name: Install non-python (homebrew) dependencies
+        if: ${{ matrix.os == 'macOS-latest' }}
+        run: |
+          brew install opencascade cgal gmp mpfr boost
       - name: Get dependencies and install reboost
         run: |
           python -m pip install --upgrade pip wheel setuptools
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 498eb1c..17754b4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,6 +57,17 @@ repos:
         additional_dependencies:
           - pytest
 
+  - repo: https://github.com/kynan/nbstripout
+    rev: "0.7.1"
+    hooks:
+      - id: nbstripout
+        args:
+          [
+            "--drop-empty-cells",
+            "--extra-keys",
+            "metadata.kernelspec metadata.language_info",
+          ]
+
   - repo: https://github.com/codespell-project/codespell
     rev: "v2.3.0"
     hooks:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e8e0ac8..3cf5f89 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -22,12 +22,13 @@
     "myst_parser",
 ]
 
+
 source_suffix = {
     ".rst": "restructuredtext",
     ".md": "markdown",
 }
 master_doc = "index"
-language = "python"
+# language = "python"
 
 # Furo theme
 html_theme = "furo"
@@ -53,8 +54,17 @@
     "scipy": ("http://docs.scipy.org/doc/scipy/reference", None),
     "pandas": ("https://pandas.pydata.org/docs", None),
     "matplotlib": ("http://matplotlib.org/stable", None),
+    "pygama": ("https://pygama.readthedocs.io/en/stable/", None),
+    "legendhpges": ("https://legend-pygeom-hpges.readthedocs.io/en/latest/", None),
 }  # add new intersphinx mappings here
 
+
+suppress_warnings = [
+    # "histogram" is defined both in pygama.dsp.processors.histogram.histogram
+    # and in pygama.math.histogram, leading to a Sphinx cross-referencing
+    # warning.  I don't know how to fix this properly
+    "ref.python",
+]
 # sphinx-autodoc
 # Include __init__() docstring in class docstring
 autoclass_content = "both"
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7885f27..9965e25 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,10 +1,54 @@
 Welcome to reboost's documentation!
 ==========================================
 
-Table of Contents
------------------
+*reboost* is a python package for the post-processing of `remage <https://remage.readthedocs.io/en/stable/>`_ monte-carlo Simulations.
+
+Getting started
+---------------
+
+*reboost* can be installed with *pip*:
+
+.. code-block:: console
+
+   $ git clone git@github.com:legend-exp/reboost.git
+   $ cd reboost
+   $ pip install .
+
+*reboost* is currently divided into two programs:
+ - *reboost-optical* for processing optical simulations,
+ - *reboost-hpge* for processing HPGe detector simulations.
+
+Both can be run on the command line with:
+
+.. code-block:: console
+
+   $ reboost-optical -h
+   $ reboost-hpge -h
+
+Next steps
+----------
+
+.. toctree::
+   :maxdepth: 2
+
+   User Manual <manual/index>
+
+.. toctree::
+   :maxdepth: 1
+
+   tutorial
 
 .. toctree::
    :maxdepth: 1
 
    Package API reference <api/modules>
+
+
+See also
+--------
+ - `remage <https://remage.readthedocs.io/en/stable/>`_: Modern *Geant4* application for HPGe and LAr experiments,
+ - `legend-pygeom-hpges <https://legend-pygeom-hpges.readthedocs.io/en/latest/>`_: Package for handling HPGe detector geometry in python,
+ - `pyg4ometry <https://pyg4ometry.readthedocs.io/en/stable/>`_: Package to create simulation geometry in python,
+ - `legend-pygeom-optics <https://legend-pygeom-optics.readthedocs.io/en/stable/>`_: Package to handle optical properties in python,
+ - `legend-pygeom-l200 <https://github.com/legend-exp/legend-pygeom-l200>`_: Implementation of the LEGEND-200 experiment (**private**),
+ - `pyvertexgen <https://github.com/tdixon97/pyvertexgen/>`_: Generation of vertices for simulations.
diff --git a/docs/source/manual/hpge.rst b/docs/source/manual/hpge.rst
new file mode 100644
index 0000000..d0012b8
--- /dev/null
+++ b/docs/source/manual/hpge.rst
@@ -0,0 +1,404 @@
+HPGe detector simulations
+=========================
+
+*reboost-hpge* is a sub-package for post-processing the high purity Germanium detector (HPGe) part of *remage* simulations.
+It provides a flexible framework to implement a user customised post-processing.
+
+Command line interface
+----------------------
+
+A command line tool *reboost-hpge* is created to run the processing.
+
+.. code-block:: console
+
+    $ reboost-hpge -h
+
+Different modes are implemented to run the tiers. For example to run the *hit* tier processing (more details in the next section).
+
+.. code-block:: console
+
+    $ reboost-hpge hit -h
+
+
+*remage* lh5 output format
+--------------------------
+
+The output simulations from *remage* are described in `remage-docs <https://remage.readthedocs.io/en/stable/output.html>`_.
+By default two ``lgdo.Table`` `docs <https://legend-pydataobj.readthedocs.io/en/stable/api/lgdo.types.html#lgdo.types.table.Table>`_ are stored with the
+following format:
+
+.. code-block:: console
+
+    /
+    └── hit · HDF5 group
+        ├── det000 · table{evtid,particle,edep,time,xloc,yloc,zloc}
+        │   ├── edep · array<1>{real}
+        │   ├── evtid · array<1>{real}
+        │   ├── particle · array<1>{real}
+        │   ├── time · array<1>{real}
+        │   ├── xloc · array<1>{real}
+        │   ├── yloc · array<1>{real}
+        │   └── zloc · array<1>{real}
+        ├── det001 · table{evtid,particle,edep,time,xloc,yloc,zloc}
+        |    ....
+        |    ....
+        └── vertices · table{evtid,time,xloc,yloc,zloc,n_part}
+            ├── evtid · array<1>{real}
+            ├── n_part · array<1>{real}
+            ├── time · array<1>{real}
+            ├── xloc · array<1>{real}
+            ├── yloc · array<1>{real}
+            └── zloc · array<1>{real}
+
+
+
+One table is stored per sensitive Germanium detector and a Table of the vertices is also stored.
+All the data is stored as (flat) 1D arrays.
+
+- *edep*:  energy deposited in Germanium (in keV).
+- *evtid*: index of the simulated event,
+- *particle*: Geant4 code for the particle type,
+- *time*: time of the event relative to the start of the event,
+- *xloc/yloc/xzloc*: Position of the interaction / vertex,
+- *n_part*: Number of particles emitted.
+
+However, this format is not directly comparable to experimental data.
+
+
+Data tiers
+----------
+
+The processing is defined in terms of several *tiers*, mirroring the logic of the `pygama <https://pygama.readthedocs.io/en/stable/>`_ data processing software used for LEGEND.
+
+- **stp** or "step" the raw *remage* outputs corresponding to Geant4 steps,
+- **hit** the data from each channel independently after grouping in discrete physical interactions in the detector.
+- **evt** or "event" the data combining the information from various detectors (includes generating the **tcm** or time-coincidence map).
+
+The processing is divided into two steps :func:`build_hit`  ``build_evt`` [WIP].
+
+Hit tier processing
+-------------------
+
+The hit tier converts the raw remage file based on Geant4 steps to a file corresponding to the physical interactions in the detectors.
+Only steps corresponding to individual detectors are performed in this step.
+The processing is based on a YAML or JSON configuration file. For example:
+
+.. code-block:: yaml
+
+    channels:
+     - det000
+     - det001
+     - det002
+     - det003
+
+    outputs:
+     - t0
+     - truth_energy_sum
+     - smeared_energy_sum
+     - evtid
+
+    step_group:
+        description: group steps by time and evtid.
+        expression: 'reboost.hpge.processors.group_by_time(stp,window=10)'
+    locals:
+        hpge: 'reboost.hpge.utils(meta_path=meta,pars=pars,detector=detector)'
+
+    operations:
+        t0:
+            description: first time in the hit.
+            mode: eval
+            expression: 'ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)'
+        truth_energy_sum:
+            description: truth summed energy in the hit.
+            mode: eval
+            expression: 'ak.sum(hit.edep,axis=-1)'
+        smeared_energy_sum:
+            description: summed energy after convolution with energy response.
+            mode: function
+            expression: |
+            reboost.hpge.processors.smear_energies(hit.truth_energy_sum,reso=pars.reso)
+
+It is necessary to provide several sub-dictionaries:
+
+- **channels**: list of HPGe channels to process.
+- **outputs**: list of fields for the output file.
+- **locals**: get objects used by the processors (passed as ``locals`` to ``LGDO.Table.eval``), more details below.
+- **step_group**: this should describe the function that groups the Geant4 steps into physical *hits*.
+- **operations**: further computations / manipulations to apply.
+
+The **step_group** block sets the structure of the output file, this function reformats the flat input table into a table
+with a jagged structure where each row corresponds to a physical hit in the detector. For example:
+
+.. code-block:: console
+
+    evtid: [0    ,     0,     1, ... ]
+    edep:  [101.2, 201.2, 303.7, ... ]
+    time:  [0    , 0.1  , 0,     ... ]
+    ....
+
+Becomes a :class:`Table`` of :class:`VectorOfVectors` with a jagged structure. For example:
+
+.. code-block:: console
+
+    evtid: [[0    ,     0], [    1],[...],... ]
+    edep:  [[101.2, 201.2], [303.7],[...],... ]
+    time:  [[0    ,   0.1], [    0],[...],... ]
+    ....
+
+The recommended tool to manipulate jagged arrays is awkward `[docs] <https://awkward-array.org/doc/main/>`_ and much of *reboost* is based on this.
+
+
+It is necessary to chose a function to perform this step grouping, this function must take in the *remage* output table and return
+a table where all the input arrays are converted to :class:`LGDO.VectorOfVectors` with a jagged structure. In the expression of the function *stp* is an alias
+for the input *remage* Table. This then must return the original LH5 table with the same fields as above restructured so each field is a :class:`VectorOfVectors`.
+In addition a ``global_evtid`` field is adding which represents the index of the event over all input files.
+
+Next a set of operations can be specified, these can perform any operation that doesn't change the length of the data. They can be either basic numerical operations
+(including awkward or numpy) or be specified by a function. The functions can reference several variables:
+
+- **hit** the output table of step grouping (note that the table is constantly updated so the order of operations is important),
+- **pars** a named tuple of parameters (more details later) for this detector,
+- **hpge** the ``legendhpges.HPGe`` object for this detector,
+- **phy_vol** the ``pygometry`` physical volume for the detector.
+
+Finally the outputs field specifies the columns of the Table to include in the output table.
+
+lh5 i/o operations
+^^^^^^^^^^^^^^^^^^
+
+:func:`build_hit` contains several options to handle i/o of lh5 files.
+
+Typically raw geant4 output files can be very large (many GB) so it is not desirable or feasible to read the full file into memory.
+Instead the :class:`lgdo.lh5.LH5Ierator` is used to handle iteration over chunks of files keeping memory use reasonable. The *buffer* keyword argument
+to :func:`build_hit` controls the size of the buffer.
+
+It is possible to specify a list of files of use wildcards, the *merge_input_files* argument controls whether the outputs are merged or kept as separate files.
+
+Finally, it is sometimes desirable to process a subset of the simulated events, for example to split the simulation by run or period. The *n_evtid* and *start_evtid*
+keywords arguments control the first simulation index to process and the number of events. Note that the indices refer to the *global* evtid when multiple files are used.
+
+parameters and other *local* variables
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Often it is necessary to include processors that depend on parameters (which) may vary by detector. To enable this the user can specify a dictionary of
+parameters with the *pars* keyword, this should contain a sub-dictionary per detector for example:
+
+.. code-block:: yaml
+
+    det000:
+     reso: 1
+     fccd: 0.1
+     phy_vol_name: det_phy
+     meta_name: icpc.json
+
+
+This dictionary is internally converted into a python ``NamedTuple`` to make cleaner syntax. The named tuple for each detector is then passed as a
+``local`` dictionary to the evaluation of the operations with name "pars".
+
+In addition, for many post-processing applications it is necessary for the processor functions to know the geometry. This is made possible
+by passing the path to the GDML file and the path to the metadata ("diodes" folder) with the *gdml* and *meta_path* arguments to build_hit.
+From the GDML file the ``pyg4ometry.geant4.Registry`` is extracted.
+
+To allow the flexibility to write processors depending on arbitrary (more complicated python objects), it is possible to add the *locals* dictionary
+to the config file. The code will then evaluate the supplied expression for each sub-dictionary. These expressions can depend on:
+
+- **detector**: the *remage* detector name,
+- **meta**: the path to the metadata,
+- **reg**: the geant4 registry,
+- **pars**: the parameters for this detector.
+
+These expressions are then evaluated (once per detector) and added to the *locals* dictionary of :func:`Table.eval`, so can be references in the expressions.
+
+For example one useful object for post-processing is the :class:`legendhpges.base.HPGe` object for the detector.
+This can be constructed from the metadata using.
+
+.. code-block:: yaml
+
+    hpge: 'reboost.hpge.utils(meta_path=meta,pars=pars,detector=detector)'
+
+This will then create the hpge object for each detector and add it to the "locals" mapping of "eval" so it can be used.
+
+Possible intended use case of this functionality are:
+
+ - extracting detector mappings (eg drift time maps),
+ - extracting the kernel of a machine learning model.
+ - any more complicated (non-JSON serialisable objects).
+
+Adding new processors
+^^^^^^^^^^^^^^^^^^^^^
+
+Any python function can be a ``reboost.hit`` processor. The only requirement is that it should return a:
+
+- :class:`VectorOfVectors`,
+- :class:`Array` or
+- :class:`ArrayOfEqualSizedArrays`
+
+with the same length as the hit table. This means processors can act on subarrays (``axis=-1`` in awkward syntax) but should not combine multiple rows of the hit table.
+
+It is simple to accommodate most of the current and future envisiged post-processing in this framework. For example:
+
+- clustering hits would result in a new :class:`VectorOfVectors` with the same number of rows but fewer entries per vector,
+- pulse shape simulations to produce waveforms (or ML emmulation of this) would give an :class:`ArrayOfEqualSizedArrays`,
+- processing in parallel many parameters (eg. for systematic) studies would give a nested :class:`VectorOfVectors`.
+
+Time coincidence map (TCM)
+--------------------------
+
+The next step in the processing chain is the **event** tier, this combines the information from the various sub-systems to produce detector wide events.
+However, before we can generate the *evt* tier we need to generate the "time-coincidence-map". This determines which of the hits in the various detectors
+are occurring *simultaneously* (actually within some coincidence time window) and should be part of the same event.
+Some information on the TCM in data is given in `[pygama-evt-docs] <https://pygama.readthedocs.io/en/stable/api/pygama.evt.html#>`_. The *reboost* TCM is fairly similar.
+
+The generation of the TCM is performed by :func:`reboost.hpge.tcm.build_tcm` which generates and stores the TCM on disk.
+
+.. warning::
+    The generation of the TCM from the times of hits is slightly different to the "hardware-tcm" used for LEGEND physics data. In the experimental data, a signal on one channel, triggers
+    the readout of the full array. Care should be taken for deecays or interactions with ~ :math:`10-100 \mu s` time differences between hits.
+    However, in practice for most cases the time differences are very small and the two TCM should be equivalent after removing hits below threshold.
+
+Before explaining how the TCM is constructed we make a detour to explain the different indices present in the reboost and remage files.
+
+- **stp.evtid**: in the remage output files we have a variable called evtid. This is the index of the decay, so as explained earlier a single evtid can result in multiple hits in the detector.
+- **hit.global_evtid**: However, when multiple files are read the evtid are now no longer necessarily sorted or unique and so we define a new index in the hit tier. This is extracted from the vertices table as
+    the sum of the number of evtid in the previous files plus the row in the vertex table of the evtid. A vector of vectors called "hit._global_evtid" is added to the hit table. We can also extract
+    a flat array of indices (for easier use in the evt tier) with a simple processor:
+
+    .. code:: yaml
+
+        global_evtid:
+            description: global evtid of the hit.
+            mode: eval
+            expression: 'ak.fill_none(hit._global_evtid,axis=-1),np.nan)'
+
+
+    This field is mandatory to generate the TCM, and the name of the field is an argument to "build_tcm".
+- **hit idx**: Multiple rows in the hit table may contain the same "global_evtid" while many "global_evtid" do not result in a hit. The hit idx is just the row of the hit table a hit corresponds to.
+- **channel_id**: When we combine multiple channels we assign them an index, this is set in the original remage macro file.
+
+:func:`build_tcm` saves two VectorOfVectors (with the same shape) to the output file, corresponding to the **channel_id** and the **hit_idx** of each event.
+
+.. note::
+    - This storage is slightly different to the TCM in data, but is chosen to allow easy iteration through the TCM.
+    - We do not currently support merging multiple **hit** tier files, this is since then the TCM would need to know which file each hit corresponded to.
+
+Event tier processing
+---------------------
+
+The event tier combines the information from various detector systems. Including in future the optical detector channels. This step is thus only necessary for experiments with
+many output channels.
+
+The processing is again based on a YAML or JSON configuration file. Most of the work to evaluate each expression is done by the :func:`pygama.evt.build_evt.evaluate_expression` and our conventions for processors
+follow those for pygama.
+The input configuration file is identical to a pygama evt tier configuration file (see an example in :func:`pygama.evt.build_evt.build_evt`).
+
+For example:
+
+.. code-block:: yaml
+
+        channels:
+            geds_on:
+                - det000
+                - det001
+                - det002
+            geds_ac:
+                - det003
+
+        outputs:
+         - energy
+         - multiplicity
+
+        operations:
+            energy_id:
+                channels: geds_on
+                aggregation_mode: gather
+                query: hit.energy > 25
+                expression: tcm.channel_id
+            energy:
+                aggregation_mode: 'keep_at_ch:evt.energy_id'
+                channels: geds_on
+                expression: hit.energy
+            multiplicity:
+                channels:
+                    - geds_on
+                    - geds_ac
+                aggregation_mode: sum
+                expression: hit.energy > 25
+                initial: 0
+
+- **channels**  : defines a set of groups of channel names which the operations will be applied to.
+- **outputs**   : defines the fields to include in the output file.
+- **operations**: a list of operations to perform.
+
+The type of operations is based on the "evaluation modes of" :func:`pygama.evt.build_evt.build_evt`.
+Each operation is defined by a configuration block which can have the following keys:
+
+- **channels**: list of channels to perform the operation on,
+- **exlude_channels**: channels to set to the default value,
+- **initial**: initial value of the aggregator,
+- **aggregation_mode**: how to combine the channels (more information below),
+- **expression**: expression to evaluate,
+- **query**: logical statement to only select some channels,
+- **sort**: expression used for sorting the output, format of "ascend_by:field" or "descend_by:field".
+
+
+
+Aggregation modes
+^^^^^^^^^^^^^^^^^
+
+There are several different ways to aggregate the data from different detectors / channels.
+
+- *"no aggregator supplied"* : then the code will perform a simple evaluation of quantities in the ``evt`` tier data for example:
+
+    .. code-block:: yaml
+
+        energy_sum:
+            expression: ak.sum(evt.energies,axis=-1)
+
+- *"first_at:sorter"* picks the value corresponding to the channel (TCM ID) with the lowest value of the "sorter" field. For example:
+
+    .. code-block:: yaml
+
+        first_time:
+            channels: geds_on
+            aggregation_mode: first_at:hit.timestamp
+            expression: hit.timestamp
+
+- *"last_at:sorter"* similar for the highest value,
+- *"gather"*: combines the fields into a :class:`VectorOfVectors`, sorted by the "sort" keys. For example the the following processor is used to extract the channel id (`tcm.array_id`) for every hit above a 25 keV energy threshold.
+
+    .. code-block:: yaml
+
+        channel_id:
+            channels: geds_on
+            aggregation_mode: gather
+            query: hit.energy > 25
+            expression: tcm.array_id
+            sort: descend_by:hit.energy
+
+- *"keep_at_channel:channel_id_field"*: similarly combines into a :class:`VectorOfVectors`, however uses only the ids from the "channel_id_field" and preserves the order of the subvectors.
+    For example we can make a processor to extract the energy of each hit from the previous part.
+
+    .. code-block:: yaml
+
+        energy:
+            channels: geds_on
+            aggregation_mode: keep_at_channel:evt.array_id
+            expression: hit.energy
+
+- *"keep_at_idx:tcm_index_field"* similar but instead preserves the shape of the tcm, first we need to generate a tcm index field.
+
+     .. code-block:: yaml
+
+        tcm_idx:
+            channels: geds_on
+            aggregation_mode: gather
+            query: hit.energy > 25
+            expression: tcm.index
+            sort: descend_by:hit.energy
+
+    this says for every element in the :class:`VectorOfVectors` which index in the flattened data of the tcm to extract the value from. So to find the value to fill a row in the output the code will search for the
+    tcm `idx`` and `id` corresponding to the supplied index.
+
+- *"all"*, *"any"*, *"sum"* aggregates by the operation.
+
+There is also a function mode, but this is not currently used in reboost, and is not expected to be needed.
diff --git a/docs/source/manual/index.rst b/docs/source/manual/index.rst
new file mode 100644
index 0000000..99ecefb
--- /dev/null
+++ b/docs/source/manual/index.rst
@@ -0,0 +1,8 @@
+User Manual
+===========
+
+.. toctree::
+    :maxdepth: 2
+
+    optical
+    hpge
diff --git a/docs/source/manual/optical.rst b/docs/source/manual/optical.rst
new file mode 100644
index 0000000..caf38b7
--- /dev/null
+++ b/docs/source/manual/optical.rst
@@ -0,0 +1,4 @@
+Optical (SiPM) simulations processing
+=====================================
+
+Come back later for more complete documentation.
diff --git a/docs/source/notebooks/images/det.png b/docs/source/notebooks/images/det.png
new file mode 100644
index 0000000..4ffbf69
Binary files /dev/null and b/docs/source/notebooks/images/det.png differ
diff --git a/docs/source/notebooks/images/output_17_1.png b/docs/source/notebooks/images/output_17_1.png
new file mode 100644
index 0000000..374c960
Binary files /dev/null and b/docs/source/notebooks/images/output_17_1.png differ
diff --git a/docs/source/notebooks/images/output_20_1.png b/docs/source/notebooks/images/output_20_1.png
new file mode 100644
index 0000000..0c30246
Binary files /dev/null and b/docs/source/notebooks/images/output_20_1.png differ
diff --git a/docs/source/notebooks/images/output_24_0.png b/docs/source/notebooks/images/output_24_0.png
new file mode 100644
index 0000000..8b4207a
Binary files /dev/null and b/docs/source/notebooks/images/output_24_0.png differ
diff --git a/docs/source/notebooks/images/output_27_1.png b/docs/source/notebooks/images/output_27_1.png
new file mode 100644
index 0000000..32a072f
Binary files /dev/null and b/docs/source/notebooks/images/output_27_1.png differ
diff --git a/docs/source/notebooks/images/output_28_1.png b/docs/source/notebooks/images/output_28_1.png
new file mode 100644
index 0000000..48ba2d4
Binary files /dev/null and b/docs/source/notebooks/images/output_28_1.png differ
diff --git a/docs/source/notebooks/images/output_30_1.png b/docs/source/notebooks/images/output_30_1.png
new file mode 100644
index 0000000..240b029
Binary files /dev/null and b/docs/source/notebooks/images/output_30_1.png differ
diff --git a/docs/source/notebooks/images/output_31_0.png b/docs/source/notebooks/images/output_31_0.png
new file mode 100644
index 0000000..2cee4f2
Binary files /dev/null and b/docs/source/notebooks/images/output_31_0.png differ
diff --git a/docs/source/notebooks/images/output_34_0.png b/docs/source/notebooks/images/output_34_0.png
new file mode 100644
index 0000000..6802adb
Binary files /dev/null and b/docs/source/notebooks/images/output_34_0.png differ
diff --git a/docs/source/notebooks/images/output_35_0.png b/docs/source/notebooks/images/output_35_0.png
new file mode 100644
index 0000000..6e15606
Binary files /dev/null and b/docs/source/notebooks/images/output_35_0.png differ
diff --git a/docs/source/notebooks/images/output_37_0.png b/docs/source/notebooks/images/output_37_0.png
new file mode 100644
index 0000000..ea77b1f
Binary files /dev/null and b/docs/source/notebooks/images/output_37_0.png differ
diff --git a/docs/source/notebooks/images/output_54_1.png b/docs/source/notebooks/images/output_54_1.png
new file mode 100644
index 0000000..2f3f13d
Binary files /dev/null and b/docs/source/notebooks/images/output_54_1.png differ
diff --git a/docs/source/notebooks/images/output_58_1.png b/docs/source/notebooks/images/output_58_1.png
new file mode 100644
index 0000000..fd21818
Binary files /dev/null and b/docs/source/notebooks/images/output_58_1.png differ
diff --git a/docs/source/notebooks/reboost_hpge_tutorial_evt.rst b/docs/source/notebooks/reboost_hpge_tutorial_evt.rst
new file mode 100644
index 0000000..055f279
--- /dev/null
+++ b/docs/source/notebooks/reboost_hpge_tutorial_evt.rst
@@ -0,0 +1,889 @@
+Event tier simulation processing
+================================
+
+The next tutorial describes how to combine the information from multiple
+detectors, i.e. to create detector-wide “events”, stored in the **evt**
+tier.
+
+To demonstrate the power of multi-detector event building we need a
+detector system with a large number of detectors. We chose an array of
+20 detectors emersed in liquid argon
+
+0) Setting up the python environment
+------------------------------------
+
+.. code:: python
+
+    from lgdo import lh5
+    from reboost.hpge import hit, tcm
+    import matplotlib.pyplot as plt
+    import pyg4ometry as pg4
+    import legendhpges
+    from legendhpges import draw, make_hpge
+    import awkward as ak
+    import logging
+    import colorlog
+    import hist
+    import numpy as np
+
+
+    plt.rcParams["figure.figsize"] = [12, 4]
+    plt.rcParams["axes.titlesize"] = 12
+    plt.rcParams["axes.labelsize"] = 12
+    plt.rcParams["legend.fontsize"] = 12
+
+
+    handler = colorlog.StreamHandler()
+    handler.setFormatter(
+        colorlog.ColoredFormatter("%(log_color)s%(name)s [%(levelname)s] %(message)s")
+    )
+    logger = logging.getLogger()
+    logger.handlers.clear()
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    logger.info("test")
+
+
+
+
+.. parsed-literal::
+
+    The memory_profiler extension is already loaded. To reload it, use:
+      %reload_ext memory_profiler
+
+
+1) Running the simulation
+-------------------------
+
+First we generate a geometry and run the simulation. We use similar BeGe
+detectors as in the part before
+
+.. code:: python
+
+
+    reg = pg4.geant4.Registry()
+
+    bege_meta = {
+        "name": "B00000B",
+        "type": "bege",
+        "production": {
+            "enrichment": {"val": 0.874, "unc": 0.003},
+            "mass_in_g": 697.0,
+        },
+        "geometry": {
+            "height_in_mm": 29.46,
+            "radius_in_mm": 36.98,
+            "groove": {"depth_in_mm": 2.0, "radius_in_mm": {"outer": 10.5, "inner": 7.5}},
+            "pp_contact": {"radius_in_mm": 7.5, "depth_in_mm": 0},
+            "taper": {
+                "top": {"angle_in_deg": 0.0, "height_in_mm": 0.0},
+                "bottom": {"angle_in_deg": 0.0, "height_in_mm": 0.0},
+            },
+        },
+    }
+
+    # make a detector
+    bege_l = make_hpge(bege_meta, name="BEGe_L", registry=reg)
+
+
+    # create a world volume
+    world_s = pg4.geant4.solid.Tubs(
+        "World_s", 0, 60, 100, 0, 2 * np.pi, registry=reg, lunit="cm"
+    )
+    world_l = pg4.geant4.LogicalVolume(world_s, "G4_Galactic", "World", registry=reg)
+    reg.setWorld(world_l)
+
+    # let's make a liquid argon balloon
+    lar_s = pg4.geant4.solid.Tubs(
+        "LAr_s", 0, 20, 40, 0, 2 * np.pi, registry=reg, lunit="cm"
+    )
+    lar_l = pg4.geant4.LogicalVolume(lar_s, "G4_lAr", "LAr_l", registry=reg)
+    pg4.geant4.PhysicalVolume([0, 0, 0], [0, 0, 0], lar_l, "LAr", world_l, registry=reg)
+
+    # lets make 4 strings of 5 detectors
+
+    det_count = 0
+    for x in [-50, 50]:
+        for y in [-50, 50]:
+            for z in [-100, -50, 0, 50, 100]:
+                pg4.geant4.PhysicalVolume(
+                    [0, 0, 0],
+                    [x, y, z, "mm"],
+                    bege_l,
+                    f"BEGe_{det_count}",
+                    lar_l,
+                    registry=reg,
+                )
+                det_count += 1
+
+    w = pg4.gdml.Writer()
+    w.addDetector(reg)
+    w.write("cfg/geom.gdml")
+
+
+Uncomment the next block to visualise
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+
+    viewer = pg4.visualisation.VtkViewerColoured(
+        materialVisOptions={"G4_lAr": [0, 0, 1, 0.1]}
+    )
+    viewer.addLogicalVolume(reg.getWorldVolume())
+    viewer.view()
+
+.. figure:: images/det.png
+   :alt: image.png
+
+   image.png
+
+We create our remage macro, this time for K42 in the LAr, a critical
+background source in LEGEND!
+
+.. code:: text
+
+   /RMG/Manager/Logging/LogLevel detail
+
+   /RMG/Geometry/RegisterDetector Germanium BEGe_0 000
+   /RMG/Geometry/RegisterDetector Germanium BEGe_1 001
+   /RMG/Geometry/RegisterDetector Germanium BEGe_2 002
+   /RMG/Geometry/RegisterDetector Germanium BEGe_3 003
+   /RMG/Geometry/RegisterDetector Germanium BEGe_4 004
+   /RMG/Geometry/RegisterDetector Germanium BEGe_5 005
+   /RMG/Geometry/RegisterDetector Germanium BEGe_6 006
+   /RMG/Geometry/RegisterDetector Germanium BEGe_7 007
+   /RMG/Geometry/RegisterDetector Germanium BEGe_8 008
+   /RMG/Geometry/RegisterDetector Germanium BEGe_9 009
+   /RMG/Geometry/RegisterDetector Germanium BEGe_10 010
+   /RMG/Geometry/RegisterDetector Germanium BEGe_11 011
+   /RMG/Geometry/RegisterDetector Germanium BEGe_12 012
+   /RMG/Geometry/RegisterDetector Germanium BEGe_13 013
+   /RMG/Geometry/RegisterDetector Germanium BEGe_14 014
+   /RMG/Geometry/RegisterDetector Germanium BEGe_15 015
+   /RMG/Geometry/RegisterDetector Germanium BEGe_16 016
+   /RMG/Geometry/RegisterDetector Germanium BEGe_17 017
+   /RMG/Geometry/RegisterDetector Germanium BEGe_18 018
+   /RMG/Geometry/RegisterDetector Germanium BEGe_19 019
+
+
+   /run/initialize
+
+   /RMG/Generator/Confine Volume
+   /RMG/Generator/Confinement/Physical/AddVolume LAr
+
+   /RMG/Generator/Select GPS
+   /gps/particle ion
+   /gps/energy 0 eV
+   /gps/ion 19 42 # 42-K
+
+
+   /run/beamOn 10000000
+
+2) Remage + hit tier files
+--------------------------
+
+For this tutorial we use the same remage simulation as for the hit tier
+simulation. See the previous tutorial for how to generate them.
+
+We can check these files.
+
+We run a simplified reboost hit tier processing, unlike that in the
+previous tutorial which deliberately included many outputs to show the
+effect of processors.
+
+First we define the config file and parameters.
+
+.. code:: python
+
+    chans = [f"det{num:03}" for num in range(20)]
+
+
+.. code:: python
+
+    chain = {
+        "channels": chans,
+        "outputs": [
+            "t0",  # first timestamp
+            "evtid",  # id of the hit
+            "global_evtid",  # global id of the hit
+            "energy_sum_no_deadlyer",
+            "energy_sum",  # true summed energy before dead layer or smearing
+        ],
+        "step_group": {
+            "description": "group steps by time and evtid with 10us window",
+            "expression": "reboost.hpge.processors.group_by_time(stp,window=10)",
+        },
+        "locals": {
+            "hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)",
+            "phy_vol": "reboost.hpge.utils.get_phy_vol(reg=reg,pars=pars,detector=detector)",
+        },
+        "operations": {
+            "t0": {
+                "description": "first time in the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)",
+            },
+            "evtid": {
+                "description": "global evtid of the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit._evtid,axis=-1),np.nan)",
+            },
+            "global_evtid": {
+                "description": "global evtid of the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit._global_evtid,axis=-1),np.nan)",
+            },
+            "distance_to_nplus_surface_mm": {
+                "description": "distance to the nplus surface in mm",
+                "mode": "function",
+                "expression": "reboost.hpge.processors.distance_to_surface(hit.xloc, hit.yloc, hit.zloc, hpge, phy_vol.position.eval(), surface_type='nplus',unit='m')",
+            },
+            "activeness": {
+                "description": "activness based on FCCD (no TL)",
+                "mode": "eval",
+                "expression": "ak.where(hit.distance_to_nplus_surface_mm<pars.fccd_in_mm,0,1)",
+            },
+            "energy_sum_deadlayer": {
+                "description": "summed energy in the hit after deadlayr",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep*hit.activeness,axis=-1)",
+            },
+            "energy_sum_no_deadlayer": {
+                "description": "summed energy in the hit after deadlayr",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep,axis=-1)",
+            },
+            "energy_sum": {
+                "description": "summed energy after convolution with energy response.",
+                "mode": "function",
+                "expression": "reboost.hpge.processors.smear_energies(hit.energy_sum_deadlayer,reso=pars.fwhm_in_keV/2.355)",
+            },
+        },
+    }
+
+.. code:: python
+
+    ## all detectors have the same performance
+    pars = {
+        f"det{num:03}": {
+            "meta_name": "BEGe.json",
+            "phy_vol_name": f"BEGe_{num}",
+            "fwhm_in_keV": 2.69,
+            "fccd_in_mm": 1.420,  # dead layer in mm
+        }
+        for num in range(20)
+    }
+
+.. code:: python
+
+    logger.setLevel(logging.CRITICAL)
+
+    hit.build_hit(
+        file_out="output/hit/output.lh5",
+        list_file_in="output/stp/*.lh5",
+        out_field="hit",
+        in_field="stp",
+        proc_config=chain,
+        pars=pars,
+        gdml="cfg/geom.gdml",
+        metadata_path="cfg/metadata/",
+        merge_input_files=True,
+    )
+    logger.setLevel(logging.INFO)
+
+
+.. parsed-literal::
+
+    CPU times: user 23.4 s, sys: 695 ms, total: 24.1 s
+    Wall time: 25 s
+
+
+3) Indices and time-coincidence map
+-----------------------------------
+
+In our processing chain we saved both the “local” and “global” evtid,
+and we can extract the “hit_idx” as the row of the output table. We can
+compare these indices.
+
+.. code:: python
+
+    data_det001 = lh5.read("det001/hit", "output/hit/output.lh5")
+
+
+.. code:: python
+
+    fig, ax = plt.subplots()
+    ax.plot(data_det001.evtid, np.arange(len(data_det001.evtid)), label="Local")
+    ax.set_xlabel("local evtid")
+    ax.set_ylabel("hit idx")
+
+
+    ax.plot(
+        data_det001.global_evtid, np.arange(len(data_det001.global_evtid)), label="Global"
+    )
+    ax.set_xlabel("evtid")
+    ax.set_ylabel("hit idx")
+    ax.legend(loc="upper left")
+
+
+
+
+.. parsed-literal::
+
+    <matplotlib.legend.Legend at 0x7fa819de8590>
+
+
+
+
+.. image:: images/output_17_1.png
+
+
+We see that the local index varies between 0 and 1e7 per file while the
+global index increases constantly. We can even check this (as it must
+from our hit tier processing).
+
+.. code:: python
+
+    evtid_change = np.diff(data_det001.global_evtid)
+
+.. code:: python
+
+    print(f"evtid change = {evtid_change}, increasing? {np.all(evtid_change>=0)}")
+
+
+.. parsed-literal::
+
+    evtid change = [843. 153. 313. ...  27. 500. 345.], increasing? True
+
+
+Now we can build the time-coincidence map and save to a new file.
+
+.. code:: python
+
+    tcm.build_tcm(
+        "output/hit/output.lh5",
+        "output/tcm/test_tcm.lh5",
+        chans,
+        time_name="t0",
+        idx_name="global_evtid",
+    )
+
+
+
+.. parsed-literal::
+
+    peak memory: 690.61 MiB, increment: 0.27 MiB
+
+
+We see that building the TCM was fast only taking around 1s. However
+since building the TCM requires reading all the files simultaneously we
+should be careful about the memory usage. Here using around 1 GB is
+still quite significant.
+
+   **Technical note**: - The code reads all the hit files in
+   simultaneously, this could cause a memory issue if the file is very
+   large - In this case the user must break the hit tier file into more
+   chunks (see build_hit) options
+
+Now we can look at our TCM.
+
+.. code:: python
+
+    tcm_ak = lh5.read("tcm", "output/tcm/test_tcm.lh5").view_as("ak")
+
+.. code:: python
+
+    lh5.show("output/tcm/test_tcm.lh5")
+
+
+.. code:: python
+
+    tcm_ak
+
+
+
+
+.. raw:: html
+
+    <prepython[{array_id: [11], array_idx: [0]},
+     {array_id: [2], array_idx: [0]},
+     {array_id: [11], array_idx: [1]},
+     {array_id: [8], array_idx: [0]},
+     {array_id: [3], array_idx: [0]},
+     {array_id: [5, 16], array_idx: [0, 0]},
+     {array_id: [0], array_idx: [0]},
+     {array_id: [13], array_idx: [0]},
+     {array_id: [18], array_idx: [0]},
+     {array_id: [7], array_idx: [0]},
+     ...,
+     {array_id: [9, 8], array_idx: [25998, 26398]},
+     {array_id: [17], array_idx: [27265]},
+     {array_id: [17], array_idx: [27266]},
+     {array_id: [3, 4], array_idx: [26889, 26009]},
+     {array_id: [15], array_idx: [27763]},
+     {array_id: [12], array_idx: [27111]},
+     {array_id: [13], array_idx: [26832]},
+     {array_id: [17], array_idx: [27267]},
+     {array_id: [19], array_idx: [25884]}]
+    -----------------------------------------------
+    type: 458647 * {
+        array_id: var * int64,
+        array_idx: var * int64
+    }</pre>
+
+
+
+We see that almost all of the events have only trigger in a single
+channel. We can also extract easily the multiplicity of the events or
+the number of triggers in the TCM.
+
+More sophisticated calcations can be performed by also grabbing
+information from the hit tier files. This is done by ``build_evt``.
+
+.. code:: python
+
+    plt.hist(
+        ak.num(tcm_ak.array_id, axis=-1),
+        range=(0.5, 20.5),
+        bins=20,
+        alpha=0.3,
+        density=True,
+    )
+    plt.yscale("log")
+    plt.xlim(0.5, 10)
+    plt.xlabel("Multiplicity")
+    plt.ylabel("Probability ")
+
+
+
+
+.. parsed-literal::
+
+    Text(0, 0.5, 'Probability ')
+
+
+
+
+.. image:: images/output_30_1.png
+
+
+3.1) TCM ``id``,\ ``idx`` and ``index``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Internally we have three different TCM attributes. \* ``tcm.idx`` : the
+row of the hit table \* ``tcm.id`` : the channel number \*
+``tcm.index``: the row of the flattened tcm the hit corresponds to
+
+This names are surely very confusing but are inherited from pygama and
+will be updated in future.
+
+These are computed with the following block of code:
+
+.. code:: python
+
+       idx_ch = tcm.idx[tcm.id == table_id]
+       outsize = len(idx_ch)
+
+       if expr == "tcm.array_id":
+           res = np.full(outsize, table_id, dtype=int)
+       elif expr == "tcm.array_idx":
+           res = idx_ch
+       elif expr == "tcm.index":
+           res = np.where(tcm.id == table_id)[0]
+
+4) Generating the event tier
+----------------------------
+
+Next we can use our time-coincidence-map to generate files containing
+information on each event. Similar to the ``hit`` tier the processing is
+based on a YAML or JSON configuration file.
+
+This config file must contain a key with lists of different groups of
+channels. It must define the output fields and then finally a block of
+operations to perform. Similar to our hit tier processing the idea is
+that each operation / processor computes an LGDO object
+(``VectorOfVectors``, ``Array`` or ``ArrayOfEqualSizedArrays``) with the
+same length as the TCM.
+
+We can define some groups of channels for our processing chain. Lets set
+some channels off and some to ac (this means the channel is used for
+anticoincidence but is not fully usable).
+
+.. code:: python
+
+    chans_off = ["det003", "det007"]
+    chans_ac = ["det013", "det016"]
+    chans_on = [
+        chan for chan in chans if (chan not in chans_off) and (chan not in chans_ac)
+    ]
+    chans_on
+
+
+
+
+.. parsed-literal::
+
+    ['det000',
+     'det001',
+     'det002',
+     'det004',
+     'det005',
+     'det006',
+     'det008',
+     'det009',
+     'det010',
+     'det011',
+     'det012',
+     'det014',
+     'det015',
+     'det017',
+     'det018',
+     'det019']
+
+
+
+We create a simple event tier processing config demonstrating the
+different possible aggregation modes.
+
+   **Note**: this processing chain deliberately includes some additional
+   operations to show the effect of each aggregation mode. It: 1. finds
+   the channel ids with energy above threshold 2. finds the TCM
+   “index’s” or the index of the TCM VoV 3. computes a vector of vector
+   of the energies (ordered by channel) 4. computes a boolean flag of
+   whether each channel is ON (and not AC) 5. computes another flag
+   checking if each hit is above threshold 6. checks if the event
+   contains any AC hits 7. computes the summed energy, the first
+   timestamp and the multiplicity
+
+.. code:: python
+
+    evt_config = {
+        "channels": {"geds_on": chans_on, "geds_ac": chans_ac},
+        "outputs": [
+            "channel_id",
+            "all_channel_id",
+            "tcm_index",
+            "is_good_channels",
+            "energy_sum",
+            "energy_vector",
+            "energy_no_threshold",
+            "is_all_above_threshold",
+            "t0",
+            "is_good_event",
+            "multiplicity",
+        ],
+        "operations": {
+            "channel_id": {
+                "channels": ["geds_on", "geds_ac"],
+                "aggregation_mode": "gather",
+                "query": "hit.energy_sum > 25",
+                "expression": "tcm.array_id",
+                "sort": "descend_by:hit.energy_sum",
+            },
+            "all_channel_id": {
+                "channels": ["geds_on", "geds_ac"],
+                "aggregation_mode": "gather",
+                "expression": "tcm.array_id",
+                "sort": "descend_by:hit.energy_sum",
+            },
+            "tcm_index": {
+                "channels": ["geds_on", "geds_ac"],
+                "aggregation_mode": "gather",
+                "query": "hit.energy_sum > 25",
+                "expression": "tcm.index",
+            },
+            "energy_no_threshold": {
+                "channels": ["geds_on"],
+                "aggregation_mode": "keep_at_ch:evt.all_channel_id",
+                "expression": "hit.energy_sum",
+            },
+            "energy_vector": {
+                "channels": ["geds_on"],
+                "aggregation_mode": "keep_at_ch:evt.channel_id",
+                "expression": "hit.energy_sum",
+            },
+            "is_good_channels": {
+                "channels": ["geds_on", "geds_ac"],
+                "aggregation_mode": "keep_at_idx:evt.tcm_index",
+                "expression": "True",
+                "initial": False,
+                "exclude_channels": "geds_ac",
+            },
+            "is_all_above_threshold": {
+                "channels": ["geds_on", "geds_ac"],
+                "aggregation_mode": "all",
+                "expression": "hit.energy_sum>25",
+                "initial": True,
+            },
+            "is_good_event": {"expression": "ak.all(evt.is_good_channels,axis=-1)"},
+            "energy_sum": {
+                "channels": ["geds_on"],
+                "aggregation_mode": "sum",
+                "query": "hit.energy_sum > 25",
+                "expression": "hit.energy_sum",
+                "initial": 0,
+            },
+            "t0": {
+                "channels": ["geds_on"],
+                "aggregation_mode": "first_at:hit.t0",
+                "expression": "hit.t0",
+            },
+            "multiplicity": {
+                "channels": ["geds_on"],
+                "aggregation_mode": "sum",
+                "expression": "hit.energy_sum > 25",
+                "initial": 0,
+            },
+        },
+    }
+
+.. code:: python
+
+    from reboost.hpge import evt
+
+    logger.setLevel(logging.INFO)
+
+.. code:: python
+
+    evt_ak = evt.build_evt(
+        hit_file="output/hit/output.lh5",
+        tcm_file="output/tcm/test_tcm.lh5",
+        evt_file=None,
+        config=evt_config,
+    )
+
+
+
+.. parsed-literal::
+
+    CPU times: user 5.21 s, sys: 530 ms, total: 5.74 s
+    Wall time: 5.74 s
+
+
+4.1) “Gather” aggregation mode
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The gather mode is used by the ``channel_id`` and ``tcm_index``
+processors. It returns a VectorOfVector of the expression evaluated for
+each hit in the event. \* the “query” removes some hits below the energy
+threshold. \* “channels” defines the channels to include \* “sort”,
+controls which order the values are in.
+
+For example we see that the “channel_id” is just a subset of the
+original ``tcm_id`` removing some hits (below 25 keV) and changing the
+order in some cases. These fields are then useful to extract values from
+other fields keeping the correspondence with channel or tcm index.
+
+.. code:: python
+
+    print("tcm.array_id       ", tcm_ak.array_id)
+    print("evt.all_channel_id ", evt_ak.all_channel_id)
+    print("evt.channel_id     ", evt_ak.channel_id)
+    print("evt.tcm_index      ", evt_ak.tcm_index)
+
+
+.. parsed-literal::
+
+    tcm.array_id        [[11], [2], [11], [8], [3], [5, 16], ..., [...], [15], [12], [13], [17], [19]]
+    evt.all_channel_id  [[11], [2], [11], [8], [], [16, 5], [0], ..., [4], [15], [12], [13], [17], [19]]
+    evt.channel_id      [[11], [], [11], [], [], [16, 5], [0], ..., [4], [15], [], [13], [17], [19]]
+    evt.tcm_index       [[0], [], [2], [], [], [...], ..., [540269], [], [540271], [540272], [540273]]
+
+
+4.2) “keep_at_ch” and “keep_at_idx”
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+These modes are similar but order the results according to either the
+TCM index or the channel id fields. In our processing chain we used this
+to extract the energies preserving the order with respect to the
+channel_id. We see in both cases there are some “nan” values
+(corresponding to the “geds_ac” channels) and some elements are removed
+by the energy threshold.
+
+.. code:: python
+
+    print("evt.energy_vector        ", evt_ak.energy_vector)
+    print("evt.energy_no_threshold  ", evt_ak.energy_no_threshold)
+
+
+.. parsed-literal::
+
+    evt.energy_vector         [[894], [], [125], [], [], [nan, ...], ..., [784], [], [nan], [1.36e+03], [258]]
+    evt.energy_no_threshold   [[894], [0.761], [125], [-1.65], [], ..., [-0.79], [nan], [1.36e+03], [258]]
+
+
+Or we can check if each channel is in AC mode.
+
+.. code:: python
+
+    print("evt.is_good_channels ", evt_ak.is_good_channels)
+
+
+.. parsed-literal::
+
+    evt.is_good_channels  [[True], [], [True], [], [], [...], ..., [True], [], [False], [True], [True]]
+
+
+4.3) “all” or “sum” mode
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+These modes aggregate the data from each channel by either summation or
+all. Thus we get out a 1D vector.
+
+For example we checked if every hit is above threshold (compare to the
+energy vectors), compute the total energy and check the number of
+channels above threshold (multiplicity).
+
+.. code:: python
+
+    print("evt_ak.is_all_above_threshold ", evt_ak.is_all_above_threshold)
+    print("evt_ak.energy_sum             ", evt_ak.energy_sum)
+    print("evt_ak.energy_sum             ", evt_ak.energy_sum)
+    print("evt_ak.multiplicity           ", evt_ak.multiplicity)
+
+
+.. parsed-literal::
+
+    evt_ak.is_all_above_threshold  [True, False, True, False, True, True, ..., True, True, False, True, True, True]
+    evt_ak.energy_sum              [894, 0, 125, 0, 0, 437, 147, 0, ..., 1.22e+03, 218, 784, 0, 0, 1.36e+03, 258]
+    evt_ak.energy_sum              [894, 0, 125, 0, 0, 437, 147, 0, ..., 1.22e+03, 218, 784, 0, 0, 1.36e+03, 258]
+    evt_ak.multiplicity            [1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, ..., 1, 1, 0, 2, 0, 1, 1, 1, 0, 0, 1, 1]
+
+
+4.4) No aggregation mode
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Finally, we can perform some basic operations on evt tier variables, eg.
+first checking if any above threshold channel is in AC mode.
+
+.. code:: python
+
+    print("evt_ak.is_good_event           ", evt_ak.is_good_event)
+
+
+.. parsed-literal::
+
+    evt_ak.is_good_event            [True, True, True, True, True, False, ..., True, True, True, False, True, True]
+
+
+5) Analysis on the event tier data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+Finally with our new evt files we can make some plots of experiment wide
+quantities.
+
+.. code:: python
+
+    def plot_energy(axes, energy, bins=400, xrange=None, label=" ", log_y=True, **kwargs):
+        h = hist.new.Reg(bins, *xrange, name="energy [keV]").Double()
+        h.fill(energy)
+        h.plot(**kwargs, label=label)
+        axes.legend()
+        if log_y:
+            axes.set_yscale("log")
+        if xrange is not None:
+            axes.set_xlim(*xrange)
+
+.. code:: python
+
+    fig, ax = plt.subplots()
+    plot_energy(
+        ax,
+        evt_ak.energy_sum[evt_ak.multiplicity > 0],
+        yerr=False,
+        label="Summed energies",
+        xrange=(0, 4000),
+    )
+    plot_energy(
+        ax,
+        ak.flatten(evt_ak[evt_ak.multiplicity == 1].energy_vector),
+        yerr=False,
+        label="M1 energies",
+        xrange=(0, 4000),
+    )
+
+    ax.set_title("summed event energy")
+
+
+
+
+.. parsed-literal::
+
+    Text(0.5, 1.0, 'summed event energy')
+
+
+
+
+.. image:: images/output_54_1.png
+
+
+Or we can select multiplicity two (M2) events and plot the 2D energy
+spectra.
+
+.. code:: python
+
+    import matplotlib as mpl
+
+.. code:: python
+
+    def plot_energy_2D(
+        axes, energy_1, energy_2, bins=400, xrange=None, yrange=None, label=" ", **kwargs
+    ):
+        x_axis = hist.axis.Regular(bins, *xrange, name="Energy 2 [keV]")
+        y_axis = hist.axis.Regular(bins, *yrange, name="Energy 1 [keV]")
+        h = hist.Hist(x_axis, y_axis)
+        h.fill(energy_2, energy_1)
+        cmap = mpl.colormaps["BuPu"].copy()
+        cmap.set_under(color="white")
+
+        w, x, y = h.to_numpy()
+        mesh = ax.pcolormesh(x, y, w.T, cmap=cmap, vmin=0.5)
+        ax.set_xlabel("Energy 2 [keV]")
+        ax.set_ylabel("Energy 1 [keV]")
+        fig.colorbar(mesh)
+
+        if xrange is not None:
+            axes.set_xlim(*xrange)
+
+.. code:: python
+
+    fig, ax = plt.subplots()
+
+    energy_1 = np.array(
+        evt_ak[(evt_ak.multiplicity == 2) & (evt_ak.is_good_event)].energy_vector[:, 1]
+    )
+    energy_2 = np.array(
+        evt_ak[(evt_ak.multiplicity == 2) & (evt_ak.is_good_event)].energy_vector[:, 0]
+    )
+    plot_energy_2D(
+        ax,
+        energy_2,
+        energy_1,
+        label=None,
+        xrange=(0, 1000),
+        yrange=(0, 1600),
+        bins=200,
+        vmin=0.5,
+    )
+    ax.set_title("summed event energy")
+
+
+
+
+.. parsed-literal::
+
+    Text(0.5, 1.0, 'summed event energy')
+
+
+
+
+.. image:: images/output_58_1.png
+
+
+Many more things are possible via manipulation of our event tier files,
+the recommended tool is awkward (see
+`[docs] <https://awkward-array.org/doc/main/>`__).
diff --git a/docs/source/notebooks/reboost_hpge_tutorial_hit.rst b/docs/source/notebooks/reboost_hpge_tutorial_hit.rst
new file mode 100644
index 0000000..2b7d096
--- /dev/null
+++ b/docs/source/notebooks/reboost_hpge_tutorial_hit.rst
@@ -0,0 +1,785 @@
+Hit tier hpge simulation processing
+===================================
+
+This tutorial describes how to process the HPGe detector simulations
+from **remage** with **reboost**. It buils on the official **remage**
+tutorial
+`[link] <https://remage.readthedocs.io/en/stable/tutorial.html>`__
+
+   .. rubric:: *Note*
+      :name: note
+
+   To run this tutorial it is recommended to create the following
+   directory structure to organise the outputs and config inputs.
+
+.. code:: text
+
+   ├── cfg
+   │   └── metadata
+   ├── output
+   │   ├── stp
+   │   └── hit
+   └── reboost_hpge_tutorial.ipynb
+
+Part 1) Running the remage simulation
+-------------------------------------
+
+Before we can run any post-processing we need to run the Geant4
+simulation. For this we follow the remage tutorial to generate the GDML
+geometry. We save this into the GDML file *cfg/geom.gdml* for use by
+remage. We also need to save the metadata dictionaries into json files
+(in the *cfg/metadata* folder as *BEGe.json* and *Coax.json*
+
+We use a slightly modified Geant4 macro to demonstrate some features of
+reboost (this should be saved as *cfg/th228.mac* to run remage on the
+command line).
+
+.. code:: text
+
+   /RMG/Manager/Logging/LogLevel detail
+
+   /RMG/Geometry/RegisterDetector Germanium BEGe 001
+   /RMG/Geometry/RegisterDetector Germanium Coax 002
+   /RMG/Geometry/RegisterDetector Scintillator LAr 003
+
+   /run/initialize
+
+   /RMG/Generator/Confine Volume
+   /RMG/Generator/Confinement/Physical/AddVolume Source
+
+   /RMG/Generator/Select GPS
+   /gps/particle ion
+   /gps/energy 0 eV
+   /gps/ion 88 224 # 224-Ra
+   /process/had/rdm/nucleusLimits 208 224 81 88 #Ra-224 to 208-Pb
+
+
+   /run/beamOn 1000000
+
+We then use the remage executable (see
+`[remage-docs] <https://remage.readthedocs.io/en/stable/>`__ for
+installation instructions) to run the simulation: > #### *Note* > Both
+of *cfg/th228.mac* and *cfg/geometry.gdml* are needed to run remage
+
+.. code:: console
+
+   $ remage --threads 8 --gdml-files cfg/geom.gdml --output-file output/stp/output.lh5 -- cfg/th228.mac
+
+You can lower the number of simulated events to speed up the simulation.
+
+We can use ``lh5.show()`` to check the output files.
+
+.. code:: python
+
+    from lgdo import lh5
+
+.. code:: python
+
+    lh5.show("output/stp/output_t0.lh5")
+
+
+.. parsed-literal::
+
+    [1m/[0m
+    └── [1mstp[0m · struct{det001,det002,det003,vertices}
+        ├── [1mdet001[0m · table{evtid,particle,edep,time,xloc,yloc,zloc}
+        │   ├── [1medep[0m · array<1>{real}
+        │   ├── [1mevtid[0m · array<1>{real}
+        │   ├── [1mparticle[0m · array<1>{real}
+        │   ├── [1mtime[0m · array<1>{real}
+        │   ├── [1mxloc[0m · array<1>{real}
+        │   ├── [1myloc[0m · array<1>{real}
+        │   └── [1mzloc[0m · array<1>{real}
+        ├── [1mdet002[0m · table{evtid,particle,edep,time,xloc,yloc,zloc}
+        │   ├── [1medep[0m · array<1>{real}
+        │   ├── [1mevtid[0m · array<1>{real}
+        │   ├── [1mparticle[0m · array<1>{real}
+        │   ├── [1mtime[0m · array<1>{real}
+        │   ├── [1mxloc[0m · array<1>{real}
+        │   ├── [1myloc[0m · array<1>{real}
+        │   └── [1mzloc[0m · array<1>{real}
+        ├── [1mdet003[0m · table{evtid,particle,edep,time,xloc_pre,yloc_pre,zloc_pre,xloc_post,yloc_post,zloc_post,v_pre,v_post}
+        │   ├── [1medep[0m · array<1>{real}
+        │   ├── [1mevtid[0m · array<1>{real}
+        │   ├── [1mparticle[0m · array<1>{real}
+        │   ├── [1mtime[0m · array<1>{real}
+        │   ├── [1mv_post[0m · array<1>{real}
+        │   ├── [1mv_pre[0m · array<1>{real}
+        │   ├── [1mxloc_post[0m · array<1>{real}
+        │   ├── [1mxloc_pre[0m · array<1>{real}
+        │   ├── [1myloc_post[0m · array<1>{real}
+        │   ├── [1myloc_pre[0m · array<1>{real}
+        │   ├── [1mzloc_post[0m · array<1>{real}
+        │   └── [1mzloc_pre[0m · array<1>{real}
+        └── [1mvertices[0m · table{evtid,time,xloc,yloc,zloc,n_part}
+            ├── [1mevtid[0m · array<1>{real}
+            ├── [1mn_part[0m · array<1>{real}
+            ├── [1mtime[0m · array<1>{real}
+            ├── [1mxloc[0m · array<1>{real}
+            ├── [1myloc[0m · array<1>{real}
+            └── [1mzloc[0m · array<1>{real}
+
+
+Part 2) reboost config files
+----------------------------
+
+For this tutorial we perform a basic post-processing of the *hit* tier
+for the two Germanium channels.
+
+2.1) Setup the environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+First we set up the python environment.
+
+.. code:: python
+
+    from reboost.hpge import hit
+    import matplotlib.pyplot as plt
+    import pyg4ometry as pg4
+    import legendhpges
+    from legendhpges import draw
+    import awkward as ak
+    import logging
+    import colorlog
+    import hist
+    import numpy as np
+
+
+    plt.rcParams["figure.figsize"] = [12, 4]
+    plt.rcParams["axes.titlesize"] = 12
+    plt.rcParams["axes.labelsize"] = 12
+    plt.rcParams["legend.fontsize"] = 12
+
+
+    handler = colorlog.StreamHandler()
+    handler.setFormatter(
+        colorlog.ColoredFormatter("%(log_color)s%(name)s [%(levelname)s] %(message)s")
+    )
+    logger = logging.getLogger()
+    logger.handlers.clear()
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    logger.info("test")
+
+
+
+
+.. parsed-literal::
+
+    [32mroot [INFO] test[0m
+
+
+2.2) Processing chain and parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Next we need to make the processing chain config file.
+
+The processing chain below gives a standard set of steps for a HPGe
+simulation. 1. first the steps are windowed into hits, 2. the first
+timestamp and index of each hit is computed (for use in event building),
+3. the distance to the detector n+ surface is computed and from this the
+activeness is calculated (based on the FCCD) 4. the energy in each step
+is summed to extract the deposited energy (both with and without
+deadlayer correction), 5. the energy is convolved with the detector
+response model (gaussian energy resolution).
+
+We also include some step based quantities in the output to show the
+effect of the processors.
+
+.. code:: python
+
+    chain = {
+        "channels": ["det001", "det002"],
+        "outputs": [
+            "t0",  # first timestamp
+            "time",  # time of each step
+            "edep",  # energy deposited in each step
+            "evtid",  # id of the hit
+            "global_evtid",  # global id of the hit
+            "distance_to_nplus_surface_mm",  # distance to detector nplus surface
+            "activeness",  # activeness for the step
+            "rpos_loc",  # radius of step
+            "zpos_loc",  # z position
+            "energy_sum",  # true summed energy before dead layer or smearing
+            "energy_sum_deadlayer",  # energy sum after dead layers
+            "energy_sum_smeared",  # energy sum after smearing with resolution
+        ],
+        "step_group": {
+            "description": "group steps by time and evtid with 10us window",
+            "expression": "reboost.hpge.processors.group_by_time(stp,window=10)",
+        },
+        "locals": {
+            "hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)",
+            "phy_vol": "reboost.hpge.utils.get_phy_vol(reg=reg,pars=pars,detector=detector)",
+        },
+        "operations": {
+            "t0": {
+                "description": "first time in the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)",
+            },
+            "evtid": {
+                "description": "global evtid of the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit._evtid,axis=-1),np.nan)",
+            },
+            "global_evtid": {
+                "description": "global evtid of the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit._global_evtid,axis=-1),np.nan)",
+            },
+            "distance_to_nplus_surface_mm": {
+                "description": "distance to the nplus surface in mm",
+                "mode": "function",
+                "expression": "reboost.hpge.processors.distance_to_surface(hit.xloc, hit.yloc, hit.zloc, hpge, phy_vol.position.eval(), surface_type='nplus',unit='m')",
+            },
+            "activeness": {
+                "description": "activness based on FCCD (no TL)",
+                "mode": "eval",
+                "expression": "ak.where(hit.distance_to_nplus_surface_mm<pars.fccd_in_mm,0,1)",
+            },
+            "rpos_loc": {
+                "description": "Local radius ",
+                "mode": "eval",
+                "expression": "((1000*hit.xloc-phy_vol.position.eval()[0])**2+(1000*hit.yloc-phy_vol.position.eval()[1])**2)**0.5",
+            },
+            "zpos_loc": {
+                "description": "Local z ",
+                "mode": "eval",
+                "expression": "1000*hit.zloc-phy_vol.position.eval()[2]",
+            },
+            "energy_sum": {
+                "description": "truth summed energy in the hit.",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep,axis=-1)",
+            },
+            "energy_sum_deadlayer": {
+                "description": "summed energy in the hit after deadlayr",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep*hit.activeness,axis=-1)",
+            },
+            "energy_sum_smeared": {
+                "description": "summed energy after convolution with energy response.",
+                "mode": "function",
+                "expression": "reboost.hpge.processors.smear_energies(hit.energy_sum_deadlayer,reso=pars.fwhm_in_keV/2.355)",
+            },
+        },
+    }
+
+We also create our parameters file.
+
+.. code:: python
+
+    pars = {
+        "det001": {
+            "meta_name": "BEGe.json",
+            "phy_vol_name": "BEGe",
+            "fwhm_in_keV": 2.69,
+            "fccd_in_mm": 1.420,  # dead layer in mm
+        },
+        "det002": {
+            "meta_name": "Coax.json",
+            "phy_vol_name": "Coax",
+            "fwhm_in_keV": 4.420,
+            "fccd_in_mm": 2.69,
+        },
+    }
+
+Part 3) Running the processing
+------------------------------
+
+Now we can run our post-processing
+
+.. code:: python
+
+    hit.build_hit(
+        file_out="output/hit/output.lh5",
+        list_file_in="output/stp/*.lh5",
+        out_field="hit",
+        in_field="stp",
+        proc_config=chain,
+        pars=pars,
+        gdml="cfg/geom.gdml",
+        metadata_path="cfg/metadata/",
+        merge_input_files=True,
+    )
+
+
+
+.. parsed-literal::
+
+    [32mreboost.hpge.utils [INFO] files contain [1252152, 1248990, 1244422, 1248990, 1258476, 1252152, 1252152, 1242666] events[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t0.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t0.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t1.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t1.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t2.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t2.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t3.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t3.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t4.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t4.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t5.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t5.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t6.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t6.lh5 and det002[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t7.lh5 and det001[0m
+    [32mreboost.hpge.hit [INFO] ...running hit tier for output/stp/output_t7.lh5 and det002[0m
+
+
+.. parsed-literal::
+
+    CPU times: user 2min 38s, sys: 4.73 s, total: 2min 42s
+    Wall time: 2min 45s
+
+
+The debugging info printed by *reboost* shows the program iterating over
+the files and appending to the output file. Now we can print our output
+file structure showing the new *hit* oriented data format.
+
+.. code:: python
+
+    lh5.show("output/hit/output.lh5")
+
+
+.. parsed-literal::
+
+    [1m/[0m
+    ├── [1mdet001[0m · HDF5 group
+    │   └── [1mhit[0m · table{edep,time,t0,evtid,global_evtid,distance_to_nplus_surface_mm,activeness,rpos_loc,zpos_loc,energy_sum,energy_sum_deadlayer,energy_sum_smeared}
+    │       ├── [1mactiveness[0m · array<1>{array<1>{real}}
+    │       │   ├── [1mcumulative_length[0m · array<1>{real}
+    │       │   └── [1mflattened_data[0m · array<1>{real}
+    │       ├── [1mdistance_to_nplus_surface_mm[0m · array<1>{array<1>{real}}
+    │       │   ├── [1mcumulative_length[0m · array<1>{real}
+    │       │   └── [1mflattened_data[0m · array<1>{real}
+    │       ├── [1medep[0m · array<1>{array<1>{real}}
+    │       │   ├── [1mcumulative_length[0m · array<1>{real}
+    │       │   └── [1mflattened_data[0m · array<1>{real}
+    │       ├── [1menergy_sum[0m · array<1>{real}
+    │       ├── [1menergy_sum_deadlayer[0m · array<1>{real}
+    │       ├── [1menergy_sum_smeared[0m · array<1>{real}
+    │       ├── [1mevtid[0m · array<1>{real}
+    │       ├── [1mglobal_evtid[0m · array<1>{real}
+    │       ├── [1mrpos_loc[0m · array<1>{array<1>{real}}
+    │       │   ├── [1mcumulative_length[0m · array<1>{real}
+    │       │   └── [1mflattened_data[0m · array<1>{real}
+    │       ├── [1mt0[0m · array<1>{real}
+    │       ├── [1mtime[0m · array<1>{array<1>{real}}
+    │       │   ├── [1mcumulative_length[0m · array<1>{real}
+    │       │   └── [1mflattened_data[0m · array<1>{real}
+    │       └── [1mzpos_loc[0m · array<1>{array<1>{real}}
+    │           ├── [1mcumulative_length[0m · array<1>{real}
+    │           └── [1mflattened_data[0m · array<1>{real}
+    └── [1mdet002[0m · HDF5 group
+        └── [1mhit[0m · table{edep,time,t0,evtid,global_evtid,distance_to_nplus_surface_mm,activeness,rpos_loc,zpos_loc,energy_sum,energy_sum_deadlayer,energy_sum_smeared}
+            ├── [1mactiveness[0m · array<1>{array<1>{real}}
+            │   ├── [1mcumulative_length[0m · array<1>{real}
+            │   └── [1mflattened_data[0m · array<1>{real}
+            ├── [1mdistance_to_nplus_surface_mm[0m · array<1>{array<1>{real}}
+            │   ├── [1mcumulative_length[0m · array<1>{real}
+            │   └── [1mflattened_data[0m · array<1>{real}
+            ├── [1medep[0m · array<1>{array<1>{real}}
+            │   ├── [1mcumulative_length[0m · array<1>{real}
+            │   └── [1mflattened_data[0m · array<1>{real}
+            ├── [1menergy_sum[0m · array<1>{real}
+            ├── [1menergy_sum_deadlayer[0m · array<1>{real}
+            ├── [1menergy_sum_smeared[0m · array<1>{real}
+            ├── [1mevtid[0m · array<1>{real}
+            ├── [1mglobal_evtid[0m · array<1>{real}
+            ├── [1mrpos_loc[0m · array<1>{array<1>{real}}
+            │   ├── [1mcumulative_length[0m · array<1>{real}
+            │   └── [1mflattened_data[0m · array<1>{real}
+            ├── [1mt0[0m · array<1>{real}
+            ├── [1mtime[0m · array<1>{array<1>{real}}
+            │   ├── [1mcumulative_length[0m · array<1>{real}
+            │   └── [1mflattened_data[0m · array<1>{real}
+            └── [1mzpos_loc[0m · array<1>{array<1>{real}}
+                ├── [1mcumulative_length[0m · array<1>{real}
+                └── [1mflattened_data[0m · array<1>{real}
+
+
+The new format is a factor of x17 times smaller than the input file due
+to the removal of many *step* based fields which use a lot of memory and
+due to the removal of the *vertices* table and the LAr hits. So we can
+easily read the whole file into memory. We use *awkward* to analyse the
+output files.
+
+.. code:: python
+
+    data_det001 = lh5.read_as("det001/hit", "output/hit/output.lh5", "ak")
+    data_det002 = lh5.read_as("det002/hit", "output/hit/output.lh5", "ak")
+
+.. code:: python
+
+    data_det001
+
+
+
+
+.. raw:: html
+
+    <pre>[{edep: [0.0826, 0.00863, ..., 32.3], time: [1.32e+15, ...], t0: 1.32e+15, ...},
+     {edep: [0.103, 0.0256, ..., 37.6, 6.44], time: [...], t0: 1.24e+15, ...},
+     {edep: [0.0824, 0.00863, ..., 16.8], time: [2.21e+14, ...], t0: 2.21e+14, ...},
+     {edep: [0.101, 0.0802, ..., 20.9], time: [9.09e+14, ...], t0: 9.09e+14, ...},
+     {edep: [0.00332, 0.0171, ..., 45, 27.7], time: [...], t0: 4.26e+13, ...},
+     {edep: [0.0845, 0.00863, ..., 27.9], time: [6.86e+14, ...], t0: 6.86e+14, ...},
+     {edep: [0.0065, 0.255, ..., 41.5, 2.69], time: [...], t0: 1.24e+14, ...},
+     {edep: [0.0388, 0.188, ..., 1.1, 41.5], time: [...], t0: 6.48e+14, ...},
+     {edep: [0.00332, 0.116, ..., 16.2], time: [4.39e+14, ...], t0: 4.39e+14, ...},
+     {edep: [0.00615, 0.0204, ..., 22.1], time: [7.11e+14, ...], t0: 7.11e+14, ...},
+     ...,
+     {edep: [0.19, 0.0171, ..., 42.2, 10.9], time: [...], t0: 1.1e+15, ...},
+     {edep: [0.0118, 0.0303, ..., 34.5, 21.4], time: [...], t0: 1.51e+15, ...},
+     {edep: [0.0204, 0.152, ..., 2.79, 51.9], time: [...], t0: 9.73e+14, ...},
+     {edep: [0.118, 0.0254, ..., 41.2, 38.6], time: [...], t0: 9.67e+14, ...},
+     {edep: [0.0824, 0.0254, ..., 34.6, 18.9], time: [...], t0: 6.64e+14, ...},
+     {edep: [0.148, 0.0802, ..., 40.9, 24], time: [...], t0: 5.56e+14, ...},
+     {edep: [0.022, 0.0148, ..., 34.8, 11.9], time: [...], t0: 6.52e+14, ...},
+     {edep: [0.0155, 0.118, ..., 0.458, 9.65], time: [...], t0: 3.97e+14, ...},
+     {edep: [0.0065, 0.00615, ..., 13.7], time: [3.98e+14, ...], t0: 3.98e+14, ...}]
+    --------------------------------------------------------------------------------
+    type: 835793 * {
+        edep: var * float64,
+        time: var * float64,
+        t0: float64,
+        evtid: float64,
+        global_evtid: float64,
+        distance_to_nplus_surface_mm: var * float64,
+        activeness: var * int64,
+        rpos_loc: var * float64,
+        zpos_loc: var * float64,
+        energy_sum: float64,
+        energy_sum_deadlayer: float64,
+        energy_sum_smeared: float64
+    }</pre>
+
+
+
+Part 4) Steps in a standard processing chain
+--------------------------------------------
+
+The next part of the tutorial gives more details on each step of the
+processing chain.
+
+4.1) Windowing
+~~~~~~~~~~~~~~
+
+We can compare the decay index (“evtid” in the “stp” file) to the index
+of the “hit”, the row of the hit table. We see that only some decays
+correspond to “hits” in the detector, as we expect. We also see that a
+single decay does not often produce multiple hits. This is also expected
+since the probability of detection is fairly low.
+
+.. code:: python
+
+    plt.scatter(
+        np.sort(data_det001.global_evtid), np.arange(len(data_det001)), marker=".", alpha=1
+    )
+    plt.xlabel("Decay index (evtid)")
+    plt.ylabel("Hit Index")
+    plt.grid()
+    plt.xlim(0, 1000)
+    plt.ylim(0, 100)
+
+
+
+
+.. parsed-literal::
+
+    (0.0, 100.0)
+
+
+
+
+.. image:: images/output_20_1.png
+
+
+However, we can use some array manipulation to extract decay index with
+multiple hits, by plotting the times we see the effect of the windowing.
+
+.. code:: python
+
+    def plot_times(times: ak.Array, xrange=None, sub_zero=False, **kwargs):
+        fig, ax = plt.subplots()
+        for idx, _time in enumerate(times):
+            if sub_zero:
+                _time = _time - ak.min(_time)
+            h = hist.new.Reg(
+                100,
+                (ak.min(times) / 1e9),
+                (ak.max(times) / 1e9) + 1,
+                name="Time since event start [s]",
+            ).Double()
+            h.fill(_time / 1e9)
+            h.plot(**kwargs, label=f"Hit {idx}")
+            ax.legend()
+            ax.set_yscale("log")
+            if xrange is not None:
+                ax.set_xlim(*xrange)
+
+
+.. code:: python
+
+    unique, counts = np.unique(data_det001.global_evtid, return_counts=True)
+
+.. code:: python
+
+    plot_times(
+        data_det001[data_det001.global_evtid == unique[counts > 1][1]].time,
+        histtype="step",
+        yerr=False,
+    )
+
+
+
+
+.. image:: images/output_24_0.png
+
+
+4.2) Distance to surface and dead layer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One of the important step in the post-processing of HPGe detector
+simulations is the detector activeness mapping. Energy deposited close
+to the surface of the Germanium detector will result in incomplete
+charge collection and a degraded signal. To account for this we added a
+processor to compute the distance to the detector surface (based on
+``legendhpges.base.HPGe.distance_to_surface()``)
+
+For the steps in the detector we extracted in the processing chain the
+local r and z coordinates and we can plot maps of the distance to the
+detector surface and the activeness for each step. We select only events
+within 5 mm of the surface for the first plots. We can see that the
+processor works as expected.
+
+.. code:: python
+
+    def plot_map(field, scale="BuPu", clab="Distance [mm]"):
+        fig, axs = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
+        n = 100000
+        for idx, (data, config) in enumerate(
+            zip(
+                [data_det001, data_det002],
+                ["cfg/metadata/BEGe.json", "cfg/metadata/Coax.json"],
+            )
+        ):
+            reg = pg4.geant4.Registry()
+            hpge = legendhpges.make_hpge(config, registry=reg)
+
+            legendhpges.draw.plot_profile(hpge, split_by_type=True, axes=axs[idx])
+            rng = np.random.default_rng()
+            r = rng.choice(
+                [-1, 1], p=[0.5, 0.5], size=len(ak.flatten(data.rpos_loc))
+            ) * ak.flatten(data.rpos_loc)
+            z = ak.flatten(data.zpos_loc)
+            c = ak.flatten(data[field])
+            cut = c < 5
+
+            s = axs[idx].scatter(
+                r[cut][0:n],
+                z[cut][0:n],
+                c=c[cut][0:n],
+                marker=".",
+                label="gen. points",
+                cmap=scale,
+            )
+            # axs[idx].axis("equal")
+
+            if idx == 0:
+                axs[idx].set_ylabel("Height [mm]")
+            c = plt.colorbar(s)
+            c.set_label(clab)
+
+            axs[idx].set_xlabel("Radius [mm]")
+
+
+.. code:: python
+
+    plot_map("distance_to_nplus_surface_mm")
+
+
+.. parsed-literal::
+
+    [32mroot [INFO] genericpolycone.antlr>[0m
+    [32mroot [INFO] genericpolyhedra.antlr>[0m
+    [32mroot [INFO] visualisation.Mesh.getBoundingBox> [-36.98, -36.98, 0.0] [36.98, 36.98, 29.46][0m
+    [32mroot [INFO] box.pycsgmesh> getBoundingBoxMesh[0m
+    [32mroot [INFO] genericpolycone.antlr>[0m
+    [32mroot [INFO] genericpolyhedra.antlr>[0m
+    [32mroot [INFO] visualisation.Mesh.getBoundingBox> [-38.25, -38.25, 0.0] [38.25, 38.25, 84.0][0m
+    [32mroot [INFO] box.pycsgmesh> getBoundingBoxMesh[0m
+
+
+
+.. image:: images/output_27_1.png
+
+
+.. code:: python
+
+    plot_map("activeness", clab="Activeness", scale="viridis")
+
+
+.. parsed-literal::
+
+    [32mroot [INFO] genericpolycone.antlr>[0m
+    [32mroot [INFO] genericpolyhedra.antlr>[0m
+    [32mroot [INFO] visualisation.Mesh.getBoundingBox> [-36.98, -36.98, 0.0] [36.98, 36.98, 29.46][0m
+    [32mroot [INFO] box.pycsgmesh> getBoundingBoxMesh[0m
+    [32mroot [INFO] genericpolycone.antlr>[0m
+    [32mroot [INFO] genericpolyhedra.antlr>[0m
+    [32mroot [INFO] visualisation.Mesh.getBoundingBox> [-38.25, -38.25, 0.0] [38.25, 38.25, 84.0][0m
+    [32mroot [INFO] box.pycsgmesh> getBoundingBoxMesh[0m
+
+
+
+.. image:: images/output_28_1.png
+
+
+We can also plot a histogram of the distance to the surface.
+
+.. code:: python
+
+    def plot_distances(axes, distances, xrange=None, label=" ", **kwargs):
+        h = hist.new.Reg(100, *xrange, name="Distance to n+ surface [mm]").Double()
+        h.fill(distances)
+        h.plot(**kwargs, label=label)
+        axes.legend()
+        axes.set_yscale("log")
+        if xrange is not None:
+            ax.set_xlim(*xrange)
+
+
+.. code:: python
+
+    fig, ax = plt.subplots()
+    plot_distances(
+        ax,
+        ak.flatten(data_det001.distance_to_nplus_surface_mm),
+        xrange=(0, 35),
+        label="BEGe",
+        histtype="step",
+        yerr=False,
+    )
+    plot_distances(
+        ax,
+        ak.flatten(data_det002.distance_to_nplus_surface_mm),
+        xrange=(0, 35),
+        label="Coax",
+        histtype="step",
+        yerr=False,
+    )
+
+
+
+
+.. image:: images/output_31_0.png
+
+
+4.3) Summed energies
+~~~~~~~~~~~~~~~~~~~~
+
+Our processing chain also sums the energies of the hits, both before and
+after weighting by the activeness.
+
+.. code:: python
+
+    def plot_energy(axes, energy, bins=400, xrange=None, label=" ", log_y=True, **kwargs):
+        h = hist.new.Reg(bins, *xrange, name="energy [keV]").Double()
+        h.fill(energy)
+        h.plot(**kwargs, label=label)
+        axes.legend()
+        if log_y:
+            axes.set_yscale("log")
+        if xrange is not None:
+            axes.set_xlim(*xrange)
+
+.. code:: python
+
+    fig, ax = plt.subplots()
+    ax.set_title("BEGe energy spectrum")
+    plot_energy(
+        ax, data_det001.energy_sum, yerr=False, label="True energy", xrange=(0, 4000)
+    )
+    plot_energy(
+        ax,
+        data_det001.energy_sum_deadlayer,
+        yerr=False,
+        label="Energy after dead layer",
+        xrange=(0, 4000),
+    )
+
+
+
+.. image:: images/output_34_0.png
+
+
+.. code:: python
+
+    fig, ax = plt.subplots()
+    ax.set_title("COAX energy spectrum")
+    plot_energy(
+        ax, data_det002.energy_sum, yerr=False, label="True energy", xrange=(0, 4000)
+    )
+    plot_energy(
+        ax,
+        data_det002.energy_sum_deadlayer,
+        yerr=False,
+        label="Energy after dead layer",
+        xrange=(0, 4000),
+    )
+
+
+
+.. image:: images/output_35_0.png
+
+
+4.4) Smearing
+~~~~~~~~~~~~~
+
+The final step in the processing chain smeared the energies by the
+energy resolution. This represents a general class of processors based
+on ‘’heuristic’’ models. Other similar processors could be implemented
+in a similar way. It would also be simple to use instead an energy
+dependent resolution curve. To see the effect we have to zoom into the
+2615 keV peak.
+
+.. code:: python
+
+    fig, axs = plt.subplots()
+    plot_energy(
+        axs,
+        data_det001.energy_sum_smeared,
+        yerr=False,
+        label="BEGe",
+        xrange=(2600, 2630),
+        log_y=False,
+        bins=150,
+        density=True,
+    )
+    plot_energy(
+        axs,
+        data_det002.energy_sum_smeared,
+        yerr=False,
+        label="COAX",
+        xrange=(2600, 2630),
+        log_y=False,
+        bins=150,
+        density=True,
+    )
+
+
+
+.. image:: images/output_37_0.png
+
+
+We see clearly the worse energy resolution for the COAX detector. > **To
+Do**: add a gaussian fit of this.
+
+Part 5) Adding a new processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The next part of the tutorial describes how to add a new processor to
+the chain. We use as an example spatial *clustering* of steps. This will
+be added later.
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
new file mode 100644
index 0000000..45c8065
--- /dev/null
+++ b/docs/source/tutorial.rst
@@ -0,0 +1,10 @@
+Basic Tutorial
+==============
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   notebooks/reboost_hpge_tutorial_hit
+   notebooks/reboost_hpge_tutorial_evt
diff --git a/pyproject.toml b/pyproject.toml
index 75976e7..25f75e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,12 +30,17 @@ classifiers = [
 ]
 requires-python = ">=3.9"
 dependencies = [
+    "awkward",
     "colorlog",
+    "tqdm",
     "numpy",
     "scipy",
     "numba",
     "legend-pydataobj>=1.9.0",
+    "pyg4ometry",
+    "pylegendtestdata",
     "legend-pygeom-optics>=0.6.5",
+    "legend-pygeom-hpges",
     "hist",
     "particle",
     "pandas",
@@ -72,6 +77,7 @@ test = [
 
 [project.scripts]
 reboost-optical = "reboost.optical.cli:optical_cli"
+reboost-hpge = "reboost.hpge.cli:hpge_cli"
 
 [tool.setuptools]
 include-package-data = true
diff --git a/src/reboost/__init__.py b/src/reboost/__init__.py
index 8e49755..c0d8185 100644
--- a/src/reboost/__init__.py
+++ b/src/reboost/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from reboost import optical
+from reboost import hpge, optical
 from reboost._version import version as __version__
 
-__all__ = ["__version__", "optical"]
+__all__ = ["__version__", "optical", "hpge"]
diff --git a/src/reboost/hpge/__init__.py b/src/reboost/hpge/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/reboost/hpge/cli.py b/src/reboost/hpge/cli.py
new file mode 100644
index 0000000..03ba0c0
--- /dev/null
+++ b/src/reboost/hpge/cli.py
@@ -0,0 +1,148 @@
+from __future__ import annotations
+
+import argparse
+import logging
+
+import colorlog
+
+from . import utils
+
+
+def hpge_cli() -> None:
+    parser = argparse.ArgumentParser(
+        prog="reboost-hpge",
+        description="%(prog)s command line interface",
+    )
+
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="""Increase the program verbosity""",
+    )
+
+    parser.add_argument(
+        "--bufsize",
+        action="store",
+        type=int,
+        default=int(5e6),
+        help="""Row count for input table buffering (only used if applicable). default: %(default)e""",
+    )
+
+    # step 1: hit tier
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    hit_parser = subparsers.add_parser("hit", help="build hit file from remage raw file")
+
+    hit_parser.add_argument(
+        "--num",
+        "-n",
+        help="Number of events to process, if not set process all",
+        default=None,
+        type=int,
+        required=False,
+    )
+
+    hit_parser.add_argument(
+        "--start",
+        "-s",
+        help="First event to process (default 0)",
+        default=0,
+        type=int,
+        required=False,
+    )
+
+    hit_parser.add_argument(
+        "--proc_chain",
+        help="JSON or YAML file that contains the processing chain",
+        required=True,
+    )
+    hit_parser.add_argument(
+        "--pars",
+        help="JSON or YAML file that contains the pars",
+        required=True,
+    )
+    hit_parser.add_argument(
+        "--gdml",
+        help="GDML file used for Geant4",
+        default=None,
+        required=False,
+    )
+
+    hit_parser.add_argument(
+        "--meta_path",
+        help="Path to metadata (diodes folder)",
+        default=None,
+        required=False,
+    )
+    hit_parser.add_argument("--infield", help="input LH5 field", required=False, default="hit")
+    hit_parser.add_argument(
+        "--outfield", help="output LH5 field name", required=False, default="hit"
+    )
+    parser.add_argument(
+        "--merge_input",
+        "-m",
+        action="store_true",
+        default=True,
+        help="""Merge input lh5 files into a single output""",
+    )
+    hit_parser.add_argument(
+        "input", help="input hit LH5 files (can include wildcars) ", nargs="+", metavar="INPUT_HIT"
+    )
+    hit_parser.add_argument("output", help="output evt LH5 file", metavar="OUTPUT_EVT")
+
+    args = parser.parse_args()
+
+    handler = colorlog.StreamHandler()
+    handler.setFormatter(
+        colorlog.ColoredFormatter("%(log_color)s%(name)s [%(levelname)s] %(message)s")
+    )
+    logger = logging.getLogger("reboost.hpge")
+    logger.addHandler(handler)
+
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.INFO)
+
+    if args.command == "hit":
+        # is the import here a good idea?
+        logger.info("...running raw->hit tier")
+        from reboost.hpge.hit import build_hit
+
+        pars = utils.load_dict(args.pars, None)
+        proc_config = utils.load_dict(args.proc_config, None)
+
+        # check the processing chain
+        for req_field in ["channels", "outputs", "step_group", "operations"]:
+            if req_field not in proc_config:
+                msg = f"error proc chain config must contain the field {req_field}"
+                raise ValueError(msg)
+
+        msg = f"""
+            Running build_hit with:
+            - output file    :{args.output}
+            - input_file(s)  :{args.input}
+            - output field   :{args.outfield}
+            - input field    :{args.infield}
+            - proc_config    :{args.proc_config}
+            - pars           :{args.pars}
+            - buffer         :{args.bufsize}
+            - gdml file      :{args.gdml}
+            - metadata_path  :{args.meta_path}
+            - merge input    :{args.merge_input}
+        """
+
+        logger.info(msg)
+
+        build_hit(
+            args.output,
+            args.input,
+            out_field=args.outfield,
+            in_field=args.infield,
+            proc_config=proc_config,
+            pars=pars,
+            buffer=args.bufsize,
+            gdml=args.gdml,
+            metadata_path=args.meta_path,
+            merge_input=args.merge_input,
+        )
diff --git a/src/reboost/hpge/evt.py b/src/reboost/hpge/evt.py
new file mode 100644
index 0000000..17ebbc4
--- /dev/null
+++ b/src/reboost/hpge/evt.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import logging
+
+import awkward as ak
+import numpy as np
+from lgdo import Table
+from lgdo.lh5 import LH5Iterator, write
+from pygama.evt.build_evt import evaluate_expression
+from pygama.evt.utils import TCMData
+
+from . import utils
+
+log = logging.getLogger(__name__)
+
+
+def build_evt(
+    hit_file: str, tcm_file: str, evt_file: str | None, config: dict, buffer: int = int(5e6)
+) -> ak.Array | None:
+    """Generates the event tier from the hit and tcm.
+
+
+    Parameters
+    ----------
+    hit_file
+        path to the hit tier file
+    tcm_file
+        path to the tcm tier file
+    evt_file
+        path to the evt tier (output) file, if `None` the :class:`Table` is returned in memory
+    config
+        dictionary of the configuration. For example:
+
+        .. code-block:: json
+
+            {
+                "channels": {
+                    "geds_on":[ "det001", "det002"],
+                    "geds_ac":["det003"]
+                },
+                "outputs": [
+                    "energy",
+                    "multiplicity"
+                ],
+                "operations": {
+                    "energy_id": {
+                        "channels": "geds_on",
+                        "aggregation_mode": "gather",
+                        "query": "hit.energy > 25",
+                        "expression": "tcm.channel_id"
+                    },
+                    "energy": {
+                        "aggregation_mode": "keep_at_ch:evt.energy_id",
+                        "expression": "hit.energy > 25",
+                        "channels": "geds_on"
+                    },
+                    "multiplicity": {
+                        "channels": "geds_on",
+                        "aggregation_mode": "sum",
+                        "expression": "hit.energy > 25",
+                        "initial": 0
+                    }
+                }
+            }
+
+        Must contain:
+        - "channels": dictionary of channel groupings
+        - "outputs": fields for the output file
+        - "operations": operations to perform see :func:`pygama.evt.build_evt.evaluate_expression` for more details.
+
+    buffer
+        number of events to process simultaneously
+
+    Returns
+    -------
+    ak.Array of the evt tier data (if the data is not saved to disk)
+    """
+
+    msg = "... beginning the evt tier processing"
+    log.info(msg)
+
+    # create the objects needed for evaluate expression
+
+    file_info = {
+        "hit": (hit_file, "hit", "det{:03}"),
+        "evt": (evt_file, "evt"),
+    }
+
+    # iterate through the TCM
+
+    out_ak = ak.Array([])
+    mode = "of"
+
+    # get channel groupings
+    channels = {}
+    for group, info in config["channels"].items():
+        if isinstance(info, str):
+            channels[group] = [info]
+
+        elif isinstance(info, list):
+            channels[group] = info
+
+    for tcm_lh5, _, n_rows_read in LH5Iterator(tcm_file, "tcm", buffer_len=buffer):
+        tcm_lh5_sel = tcm_lh5
+        tcm_ak = tcm_lh5_sel.view_as("ak")[:n_rows_read]
+
+        tcm = TCMData(
+            id=np.array(ak.flatten(tcm_ak.array_id)),
+            idx=np.array(ak.flatten(tcm_ak.array_idx)),
+            cumulative_length=np.array(np.cumsum(ak.num(tcm_ak.array_id, axis=-1))),
+        )
+
+        n_rows = len(tcm.cumulative_length)
+        out_tab = Table(size=n_rows)
+
+        for name, info in config["operations"].items():
+            msg = f"computing field {name}"
+            log.debug(msg)
+
+            defaultv = info.get("initial", np.nan)
+            if isinstance(defaultv, str) and (defaultv in ["np.nan", "np.inf", "-np.inf"]):
+                defaultv = eval(defaultv)
+
+            channels_use = utils.get_channels_from_groups(info.get("channels", []), channels)
+            channels_exclude = utils.get_channels_from_groups(
+                info.get("exclude_channels", []), channels
+            )
+
+            if "aggregation_mode" not in info:
+                field = out_tab.eval(
+                    info["expression"].replace("evt.", ""), info.get("parameters", {})
+                )
+            else:
+                field = evaluate_expression(
+                    file_info,
+                    tcm,
+                    channels_use,
+                    table=out_tab,
+                    mode=info["aggregation_mode"],
+                    expr=info["expression"],
+                    query=info.get("query", None),
+                    sorter=info.get("sort", None),
+                    channels_skip=channels_exclude,
+                    default_value=defaultv,
+                    n_rows=n_rows,
+                )
+
+            msg = f"field {field}"
+            log.debug(msg)
+            out_tab.add_field(name, field)
+
+        # remove fields if necessary
+        existing_cols = list(out_tab.keys())
+        for col in existing_cols:
+            if col not in config["outputs"]:
+                out_tab.remove_column(col, delete=True)
+
+        # write
+        if evt_file is not None:
+            write(out_tab, "evt", evt_file, wo_mode=mode)
+            mode = "append"
+        else:
+            out_ak = ak.concatenate((out_ak, out_tab.view_as("ak")))
+
+    if evt_file is None:
+        return out_ak
+    return None
diff --git a/src/reboost/hpge/hit.py b/src/reboost/hpge/hit.py
new file mode 100644
index 0000000..b17104b
--- /dev/null
+++ b/src/reboost/hpge/hit.py
@@ -0,0 +1,397 @@
+from __future__ import annotations
+
+import logging
+
+import awkward as ak
+import pyg4ometry
+from lgdo import Array, ArrayOfEqualSizedArrays, Table, VectorOfVectors, lh5
+
+from . import utils
+
+log = logging.getLogger(__name__)
+
+
+def step_group(data: Table, group_config: dict) -> Table:
+    """Performs a grouping of geant4 steps to build the `hit` table.
+
+    Parameters
+    ----------
+    data
+        `stp` table from remage.
+    group_config
+        dict with the configuration describing the step grouping.
+        For example
+
+        .. code-block:: json
+
+            {
+            "description": "group steps by time and evtid.",
+            "expression": "reboost.hpge.processors.group_by_time(stp,window=10)"
+            }
+
+        this will then evaluate the function chosen by `expression` resulting in a new Table.
+
+    """
+
+    # group to create hit table
+
+    group_func, globs = utils.get_function_string(
+        group_config["expression"],
+    )
+    locs = {"stp": data}
+
+    msg = f"running step grouping with {group_func} and globals {globs.keys()} and locals {locs.keys()}"
+    log.debug(msg)
+    return eval(group_func, globs, locs)
+
+
+def get_locals(
+    local_info: dict,
+    pars_dict: dict | None,
+    detector: str | None = None,
+    meta_path: str | None = None,
+    reg: pyg4ometry.geant4.Registry | None = None,
+) -> dict:
+    """Compute any local variables needed for processing.
+
+    Parameters
+    ----------
+    local_info
+        config file block of the local objects to compute. For example:
+
+        ..code-block::
+
+            {"hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)"}
+
+    pars_dict
+        dictionary of parameters
+    detector
+        remage name for the dete
+    meta_path
+        path to thhe diodes folder of the metadata
+    reg
+        geant4 registry of the experiment
+
+    Returns
+    -------
+    dictionary of local variables to pass to eval
+    """
+    local_dict = {}
+
+    # get parameters named tuple
+    if pars_dict is not None:
+        pars_tuple = utils.dict2tuple(pars_dict)
+        local_dict = {"pars": pars_tuple}
+
+    objs = {"pars": pars_tuple, "detector": detector, "meta": meta_path, "reg": reg}
+
+    for name, eval_str in local_info.items():
+        proc_func, globs = utils.get_function_string(eval_str)
+
+        msg = f"extracting local object with {eval_str} "
+        log.debug(msg)
+        obj = eval(proc_func, globs, objs)
+        local_dict = local_dict | {name: obj}
+    return local_dict
+
+
+def eval_expression(
+    table: Table, info: dict, local_dict: dict | None
+) -> Array | ArrayOfEqualSizedArrays | VectorOfVectors:
+    """Evaluate an expression returning an LGDO object.
+
+    Parameters
+    ----------
+    table
+        hit table, with columns possibly used in the operations.
+    info
+        `dict` containing the information on the expression. Must contain `mode` and `expressions` keys. For example:
+
+        .. code-block:: json
+
+            {
+              "mode": "eval",
+              "expression":"ak.sum(hit.edep,axis=-1)"
+            }
+
+        Variables preceded by `hit` will be taken from the supplied table. Mode can be either `eval`,
+        in which case a simple expression is based (only involving numpy, awkward or inbuilt python functions),
+        or `function` in which case an arbitrary function is passed (for example defined in processors).
+
+        There are several objects passed to the evaluation as 'locals' determined by the `local_dict`
+
+    local_dict
+        mapping of local variables for the evaluation
+
+    Returns
+    -------
+    a new column for the hit table either :class:`Array`, :class:`ArrayOfEqualSizedArrays` or :class:`VectorOfVectors`.
+
+    Note
+    ----
+    In future the passing of local variables (pars,hpge,reg) to the evaluation should be make more generic.
+    """
+    if local_dict is None:
+        local_dict = {}
+
+    if info["mode"] == "eval":
+        # replace hit.
+        expr = info["expression"].replace("hit.", "")
+
+        msg = f"evaluating table with command {expr} and local_dict {local_dict.keys()}"
+        log.debug(msg)
+
+        col = table.eval(expr, local_dict)
+
+    elif info["mode"] == "function":
+        proc_func, globs = utils.get_function_string(info["expression"])
+
+        # add hit table to locals
+        local_dict = {"hit": table} | local_dict
+
+        msg = f"evaluating table with command {info['expression']} and local_dict {local_dict.keys()} and global dict {globs.keys()}"
+        log.debug(msg)
+        col = eval(proc_func, globs, local_dict)
+
+    else:
+        msg = "mode is not recognised."
+        raise ValueError(msg)
+    return col
+
+
+def build_hit(
+    file_out: str,
+    list_file_in: list,
+    out_field: str,
+    in_field: str,
+    proc_config: dict,
+    pars: dict,
+    n_evtid: int | None = None,
+    start_evtid: int = 0,
+    buffer: int = 1000000,
+    gdml: str | None = None,
+    metadata_path: str | None = None,
+    merge_input_files: bool = True,
+) -> None:
+    """
+    Read incrementally the files compute something and then write output
+
+    Parameters
+    ----------
+        file_out
+            output file path
+        list_file_in
+            list of input files
+        out_field
+            lh5 group name for output
+        in_field
+            lh5 group name for input
+        proc_config
+            the configuration file for the processing. Must contain the fields `channels`, `outputs`, `step_group` and operations`.
+            Optionally can also contain the `locals` field to extract other non-JSON serializable objects used by the processors.
+            For example:
+
+            .. code-block:: json
+
+               {
+                    "channels": [
+                        "det000",
+                        "det001"
+                     ],
+                    "outputs": [
+                        "t0",
+                        "truth_energy_sum",
+                        "smeared_energy_sum",
+                        "evtid"
+                    ],
+                    "step_group": {
+                        "description": "group steps by time and evtid.",
+                        "expression": "reboost.hpge.processors.group_by_time(stp,window=10)"
+                    },
+                    "locals": {
+                        "hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)"
+                    },
+                    "operations": {
+                        "t0": {
+                            "description": "first time in the hit.",
+                            "mode": "eval",
+                            "expression": "ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)"
+                        },
+                        "truth_energy_sum": {
+                            "description": "truth summed energy in the hit.",
+                            "mode": "eval",
+                            "expression": "ak.sum(hit.edep,axis=-1)"
+                        },
+                        "smeared_energy_sum": {
+                            "description": "summed energy after convolution with energy response.",
+                            "mode": "function",
+                            "expression": "reboost.hpge.processors.smear_energies(hit.truth_energy_sum,reso=pars.reso)"
+                        }
+
+                    }
+                }
+
+        pars
+            a dictionary of parameters, must have a field per channel consisting of a `dict` of parameters. For example:
+
+            .. code-block:: json
+
+                {
+                    "det000": {
+                        "reso": 1,
+                        "fccd": 0.1,
+                        "phy_vol_name":"det_phy",
+                        "meta_name": "icpc.json"
+                    }
+                }
+
+            this should also contain the channel mappings needed by reboost. These are:
+             - `phy_vol_name`: is the name of the physical volume,
+             - `meta_name`    : is the name of the JSON file with the metadata.
+
+            If these keys are not present both will be set to the remage output table name.
+
+        start_evtid
+            first `evtid` to read, defaults to 0.
+        n_evtid
+            number of `evtid` to read, if `None` all steps are read (the default).
+        buffer
+            length of buffer
+        gdml
+            path to the input gdml file.
+        metadata_path
+            path to the folder with the metadata (i.e. the `hardware.detectors.germanium.diodes` folder of `legend-metadata`)
+        merge_input_files
+            boolean flag to merge all input files into a single output.
+
+    Note
+    ----
+     - The operations can depend on the outputs of previous steps, so operations order is important.
+     - It would be better to have a cleaner way to supply metadata and detector maps.
+    """
+
+    # get the gdml file
+    with utils.debug_logging(logging.CRITICAL):
+        reg = pyg4ometry.gdml.Reader(gdml).getRegistry() if gdml is not None else None
+
+    # get info on the files to read in a nice named tuple
+    finfo = utils.get_selected_files(
+        table=in_field,
+        file_list=list_file_in,
+        n_evtid=n_evtid,
+        start_evtid=start_evtid,
+    )
+
+    # loop over input files
+    for first_evtid, file_idx, file_in in zip(
+        finfo.file_start_global_evtids, finfo.file_indices, finfo.file_list
+    ):
+        # get the evtids in the file: may use a lot of memory
+        vertices = lh5.read_as(f"{in_field}/vertices/evtid", file_in, "np")
+
+        for ch_idx, d in enumerate(proc_config["channels"]):
+            msg = f"...running hit tier for {file_in} and {d}"
+            log.info(msg)
+
+            # get local variables
+
+            local_dict = get_locals(
+                proc_config.get("locals", {}),
+                pars_dict=pars.get(d, {}),
+                meta_path=metadata_path,
+                detector=d,
+                reg=reg,
+            )
+
+            is_first_chan = bool(ch_idx == 0)
+            is_first_file = bool(file_idx == 0)
+
+            # flag to overwrite input file
+            delete_input = (is_first_chan & (merge_input_files is False)) | (
+                is_first_chan & is_first_file
+            )
+
+            msg = (
+                f"...begin processing with {file_in} to {file_out} and delete input {delete_input}"
+            )
+            log.debug(msg)
+
+            # number of iterations (used to handle last iteration)
+            it, entries, max_idx = utils.get_iterator(
+                file=file_in, lh5_table=f"{in_field}/{d}", buffer=buffer
+            )
+            buffer_rows = None
+
+            # iterate over the LH5 file
+            for idx, (lh5_obj, _, n_rows) in enumerate(it):
+                msg = f"... processed {idx} chunks out of {max_idx}"
+                log.debug(msg)
+
+                # convert to awkward
+                ak_obj = lh5_obj.view_as("ak")
+
+                # fix for a bug in lh5 iterator
+                if idx == max_idx:
+                    ak_obj = ak_obj[:n_rows]
+
+                # handle the buffers
+                obj, buffer_rows, mode = utils._merge_arrays(
+                    ak_obj, buffer_rows, idx=idx, max_idx=max_idx, delete_input=delete_input
+                )
+
+                # add global evtid
+                obj = utils.get_global_evtid(first_evtid, obj, vertices)
+
+                # check if the chunk can be skipped, does lack of sorting break this?
+
+                if not utils.get_include_chunk(
+                    obj._global_evtid,
+                    start_glob_evtid=finfo.first_global_evtid,
+                    end_glob_evtid=finfo.last_global_evtid,
+                ):
+                    continue
+
+                # select just the correct global evtid objects
+                obj = obj[
+                    (obj._global_evtid >= finfo.first_global_evtid)
+                    & (obj._global_evtid <= finfo.last_global_evtid)
+                ]
+
+                # rename the fields
+                obj = ak.with_field(
+                    obj,
+                    obj["evtid"],
+                    "_evtid",
+                )
+                obj = ak.without_field(obj, "evtid")
+
+                # convert back to a table, should work
+                data = Table(obj)
+
+                # group steps into hits
+                grouped = step_group(data, proc_config["step_group"])
+
+                # processors
+                for name, info in proc_config["operations"].items():
+                    msg = f"... adding column {name}"
+                    log.debug(msg)
+
+                    col = eval_expression(grouped, info, local_dict=local_dict)
+                    grouped.add_field(name, col)
+
+                # remove unwanted columns
+                log.debug("... removing unwanted columns")
+
+                existing_cols = list(grouped.keys())
+                for col in existing_cols:
+                    if col not in proc_config["outputs"]:
+                        grouped.remove_column(col, delete=True)
+
+                # write lh5 file
+
+                file_out_tmp = (
+                    f"{file_out.split('.')[0]}_{file_idx}.lh5"
+                    if (merge_input_files is False)
+                    else file_out
+                )
+                lh5.write(grouped, f"{d}/{out_field}", file_out_tmp, wo_mode=mode)
diff --git a/src/reboost/hpge/processors.py b/src/reboost/hpge/processors.py
new file mode 100644
index 0000000..d390364
--- /dev/null
+++ b/src/reboost/hpge/processors.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+import awkward as ak
+import legendhpges
+import lgdo
+import numpy as np
+from lgdo import Array, Table, VectorOfVectors
+from numpy.typing import ArrayLike
+
+
+def sort_data(obj: ak.Array, *, time_name: str = "time", evtid_name: str = "_evtid") -> ak.Array:
+    """Sort the data by evtid then time.
+
+    Parameters
+    ----------
+    obj
+        array of records containing fields `time` and `evtid`
+
+    Returns
+    -------
+    sorted awkward array
+    """
+    indices = np.lexsort((obj[time_name], obj[evtid_name]))
+    return obj[indices]
+
+
+def group_by_evtid(data: Table) -> Table:
+    """Simple grouping by evtid.
+
+    Takes the input `stp` :class:`LGOD.Table` from remage and defines groupings of steps (i.e the
+    `cumulative_length` for a vector of vectors). This then defines the output table (also :class:`LGDO.Table`),
+    on which processors can add fields.
+
+    Parameters
+    ----------
+    data
+        LGDO Table which must contain the `evtid` field.
+
+    Returns
+    -------
+    LGDO table of :class:`VectorOfVector` for each field.
+
+    Note
+    ----
+    The input table must be sorted (by `evtid`).
+    """
+
+    # convert to awkward
+    obj_ak = data.view_as("ak")
+
+    # sort input
+    obj_ak = sort_data(obj_ak)
+
+    # extract cumulative lengths
+    counts = ak.run_lengths(obj_ak._evtid)
+    cumulative_length = np.cumsum(counts)
+
+    # build output table
+    out_tbl = Table(size=len(cumulative_length))
+
+    for f in obj_ak.fields:
+        out_tbl.add_field(
+            f, VectorOfVectors(cumulative_length=cumulative_length, flattened_data=obj_ak[f])
+        )
+    return out_tbl
+
+
+def group_by_time(
+    data: Table | ak.Array,
+    window: float = 10,
+    time_name: str = "time",
+    evtid_name: str = "_evtid",
+    fields: list | None = None,
+) -> lgdo.Table:
+    """Grouping of steps by `evtid` and `time`.
+
+    Takes the input `stp` :class:`LGOD.Table` from remage and defines groupings of steps (i.e the
+    `cumulative_length` for a vector of vectors). This then defines the output table (also :class:`LGDO.Table`),
+    on which processors can add fields.
+
+    The windowing is based on defining a new group when the `evtid` changes or when the time increases by `> window`,
+    which is in units of us.
+
+
+    Parameters
+    ----------
+    data
+        LGDO Table or ak.Array which must contain the time_name and evtid_name fields
+    window
+        time window in us used to search for coincident hits
+    time_name
+        name of the timing field
+    evtid_name
+        name of the evtid field
+    fields
+        names of fields to include in the output table, if None includes all
+
+    Returns
+    -------
+    LGDO table of :class:`VectorOfVector` for each field.
+
+    Note
+    ----
+    The input table must be sorted (first by `evtid` then `time`).
+    """
+
+    obj = data.view_as("ak") if isinstance(data, Table) else data
+    obj = sort_data(obj, time_name=time_name, evtid_name=evtid_name)
+
+    # get difference
+    time_diffs = np.diff(obj[time_name])
+    index_diffs = np.diff(obj[evtid_name])
+
+    # index of thhe last element in each run
+    time_change = (time_diffs > window * 1000) & (index_diffs == 0)
+    index_change = index_diffs > 0
+
+    # cumulative length is just the index of changes plus 1
+    cumulative_length = np.array(np.where(time_change | index_change))[0] + 1
+
+    # add the las grouping
+    cumulative_length = np.append(cumulative_length, len(obj[time_name]))
+
+    # build output table
+    out_tbl = Table(size=len(cumulative_length))
+
+    fields = obj.fields if fields is None else fields
+    for f in fields:
+        out_tbl.add_field(
+            f, VectorOfVectors(cumulative_length=cumulative_length, flattened_data=obj[f])
+        )
+
+    return out_tbl
+
+
+def smear_energies(truth_energy: Array, reso: float = 2) -> Array:
+    """Smearing of energies.
+
+    Parameters
+    ----------
+    truth_energy
+        Array of energies to be smeared.
+    reso
+        energy resolution (sigma).
+
+    Returns
+    -------
+    New array after sampling from a Gaussian with mean :math:`energy_i` and sigma `reso` for every element of `truth_array`.
+
+    """
+
+    flat_energy = truth_energy
+    rng = np.random.default_rng()
+
+    return Array(rng.normal(loc=flat_energy, scale=np.ones_like(flat_energy) * reso))
+
+
+def distance_to_surface(
+    positions_x: VectorOfVectors,
+    positions_y: VectorOfVectors,
+    positions_z: VectorOfVectors,
+    hpge: legendhpges.base.HPGe,
+    det_pos: ArrayLike,
+    surface_type: str | None = None,
+    unit: str = "mm",
+) -> Array:
+    """Computes the distance from each step to the detector surface.
+
+    Parameters
+    ----------
+    positions_x
+        Global x positions for each step.
+    positions_y
+        Global y positions for each step.
+    positions_z
+        Global z positions for each step.
+    hpge
+        HPGe object.
+    det_pos
+        position of the detector origin, must be a 3 component array corresponding to `(x,y,z)`.
+    surface_type
+        string of which surface to use, can be `nplus`, `pplus` `passive` or None (in which case the distance to any surface is calculated).
+    unit
+        unit for the hit tier positions table.
+
+    Returns
+    -------
+    VectorOfVectors with the same shape as `positions_x/y/z` of the distance to the surface.
+
+    Note
+    ----
+    `positions_x/positions_y/positions_z` must all have the same shape.
+
+    """
+    factor = np.array([1, 100, 1000])[unit == np.array(["mm", "cm", "m"])][0]
+
+    # compute local positions
+    pos = []
+    sizes = []
+    for idx, pos_tmp in enumerate([positions_x, positions_y, positions_z]):
+        local_pos_tmp = ak.Array(pos_tmp) * factor - det_pos[idx]
+        local_pos_flat_tmp = ak.flatten(local_pos_tmp).to_numpy()
+        pos.append(local_pos_flat_tmp)
+        sizes.append(ak.num(local_pos_tmp, axis=1))
+
+    if not ak.all(sizes[0] == sizes[1]) or not ak.all(sizes[0] == sizes[2]):
+        msg = "all position vector of vector must have the same shape"
+        raise ValueError(msg)
+
+    size = sizes[0]
+    # restructure the positions
+    local_positions = np.vstack(pos).T
+
+    # get indices
+
+    surface_indices = (
+        np.where(np.array(hpge.surfaces) == surface_type) if surface_type is not None else None
+    )
+
+    # distance calc itself
+    distances = hpge.distance_to_surface(local_positions, surface_indices=surface_indices)
+
+    return VectorOfVectors(ak.unflatten(distances, size))
diff --git a/src/reboost/hpge/tcm.py b/src/reboost/hpge/tcm.py
new file mode 100644
index 0000000..99b951c
--- /dev/null
+++ b/src/reboost/hpge/tcm.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import logging
+import re
+
+import awkward as ak
+from lgdo import Table, lh5
+
+from . import processors
+
+log = logging.getLogger(__name__)
+
+
+def build_tcm(
+    hit_file: str,
+    out_file: str,
+    channels: list[str],
+    time_name: str = "t0",
+    idx_name: str = "global_evtid",
+    time_window_in_us: float = 10,
+) -> None:
+    """Build the (Time Coincidence Map) TCM from the hit tier.
+
+    Parameters
+    ----------
+    hit_file
+        path to hit tier file.
+    out_file
+        output path for tcm.
+    channels
+        list of channel names to include.
+    time_name
+        name of the hit tier field used for time grouping.
+    idx_name
+        name of the hit tier field used for index grouping.
+    time_window_in_us
+        time window used to define the grouping.
+
+    """
+
+    hash_func = r"\d+"
+
+    msg = "start building time-coincidence map"
+    log.info(msg)
+
+    chan_ids = [re.search(hash_func, channel).group() for channel in channels]
+
+    hit_data = []
+    for channel in channels:
+        hit_data.append(
+            lh5.read(f"{channel}/hit", hit_file, field_mask=[idx_name, time_name]).view_as("ak")
+        )
+    tcm = get_tcm_from_ak(
+        hit_data, chan_ids, window=time_window_in_us, time_name=time_name, idx_name=idx_name
+    )
+
+    if tcm is not None:
+        lh5.write(tcm, "tcm", out_file, wo_mode="of")
+
+
+def get_tcm_from_ak(
+    hit_data: list[ak.Array],
+    channels: list[int],
+    *,
+    window: float = 10,
+    time_name: str = "t0",
+    idx_name: str = "global_evtid",
+) -> Table:
+    """Builds a time-coincidence map from a hit of hit data Tables.
+
+    - build an ak.Array of the data merging channels with fields base on "time_name", and "idx_name" and adding a field `rawid` from the channel idx, also add the row (`hit_idx`)
+    - sorts this array by "idx_name" then "time_name" fields
+    - group by "idx_name" and "time_name" based on the window parameter
+
+    Parameters
+    ----------
+    hit_data
+        list of hit tier data for each channel
+    channels
+        list of channel indices
+    window
+        time window for selecting coincidences (in us)
+    time_name
+        name of the field for time information
+    idx_name
+        name of the decay index field
+
+    Returns
+    -------
+    an LGDO.VectorOfVectors containing the time-coincidence map
+    """
+    # build ak_obj for sorting
+    sort_objs = []
+
+    for ch_idx, data_tmp in zip(channels, hit_data):
+        obj_tmp = ak.copy(data_tmp)
+        obj_tmp = obj_tmp[[time_name, idx_name]]
+        hit_idx = ak.local_index(obj_tmp)
+
+        obj_tmp = ak.with_field(obj_tmp, hit_idx, "array_idx")
+
+        obj_tmp["array_id"] = int(ch_idx)
+        sort_objs.append(obj_tmp)
+
+    obj_tot = ak.concatenate(sort_objs)
+
+    return processors.group_by_time(
+        obj_tot,
+        time_name=time_name,
+        evtid_name=idx_name,
+        window=window,
+        fields=["array_id", "array_idx"],
+    )
diff --git a/src/reboost/hpge/utils.py b/src/reboost/hpge/utils.py
new file mode 100644
index 0000000..4cb4265
--- /dev/null
+++ b/src/reboost/hpge/utils.py
@@ -0,0 +1,477 @@
+from __future__ import annotations
+
+import copy
+import importlib
+import itertools
+import json
+import logging
+import re
+from collections import namedtuple
+from contextlib import contextmanager
+from pathlib import Path
+from typing import NamedTuple
+
+import awkward as ak
+import legendhpges
+import numpy as np
+import pyg4ometry
+import yaml
+from lgdo import lh5
+from lgdo.lh5 import LH5Iterator
+from numpy.typing import ArrayLike, NDArray
+
+log = logging.getLogger(__name__)
+
+reg = pyg4ometry.geant4.Registry()
+
+
+class FileInfo(NamedTuple):
+    """NamedTuple storing the information on the input files"""
+
+    file_list: list[str]
+    """list of strings of the selected files."""
+
+    file_indices: list[int]
+    """list of integers of the indices of the files."""
+
+    file_start_global_evtids: list[int]
+    """list of integers of the first global evtid for each file."""
+
+    first_global_evtid: int
+    """first global evtid to process."""
+
+    last_global_evtid: int
+    """Last global evtid to process."""
+
+
+def get_channels_from_groups(names: list | str | None, groupings: dict | None = None) -> list:
+    """Get a list of channels from a list of groups"""
+
+    if names is None:
+        channels_e = []
+    elif isinstance(names, str):
+        channels_e = groupings[names]
+    elif isinstance(names, list):
+        channels_e = list(itertools.chain.from_iterable([groupings[e] for e in names]))
+    else:
+        msg = f"names {names} must be list or str or `None`"
+        raise ValueError(msg)
+
+    return channels_e
+
+
+def get_selected_files(
+    file_list: list[str], table: str, n_evtid: int | None, start_evtid: int
+) -> FileInfo:
+    """Get the files to read based on removing those with global evtid out of the selected range.
+
+    - expands wildcards,
+    - extracts number of evtid per file,
+    - removes files outside of the range `low_evtid` to `high_evtid`.
+
+    Parameters
+    ----------
+    file_list
+        list of files to process (can include wildcards)
+    table
+        lh5 input field
+    n_evtid
+        number of simulation events to process.
+    high_global_evtid
+        start_evtid: first (global) simulation event to process.
+
+    Returns
+    -------
+    `FileInfo` object with information on the files.
+    """
+    # expand wildcards
+    expanded_list_file_in = get_file_list(path=file_list)
+
+    n_sim = get_num_simulated(expanded_list_file_in, table=table)
+
+    # first global index of each file
+    cum_nsim = np.concatenate([[0], np.cumsum(n_sim)])
+
+    low_global_evtid, high_global_evtid = get_global_evtid_range(
+        start_evtid=start_evtid, n_evtid=n_evtid, n_tot_sim=cum_nsim[-1]
+    )
+    file_indices = np.array(
+        get_files_to_read(
+            cum_nsim, start_glob_evtid=low_global_evtid, end_glob_evtid=high_global_evtid
+        )
+    )
+
+    # select just the necessary files
+    file_list_sel = np.array(expanded_list_file_in)[file_indices]
+    start_evtid_sel = cum_nsim[file_indices]
+
+    return FileInfo(
+        file_list_sel, file_indices, start_evtid_sel, low_global_evtid, high_global_evtid
+    )
+
+
+def get_num_simulated(file_list: list, table: str = "hit") -> int:
+    """Loop over a list of files and extract the number of simulated events.
+
+    Based on the size of the `vertices` tables.
+
+    Parameters
+    ----------
+    file_list
+        list of input files (each must contain the vertices table)
+    table
+        name of the lh5 input table.
+    """
+    n = []
+    for file in file_list:
+        it = lh5.LH5Iterator(file, f"{table}/vertices", buffer_len=int(5e6))
+        n.append(it._get_file_cumlen(0))
+
+    msg = f"files contain {n} events"
+    log.info(msg)
+    return n
+
+
+def get_num_evtid_hit_tier(hit_file: str, channels: list[str], idx_name: str) -> int:
+    """Get the number of evtid in the hit file.
+
+    Parameters
+    ----------
+    hit_file
+        path to the hit file
+    channels
+        list of channel names
+    idx_name
+        name of the index field
+
+    Returns
+    -------
+    largest evtid in the files.
+
+    Note
+    ----
+    To avoid reading the full file into memory we assume the hit tier files are sorted by "idx_name". This should always be the case.
+    """
+
+    n_max = 0
+    for channel in channels:
+        # use the iterator to get the file size
+        it = lh5.LH5Iterator(hit_file, f"{channel}/hit/", buffer_len=int(5e6))
+        entries = it._get_file_cumentries(0)
+
+        ob_ak = lh5.read(f"{channel}/hit/", hit_file, idx=[entries - 1]).view_as("ak")
+
+        max_evtid = ob_ak[idx_name][0]
+
+        n_max = max_evtid if (max_evtid > n_max) else n_max
+
+    return n_max
+
+
+def get_file_list(path: str | list[str]) -> NDArray:
+    """Get list of files to read.
+
+    Parameters
+    ----------
+    path
+        either a string or a list of strings containing files paths to process. May contain wildcards which are expanded.
+
+    Returns
+    -------
+    sorted array of files, after expanding wildcards, removing duplicates and sorting.
+
+    """
+
+    if isinstance(path, str):
+        path = [path]
+
+    path_out_list = []
+
+    for ptmp in path:
+        ptmp_path = Path(ptmp)
+        dir_tmp = ptmp_path.parent
+        pattern_tmp = ptmp_path.name
+
+        path_out_list.extend(dir_tmp.glob(pattern_tmp))
+    path_out_list_str = [str(ptmp) for ptmp in path_out_list]
+    return np.array(np.sort(np.unique(path_out_list_str)))
+
+
+def get_global_evtid_range(
+    start_evtid: int, n_evtid: int | None, n_tot_sim: int
+) -> tuple[int, int]:
+    """Get the global evtid range"""
+
+    # some checks
+    if (n_evtid is not None) and (start_evtid + n_evtid > n_tot_sim):
+        msg = "Index are out of the range of the simulation."
+        raise ValueError(msg)
+
+    start_glob_index = start_evtid
+    end_glob_index = start_evtid + n_evtid - 1 if (n_evtid is not None) else n_tot_sim - 1
+    return start_glob_index, end_glob_index
+
+
+def get_files_to_read(cum_n_sim: ArrayLike, start_glob_evtid: int, end_glob_evtid: int) -> NDArray:
+    """Get the index of files to read based on the number of evtid to read and the start evtid.
+
+    Parameters
+    ----------
+    cum_n_sim
+        cumulative list of the number of evtid per file.
+    start_glob_evtid
+        first global evtid to include.
+    end_glob_evtid
+        last global evtid to process.
+
+    Returns
+    -------
+    array of the indices of files to process.
+    """
+    # find which files to read
+
+    file_indices = []
+    cum_n_sim = np.array(cum_n_sim)
+
+    for i, (low, high) in enumerate(zip(cum_n_sim, cum_n_sim[1:] - 1)):
+        if (high >= start_glob_evtid) & (low <= end_glob_evtid):
+            file_indices.append(i)
+    return np.array(file_indices)
+
+
+def get_global_evtid(first_evtid: int, obj: ak.Array, vertices: ArrayLike) -> ak.Array:
+    """Adds a global evtid field to the array.
+
+    The global evtid is the index of the decay in the order the files are read in. This is obtained
+    from the first_evtid, defined as the number of decays in the previous files, plus the row of the
+    vertices array corresponding to this evtid. In this way we obtain an index both sorted and unique
+    over all the input files, this enables easy selection of some chunks.
+
+    Parameters
+    ----------
+    first_evtid
+        number of decays in the previous files
+    obj
+        awkward array of the input data
+    vertices
+        array of the vertex evtids
+
+    Returns
+    -------
+    the obj with the global_evtid field added
+    """
+    vertices = np.array(vertices)
+    indices = np.searchsorted(vertices, np.array(obj.evtid))
+
+    if np.any(vertices[indices] != np.array(obj.evtid)):
+        msg = "Some of the evtids in the obj do not correspond to rows in the input"
+        raise ValueError(msg)
+
+    return ak.with_field(obj, first_evtid + indices, "_global_evtid")
+
+
+def get_include_chunk(
+    global_evtid: ak.Array,
+    start_glob_evtid: int,
+    end_glob_evtid: int,
+) -> bool:
+    """Check if a chunk can be skipped based on evtid range.
+
+    Parameters
+    ----------
+    global_evtid
+        awkward array of the (local) evtid in the chunk
+    start_glob_evtid
+        first global evtid to include.
+    end_glob_evtid
+        last global evtid to process.
+    Returns
+    -------
+    boolean flag of whether to include in the chunk.
+
+    """
+    low = np.min(np.array(global_evtid))
+    high = np.max(np.array(global_evtid))
+    return (high >= start_glob_evtid) & (low <= end_glob_evtid)
+
+
+def get_hpge(meta_path: str | None, pars: NamedTuple, detector: str) -> legendhpges.HPGe:
+    """Extract the :class:`legendhpges.HPGe` object from metadata.
+
+    Parameters
+    ----------
+    meta_path
+        path to the folder with the `diodes` metadata.
+    pars
+        named tuple of parameters.
+    detector
+        remage output name for the detector
+
+    Returns
+    -------
+    hpge
+        the `legendhpges` object for the detector.
+    """
+    with debug_logging(logging.CRITICAL):
+        reg = pyg4ometry.geant4.Registry()
+        if meta_path is not None:
+            meta_name = pars.meta_name if ("meta_name" in pars._fields) else f"{detector}.json"
+            meta_dict = Path(meta_path) / Path(meta_name)
+            return legendhpges.make_hpge(meta_dict, registry=reg)
+        return None
+
+
+def get_phy_vol(
+    reg: pyg4ometry.geant4.Registry | None, pars: NamedTuple, detector: str
+) -> pyg4ometry.geant4.PhysicalVolume:
+    """Extract the :class:`pyg4ometry.geant4.PhysicalVolume` object from GDML
+
+    Parameters
+    ----------
+    reg
+        Geant4 registry from GDML
+    pars
+        named tuple of parameters.
+    detector
+        remage output name for the detector.
+
+    Returns
+    -------
+    phy_vol
+        the `pyg4ometry.geant4.PhysicalVolume` object for the detector
+    """
+
+    with debug_logging(logging.CRITICAL):
+        if reg is not None:
+            phy_name = pars.phy_vol_name if ("phy_vol_name" in pars._fields) else f"{detector}"
+            return reg.physicalVolumeDict[phy_name]
+
+        return None
+
+
+@contextmanager
+def debug_logging(level):
+    logger = logging.getLogger("root")
+    old_level = logger.getEffectiveLevel()
+    logger.setLevel(level)
+    try:
+        yield
+    finally:
+        logger.setLevel(old_level)
+
+
+def dict2tuple(dictionary: dict) -> namedtuple:
+    return namedtuple("parameters", dictionary.keys())(**dictionary)
+
+
+def get_function_string(expr: str) -> tuple[str, dict]:
+    """Get a function call to evaluate."""
+
+    args_str = re.search(r"\((.*)\)$", expr.strip()).group(1)
+
+    # get module and function names
+    func_call = expr.strip().split("(")[0]
+    subpackage, func = func_call.rsplit(".", 1)
+    package = subpackage.split(".")[0]
+    importlib.import_module(subpackage, package=__package__)
+
+    # declare imported package as globals (see eval() call later)
+    globs = {
+        package: importlib.import_module(package),
+    }
+
+    call_str = f"{func_call}({args_str})"
+
+    return call_str, globs
+
+
+def _merge_arrays(
+    ak_obj: ak.Array,
+    buffer_rows: ak.Array | None,
+    idx: int,
+    max_idx: int,
+    delete_input: bool = False,
+) -> tuple[ak.Array, ak.Array, str]:
+    """Merge awkward arrays with a buffer and simultaneously remove the last rows.
+
+    This function is used since when iterating through the rows of an Array it will
+    sometimes happen to split some events. This concatenates rows in the buffer onto the start of the data.
+    This also removes the last rows of each chunk and saves them into a buffer.
+
+    Parameters
+    ----------
+    obj
+        array of data
+    buffer_rows
+        buffer to concatenate with.
+    idx
+        integer index used control the behaviour for the first and last chunk.
+    max_idx
+        largest index.
+    delete_input
+        flag to delete the input files.
+
+    Returns
+    -------
+    (obj,buffer,mode) tuple of the data, the buffer for the next chunk and the IO mode for file writing.
+    """
+    counts = ak.run_lengths(ak_obj.evtid)
+    rows = ak.num(ak_obj, axis=-1)
+    end_rows = counts[-1]
+
+    if max_idx == 0:
+        mode = "of" if (delete_input) else "append"
+        obj = ak_obj
+    elif idx == 0:
+        mode = "of" if (delete_input) else "append"
+        obj = ak_obj[0 : rows - end_rows]
+        buffer_rows = copy.deepcopy(ak_obj[rows - end_rows :])
+    elif idx != max_idx:
+        mode = "append"
+        obj = ak.concatenate((buffer_rows, ak_obj[0 : rows - end_rows]))
+        buffer_rows = copy.deepcopy(ak_obj[rows - end_rows :])
+    else:
+        mode = "append"
+        obj = ak.concatenate((buffer_rows, ak_obj))
+        buffer_rows = None
+
+    return obj, buffer_rows, mode
+
+
+def get_iterator(file: str, buffer: int, lh5_table: str, **kwargs) -> tuple[LH5Iterator, int, int]:
+    """Get information on the iterator (number of index and entries)."""
+
+    it = LH5Iterator(file, lh5_table, buffer_len=buffer, **kwargs)
+    entries = it._get_file_cumentries(0)
+
+    # number of blocks is ceil of entries/buffer,
+    # shift by 1 since idx starts at 0
+    max_idx = int(np.ceil(entries / buffer)) - 1
+
+    return it, entries, max_idx
+
+
+__file_extensions__ = {"json": [".json"], "yaml": [".yaml", ".yml"]}
+
+
+def load_dict(fname: str, ftype: str | None = None) -> dict:
+    """Load a text file as a Python dict."""
+    fname = Path(fname)
+
+    # determine file type from extension
+    if ftype is None:
+        for _ftype, exts in __file_extensions__.items():
+            if fname.suffix in exts:
+                ftype = _ftype
+
+    msg = f"loading {ftype} dict from: {fname}"
+    log.debug(msg)
+
+    with fname.open() as f:
+        if ftype == "json":
+            return json.load(f)
+        if ftype == "yaml":
+            return yaml.safe_load(f)
+
+        msg = f"unsupported file format {ftype}"
+        raise NotImplementedError(msg)
diff --git a/tests/configs/geom.gdml b/tests/configs/geom.gdml
new file mode 100644
index 0000000..62ebc8d
--- /dev/null
+++ b/tests/configs/geom.gdml
@@ -0,0 +1,132 @@
+<?xml version="1.0" ?>
+<gdml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://cern.ch/service-spi/app/releases/GDML/schema/gdml.xsd">
+	<define/>
+	<materials>
+		<isotope name="Ge74" Z="32" N="74">
+			<atom value="73.921"/>
+		</isotope>
+		<isotope name="Ge76" Z="32" N="76">
+			<atom value="75.921"/>
+		</isotope>
+		<element name="ElementEnrichedGermanium0.900" formula="EnrGe0.900">
+			<fraction ref="Ge74" n="0.09999999999999998"/>
+			<fraction ref="Ge76" n="0.9"/>
+		</element>
+		<material name="EnrichedGermanium0.900">
+			<D value="5.550156934963668"/>
+			<fraction ref="ElementEnrichedGermanium0.900" n="1"/>
+		</material>
+	</materials>
+	<solids>
+		<box name="ws" x="400" y="1400" z="400" lunit="mm"/>
+		<genericPolycone name="det_log_0" startphi="0" deltaphi="6.283185307179586" lunit="mm" aunit="rad">
+			<rzpoint r="0" z="0"/>
+			<rzpoint r="10" z="0"/>
+			<rzpoint r="10" z="2"/>
+			<rzpoint r="14" z="2"/>
+			<rzpoint r="14" z="0"/>
+			<rzpoint r="40.0" z="0"/>
+			<rzpoint r="41" z="1"/>
+			<rzpoint r="41" z="94.1"/>
+			<rzpoint r="40.0" z="95.1"/>
+			<rzpoint r="8.81187649392582" z="95.1"/>
+			<rzpoint r="4" z="40.099999999999994"/>
+			<rzpoint r="4" z="30.099999999999994"/>
+			<rzpoint r="0" z="30.099999999999994"/>
+		</genericPolycone>
+		<genericPolycone name="det_log_1" startphi="0" deltaphi="6.283185307179586" lunit="mm" aunit="rad">
+			<rzpoint r="0" z="0"/>
+			<rzpoint r="10" z="0"/>
+			<rzpoint r="10" z="2"/>
+			<rzpoint r="14" z="2"/>
+			<rzpoint r="14" z="0"/>
+			<rzpoint r="40.0" z="0"/>
+			<rzpoint r="41" z="1"/>
+			<rzpoint r="41" z="94.1"/>
+			<rzpoint r="40.0" z="95.1"/>
+			<rzpoint r="8.81187649392582" z="95.1"/>
+			<rzpoint r="4" z="40.099999999999994"/>
+			<rzpoint r="4" z="30.099999999999994"/>
+			<rzpoint r="0" z="30.099999999999994"/>
+		</genericPolycone>
+		<genericPolycone name="det_log_2" startphi="0" deltaphi="6.283185307179586" lunit="mm" aunit="rad">
+			<rzpoint r="0" z="0"/>
+			<rzpoint r="10" z="0"/>
+			<rzpoint r="10" z="2"/>
+			<rzpoint r="14" z="2"/>
+			<rzpoint r="14" z="0"/>
+			<rzpoint r="40.0" z="0"/>
+			<rzpoint r="41" z="1"/>
+			<rzpoint r="41" z="94.1"/>
+			<rzpoint r="40.0" z="95.1"/>
+			<rzpoint r="8.81187649392582" z="95.1"/>
+			<rzpoint r="4" z="40.099999999999994"/>
+			<rzpoint r="4" z="30.099999999999994"/>
+			<rzpoint r="0" z="30.099999999999994"/>
+		</genericPolycone>
+		<genericPolycone name="det_log_3" startphi="0" deltaphi="6.283185307179586" lunit="mm" aunit="rad">
+			<rzpoint r="0" z="0"/>
+			<rzpoint r="10" z="0"/>
+			<rzpoint r="10" z="2"/>
+			<rzpoint r="14" z="2"/>
+			<rzpoint r="14" z="0"/>
+			<rzpoint r="40.0" z="0"/>
+			<rzpoint r="41" z="1"/>
+			<rzpoint r="41" z="94.1"/>
+			<rzpoint r="40.0" z="95.1"/>
+			<rzpoint r="8.81187649392582" z="95.1"/>
+			<rzpoint r="4" z="40.099999999999994"/>
+			<rzpoint r="4" z="30.099999999999994"/>
+			<rzpoint r="0" z="30.099999999999994"/>
+		</genericPolycone>
+		<tube name="source" rmin="0" rmax="2" z="1" startphi="0" deltaphi="6.283185307179586" lunit="mm" aunit="rad"/>
+	</solids>
+	<structure>
+		<volume name="det_log_0">
+			<materialref ref="EnrichedGermanium0.900"/>
+			<solidref ref="det_log_0"/>
+		</volume>
+		<volume name="det_log_1">
+			<materialref ref="EnrichedGermanium0.900"/>
+			<solidref ref="det_log_1"/>
+		</volume>
+		<volume name="det_log_2">
+			<materialref ref="EnrichedGermanium0.900"/>
+			<solidref ref="det_log_2"/>
+		</volume>
+		<volume name="det_log_3">
+			<materialref ref="EnrichedGermanium0.900"/>
+			<solidref ref="det_log_3"/>
+		</volume>
+		<volume name="source_log">
+			<materialref ref="G4_Fe"/>
+			<solidref ref="source"/>
+		</volume>
+		<volume name="wl">
+			<materialref ref="G4_Galactic"/>
+			<solidref ref="ws"/>
+			<physvol name="det_phy_0">
+				<volumeref ref="det_log_0"/>
+			</physvol>
+			<physvol name="det_phy_1">
+				<volumeref ref="det_log_1"/>
+				<position name="det_phy_1_pos" x="0.000000000000000" y="100.000000000000000" z="0.000000000000000" unit="mm"/>
+			</physvol>
+			<physvol name="det_phy_2">
+				<volumeref ref="det_log_2"/>
+				<position name="det_phy_2_pos" x="0.000000000000000" y="200.000000000000000" z="0.000000000000000" unit="mm"/>
+			</physvol>
+			<physvol name="det_phy_3">
+				<volumeref ref="det_log_3"/>
+				<position name="det_phy_3_pos" x="0.000000000000000" y="300.000000000000000" z="0.000000000000000" unit="mm"/>
+			</physvol>
+			<physvol name="source_phy">
+				<volumeref ref="source_log"/>
+				<position name="source_phy_pos" x="-100.000000000000000" y="0.000000000000000" z="40.000000000000000" unit="mm"/>
+			</physvol>
+		</volume>
+	</structure>
+	<setup name="Default" version="1.0">
+		<world ref="wl"/>
+	</setup>
+</gdml>
diff --git a/tests/test_hpge_distance_to_surface.py b/tests/test_hpge_distance_to_surface.py
new file mode 100644
index 0000000..1114903
--- /dev/null
+++ b/tests/test_hpge_distance_to_surface.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+import awkward as ak
+import pytest
+from legendhpges import make_hpge
+from legendtestdata import LegendTestData
+from lgdo import types
+
+from reboost.hpge.processors import distance_to_surface
+
+
+@pytest.fixture(scope="session")
+def test_data_configs():
+    ldata = LegendTestData()
+    ldata.checkout("5f9b368")
+    return ldata.get_path("legend/metadata/hardware/detectors/germanium/diodes")
+
+
+def test_distance_to_surface(test_data_configs):
+    gedet = make_hpge(test_data_configs + "/V99000A.json")
+    dist = [100, 0, 0]
+
+    pos = ak.Array(
+        {
+            "xloc": [[0, 100, 200], [100], [700, 500, 200]],
+            "yloc": [[100, 0, 0], [200], [100, 300, 200]],
+            "zloc": [[700, 10, 20], [100], [300, 100, 0]],
+        }
+    )
+
+    # check just the shape
+    assert ak.all(
+        ak.num(distance_to_surface(pos.xloc, pos.yloc, pos.zloc, gedet, dist, None), axis=1)
+        == [3, 1, 3]
+    )
+
+    # check it can be written
+
+    assert isinstance(
+        distance_to_surface(pos.xloc, pos.yloc, pos.zloc, gedet, dist, None), types.LGDO
+    )
diff --git a/tests/test_hpge_hit.py b/tests/test_hpge_hit.py
new file mode 100644
index 0000000..806c6da
--- /dev/null
+++ b/tests/test_hpge_hit.py
@@ -0,0 +1,396 @@
+from __future__ import annotations
+
+import pathlib
+
+import awkward as ak
+import numpy as np
+import pyg4ometry
+import pytest
+from legendhpges.base import HPGe
+from legendtestdata import LegendTestData
+from lgdo import Table, lh5
+
+from reboost.hpge import hit
+
+configs = pathlib.Path(__file__).parent.resolve() / pathlib.Path("configs")
+
+
+@pytest.fixture(scope="session")
+def test_data_configs():
+    ldata = LegendTestData()
+    ldata.checkout("5f9b368")
+    return ldata.get_path("legend/metadata/hardware/detectors/germanium/diodes")
+
+
+def test_get_locals(test_data_configs):
+    local_info = {"hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)"}
+
+    local_dict = hit.get_locals(
+        local_info,
+        pars_dict={"meta_name": "V99000A.json"},
+        detector="det001",
+        meta_path=test_data_configs,
+    )
+
+    assert isinstance(local_dict["hpge"], HPGe)
+
+
+def test_eval():
+    in_arr = ak.Array(
+        {
+            "evtid": [[1, 1, 1], [2, 2], [10, 10], [11], [12, 12, 12]],
+            "time": [[0, 0, 0], [0, 0], [0, 0], [0], [0, 0, 0]],
+            "edep": [[100, 150, 300], [0, 2000], [10, 20], [19], [100, 200, 300]],
+        }
+    )
+    tab = Table(in_arr)
+    basic_eval = {"mode": "eval", "expression": "ak.sum(hit.edep,axis=-1)"}
+
+    assert ak.all(
+        hit.eval_expression(tab, basic_eval, {}).view_as("ak") == [550, 2000, 30, 19, 600]
+    )
+
+    tab.add_field("e_sum", hit.eval_expression(tab, basic_eval, {}))
+    bad_eval = {"mode": "sum", "expression": "ak.sum(hit.edep,axis=-1)"}
+
+    with pytest.raises(ValueError):
+        hit.eval_expression(tab, bad_eval, {})
+
+    func_eval = {
+        "mode": "function",
+        "expression": "reboost.hpge.processors.smear_energies(hit.e_sum,reso=pars.reso)",
+    }
+    pars = {"reso": 2}
+    # test get locals
+    local_dict = hit.get_locals({}, pars_dict=pars)
+    assert np.size(hit.eval_expression(tab, func_eval, local_dict).view_as("np")) == 5
+
+
+def test_eval_with_hpge(test_data_configs):
+    local_info = {"hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)"}
+    local_dict = hit.get_locals(
+        local_info,
+        pars_dict={"meta_name": "V99000A.json"},
+        detector="det001",
+        meta_path=test_data_configs,
+    )
+
+    pos = ak.Array(
+        {
+            "xloc": [[0, 100, 200], [100], [700, 500, 200]],
+            "yloc": [[100, 0, 0], [200], [100, 300, 200]],
+            "zloc": [[700, 10, 20], [100], [300, 100, 0]],
+        }
+    )
+    tab = Table(pos)
+
+    func_eval = {
+        "mode": "function",
+        "expression": "reboost.hpge.processors.distance_to_surface(hit.xloc, hit.yloc, hit.zloc, hpge, [0,0,0], None)",
+    }
+
+    assert ak.all(ak.num(hit.eval_expression(tab, func_eval, local_dict), axis=1) == [3, 1, 3])
+
+
+def test_eval_with_hpge_and_phy_vol(test_data_configs):
+    gdml_path = configs / pathlib.Path("geom.gdml")
+
+    gdml = pyg4ometry.gdml.Reader(gdml_path).getRegistry()
+
+    local_info = {
+        "hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)",
+        "phy_vol": "reboost.hpge.utils.get_phy_vol(reg=reg,pars=pars,detector=detector)",
+    }
+    local_dict = hit.get_locals(
+        local_info,
+        pars_dict={"meta_name": "V99000A.json", "phy_vol_name": "det_phy_1"},
+        detector="det001",
+        meta_path=test_data_configs,
+        reg=gdml,
+    )
+
+    pos = ak.Array(
+        {
+            "xloc": [[0, 100, 200], [100], [700, 500, 200]],
+            "yloc": [[100, 0, 0], [200], [100, 300, 200]],
+            "zloc": [[700, 10, 20], [100], [300, 100, 0]],
+        }
+    )
+    tab = Table(pos)
+
+    func_eval = {
+        "mode": "function",
+        "expression": "reboost.hpge.processors.distance_to_surface(hit.xloc, hit.yloc, hit.zloc, hpge, phy_vol.position.eval(), None)",
+    }
+
+    assert ak.all(ak.num(hit.eval_expression(tab, func_eval, local_dict), axis=1) == [3, 1, 3])
+
+
+# test the full processing chain
+@pytest.fixture
+def test_reboost_input_file(tmp_path):
+    # make it large enough to be multiple groups
+    rng = np.random.default_rng()
+    evtid_1 = np.sort(rng.integers(int(1e4), size=(int(1e5))))
+    time_1 = rng.uniform(low=0, high=1, size=(int(1e5)))
+    edep_1 = rng.uniform(low=0, high=1000, size=(int(1e5)))
+    pos_x_1 = rng.uniform(low=-50, high=50, size=(int(1e5)))
+    pos_y_1 = rng.uniform(low=-50, high=50, size=(int(1e5)))
+    pos_z_1 = rng.uniform(low=-50, high=50, size=(int(1e5)))
+
+    # make it not divide by the buffer len
+    evtid_2 = np.sort(rng.integers(int(1e4), size=(3004)))
+    time_2 = rng.uniform(low=0, high=1, size=(3004))
+    edep_2 = rng.uniform(low=0, high=1000, size=(3004))
+    pos_x_2 = rng.uniform(low=-50, high=50, size=(3004))
+    pos_y_2 = rng.uniform(low=-50, high=50, size=(3004))
+    pos_z_2 = rng.uniform(low=-50, high=50, size=(3004))
+
+    vertices_1 = ak.Array({"evtid": np.arange(int(1e4))})
+    vertices_2 = ak.Array({"evtid": np.arange(int(1e4))})
+
+    arr_1 = ak.Array(
+        {
+            "evtid": evtid_1,
+            "time": time_1,
+            "edep": edep_1,
+            "xloc": pos_x_1,
+            "yloc": pos_y_1,
+            "zloc": pos_z_1,
+        }
+    )
+    arr_2 = ak.Array(
+        {
+            "evtid": evtid_2,
+            "time": time_2,
+            "edep": edep_2,
+            "xloc": pos_x_2,
+            "yloc": pos_y_2,
+            "zloc": pos_z_2,
+        }
+    )
+
+    lh5.write(Table(vertices_1), "hit/vertices", tmp_path / "file1.lh5", wo_mode="of")
+    lh5.write(Table(vertices_2), "hit/vertices", tmp_path / "file2.lh5", wo_mode="of")
+
+    lh5.write(Table(arr_1), "hit/det001", tmp_path / "file1.lh5", wo_mode="append")
+    lh5.write(Table(arr_2), "hit/det001", tmp_path / "file2.lh5", wo_mode="append")
+
+    return tmp_path
+
+
+def test_build_hit(test_reboost_input_file):
+    # first just one file no pars
+
+    proc_config = {
+        "channels": [
+            "det001",
+        ],
+        "outputs": ["t0", "_evtid"],
+        "step_group": {
+            "description": "group steps by time and evtid.",
+            "expression": "reboost.hpge.processors.group_by_time(stp,window=10)",
+        },
+        "operations": {
+            "t0": {
+                "description": "first time in the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)",
+            },
+            "truth_energy_sum": {
+                "description": "truth summed energy in the hit.",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep,axis=-1)",
+            },
+        },
+    }
+
+    # build hit for file 1 and 2 separately and then also merging them
+    for output_file, input_file in zip(
+        ["out_1.lh5", "out_2.lh5", "out_merge.lh5"], ["file1.lh5", "file2.lh5", "file*.lh5"]
+    ):
+        hit.build_hit(
+            str(test_reboost_input_file / output_file),
+            [str(test_reboost_input_file / input_file)],
+            in_field="hit",
+            out_field="hit",
+            proc_config=proc_config,
+            pars={},
+            buffer=10000,
+        )
+
+    tab_1 = lh5.read("det001/hit", str(test_reboost_input_file / "out_1.lh5")).view_as("ak")
+    tab_2 = lh5.read("det001/hit", str(test_reboost_input_file / "out_2.lh5")).view_as("ak")
+    tab_merge = lh5.read("det001/hit", str(test_reboost_input_file / "out_merge.lh5")).view_as("ak")
+
+    # check lengths
+    assert len(ak.flatten(tab_1._evtid, axis=-1)) == int(1e5)
+    assert len(ak.flatten(tab_2._evtid, axis=-1)) == 3004
+    assert len(ak.flatten(tab_merge._evtid, axis=-1)) == 3004 + int(1e5)
+
+    # one call but write both files
+    hit.build_hit(
+        str(test_reboost_input_file / "out.lh5"),
+        [str(test_reboost_input_file / "file*.lh5")],
+        in_field="hit",
+        out_field="hit",
+        proc_config=proc_config,
+        pars={},
+        buffer=10000,
+        merge_input_files=False,
+    )
+
+    tab_1 = lh5.read("det001/hit", str(test_reboost_input_file / "out_0.lh5")).view_as("ak")
+    tab_2 = lh5.read("det001/hit", str(test_reboost_input_file / "out_1.lh5")).view_as("ak")
+
+    # check lengths
+    assert len(ak.flatten(tab_1._evtid, axis=-1)) == int(1e5)
+    assert len(ak.flatten(tab_2._evtid, axis=-1)) == 3004
+
+    # test with a smaller buffer
+    hit.build_hit(
+        str(test_reboost_input_file / "out_small_buffer.lh5"),
+        [str(test_reboost_input_file / "file*.lh5")],
+        in_field="hit",
+        out_field="hit",
+        proc_config=proc_config,
+        pars={},
+        buffer=1000,
+    )
+
+    tab_small = lh5.read(
+        "det001/hit", str(test_reboost_input_file / "out_small_buffer.lh5")
+    ).view_as("ak")
+
+    # buffer does not affect results
+    assert ak.all(ak.flatten(tab_merge._evtid) == ak.flatten(tab_small._evtid))
+
+
+def test_build_hit_some_row(test_reboost_input_file):
+    proc_config = {
+        "channels": [
+            "det001",
+        ],
+        "outputs": ["t0", "_evtid"],
+        "step_group": {
+            "description": "group steps by time and evtid.",
+            "expression": "reboost.hpge.processors.group_by_time(stp,window=10)",
+        },
+        "operations": {
+            "t0": {
+                "description": "first time in the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)",
+            },
+            "truth_energy_sum": {
+                "description": "truth summed energy in the hit.",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep,axis=-1)",
+            },
+        },
+    }
+
+    # test asking to read too many rows
+    with pytest.raises(ValueError):
+        hit.build_hit(
+            str(test_reboost_input_file / "out.lh5"),
+            [str(test_reboost_input_file / "file1.lh5")],
+            n_evtid=int(1e7),
+            in_field="hit",
+            out_field="hit",
+            proc_config=proc_config,
+            pars={},
+            buffer=10000,
+        )
+
+    # test reading the data in two goes gives the same result
+    for n_ev, s_ev, out in zip(
+        [int(1e3), int(1e4 - 1e3), int(1e4), int(1e4)],
+        [0, int(1e3), 0, 100],
+        ["out_some_rows.lh5", "out_rest_rows.lh5", "out_all_file_one.lh5", "out_mix.lh5"],
+    ):
+        # test read only some events
+        hit.build_hit(
+            str(test_reboost_input_file / out),
+            [
+                str(test_reboost_input_file / "file1.lh5"),
+                str(test_reboost_input_file / "file2.lh5"),
+            ],
+            n_evtid=n_ev,
+            start_evtid=s_ev,
+            in_field="hit",
+            out_field="hit",
+            proc_config=proc_config,
+            pars={},
+            buffer=100000,
+        )
+
+    tab_some = lh5.read("det001/hit", str(test_reboost_input_file / "out_some_rows.lh5")).view_as(
+        "ak"
+    )
+    tab_rest = lh5.read("det001/hit", str(test_reboost_input_file / "out_rest_rows.lh5")).view_as(
+        "ak"
+    )
+
+    tab_1 = lh5.read("det001/hit", str(test_reboost_input_file / "out_all_file_one.lh5")).view_as(
+        "ak"
+    )
+
+    tab_merge = ak.concatenate((tab_some, tab_rest))
+    assert ak.all(ak.all(tab_merge._evtid == tab_1._evtid, axis=-1))
+
+
+def test_build_hit_with_locals(test_reboost_input_file, test_data_configs):
+    proc_config = {
+        "channels": [
+            "det001",
+        ],
+        "outputs": ["t0", "_evtid", "distance_to_surface"],
+        "step_group": {
+            "description": "group steps by time and evtid.",
+            "expression": "reboost.hpge.processors.group_by_time(stp,window=10)",
+        },
+        "locals": {
+            "hpge": "reboost.hpge.utils.get_hpge(meta_path=meta,pars=pars,detector=detector)",
+            "phy_vol": "reboost.hpge.utils.get_phy_vol(reg=reg,pars=pars,detector=detector)",
+        },
+        "operations": {
+            "t0": {
+                "description": "first time in the hit.",
+                "mode": "eval",
+                "expression": "ak.fill_none(ak.firsts(hit.time,axis=-1),np.nan)",
+            },
+            "truth_energy_sum": {
+                "description": "truth summed energy in the hit.",
+                "mode": "eval",
+                "expression": "ak.sum(hit.edep,axis=-1)",
+            },
+            "smear_energy_sum": {
+                "description": "summed energy after convolution with energy response.",
+                "mode": "function",
+                "expression": "reboost.hpge.processors.smear_energies(hit.truth_energy_sum,reso=pars.reso)",
+            },
+            "distance_to_surface": {
+                "description": "distance to the nplus surface",
+                "mode": "function",
+                "expression": "reboost.hpge.processors.distance_to_surface(hit.xloc, hit.yloc, hit.zloc, hpge, phy_vol.position.eval(), None)",
+            },
+        },
+    }
+    gdml_path = configs / pathlib.Path("geom.gdml")
+    meta_path = test_data_configs
+
+    # complete check on the processing chain including parameters / local variables
+
+    hit.build_hit(
+        str(test_reboost_input_file / "out.lh5"),
+        [str(test_reboost_input_file / "file*.lh5")],
+        in_field="hit",
+        out_field="hit",
+        proc_config=proc_config,
+        pars={"det001": {"reso": 1, "meta_name": "V99000A.json", "phy_vol_name": "det_phy_1"}},
+        buffer=100000,
+        merge_input_files=False,
+        metadata_path=meta_path,
+        gdml=gdml_path,
+    )
diff --git a/tests/test_hpge_processors.py b/tests/test_hpge_processors.py
new file mode 100644
index 0000000..167912f
--- /dev/null
+++ b/tests/test_hpge_processors.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+import numpy as np
+
+from reboost.hpge import processors
+
+
+def test_smear():
+    truth = np.array([1, 2, 3, 4, 5])
+    smeared = processors.smear_energies(truth, reso=0.01)
+
+    assert np.size(smeared) == 5
diff --git a/tests/test_hpge_step_group.py b/tests/test_hpge_step_group.py
new file mode 100644
index 0000000..69141cd
--- /dev/null
+++ b/tests/test_hpge_step_group.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+import awkward as ak
+import numpy as np
+from lgdo import Table
+
+from reboost.hpge import hit, processors
+
+
+def test_evtid_group():
+    in_arr_evtid = ak.Array(
+        {"_evtid": [1, 1, 1, 2, 2, 10, 10, 11, 12, 12, 12], "time": np.zeros(11)}
+    )
+
+    in_tab = Table(in_arr_evtid)
+
+    out = processors.group_by_evtid(in_tab)
+    out_ak = out.view_as("ak")
+    assert ak.all(out_ak._evtid == [[1, 1, 1], [2, 2], [10, 10], [11], [12, 12, 12]])
+    assert ak.all(out_ak.time == [[0, 0, 0], [0, 0], [0, 0], [0], [0, 0, 0]])
+
+    # test the eval in build hit also
+
+    out_eval = hit.step_group(
+        in_tab,
+        {
+            "description": "group steps by time and evtid.",
+            "expression": "reboost.hpge.processors.group_by_evtid(stp)",
+        },
+    )
+
+    out_eval_ak = out_eval.view_as("ak")
+
+    assert ak.all(out_ak._evtid == out_eval_ak._evtid)
+    assert ak.all(out_ak.time == out_eval_ak.time)
+
+
+def test_time_group():
+    # time units are ns
+    in_arr_evtid = ak.Array(
+        {
+            "_evtid": [1, 1, 1, 2, 2, 2, 2, 2, 11, 12, 12, 12, 15, 15, 15, 15, 15],
+            "time": [
+                0,
+                -2000,
+                3000,
+                0,
+                100,
+                1200,
+                17000,
+                17010,
+                0,
+                0,
+                0,
+                -5000,
+                150,
+                151,
+                152,
+                3000,
+                3100,
+            ],
+        }
+    )
+
+    in_tab = Table(in_arr_evtid)
+
+    # 1us =1000ns
+    out = processors.group_by_time(in_tab, window=1)
+    out_ak = out.view_as("ak")
+    assert ak.all(
+        out_ak._evtid
+        == [[1], [1], [1], [2, 2], [2], [2, 2], [11], [12], [12, 12], [15, 15, 15], [15, 15]]
+    )
+    assert ak.all(
+        out_ak.time
+        == [
+            [-2000],
+            [0],
+            [3000],
+            [0, 100],
+            [1200],
+            [17000, 17010],
+            [0],
+            [-5000],
+            [0, 0],
+            [150, 151, 152],
+            [3000, 3100],
+        ]
+    )
diff --git a/tests/test_hpge_utils.py b/tests/test_hpge_utils.py
new file mode 100644
index 0000000..cd1f2fd
--- /dev/null
+++ b/tests/test_hpge_utils.py
@@ -0,0 +1,297 @@
+from __future__ import annotations
+
+import json
+import pathlib
+
+import awkward as ak
+import numpy as np
+import pyg4ometry
+import pytest
+import yaml
+from legendhpges.base import HPGe
+from legendtestdata import LegendTestData
+from lgdo import Array, Table, lh5
+
+from reboost.hpge.utils import (
+    _merge_arrays,
+    dict2tuple,
+    get_file_list,
+    get_files_to_read,
+    get_global_evtid,
+    get_global_evtid_range,
+    get_hpge,
+    get_include_chunk,
+    get_num_evtid_hit_tier,
+    get_num_simulated,
+    get_phy_vol,
+    load_dict,
+)
+
+configs = pathlib.Path(__file__).parent.resolve() / pathlib.Path("configs")
+
+
+def test_merge():
+    ak_obj = ak.Array({"evtid": [1, 1, 1, 1, 2, 2, 3], "edep": [100, 50, 1000, 20, 100, 200, 10]})
+    bufer_rows = ak.Array({"evtid": [1, 1], "edep": [60, 50]})
+
+    # should only remove the last element
+    merged_idx_0, buffer_0, mode = _merge_arrays(ak_obj, None, 0, 100, True)
+
+    assert ak.all(merged_idx_0.evtid == [1, 1, 1, 1, 2, 2])
+    assert ak.all(merged_idx_0.edep == [100, 50, 1000, 20, 100, 200])
+
+    assert ak.all(buffer_0.evtid == [3])
+    assert ak.all(buffer_0.edep == [10])
+
+    # delete input file
+    assert mode == "of"
+
+    # if delete input is false it should be appended
+    _, _, mode = _merge_arrays(ak_obj, None, 0, 100, False)
+    assert mode == "append"
+
+    # now if idx isn't 0 or the max_idx should add the buffer and remove the end
+
+    merged_idx, buffer, mode = _merge_arrays(ak_obj, bufer_rows, 2, 100, True)
+
+    assert ak.all(merged_idx.evtid == [1, 1, 1, 1, 1, 1, 2, 2])
+    assert ak.all(merged_idx.edep == [60, 50, 100, 50, 1000, 20, 100, 200])
+
+    assert ak.all(buffer.evtid == [3])
+    assert ak.all(buffer.edep == [10])
+
+    assert mode == "append"
+
+    # now for the final index just adds the buffer
+
+    merged_idx_end, buffer_end, mode = _merge_arrays(ak_obj, bufer_rows, 100, 100, True)
+
+    assert ak.all(merged_idx_end.evtid == [1, 1, 1, 1, 1, 1, 2, 2, 3])
+    assert ak.all(merged_idx_end.edep == [60, 50, 100, 50, 1000, 20, 100, 200, 10])
+
+    assert buffer_end is None
+
+
+@pytest.fixture
+def file_fixture(tmp_path):
+    # Create a simple YAML file
+    data = {"det": 1}
+    yaml_file = tmp_path / "data.yaml"
+    with pathlib.Path.open(yaml_file, "w") as yf:
+        yaml.dump(data, yf)
+
+    json_file = tmp_path / "data.json"
+    with pathlib.Path.open(json_file, "w") as jf:
+        json.dump(data, jf)
+
+    # Create a simple TXT file
+    txt_file = tmp_path / "data.txt"
+    with pathlib.Path.open(txt_file, "w") as tf:
+        tf.write("Some text.\n")
+
+    # Return paths for the test functions
+    return {"yaml_file": yaml_file, "json_file": json_file, "txt_file": txt_file}
+
+
+def test_read(file_fixture):
+    json_dict = load_dict(file_fixture["json_file"], None)
+    assert json_dict["det"] == 1
+
+    yaml_dict = load_dict(file_fixture["yaml_file"], None)
+    assert yaml_dict["det"] == 1
+
+    with pytest.raises(NotImplementedError):
+        load_dict(file_fixture["txt_file"], None)
+
+
+@pytest.fixture
+def file_list(tmp_path):
+    # make a list of files
+    for i in range(5):
+        data = {"det": i}
+
+        # make a json file
+        json_file = tmp_path / f"data_{i}.json"
+        with pathlib.Path.open(json_file, "w") as jf:
+            json.dump(data, jf)
+
+        # and a text file
+        txt_file = tmp_path / f"data_{i}.txt"
+        with pathlib.Path.open(txt_file, "w") as tf:
+            tf.write("Some text.\n")
+    return pathlib.Path(tmp_path)
+
+
+def test_get_file_list(file_list):
+    first_file_list = get_file_list(str(pathlib.Path(file_list) / "data_0.json"))
+    assert len(first_file_list) == 1
+
+    json_file_list = get_file_list(str(pathlib.Path(file_list) / "data*.json"))
+    assert len(json_file_list) == 5
+    json_file_list_repeat = get_file_list(
+        [str(pathlib.Path(file_list) / "data*.json"), str(pathlib.Path(file_list) / "data*.json")]
+    )
+    assert len(json_file_list_repeat) == 5
+    all_file_list = get_file_list([str(pathlib.Path(file_list) / "data*")])
+    assert len(all_file_list) == 10
+
+
+@pytest.fixture(scope="session")
+def test_data_configs():
+    ldata = LegendTestData()
+    ldata.checkout("5f9b368")
+    return ldata.get_path("legend/metadata/hardware/detectors/germanium/diodes")
+
+
+def test_get_hpge(test_data_configs):
+    # specify name in pars
+    hpge = get_hpge(str(test_data_configs), dict2tuple({"meta_name": "C99000A.json"}), "det001")
+    assert isinstance(hpge, HPGe)
+
+    # now read without metaname
+    hpge_ic = get_hpge(str(test_data_configs), dict2tuple({}), "V99000A")
+    assert isinstance(hpge_ic, HPGe)
+
+
+def test_get_phy_vol():
+    gdml_path = configs / pathlib.Path("geom.gdml")
+
+    gdml = pyg4ometry.gdml.Reader(gdml_path).getRegistry()
+
+    # read with the det_phy_vol_name
+    phy = get_phy_vol(gdml, dict2tuple({"phy_vol_name": "det_phy_1"}), "det001")
+
+    assert isinstance(phy, pyg4ometry.geant4.PhysicalVolume)
+
+    # read without
+    phy = get_phy_vol(gdml, dict2tuple({}), "det_phy_0")
+    assert isinstance(phy, pyg4ometry.geant4.PhysicalVolume)
+
+
+@pytest.fixture
+def test_lh5_files(tmp_path):
+    n1 = 14002
+    tab1 = Table(size=n1)
+    tab1.add_field("a", Array(np.ones(n1)))
+    lh5.write(tab1, "hit/vertices", tmp_path / "file1.lh5", wo_mode="of")
+
+    n2 = 25156
+    tab2 = Table(size=n2)
+    tab2.add_field("a", Array(np.ones(n2)))
+
+    lh5.write(tab2, "hit/vertices", tmp_path / "file2.lh5", wo_mode="of")
+
+    n3 = int(1e7)
+    tab3 = Table(size=n3)
+    tab3.add_field("a", Array(np.ones(n3)))
+
+    lh5.write(tab3, "hit/vertices", tmp_path / "file3.lh5", wo_mode="of")
+
+    return tmp_path
+
+
+def test_get_n_sim(test_lh5_files):
+    # single file
+    n1 = get_num_simulated([str(test_lh5_files / "file1.lh5")])
+    assert n1 == [14002]
+
+    # two files
+    n12 = get_num_simulated([str(test_lh5_files / "file1.lh5"), str(test_lh5_files / "file2.lh5")])
+    assert n12 == [14002, 25156]
+
+    # length > buffer
+    n123 = get_num_simulated(
+        [
+            str(test_lh5_files / "file1.lh5"),
+            str(test_lh5_files / "file2.lh5"),
+            str(test_lh5_files / "file3.lh5"),
+        ]
+    )
+    assert n123 == [14002, 25156, int(1e7)]
+
+
+@pytest.fixture
+def test_hit_file(tmp_path):
+    n1 = 10
+    tab_ch1 = Table(size=n1)
+    tab_ch1.add_field("global_evtid", Array(np.array([1, 4, 5, 6, 7, 9, 11, 12, 15, 17])))
+    lh5.write(tab_ch1, "det001/hit", tmp_path / "file1.lh5", wo_mode="of")
+
+    n2 = 5
+    tab_ch2 = Table(size=n2)
+    tab_ch2.add_field("global_evtid", Array(np.array([7, 8, 12, 13, 14])))
+    lh5.write(tab_ch2, "det002/hit", tmp_path / "file1.lh5", wo_mode="append")
+    return tmp_path / "file1.lh5"
+
+
+def test_num_evtid_hit_tier(test_hit_file):
+    # both channels
+    assert get_num_evtid_hit_tier(str(test_hit_file), ["det001", "det002"], "global_evtid") == 17
+
+    # one at a time
+    assert get_num_evtid_hit_tier(str(test_hit_file), ["det001"], "global_evtid") == 17
+    assert get_num_evtid_hit_tier(str(test_hit_file), ["det002"], "global_evtid") == 14
+
+
+def test_read_some_index(test_hit_file):
+    pass
+
+
+def test_global_evtid_range():
+    # raise exception if n_evtid is too large
+    with pytest.raises(ValueError):
+        get_global_evtid_range(100, 1000, 200)
+
+    # test that we get the right ranges
+    assert get_global_evtid_range(200, 5, 2000) == (200, 204)
+    assert get_global_evtid_range(200, None, 2000) == (200, 1999)
+
+
+def test_get_global_evtid():
+    # single file
+    first_evtid = 0
+    vertices = [0, 1, 2, 3, 4, 5]
+    input_evtid = [2, 3, 4, 5, 5, 5]
+    obj = ak.Array({"evtid": input_evtid})
+    assert np.all(get_global_evtid(first_evtid, obj, vertices)._global_evtid == input_evtid)
+
+    # now if we only have some vertices
+    vertices = [0, 2, 4, 6, 8, 10]
+    input_evtid = [4, 6, 8, 10, 10, 10]
+    obj = ak.Array({"evtid": input_evtid})
+    assert np.all(
+        get_global_evtid(first_evtid, obj, vertices)._global_evtid == np.array(input_evtid) / 2.0
+    )
+
+
+def test_get_files_to_read():
+    n_sim = [1000, 1200, 200, 5000]
+    n_sim = np.concatenate([[0], np.cumsum(n_sim)])
+
+    # read all evtid and thus all files
+    assert np.all(get_files_to_read(n_sim, start_glob_evtid=0, end_glob_evtid=7399) == [0, 1, 2, 3])
+
+    # all of file 0
+    assert np.all(get_files_to_read(n_sim, start_glob_evtid=0, end_glob_evtid=999) == [0])
+
+    # and some of file 1
+    assert np.all(get_files_to_read(n_sim, start_glob_evtid=0, end_glob_evtid=1000) == [0, 1])
+
+    # some of file 0, 1 and 2
+    assert np.all(get_files_to_read(n_sim, start_glob_evtid=0, end_glob_evtid=2200) == [0, 1, 2])
+
+    # only file 1 and 2
+    assert np.all(get_files_to_read(n_sim, start_glob_evtid=1000, end_glob_evtid=2300) == [1, 2])
+
+    # only file 3
+    assert np.all(get_files_to_read(n_sim, start_glob_evtid=2400, end_glob_evtid=5000) == [3])
+
+
+def test_skip_chunk():
+    evtid = ak.Array([4, 5, 10, 30])
+
+    assert get_include_chunk(evtid, start_glob_evtid=0, end_glob_evtid=35)
+    assert get_include_chunk(evtid, start_glob_evtid=0, end_glob_evtid=4)
+    assert get_include_chunk(evtid, start_glob_evtid=30, end_glob_evtid=33)
+    assert not get_include_chunk(evtid, start_glob_evtid=0, end_glob_evtid=3)
+    assert not get_include_chunk(evtid, start_glob_evtid=31, end_glob_evtid=100)