responding to @melanieclarke and @braingram comments

spacetelescope · Jun 11, 2024 · 25ab568 · 25ab568
1 parent c278c8e
commit 25ab568
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 110 deletions.
diff --git a/docs/jwst/badpix_selfcal/arguments.rst b/docs/jwst/badpix_selfcal/arguments.rst
@@ -8,7 +8,7 @@ The ``badpix_selfcal`` step has the following optional arguments.
 
 ``--kernel_size`` (integer, default=15)
   The size of the kernel to use for the median filter, which is applied 
-  in the spectral (Y-axis) direction to make the smoothed image.
+  in the spectral direction to make the smoothed image.
 
 ``--save_flagged_bkgd`` (boolean, default=False)
   Whether to save the flagged background exposures to fits files.
diff --git a/docs/jwst/badpix_selfcal/description.rst b/docs/jwst/badpix_selfcal/description.rst
@@ -8,7 +8,7 @@ Overview
 --------
 The ``badpix_selfcal`` step flags bad pixels in the input data using a self-calibration 
 technique based on median filtering along the spectral axis. 
-When background exposures are available, those are used in combination with the science
+When additional exposures are available, those are used in combination with the science
 exposure to identify bad pixels; when unavailable, the step will be skipped with a warning
 unless the ``force_single`` parameter is set True. In that case, the science data alone is
 used as its own "background".
@@ -18,22 +18,27 @@ in the :ref:`calwebb_spec2 <calwebb_spec2>` pipeline.
 
 Input details
 -------------
-The input data must be in the form of a `~jwst.datamodels.IFUImageModel` or 
+The input data must be in the form of a `~jwst.datamodels.IFUImageModel` or
 a `~jwst.datamodels.ModelContainer` containing exactly one
-science exposure and any number of background exposures. A fits or association file 
+science exposure and any number of additional exposures.
+A fits or association file 
 that can be read into one of these data models is also acceptable.
+Any exposure with the metadata attribute ``asn.exptype`` set to 
+``background`` or ``selfcal`` will be used in conjunction with the science
+exposure to construct the combined background image. 
 
 Algorithm
 ---------
 The algorithm relies on the assumption that bad pixels are outliers in the data along
 the spectral axis. The algorithm proceeds as follows:
 
-* A combined background image is created. If background exposures are available, 
-  the pixelwise minimum of all background and science exposures is taken. 
-  If no background exposures are available, the science data itself is passed in 
+* A combined background image is created. If additional (``selfcal`` or ``background``)
+  exposures are available, 
+  the pixelwise minimum of all background, selfcal, and science exposures is taken. 
+  If no additional exposures are available, the science data itself is passed in 
   without modification, serving as the "background image" for the rest of the procedure, 
   i.e., true self-calibration.
-* The combined background image is median-filtered, ignoring NaNs, along the spectral (Y-) axis 
+* The combined background image is median-filtered, ignoring NaNs, along the spectral axis 
   with a user-specified kernel size. The default kernel size is 15 pixels.
 * The difference between the original background image and the median-filtered background image
   is taken. The highest- and lowest-flux pixels in this difference image are
@@ -42,7 +47,8 @@ the spectral axis. The algorithm proceeds as follows:
   using the ``flagfrac`` parameter. The total fraction of flagged pixels is thus 2x ``flagfrac``.
 * The bad pixels are flagged in the input data by setting the DQ flag to
   "OTHER_BAD_PIXEL" and "DO_NOT_USE".
-* The bad pixels are also flagged in each background exposure, if available.
+* The bad pixels are also flagged in each exposure with ``asn.exptype`` equal to ``background``,
+  if available.
 
 Output product
 --------------

diff --git a/jwst/badpix_selfcal/badpix_selfcal.py b/jwst/badpix_selfcal/badpix_selfcal.py
@@ -6,7 +6,8 @@
 
 
 def badpix_selfcal(minimg: np.ndarray,
-                   flagfrac: float = 0.001,
+                   flagfrac_lower: float = 0.001,
+                   flagfrac_upper: float = 0.001,
                    kernel_size: int = 15,
                    dispaxis=None) -> np.ndarray:
     """
@@ -18,8 +19,10 @@ def badpix_selfcal(minimg: np.ndarray,
         Selfcal data of shape (x, y), i.e., after some operation has
         already been taken to combine multiple exposures,
         typically a MIN operation.
-    flagfrac: float
-        Fraction of pixels to flag on each of low and high end
+    flagfrac_lower: float
+        Fraction of pixels to flag on the low end
+    flagfrac_upper: float
+        Fraction of pixels to flag on the high end
     kernel_size: int
         Size of kernel for median filter
     dispaxis: int
@@ -32,9 +35,11 @@ def badpix_selfcal(minimg: np.ndarray,
         Indices of the flagged pixels,
         shaped like output from np.where
     """
-    if flagfrac < 0 or flagfrac >= 0.5:
-        raise ValueError("flagfrac must be between 0 and 0.5. \
-                        Note this fraction will be flagged on both high and low ends.")
+    if (flagfrac_lower < 0) or (flagfrac_upper < 0):
+        raise ValueError("Flagged fraction must be between 0 and 0.5")
+    if flagfrac_lower + flagfrac_upper >= 1:
+        raise ValueError("Total flagged fraction (lower plus upper) must be less than 1")
+
     if dispaxis not in [1, 2, None]:
         raise ValueError("dispaxis must be either 1 or 2, or None.")
 
@@ -51,7 +56,7 @@ def badpix_selfcal(minimg: np.ndarray,
     minimg_hpf = minimg - smoothed
 
     # Flag outliers using percentile cutoff
-    flag_low, flag_high = np.nanpercentile(minimg_hpf, [flagfrac * 100, (1 - flagfrac) * 100])
+    flag_low, flag_high = np.nanpercentile(minimg_hpf, [flagfrac_lower * 100, (1 - flagfrac_upper) * 100])
     bad = (minimg_hpf > flag_high) | (minimg_hpf < flag_low)
     flagged_indices = np.where(bad)
 
@@ -77,7 +82,7 @@ def apply_flags(input_model: dm.IFUImageModel, flagged_indices: np.ndarray) -> d
         Flagged data model
     """
 
-    input_model.dq[flagged_indices] = pixel["DO_NOT_USE"] + pixel["OTHER_BAD_PIXEL"]
+    input_model.dq[flagged_indices] += pixel["DO_NOT_USE"] + pixel["OTHER_BAD_PIXEL"]
 
     input_model.data[flagged_indices] = np.nan
     input_model.err[flagged_indices] = np.nan

diff --git a/jwst/badpix_selfcal/badpix_selfcal_step.py b/jwst/badpix_selfcal/badpix_selfcal_step.py
@@ -4,7 +4,6 @@
 from . import badpix_selfcal
 import numpy as np
 from jwst import datamodels as dm
-from jwst.master_background.master_background_step import split_container
 
 import logging
 log = logging.getLogger(__name__)
@@ -21,14 +20,16 @@ class BadpixSelfcalStep(Step):
     All input exposures in the association file (or manually-provided bkg_list) are combined
     into a single background model using a MIN operation. The bad pixels are then identified
     using a median filter and percentile cutoffs, and applied to the science data by setting
-    the flagged pixels to NaN and the DQ flag to DO_NOT_USE + OTHER_BAD_PIXEL.
+    the flagged pixels, errors, and variances to NaN
+    and the DQ flag to DO_NOT_USE + OTHER_BAD_PIXEL.
     """
 
     class_alias = "badpix_selfcal"
     bkg_suffix = "badpix_selfcal"
 
     spec = """
-    flagfrac = float(default=0.001)  #fraction of pixels to flag on each of low and high end
+    flagfrac_lower = float(default=0.001)  #fraction of pixels to flag on the low-flux end
+    flagfrac_upper = float(default=0.001)  #fraction of pixels to flag on the high-flux end
     kernel_size = integer(default=15)  #size of kernel for median filter
     force_single = boolean(default=False)  #force single input exposure
     skip = boolean(default=True)
@@ -41,9 +42,9 @@ def process(self, input, selfcal_list=[]):
         Parameters
         ----------
         input: JWST data model or association
-            input science data to be corrected
+            input science data to be corrected, or tuple of (sci, bkg, selfcal)
 
-        selfcal_list: list of ImageModels or filenames to use for selfcal
+        selfcal_list: list of user-defined ImageModels or filenames to use for selfcal
 
         Returns
         -------
@@ -62,13 +63,15 @@ def process(self, input, selfcal_list=[]):
         In that case, the input exposure will be used as the sole background exposure,
         i.e., true self-calibration.
         """
-        input_sci, selfcal_list, bkg_list_asn = _parse_inputs(input, selfcal_list)
 
+        input_sci, selfcal_list, bkg_list_asn = _parse_inputs(input, selfcal_list)
         # ensure that there are background exposures to use, otherwise skip the step
         # unless forced
         if (len(selfcal_list) == 0) and (not self.force_single):
-            log.warning("No background exposures provided for self-calibration. Skipping step.")
-            self.record_step_status(input_sci, "badpix_selfcal", success=False)
+            log.warning("No selfcal or background exposures provided for self-calibration. \
+                        Skipping step. If you want to force self-calibration with the science \
+                        exposure alone (generally not recommended), set force_single=True.")
+            input_sci.meta.cal_step.badpix_selfcal = 'SKIPPED'
             return input_sci, bkg_list_asn
 
         # get the dispersion axis
@@ -79,43 +82,39 @@ def process(self, input, selfcal_list=[]):
                         Kernel for self-calibration will be two-dimensional.")
             dispaxis = None
 
-        # open all selfcal exposures
+        # collapse all selfcal exposures into a single background model
         # note that selfcal_list includes the science exposure. This is expected.
         # all exposures are combined into a single background model using a MIN operation.
         selfcal_list = [input_sci] + [dm.open(k) for k in selfcal_list]
-
-        # collapse background dithers into a single background model
         selfcal_3d = []
         for i, selfcal_model in enumerate(selfcal_list):
             selfcal_3d.append(selfcal_model.data)
         minimg = np.nanmin(np.asarray(selfcal_3d), axis=0)
-        bad_indices = badpix_selfcal.badpix_selfcal(minimg, self.flagfrac, self.kernel_size, dispaxis)
+        bad_indices = badpix_selfcal.badpix_selfcal(minimg, self.flagfrac_lower, self.flagfrac_upper, self.kernel_size, dispaxis)
 
         # apply the flags to the science data
         input_sci = badpix_selfcal.apply_flags(input_sci, bad_indices)
 
         # apply the flags to the background data to be passed to background sub step
         if len(bkg_list_asn) > 0:
             for i, background_model in enumerate(bkg_list_asn):
-                bkg_list_asn[i] = badpix_selfcal.apply_flags(background_model, bad_indices)
+                bkg_list_asn[i] = badpix_selfcal.apply_flags(dm.open(background_model), bad_indices)
 
-        self.record_step_status(input_sci, "badpix_selfcal", success=True)
+        input_sci.meta.cal_step.badpix_selfcal = 'COMPLETE'
+        return input_sci, bkg_list_asn
 
-        return input_sci, *bkg_list_asn
 
-
-def _parse_inputs(input, selfcal_list=[]):
+def _parse_inputs(input, selfcal_list):
     """
     Parse the input to the step. This is a helper function that is used in the
     command line interface to the step.
 
     Parameters
     ----------
     input: str
-        input file or association
+        input file or association, or tuple of (sci, bkg, selfcal)
 
-    selfcal_list: list
-        ImageModels or filenames to use for selfcal
+    selfcal_list: list of user-supplied ImageModels or filenames to use for selfcal
 
     Returns
     -------
@@ -124,30 +123,66 @@ def _parse_inputs(input, selfcal_list=[]):
 
     selfcal_list: list of ImageModels or filenames to use for selfcal
     """
-    with dm.open(input) as input_data:
+    selfcal_list_user = selfcal_list
+
+    if isinstance(input, tuple):
+        input_sci = dm.open(input[0])
+        selfcal_list = list(input[1]) + list(input[2])
+        bkg_list_asn = input[1]
 
-        # find science and background exposures.
-        if isinstance(input_data, dm.ModelContainer):
+    else:
+        with dm.open(input) as input_data:
 
-            sci_models, bkg_list_asn, selfcal_list_asn = split_container(input_data,
-                                                                         fields=['science', 'background', 'selfcal'])
+            # find science and background exposures.
+            if isinstance(input_data, dm.ModelContainer):
+
+                sci_models, bkg_list_asn, selfcal_list_asn = split_container_by_asn_exptype(
+                    input_data, exptypes=['science', 'background', 'selfcal'])
 
-            if len(selfcal_list) == 0:
                 selfcal_list = list(bkg_list_asn) + list(selfcal_list_asn)
+
+                if len(sci_models) > 1:
+                    raise ValueError("Input data contains multiple science exposures. \
+                                     This is not supported in calwebb_spec2 steps.")
+                input_sci = sci_models[0]
+
+            elif isinstance(input_data, dm.IFUImageModel) or isinstance(input_data, dm.ImageModel):
+
+                input_sci = input_data
+                bkg_list_asn = []
+
             else:
-                log.warning("selfcal_list provided directly as input, ignoring background \
-                            and selfcal exposure types in the association file")
+                raise TypeError("Input data is not a ModelContainer, ImageModel, or IFUImageModel.\
+                                Cannot continue.")
+
+    if len(selfcal_list_user) > 0:
+        selfcal_list = selfcal_list_user
+        log.warning("selfcal_list provided directly as input, ignoring any other \
+                    input background and selfcal exposure types, from e.g. an input \
+                    association file")
+    return input_sci, selfcal_list, bkg_list_asn
 
-            # in calwebb_spec2 there should be only a single science exposure in an association
-            input_sci = sci_models[0]
 
-        elif isinstance(input_data, dm.IFUImageModel) or isinstance(input_data, dm.ImageModel):
+def split_container_by_asn_exptype(container: dm.ModelContainer, exptypes: list) -> list:
+    """
+    Split a ModelContainer into a list of ImageModels based on exposure type.
 
-            input_sci = input_data
-            bkg_list_asn = []
+    Parameters
+    ----------
+    container: ModelContainer
+        input data
 
-        else:
-            raise TypeError("Input data is not a ModelContainer, ImageModel, or IFUImageModel.\
-                            Cannot continue.")
+    exptype: str
+        exposure type to split on
 
-    return input_sci, selfcal_list, bkg_list_asn
+    Returns
+    -------
+    split_list: list of lists
+        lists of ImageModels
+    """
+    split_list = []
+    for exptype in exptypes:
+        good_models = [container[j] for j in range(len(container)) if container[j].meta.asn.exptype == exptype]
+        split_list.append(good_models)
+
+    return split_list
diff --git a/jwst/badpix_selfcal/tests/test_badpix_selfcal.py b/jwst/badpix_selfcal/tests/test_badpix_selfcal.py
@@ -202,7 +202,7 @@ def test_background_flagger_mrs(background):
 
     # pass into the MRSBackgroundFlagger and check it found the right pixels
     flagfrac = 0.001
-    result = badpix_selfcal(bg, flagfrac=flagfrac)
+    result = badpix_selfcal(bg, flagfrac_lower=flagfrac, flagfrac_upper=flagfrac)
     result_tuples = [(i, j) for i, j in zip(*result)]
 
     # check that the hot pixels were among those flagged
@@ -248,10 +248,14 @@ def test_badpix_selfcal_step(request, dset):
 
     assert result[0].meta.cal_step.badpix_selfcal == "COMPLETE"
     if dset == "sci":
-        assert len(result) == 1
+        assert len(result) == 2
+        assert len(result[0]) == 1
+        assert len(result[1]) == 0
     else:
-        # should return sci, bkg0, bkg1 but not selfcal0, selfcal1
-        assert len(result) == 3
+        # should return sci, (bkg0, bkg1) but not selfcal0, selfcal1
+        assert len(result) == 2
+        assert len(result[0]) == 1
+        assert len(result[1]) == 2
 
 
 def test_expected_fail_sci(sci):