Clean-up + pre-commit

NNPDF · Jan 8, 2025 · a4285c8 · a4285c8
1 parent 1cccebe
commit a4285c8
Show file tree

Hide file tree

Showing 2 changed files with 115 additions and 125 deletions.
diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py
@@ -3,16 +3,17 @@
 '''
 
 import logging
+import os
 
 from filter_utils import Extractor
-import numpy as np
 
 logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
 
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 
 if __name__ == "__main__":
-    CMS_WCHARM_TOT = Extractor("./metadata.yaml", "WPWM-TOT", mult_factor=1000)
+    CMS_WCHARM_TOT = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-TOT", mult_factor=1000)
     CMS_WCHARM_TOT.generate_data()
 
-    CMS_WCHARM_RATIO = Extractor("./metadata.yaml", "WPWM-RATIO", mult_factor=1.0)
+    CMS_WCHARM_RATIO = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-RATIO", mult_factor=1.0)
     CMS_WCHARM_RATIO.generate_data()
diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py
@@ -1,4 +1,6 @@
+import functools
 import logging
+import os
 
 import numpy as np
 import yaml
@@ -8,28 +10,29 @@
 yaml.add_representer(float, prettify_float)
 
 MW2 = 80.385**2
-CMSLUMI13 = 2.5
 
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 ART_LABEL = 'art_corr'
 STAT_LABEL = 'stat_uncorr'
 TABLE_TOKEN = 'Table'
 
 
 class Extractor:
-    """
-    Extracts kinematics, central data, and uncertainties for a given dataset
-
-    Parameters
-    ----------
-    metadata_file: str
-      Path to the metadata file
-    observable: str
-      The name of the observable for which the data is extracted. The name must
-      be listed in the metadata file.
-    """
 
     def __init__(self, metadata_file, observable, mult_factor=1):
-
+        """
+        Parameters
+        ----------
+        metadata_file: str
+            Path to the metadata file
+        observable: str
+            The name of the observable for which the data is extracted. The name
+            must be listed in the metadata file.
+        mult_factor: float
+            Multiplication factor to apply to the central data points. This is
+            useful to convert the data in the metadata file to the desired
+            units.
+        """
         # Open metadata and select process
         with open(metadata_file, 'r') as file:
             metadata = yaml.safe_load(file)
@@ -44,69 +47,51 @@ def __init__(self, metadata_file, observable, mult_factor=1):
             if self.metadata is None:
                 raise Exception(f"{observable} is not listed in the metadata file.")
 
-        # Initialise dict of tables
-        self.tables = {}
         self.observable = observable
         self.mult_factor = mult_factor
-        self.kin_labels = self.metadata['kinematic_coverage']
-        self.ndata = self.metadata['ndata']
 
-    def __retrieve_table(self, table_id):
+    @functools.cache
+    def _retrieve_table(self, table_id):
         """
-        Implementation of the lazy loading for the tables. If the table
-        is loaded for the first time, it is stored into an internal
-        container of the class, so that it will not be loaded each time.
-
-        When called, this functions checks if the table has already been stored
-        and, if that is the case, returns the stored table.
+        Implementation of the loading for the table.
 
         Parameters
         ----------
         table_id: int
-          Index that specifies the table
+          Index that specifies the table.
 
         Return
         ------
-        The table specified by `table_id`. If not previously loaded, it is also
-        stored into the internal container for future use.
+        The table specified by `table_id`.
         """
-        try:
-            table = self.tables[str(table_id)]
-        except KeyError:
-            logging.debug(
-                f'Table {table_id} has not already been used or stored.' f' Storing the table...'
-            )
-            with open(f'./rawdata/{TABLE_TOKEN}{table_id}.yaml', 'r') as tab:
-                tab_dict = yaml.safe_load(tab)
-                self.tables[str(table_id)] = tab_dict
-                table = tab_dict
-        return table
+        with open(f'{CURRENT_DIR}/rawdata/{TABLE_TOKEN}{table_id}.yaml') as tab:
+            tab_dict = yaml.safe_load(tab)
+        return tab_dict
 
-    def __extract_kinematics(self, table: dict):
+    def _generate_kinematics(self):
         """
-        Extracts the kinematic variables of the single differential
-        distribution given a table.
-
-        For each bin, it computes the max, min, and mid value of the transverse
-        momentum of the boson.
-
-        Parameters
-        ----------
-        table: dict
-          Dictionary containing the bins in the transverse momentum
+        The function generates the kinematics by reading and processing it from
+        the referenced table. Kinematics is processed in the format of a list of
+        dictionaries. The keys in each dictionaries specify the label (i.e. name)
+        for the kinematic variables. For this dataset, they are 'abs_eta' and 'm_W2'.
+        The labels are taken from the matadata file. The corresponding values are
+        'min', 'mid', and 'max'.
+
+        For this dataset, 'm_W2' is used in the computation of the (x,Q2)-map and
+        does not have any active role in the fit. For that reason, every bin has the
+        same value. Moreover, only the mid value is used.
+        """
+        logging.info(f"Generating kinematics for CMS_{self.observable}...")
 
-        Return
-        ------
-        List of bins containing min, max, and mid values for each of the kinematic
-        observables listed in the `kinematic_coverage` of the metadata file.
+        table_ID = self.metadata["tables"][0]
+        tab_dict = self._retrieve_table(table_ID)
 
-        """
-        data = table['independent_variables'][0]
-        label = self.kin_labels
+        data = tab_dict['independent_variables'][0]
+        label = self.metadata['kinematic_coverage']
         kinematics = []
-        for bin in data['values']:
-            abs_eta_min = bin['low']
-            abs_eta_max = bin['high']
+        for eta_bin in data['values']:
+            abs_eta_max = eta_bin['high']
+            abs_eta_min = eta_bin['low']
             kin_bin = {
                 label[0]: {
                     'min': abs_eta_min,
@@ -116,61 +101,47 @@ def __extract_kinematics(self, table: dict):
                 label[1]: {'min': None, 'mid': MW2, 'max': None},
             }
             kinematics.append(kin_bin)
-        return kinematics
-
-    def generate_kinematics(self):
-        """
-        Function that generates the kinematics by looping over all the
-        tables specified in the metadata file. The resulting kinematics
-        is then saved to a yaml file. It relies on the method
-        `__extract_kinematics`.
-        """
-
-        logging.info(f"Generating kinematics for ATLAS_{self.observable}...")
-
-        # Initialise kinematics list
-        kinematics = []
-        ndata = 0
-        table = self.metadata["tables"][0]
-        tab_dict = self.__retrieve_table(table)
-        kin = self.__extract_kinematics(tab_dict)
-        kinematics = np.concatenate([kinematics, kin])
-        ndata += len(kin)
 
         # Check number of data agrees with metadata
-        try:
-            assert self.metadata['ndata'] is not None
-            assert self.metadata['ndata'] == ndata
-        except AssertionError as e:
-            logging.warning(
-                f"The number of data in the metafile is either wrong or unspecified."
-                f" The correct number is {ndata}. Please, update the metafile."
+        ndata = len(kinematics)
+        if not self.metadata['ndata'] == ndata:
+            raise ValueError(
+                f"Mismatch in 'ndata': expected {self.metadata['ndata']}, but got {ndata}"
             )
-            return
-        return kinematics.tolist()
+        self.ndata = ndata
+        return kinematics
 
-    def generate_data_and_unc(self, mult_factor=1.0):
+    def _generate_data_and_unc(self):
         """
-        Same as `generate_kinematics`, but for central data points.
+        Return a list with central data points and two additional lists with the corresponding
+        statistical and systematic uncertainties. For this dataset, uncertainties are always
+        symmetric. Uncertainties are given as absolute values.
+
+        Note that, for the total x-sec, the correlation matrix is provided. The corresponding
+        covariance matrix is constructed in `_generate_covmat`.
         """
         logging.info(f"Generating central data for CMS_{self.observable}...")
         dat_central = []
         stat_unc = []
         asy_sys_unc = []
-        table = self.metadata['tables'][0]
-        tab_dict = self.__retrieve_table(table)
+        table_ID = self.metadata['tables'][0]
+        tab_dict = self._retrieve_table(table_ID)
 
         # Select data with pT > 25 GeV
         tab_dict = tab_dict['dependent_variables'][0]['values']
 
         # Loop over bins
         for rap_bin in tab_dict:
-            dat_central.append(rap_bin['value'] * mult_factor)
-            stat_unc.append(rap_bin['errors'][0]['symerror'] * mult_factor)
-            asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * mult_factor)
+            dat_central.append(rap_bin['value'] * self.mult_factor)
+            stat_unc.append(rap_bin['errors'][0]['symerror'] * self.mult_factor)
+            asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * self.mult_factor)
         return dat_central, stat_unc, asy_sys_unc
 
-    def __build_unc_definitions(self):
+    def _build_unc_definitions(self):
+        """
+        Build the dictionary containing the definitions of the uncertainties to be
+        used in the uncertainty data file.
+        """
         unc_definitions = {}
 
         # Statistical uncertainty
@@ -196,9 +167,22 @@ def __build_unc_definitions(self):
 
         return unc_definitions
 
-    def generate_covmat(self, diag_uncs=None):
-        table = self.metadata["tables"][1]
-        tab_dict = self.__retrieve_table(table)
+    def _generate_covmat(self, diag_uncs):
+        """
+        Generate the covariance matrix for the total x-sec. This function requires
+        the diagonal systematic uncertainties as argument. The diagonal uncertainties
+        are used to construct the covariance matrix from the correlation matrix stored
+        in the HepData table.
+
+        Note that such a correlation matrix exists for the total x-sec only, while the
+        ratio observable does not provide this information.
+        """
+        if not self.observable == 'WPWM-TOT':
+            raise ValueError(
+                "The construction of the covariance matrix is defined for the total x-sec only."
+            )
+        table_ID = self.metadata["tables"][1]
+        tab_dict = self._retrieve_table(table_ID)
         matlist = tab_dict['dependent_variables'][0]['values']
         matlist = [d['value'] for d in matlist]
         covmat = np.zeros((self.ndata, self.ndata))
@@ -208,64 +192,69 @@ def generate_covmat(self, diag_uncs=None):
         return covmat
 
     def generate_data(self):
-        '''
-        Collect central data, kinematics, and uncertainties and save them into
-        yaml files.
-        '''
+        """
+        The function collects central data, kinematics, and uncertainties ans save them
+        into yaml files.
+
+        The systematic uncertainties are given as percentages relative the central data point.
+        The absolute value of the uncertainty is obtained from the central data point before
+        the shifts are applied.
+        """
         # Get central data and kinematics
-        central_data, stat_unc, sys_unc = self.generate_data_and_unc(self.mult_factor)
-        kinematics = self.generate_kinematics()
+        central_data, stat_unc, sys_unc = self._generate_data_and_unc()
+        kinematics = self._generate_kinematics()
 
         # Uncertainty definitions
-        unc_definitions = self.__build_unc_definitions()
+        unc_definitions = self._build_unc_definitions()
         sys_artificial = []  # Initialize vector of artificial uncertainties
 
         if self.observable == 'WPWM-TOT':
-            covmat = self.generate_covmat(sys_unc)
+            # Generate covmat and perform eigen decomposition
+            covmat = self._generate_covmat(sys_unc)
             eigvals, eigvecs = np.linalg.eig(covmat)
             art_unc = np.sqrt(eigvals) * eigvecs
 
             # Loop over bins
-            for data_idx, data in enumerate(central_data):
+            for data_idx in range(len(central_data)):
                 # Statistical uncertainty
                 unc_dict = {STAT_LABEL: stat_unc[data_idx]}
+
+                # Artificial systematic uncertainties
                 for sys_idx, art_sys in enumerate(art_unc[data_idx, :]):
                     unc_dict[f'{ART_LABEL}_{sys_idx+1}'] = float(art_sys)
+
+                # Append to list
                 sys_artificial.append(unc_dict)
 
         elif self.observable == 'WPWM-RATIO':
-            for data_idx, data in enumerate(central_data):
+            for data_idx in range(len(central_data)):
                 # Statistical uncertainty
                 unc_dict = {STAT_LABEL: stat_unc[data_idx]}
 
                 # Systematic uncertainty
                 unc_dict[f'{ART_LABEL}'] = sys_unc[data_idx]
                 sys_artificial.append(unc_dict)
-
-        # Local path for yaml files
-        path='./'
 
         # Save kinematics into file
         logging.info("Dumping kinematics to file...")
         kinematics_yaml = {'bins': kinematics}
-        with open(path + self.metadata['kinematics']['file'], 'w') as kin_out_file:
-            yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False)
+        kins_file_name = self.metadata['kinematics']['file']
+        with open(CURRENT_DIR + '/' + kins_file_name, 'w') as file:
+            yaml.dump(kinematics_yaml, file, sort_keys=False)
         logging.info("Done!")
 
         # Save central data into file
         logging.info("Dumping kinematics to file...")
         dat_central_yaml = {'data_central': central_data}
-        file_name = self.metadata['data_central']
-        with open(path + file_name, 'w') as dat_out_file:
-            yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False)
+        data_file_name = self.metadata['data_central']
+        with open(CURRENT_DIR + '/' + data_file_name, 'w') as file:
+            yaml.dump(dat_central_yaml, file, sort_keys=False)
         logging.info("Done!")
 
         # Save unertainties
         logging.info("Dumping kinematics to file...")
         uncertainties_yaml = {'definitions': unc_definitions, 'bins': sys_artificial}
-        file_name = (
-            self.metadata['data_uncertainties'][0]
-        )
-        with open(path + file_name, 'w') as dat_out_file:
-            yaml.dump(uncertainties_yaml, dat_out_file, sort_keys=False)
+        unc_file_name = self.metadata['data_uncertainties'][0]
+        with open(CURRENT_DIR + '/' + unc_file_name, 'w') as file:
+            yaml.dump(uncertainties_yaml, file, sort_keys=False)
         logging.info("Done!")