diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py index 8ce27320c6..c0604b80d7 100644 --- a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py +++ b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter.py @@ -3,16 +3,17 @@ ''' import logging +import os from filter_utils import Extractor -import numpy as np logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if __name__ == "__main__": - CMS_WCHARM_TOT = Extractor("./metadata.yaml", "WPWM-TOT", mult_factor=1000) + CMS_WCHARM_TOT = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-TOT", mult_factor=1000) CMS_WCHARM_TOT.generate_data() - CMS_WCHARM_RATIO = Extractor("./metadata.yaml", "WPWM-RATIO", mult_factor=1.0) + CMS_WCHARM_RATIO = Extractor(f"{CURRENT_DIR}/metadata.yaml", "WPWM-RATIO", mult_factor=1.0) CMS_WCHARM_RATIO.generate_data() diff --git a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py index 1dbc3e44d5..691d251bdd 100644 --- a/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py +++ b/nnpdf_data/nnpdf_data/commondata/CMS_WCHARM_7TEV/filter_utils.py @@ -1,4 +1,6 @@ +import functools import logging +import os import numpy as np import yaml @@ -8,28 +10,29 @@ yaml.add_representer(float, prettify_float) MW2 = 80.385**2 -CMSLUMI13 = 2.5 +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) ART_LABEL = 'art_corr' STAT_LABEL = 'stat_uncorr' TABLE_TOKEN = 'Table' class Extractor: - """ - Extracts kinematics, central data, and uncertainties for a given dataset - - Parameters - ---------- - metadata_file: str - Path to the metadata file - observable: str - The name of the observable for which the data is extracted. The name must - be listed in the metadata file. - """ def __init__(self, metadata_file, observable, mult_factor=1): - + """ + Parameters + ---------- + metadata_file: str + Path to the metadata file + observable: str + The name of the observable for which the data is extracted. The name + must be listed in the metadata file. + mult_factor: float + Multiplication factor to apply to the central data points. This is + useful to convert the data in the metadata file to the desired + units. + """ # Open metadata and select process with open(metadata_file, 'r') as file: metadata = yaml.safe_load(file) @@ -44,69 +47,51 @@ def __init__(self, metadata_file, observable, mult_factor=1): if self.metadata is None: raise Exception(f"{observable} is not listed in the metadata file.") - # Initialise dict of tables - self.tables = {} self.observable = observable self.mult_factor = mult_factor - self.kin_labels = self.metadata['kinematic_coverage'] - self.ndata = self.metadata['ndata'] - def __retrieve_table(self, table_id): + @functools.cache + def _retrieve_table(self, table_id): """ - Implementation of the lazy loading for the tables. If the table - is loaded for the first time, it is stored into an internal - container of the class, so that it will not be loaded each time. - - When called, this functions checks if the table has already been stored - and, if that is the case, returns the stored table. + Implementation of the loading for the table. Parameters ---------- table_id: int - Index that specifies the table + Index that specifies the table. Return ------ - The table specified by `table_id`. If not previously loaded, it is also - stored into the internal container for future use. + The table specified by `table_id`. """ - try: - table = self.tables[str(table_id)] - except KeyError: - logging.debug( - f'Table {table_id} has not already been used or stored.' f' Storing the table...' - ) - with open(f'./rawdata/{TABLE_TOKEN}{table_id}.yaml', 'r') as tab: - tab_dict = yaml.safe_load(tab) - self.tables[str(table_id)] = tab_dict - table = tab_dict - return table + with open(f'{CURRENT_DIR}/rawdata/{TABLE_TOKEN}{table_id}.yaml') as tab: + tab_dict = yaml.safe_load(tab) + return tab_dict - def __extract_kinematics(self, table: dict): + def _generate_kinematics(self): """ - Extracts the kinematic variables of the single differential - distribution given a table. - - For each bin, it computes the max, min, and mid value of the transverse - momentum of the boson. - - Parameters - ---------- - table: dict - Dictionary containing the bins in the transverse momentum + The function generates the kinematics by reading and processing it from + the referenced table. Kinematics is processed in the format of a list of + dictionaries. The keys in each dictionaries specify the label (i.e. name) + for the kinematic variables. For this dataset, they are 'abs_eta' and 'm_W2'. + The labels are taken from the matadata file. The corresponding values are + 'min', 'mid', and 'max'. + + For this dataset, 'm_W2' is used in the computation of the (x,Q2)-map and + does not have any active role in the fit. For that reason, every bin has the + same value. Moreover, only the mid value is used. + """ + logging.info(f"Generating kinematics for CMS_{self.observable}...") - Return - ------ - List of bins containing min, max, and mid values for each of the kinematic - observables listed in the `kinematic_coverage` of the metadata file. + table_ID = self.metadata["tables"][0] + tab_dict = self._retrieve_table(table_ID) - """ - data = table['independent_variables'][0] - label = self.kin_labels + data = tab_dict['independent_variables'][0] + label = self.metadata['kinematic_coverage'] kinematics = [] - for bin in data['values']: - abs_eta_min = bin['low'] - abs_eta_max = bin['high'] + for eta_bin in data['values']: + abs_eta_max = eta_bin['high'] + abs_eta_min = eta_bin['low'] kin_bin = { label[0]: { 'min': abs_eta_min, @@ -116,61 +101,47 @@ def __extract_kinematics(self, table: dict): label[1]: {'min': None, 'mid': MW2, 'max': None}, } kinematics.append(kin_bin) - return kinematics - - def generate_kinematics(self): - """ - Function that generates the kinematics by looping over all the - tables specified in the metadata file. The resulting kinematics - is then saved to a yaml file. It relies on the method - `__extract_kinematics`. - """ - - logging.info(f"Generating kinematics for ATLAS_{self.observable}...") - - # Initialise kinematics list - kinematics = [] - ndata = 0 - table = self.metadata["tables"][0] - tab_dict = self.__retrieve_table(table) - kin = self.__extract_kinematics(tab_dict) - kinematics = np.concatenate([kinematics, kin]) - ndata += len(kin) # Check number of data agrees with metadata - try: - assert self.metadata['ndata'] is not None - assert self.metadata['ndata'] == ndata - except AssertionError as e: - logging.warning( - f"The number of data in the metafile is either wrong or unspecified." - f" The correct number is {ndata}. Please, update the metafile." + ndata = len(kinematics) + if not self.metadata['ndata'] == ndata: + raise ValueError( + f"Mismatch in 'ndata': expected {self.metadata['ndata']}, but got {ndata}" ) - return - return kinematics.tolist() + self.ndata = ndata + return kinematics - def generate_data_and_unc(self, mult_factor=1.0): + def _generate_data_and_unc(self): """ - Same as `generate_kinematics`, but for central data points. + Return a list with central data points and two additional lists with the corresponding + statistical and systematic uncertainties. For this dataset, uncertainties are always + symmetric. Uncertainties are given as absolute values. + + Note that, for the total x-sec, the correlation matrix is provided. The corresponding + covariance matrix is constructed in `_generate_covmat`. """ logging.info(f"Generating central data for CMS_{self.observable}...") dat_central = [] stat_unc = [] asy_sys_unc = [] - table = self.metadata['tables'][0] - tab_dict = self.__retrieve_table(table) + table_ID = self.metadata['tables'][0] + tab_dict = self._retrieve_table(table_ID) # Select data with pT > 25 GeV tab_dict = tab_dict['dependent_variables'][0]['values'] # Loop over bins for rap_bin in tab_dict: - dat_central.append(rap_bin['value'] * mult_factor) - stat_unc.append(rap_bin['errors'][0]['symerror'] * mult_factor) - asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * mult_factor) + dat_central.append(rap_bin['value'] * self.mult_factor) + stat_unc.append(rap_bin['errors'][0]['symerror'] * self.mult_factor) + asy_sys_unc.append(rap_bin['errors'][1]['symerror'] * self.mult_factor) return dat_central, stat_unc, asy_sys_unc - def __build_unc_definitions(self): + def _build_unc_definitions(self): + """ + Build the dictionary containing the definitions of the uncertainties to be + used in the uncertainty data file. + """ unc_definitions = {} # Statistical uncertainty @@ -196,9 +167,22 @@ def __build_unc_definitions(self): return unc_definitions - def generate_covmat(self, diag_uncs=None): - table = self.metadata["tables"][1] - tab_dict = self.__retrieve_table(table) + def _generate_covmat(self, diag_uncs): + """ + Generate the covariance matrix for the total x-sec. This function requires + the diagonal systematic uncertainties as argument. The diagonal uncertainties + are used to construct the covariance matrix from the correlation matrix stored + in the HepData table. + + Note that such a correlation matrix exists for the total x-sec only, while the + ratio observable does not provide this information. + """ + if not self.observable == 'WPWM-TOT': + raise ValueError( + "The construction of the covariance matrix is defined for the total x-sec only." + ) + table_ID = self.metadata["tables"][1] + tab_dict = self._retrieve_table(table_ID) matlist = tab_dict['dependent_variables'][0]['values'] matlist = [d['value'] for d in matlist] covmat = np.zeros((self.ndata, self.ndata)) @@ -208,64 +192,69 @@ def generate_covmat(self, diag_uncs=None): return covmat def generate_data(self): - ''' - Collect central data, kinematics, and uncertainties and save them into - yaml files. - ''' + """ + The function collects central data, kinematics, and uncertainties ans save them + into yaml files. + + The systematic uncertainties are given as percentages relative the central data point. + The absolute value of the uncertainty is obtained from the central data point before + the shifts are applied. + """ # Get central data and kinematics - central_data, stat_unc, sys_unc = self.generate_data_and_unc(self.mult_factor) - kinematics = self.generate_kinematics() + central_data, stat_unc, sys_unc = self._generate_data_and_unc() + kinematics = self._generate_kinematics() # Uncertainty definitions - unc_definitions = self.__build_unc_definitions() + unc_definitions = self._build_unc_definitions() sys_artificial = [] # Initialize vector of artificial uncertainties if self.observable == 'WPWM-TOT': - covmat = self.generate_covmat(sys_unc) + # Generate covmat and perform eigen decomposition + covmat = self._generate_covmat(sys_unc) eigvals, eigvecs = np.linalg.eig(covmat) art_unc = np.sqrt(eigvals) * eigvecs # Loop over bins - for data_idx, data in enumerate(central_data): + for data_idx in range(len(central_data)): # Statistical uncertainty unc_dict = {STAT_LABEL: stat_unc[data_idx]} + + # Artificial systematic uncertainties for sys_idx, art_sys in enumerate(art_unc[data_idx, :]): unc_dict[f'{ART_LABEL}_{sys_idx+1}'] = float(art_sys) + + # Append to list sys_artificial.append(unc_dict) elif self.observable == 'WPWM-RATIO': - for data_idx, data in enumerate(central_data): + for data_idx in range(len(central_data)): # Statistical uncertainty unc_dict = {STAT_LABEL: stat_unc[data_idx]} # Systematic uncertainty unc_dict[f'{ART_LABEL}'] = sys_unc[data_idx] sys_artificial.append(unc_dict) - - # Local path for yaml files - path='./' # Save kinematics into file logging.info("Dumping kinematics to file...") kinematics_yaml = {'bins': kinematics} - with open(path + self.metadata['kinematics']['file'], 'w') as kin_out_file: - yaml.dump(kinematics_yaml, kin_out_file, sort_keys=False) + kins_file_name = self.metadata['kinematics']['file'] + with open(CURRENT_DIR + '/' + kins_file_name, 'w') as file: + yaml.dump(kinematics_yaml, file, sort_keys=False) logging.info("Done!") # Save central data into file logging.info("Dumping kinematics to file...") dat_central_yaml = {'data_central': central_data} - file_name = self.metadata['data_central'] - with open(path + file_name, 'w') as dat_out_file: - yaml.dump(dat_central_yaml, dat_out_file, sort_keys=False) + data_file_name = self.metadata['data_central'] + with open(CURRENT_DIR + '/' + data_file_name, 'w') as file: + yaml.dump(dat_central_yaml, file, sort_keys=False) logging.info("Done!") # Save unertainties logging.info("Dumping kinematics to file...") uncertainties_yaml = {'definitions': unc_definitions, 'bins': sys_artificial} - file_name = ( - self.metadata['data_uncertainties'][0] - ) - with open(path + file_name, 'w') as dat_out_file: - yaml.dump(uncertainties_yaml, dat_out_file, sort_keys=False) + unc_file_name = self.metadata['data_uncertainties'][0] + with open(CURRENT_DIR + '/' + unc_file_name, 'w') as file: + yaml.dump(uncertainties_yaml, file, sort_keys=False) logging.info("Done!")