diff --git a/.gitignore b/.gitignore index 8e0e410f..9a719ac5 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ .ipynb_checkpoints/ \#* +docs/_build/ runinfo/ plot/ plotting/plot/ diff --git a/README.md b/README.md index 2215dcd9..2e4095d4 100644 --- a/README.md +++ b/README.md @@ -10,750 +10,64 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) Repository for Commissioning studies in the BTV POG based on (custom) nanoAOD samples -Detailed documentation in [btv-wiki](https://btv-wiki.docs.cern.ch/SoftwareAlgorithms/BTVNanoCommissioning/) -## Requirements -### Setup -:heavy_exclamation_mark: suggested to install under `bash` environment +This framework is based on [coffea](https://coffeateam.github.io/coffea/) and using [btvnano](https://btv-wiki.docs.cern.ch/SoftwareAlgorithms/PFNano/) as input. The framework is also used as frontend for the btv automation task [autobtv](https://gitlab.cern.ch/cms-analysis/btv/software-and-algorithms/autobtv) -``` -# only first time, including submodules -git clone --recursive git@github.com:cms-btv-pog/BTVNanoCommissioning.git - -# activate enviroment once you have coffea framework -conda activate btv_coffea -# Or if you are using lxplus, can use exist framework -conda activate /eos/user/m/milee/miniconda3/envs/btv_coffea - -# when modifying the framework -pip install -e . -``` -### Coffea installation with Micromamba -For installing Micromamba, see [[here](https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html)] -``` -wget -L micro.mamba.pm/install.sh -# Run and follow instructions on screen -bash install.sh -``` -NOTE: always make sure that conda, python, and pip point to local micromamba installation (`which conda` etc.). - -You can simply create the environment through the existing `test_env.yml` under your micromamba environment using micromamba, and activate it -``` -micromamba env create -f test_env.yml -micromamba activate btv_coffea -``` - -Once the environment is set up, compile the python package: -``` -pip install -e . -pip install -e .[dev] # for developer -``` - -### Other installation options for coffea -See https://coffeateam.github.io/coffea/installation.html - -## Quick launch of all tasks - -Now you can use various shell scripts to directly launch the runner scripts with predefined scaleouts. You can modify and customize the scripts inside the ```scripts/submit``` directory according to your needs. Each script takes arguments from ```arguments.txt``` directory, that has 4 inputs i.e. - ```Campaign name```, ```year```, ```executor``` and ```luminosity```. To launch any workflow, for example W+c -``` -./ctag_wc.sh arguments.txt -``` -Additional scripts are provided to make a directory structure that creates directories locally and copies them in the remote BTV eos area [https://btvweb.web.cern.ch/Commissioning/dataMC/](https://btvweb.web.cern.ch/Commissioning/dataMC/). -Finally plots can be directly monitored in the webpage. - -## Structure - -Each workflow can be a separate "processor" file, creating the mapping from NanoAOD to -the histograms we need. Workflow processors can be passed to the `runner.py` script -along with the fileset these should run over. Multiple executors can be chosen -(for now iterative - one by one, uproot/futures - multiprocessing and dask-slurm). - -To test a small set of files to see whether the workflows run smoothly, run: -``` -python runner.py --workflow ttsemilep_sf --json metadata/test_bta_run3.json --campaign Summer23 --year 2023 -``` - -More options for `runner.py` -
more options -

- -``` ---wf {validation,ttcom,ttdilep_sf,ttsemilep_sf,emctag_ttdilep_sf,ctag_ttdilep_sf,ectag_ttdilep_sf,ctag_ttsemilep_sf,ectag_ttsemilep_sf,ctag_Wc_sf,ectag_Wc_sf,ctag_DY_sf,ectag_DY_sf,BTA,BTA_addPFMuons,BTA_addAllTracks,BTA_ttbar}, --workflow {validation,ttcom,ttdilep_sf,ttsemilep_sf,emctag_ttdilep_sf,ctag_ttdilep_sf,ectag_ttdilep_sf,ctag_ttsemilep_sf,ectag_ttsemilep_sf,ctag_Wc_sf,ectag_Wc_sf,ctag_DY_sf,ectag_DY_sf,BTA,BTA_addPFMuons,BTA_addAllTracks,BTA_ttbar} - Which processor to run - -o OUTPUT, --output OUTPUT - Output histogram filename (default: hists.coffea) - --samples SAMPLEJSON, --json SAMPLEJSON - JSON file containing dataset and file locations - (default: dummy_samples.json) - --year YEAR Year - --campaign CAMPAIGN Dataset campaign, change the corresponding correction - files{ "Rereco17_94X","Winter22Run3","Summer23","Summer23BPix","Summer22","Summer22EE","2018_UL","2017_UL","2016preVFP_UL","2016postVFP_UL","prompt_dataMC"} - --isSyst Run with systematics, all, weight_only(no JERC uncertainties included),JERC_split, None(not extract) - --isArray Output root files - --noHist Not save histogram coffea files - --overwrite Overwrite existing files - --executor {iterative,futures,parsl/slurm,parsl/condor,parsl/condor/naf_lite,dask/condor,dask/condor/brux,dask/slurm,dask/lpc,dask/lxplus,dask/casa} - The type of executor to use (default: futures). Other options can be implemented. For - example see https://parsl.readthedocs.io/en/stable/userguide/configuring.html- - `parsl/slurm` - tested at DESY/Maxwell- `parsl/condor` - tested at DESY, RWTH- - `parsl/condor/naf_lite` - tested at DESY- `dask/condor/brux` - tested at BRUX (Brown U)- - `dask/slurm` - tested at DESY/Maxwell- `dask/condor` - tested at DESY, RWTH- `dask/lpc` - - custom lpc/condor setup (due to write access restrictions)- `dask/lxplus` - custom - lxplus/condor setup (due to port restrictions) - -j WORKERS, --workers WORKERS - Number of workers (cores/threads) to use for multi- worker executors (e.g. futures or condor) (default: - 3) - -s SCALEOUT, --scaleout SCALEOUT - Number of nodes to scale out to if using slurm/condor. - Total number of concurrent threads is ``workers x - scaleout`` (default: 6) - --memory MEMORY Memory used in jobs (in GB) ``(default: 4GB) - --disk DISK Disk used in jobs ``(default: 4GB) - --voms VOMS Path to voms proxy, made accessible to worker nodes. - By default a copy will be made to $HOME. - --chunk N Number of events per process chunk - --retries N Number of retries for coffea processor - --fsize FSIZE (Specific for dask/lxplus file splitting, default: 50) Numbers of files processed per - dask-worker - --index INDEX (Specific for dask/lxplus file splitting, default: 0,0) Format: - $dict_index_start,$file_index_start,$dict_index_stop,$file_index_stop. Stop indices are - optional. $dict_index refers to the index, splitted $dict_index and $file_index with ',' - $dict_index refers to the sample dictionary of the samples json file. $file_index refers to the N-th batch of files per dask-worker, with its size being defined by the option --index. The job will start (stop) submission from (with) the corresponding indices. - --validate Do not process, just check all files are accessible - --skipbadfiles Skip bad files. - --only ONLY Only process specific dataset or file - --limit N Limit to the first N files of each dataset in sample - JSON - --max N Max number of chunks to run in total -``` -

-
- -### Roadmap for running the tool (for commissioning tasks) - -1. Is the `.json` file ready? If not, create it following the instructions in the [Make the json files](#make-the-json-files) section. Please use the correct naming scheme - -2. Add the `lumiMask`, correction files (SFs, pileup weight), and JER, JEC files under the dict entry in `utils/AK4_parameters.py`. See details in [Correction files configurations](#correction-files-configurations) - -3. If the JERC file `jec_compiled.pkl.gz` is missing in the `data/JME/${campaign}` directory, create it through [Create compiled JERC file](#create-compiled-jerc-filepklgz) - -4. If selections and output histogram/arrays need to be changed, modify the dedicated `workflows` - -5. Run the workflow with dedicated input and campaign name. Example commands for Run 3 can be found [here](#commands-for-different-phase-space). For first usage, the JERC file needs to be recompiled first, see [Create compiled JERC file](#create-compiled-jerc-filepklgz). You can also specify `--isArray` to store the skimmed root files - -6. Fetch the failed files to obtain events that have been processed and events that have to be resubmitted using `scripts/dump_processed.py`. Check the luminosity of the processed dataset used for the plotting script and re-run failed jobs if needed (details in [get procssed info](#get-processed-information)) - -7. Once you obtain the `.coffea` file(s), you can make plots using the [plotting scripts](#plotting-code), if the xsection for your sample is missing, please add it to `src/BTVNanoCommissioning/helpers/xsection.py` - -Check out [notes for developer](https://btv-wiki.docs.cern.ch/SoftwareAlgorithms/BTVNanoCommissioning/#notes-for-developers) for more info! - -### Commands for different phase space - -After a small test, you can run the full campaign for a dedicated phase space, separately for data and for MC. - -``` - python runner.py --workflow $WF --json metadata/$JSON --campaign $CAMPAIGN --year $YEAR (--executor ${scaleout_site}) -``` - - -#### b-SFs - -
details -

- -- Dileptonic ttbar phase space : check performance for btag SFs, emu channel - -``` -python runner.py --workflow ttdiilep_sf --json metadata/data_Summer23_2023_em_BTV_Run3_2023_Comm_MINIAODv4_NanoV12.json --campaign Summer23 --year 2023 (--executor ${scaleout_site}) -``` - -- Semileptonic ttbar phase space : check performance for btag SFs, muon channel - -``` -python runner.py --workflow ttsemilep_sf --json metadata/data_Summer23_2023_mu_BTV_Run3_2023_Comm_MINIAODv4_NanoV12.json --campaign Summer23 --year 2023 (--executor ${scaleout_site}) -``` - -

-
- -#### c-SFs -
details -

- -- Dileptonic ttbar phase space : check performance for charm SFs, bjets enriched SFs, muon channel - -``` -python runner.py --workflow ctag_ttdilep_sf --json metadata/data_Summer23_2023_mu_BTV_Run3_2023_Comm_MINIAODv4_NanoV12.json --campaign Summer23 --year 2023(--executor ${scaleout_site}) -``` - - -- Semileptonic ttbar phase space : check performance for charm SFs, bjets enriched SFs, muon channel - -``` -python runner.py --workflow ctag_ttsemilep_sf --json metadata/data_Summer23_2023_mu_BTV_Run3_2023_Comm_MINIAODv4_NanoV12.json --campaign Summer23 --year 2023(--executor ${scaleout_site}) -``` - -- W+c phase space : check performance for charm SFs, cjets enriched SFs, muon channel - -``` -python runner.py --workflow ctag_Wc_sf --json metadata/data_Summer23_2023_mu_BTV_Run3_2023_Comm_MINIAODv4_NanoV12.json --campaign Summer23 --year 2023(--executor ${scaleout_site}) -``` - -- DY phase space : check performance for charm SFs, light jets enriched SFs, muon channel - -``` -python runner.py --workflow ctag_DY_sf --json metadata/data_Summer23_2023_mu_BTV_Run3_2023_Comm_MINIAODv4_NanoV12.json --campaign Summer23 --year 2023(--executor ${scaleout_site}) -``` - -

-
- -#### Validation - check different campaigns - -
details -

- -Only basic jet selections(PUID, ID, pT, $\eta$) applied. Put the json files with different campaigns, plot ROC & efficiency - -``` -python runner.py --workflow valid --json metadata/$json file -``` - -

-
- -#### BTA - BTagAnalyzer Ntuple producer - -Based on Congqiao's [development](notebooks/BTA_array_producer.ipynb) to produce BTA ntuples based on PFNano. - -:exclamation: Only the newest version [BTV_Run3_2023_Comm_MINIAODv4](https://github.com/cms-btv-pog/btvnano-prod) ntuples work. Example files are given in [this](metadata/test_bta_run3.json) json. Optimize the chunksize(`--chunk`) in terms of the memory usage. This depends on sample, if the sample has huge jet collection/b-c hardons. The more info you store, the more memory you need. I would suggest to test with `iterative` to estimate the size. - -
details -

- -Run with the nominal `BTA` workflow to include the basic event variables, jet observables, and GEN-level quarks, hadrons, leptons, and V0 variables. -``` -python runner.py --wf BTA --json metadata/test_bta_run3.json --campaign Summer22EE --isJERC -``` - -Run with the `BTA_addPFMuons` workflow to additionally include the `PFMuon` and `TrkInc` collection, used by the b-tag SF derivation with the QCD(μ) methods. -``` -python runner.py --wf BTA_addPFMuons --json metadata/test_bta_run3.json --campaign Summer22EE --isJERC -``` +This framework is based on [coffea processor](https://coffeateam.github.io/coffea/concepts.html#coffea-processor). Each workflow can be a separate **processor** file in the `workflows`, creating the mapping from `PFNano` to the histograms as `coffea` file or creating `.root` files by saving awkward arrays. Workflow processors can be passed to the `runner.py` script along with the fileset these should run over. Multiple executors can be chosen +(`iterative` - one by one, `futures` - multiprocessing). Scale out to clusters depend on facilities. Obtain the histograms as plot(`.pdf`) or save to template `.root` file with dedicated scripts -Run with the `BTA_addAllTracks` workflow to additionally include the `Tracks` collection, used by the JP variable calibration. -``` -python runner.py --wf BTA_addAllTracks --json metadata/test_bta_run3.json --campaign Summer22EE --isJERC +The minimum requirement commands are shown in follow, specified the selections, datataset, campaign and year ``` - -

-
- -## Scale-out - -Scale out can be notoriously tricky between different sites. Coffea's integration of `slurm` and `dask` -makes this quite a bit easier and for some sites the ``native'' implementation is sufficient, e.g Condor@DESY. -However, some sites have certain restrictions for various reasons, in particular Condor @CERN and @FNAL. The scaleout scheme is named as follows: `$cluster_schedule_system/scheduler/site`. The existing sites are documented in [sites configuration](#sites-configuration-with-daskparsl-schedular) while [standalone condor submission](#standalone-condor-jobslxpluscmsconnect) is possible and strongly suggested when working on lxplus. - - -Memory usage is also useful to adapt to cluster. Check the memory by calling `memory_usage_psutil()` from `helpers.func.memory_usage_psutil` to optimize job size. Example with `ectag_Wc_sf` summarized below. - -
details -

- - Type |Array+Hist | Hist only| Array Only| -| :---: | :---: | :---: | :---: | -DoubleMuon (BTA,BTV_Comm_v2)| 1243MB | 848MB |1249MB| -DoubleMuon (PFCands, BTV_Comm_v1)|1650MB |1274MB |1632MB| -DoubleMuon (Nano_v11)|1183MB| 630MB |1180MB| -WJets_inc (BTA,BTV_Comm_v2)| 1243MB |848MB |1249MB| -WJets_inc (PFCands, BTV_Comm_v1)|1650MB |1274MB |1632MB -WJets_inc (Nano_v11)|1183MB |630MB |1180MB| - -

-
- -### Sites configuration with dask/parsl schedular - -
details -

- -#### Condor@FNAL (CMSLPC) -Follow setup instructions at https://github.com/CoffeaTeam/lpcjobqueue. After starting -the singularity container run with -```bash -python runner.py --wf ttcom --executor dask/lpc +python runner.py --workflow ttsemilep_sf --json metadata/test_bta_run3.json --campaign Summer22EERun3 --year 2022 ``` +- Detailed documentation [here](https://btvnanocommissioning.readthedocs.io/en/latest/) +- To running the commissioning task or producing the template: go to [Preparation for commissioning/SFs tasks](https://btvnanocommissioning.readthedocs.io/en/latest/user.html) +- To develop new workflow, the instruction can be found in [Instruction for developers](https://btvnanocommissioning.readthedocs.io/en/latest/user.html) +- Current working in progress [issues](https://gitlab.cern.ch/cms-btv-coordination/tasks/-/issues/?label_name%5B%5D=Software%3A%3A%20BTVnano%20%26CommFW) -#### Condor@CERN (lxplus) -Only one port is available per node, so its possible one has to try different nodes until hitting -one with `8786` being open. Other than that, no additional configurations should be necessary. -```bash -python runner.py --wf ttcom --executor dask/lxplus -``` - -jobs automatically split to 50 files per jobs to avoid job failure due to crowded cluster on lxplus with the naming scheme `hist_$workflow_$json_$dictindex_$fileindex.coffea`. The `.coffea` files can be then combined at plotting level - - -:exclamation: The optimal scaleout options on lxplus are `-s 50 --chunk 50000` - -To deal with unstable condor cluster and dask worker on lxplus, you can resubmit failure jobs via `--index $dictindex,$fileindex` option. `$dictindex` refers to the index in the `.json dict`, `$fileindex` refers to the index of the file list split to 50 files per dask-worker. The total number of files of each dict can be computed by `math.ceil(len($filelist)/50)` The job will start from the corresponding indices. - -#### Coffea-casa (Nebraska AF) -Coffea-casa is a JupyterHub based analysis-facility hosted at Nebraska. For more information and setup instuctions see -https://coffea-casa.readthedocs.io/en/latest/cc_user.html - -After setting up and checking out this repository (either via the online terminal or git widget utility run with -```bash -python runner.py --wf ttcom --executor dask/casa -``` -Authentication is handled automatically via login auth token instead of a proxy. File paths need to replace xrootd redirector with "xcache", `runner.py` does this automatically. - - -#### Condor@DESY -```bash -python runner.py --wf ttcom --executor dask/condor(parsl/condor) -``` - -#### Maxwell@DESY -```bash -python runner.py --wf ttcom --executor parsl/slurm -``` -

-
+## Setup -### Standalone condor jobs@lxplus/cmsconnect +You can install your [standalone conda envrionment](#standalone-conda-environment) via `yaml` or on the lxplus you can directly jump to [setup](#setup-the-framework) +### Standalone conda environment +> [!Caution] +> suggested to install under `bash` environment -:heavy_exclamation_mark: :heavy_exclamation_mark: :heavy_exclamation_mark: Strongly suggest to use this in lxplus :heavy_exclamation_mark: :heavy_exclamation_mark: :heavy_exclamation_mark: -You have the option to run the framework through "standalone condor jobs", bypassing the native coffea-supported job submission system. Within each job you submit, a standalone script will execute the following on the worker node: - - Set up a minimal required Python environment. - - Retrieve the BTVNanoCommissioning repository, either from a git link or transferred locally. - - Launch the `python runner.py ...` command to execute the coffea framework in the iterative executor mode. - -This utility is currently adapted for the lxplus and cmsconnect condor systems. To generate jobs for launching, replace `python runner.py` with `python condor/submitter.py`, append the existing arguments, and add the following arguments in addition: - - - `--jobName`: Specify the desired condor job name. A dedicated folder will be generated, including all submission-related files. - - `--outputXrootdDir`: Indicate the XRootD directory's path (starting with `root://`) where the produced .coffea (and .root) files from each worker node will be transferred to. - - `--condorFileSize`: Define the number of files to process per condor job (default is 50). The input file list will be divided based on this count. - - `--remoteRepo` (optional, but recommended): Specify the path and branch of the remote repository to download the BTVNanoCommissioning repository. If not specified, the local directory will be packed and transferred as the condor input, potentially leading to higher loads for condor transfers. Use the format e.g. `--remoteRepo 'https://github.com/cms-btv-pog/BTVNanoCommissioning.git -b master'`. - -After executing the command, a new folder will be created, preparing the submission. Follow the on-screen instructions and utilize `condor_submit ...` to submit the jdl file. The output will be transferred to the designated XRootD destination. - -The script provided by Pablo to resubmit failure jobs in `script/missingFiles.py` from the original job folder. - -
Frequent issues for standalone condor jobs submission - -

- -1. CMS Connect provides a condor interface where one can submit jobs to all resources available in the CMS Global Pool. See [WorkBookCMSConnect Twiki](https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookCMSConnect#Requesting_different_Operative_S) for the instructions if you use it for the first time. -2. The submitted jobs are of the kind which requires a proper setup of the X509 proxy, to use the XRootD service to access and store data. In the generated `.jdl` file, you may see a line configured for this purpose `use_x509userproxy = true`. If you have not submitted jobs of this kind on lxplus condor, we recommend you to add a line - ```bash - export X509_USER_PROXY=$HOME/x509up_u`id -u` - ``` - to `.bashrc` and run it so the proxy file will be stored in your AFS folder instead of in your `/tmp/USERNAME` folder. For submission on cmsconnect, no specific action is required. - -

-
- - - -## Make the dataset json files - -Use `fetch.py` in folder `scripts/` to obtain your samples json files. You can create `$input_list` ,which can be a list of datasets taken from CMS DAS or names of dataset(need to specify campaigns explicity), and create the json contains `dataset_name:[filelist]`. One can specify the local path in that input list for samples not published in CMS DAS. -`$output_json_name$` is the name of your output samples json file. - -The `--whitelist_sites, --blacklist_sites` are considered for fetch dataset if multiple sites are available - - -``` -## File publish in DAS, input MC file name list, specified --from_dataset and add campaign info, if more than one campaign found, would ask for specify explicity -python scripts/fetch.py -i $MC_FILE_LIST -o ${output_json_name} --from_dataset --campaign Run3Summer23BPixNanoAODv12 -## File publish in DAS, input DAS path -python fetch.py --input ${input_DAS_list} --output ${output_json_name} (--xrd {prefix_forsite}) - -## Not publish case, specify site by --xrd prefix -python fetch.py --input ${input_list} --output ${output_json_name} --xrd {prefix_forsite} -# where the input list should contains -$DATASET_NAME $PATH_TO_FILE -``` -The `output_json_name` must contain the BTV name tag (e.g. `BTV_Run3_2022_Comm_v1`). - -You might need to rename the json key name with following name scheme: - -For the data sample please use the naming scheme, -``` -$dataset_$Run -#i.e. -SingleMuon_Run2022C-PromptReco-v1 -``` -For MC, please be consistent with the dataset name in CMS DAS, as it cannot be mapped to the cross section otherwise. -``` -$dataset -#i.e. -WW_TuneCP5_13p6TeV-pythia8 -``` - - -## Get processed information - -Get the run & luminosity information for the processed events from the coffea output files. When you use `--skipbadfiles`, the submission will ignore files not accesible(or time out) by `xrootd`. This script helps you to dump the processed luminosity into a json file which can be calculated by `brilcalc` tool and provide a list of failed lumi sections by comparing the original json input to the one from the `.coffea` files. - - -```bash -# all is default, dump lumi and failed files, if run -t lumi only case. no json file need to be specified -python scripts/dump_processed.py -c $COFFEA_FILES -n $OUTPUT_NAME (-j $ORIGINAL_JSON -t [all,lumi,failed]) -``` - - -## Correction files configurations -:heavy_exclamation_mark: If the correction files are not supported yet by jsonpog-integration, you can still try with custom input data. - -### Options with custom input data - -All the `lumiMask`, correction files (SFs, pileup weight), and JEC, JER files are under `BTVNanoCommissioning/src/data/` following the substructure `${type}/${campaign}/${files}`(except `lumiMasks` and `Prescales`) - -| Type | File type | Comments| -| :---: | :---: | :---: | -| `lumiMasks` |`.json` | Masked good lumi-section used for physics analysis| -| `Prescales` | `.txt` | HLT paths for prescaled triggers| -| `PU` | `.pkl.gz` or `.histo.root` | Pileup reweight files, matched MC to data| -| `LSF` | `.histo.root` | Lepton ID/Iso/Reco/Trigger SFs| -| `BTV` | `.csv` or `.root` | b-tagger, c-tagger SFs| -| `JME` | `.txt` | JER, JEC files| - -Create a `dict` entry under `correction_config` with dedicated campaigns in `BTVNanoCommissioning/src/utils/AK4_parameters.py`. - - -
Take `Rereco17_94X` as an example. -

- -```python -# specify campaign -"Rereco17_94X": -{ - ##Load files with dedicated type:filename - "lumiMask": "Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt", - "PU": "94XPUwei_corrections.pkl.gz", - "JME": "jec_compiled.pkl.gz", - ## Btag SFs- create dict specifying SFs for DeepCSV b-tag(DeepCSVB), DeepJet b-tag(DeepJetB),DeepCSV c-tag(DeepCSVC), DeepJet c-tag(DeepJetC), - "BTV": { - ### b-tag - "DeepCSVB": "DeepCSV_94XSF_V5_B_F.csv", - "DeepJetB": "DeepFlavour_94XSF_V4_B_F.csv", - ### c-tag - "DeepCSVC": "DeepCSV_ctagSF_MiniAOD94X_2017_pTincl_v3_2_interp.root", - "DeepJetC": "DeepJet_ctagSF_MiniAOD94X_2017_pTincl_v3_2_interp.root", - }, - ## lepton SF - create dict specifying SFs for electron/muon ID/ISO/RECO SFs - "LSF": { - ### Following the scheme "${SF_name} ${histo_name_in_root_file}": "${file}" - "ele_Trig TrigSF": "Ele32_L1DoubleEG_TrigSF_vhcc.histo.root", - "ele_ID EGamma_SF2D": "ElectronIDSF_94X_MVA80WP.histo.root", - "ele_Rereco EGamma_SF2D": "ElectronRecoSF_94X.histo.root", - "mu_ID NUM_TightID_DEN_genTracks_pt_abseta": "RunBCDEF_MuIDSF.histo.root", - "mu_ID_low NUM_TightID_DEN_genTracks_pt_abseta": "RunBCDEF_MuIDSF_lowpT.histo.root", - "mu_Iso NUM_TightRelIso_DEN_TightIDandIPCut_pt_abseta": "RunBCDEF_MuISOSF.histo.root", - }, - }, -``` - -

-
- -### Use central maintained jsonpog-integration -The official correction files collected in [jsonpog-integration](https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration) is updated by POG except `lumiMask` and `JME` still updated by maintainer. No longer to request input files in the `correction_config`. - -
See the example with `2017_UL`. -

- -```python - "2017_UL": { - # Same with custom config - "lumiMask": "Cert_294927-306462_13TeV_UL2017_Collisions17_MuonJSON.txt", - "JME": "jec_compiled.pkl.gz", - # no config need to be specify for PU weights - "PU": None, - # Btag SFs - specify $TAGGER : $TYPE-> find [$TAGGER_$TYPE] in json file - "BTV": {"deepCSV": "shape", "deepJet": "shape"}, - "roccor": None, - # JMAR, IDs from JME- Following the scheme: "${SF_name}": "${WP}" - "JMAR": {"PUJetID_eff": "L"}, - "LSF": { - # Electron SF - Following the scheme: "${SF_name} ${SF_map} ${year}": "${WP}" - # https://github.com/cms-egamma/cms-egamma-docs/blob/master/docs/EgammaSFJSON.md - "ele_ID 2017 UL-Electron-ID-SF": "wp90iso", - "ele_Reco 2017 UL-Electron-ID-SF": "RecoAbove20", - - # Muon SF - Following the scheme: "${SF_name} ${year}": "${WP}" - # WPs : ['NUM_GlobalMuons_DEN_genTracks', 'NUM_HighPtID_DEN_TrackerMuons', 'NUM_HighPtID_DEN_genTracks', 'NUM_IsoMu27_DEN_CutBasedIdTight_and_PFIsoTight', 'NUM_LooseID_DEN_TrackerMuons', 'NUM_LooseID_DEN_genTracks', 'NUM_LooseRelIso_DEN_LooseID', 'NUM_LooseRelIso_DEN_MediumID', 'NUM_LooseRelIso_DEN_MediumPromptID', 'NUM_LooseRelIso_DEN_TightIDandIPCut', 'NUM_LooseRelTkIso_DEN_HighPtIDandIPCut', 'NUM_LooseRelTkIso_DEN_TrkHighPtIDandIPCut', 'NUM_MediumID_DEN_TrackerMuons', 'NUM_MediumID_DEN_genTracks', 'NUM_MediumPromptID_DEN_TrackerMuons', 'NUM_MediumPromptID_DEN_genTracks', 'NUM_Mu50_or_OldMu100_or_TkMu100_DEN_CutBasedIdGlobalHighPt_and_TkIsoLoose', 'NUM_SoftID_DEN_TrackerMuons', 'NUM_SoftID_DEN_genTracks', 'NUM_TightID_DEN_TrackerMuons', 'NUM_TightID_DEN_genTracks', 'NUM_TightRelIso_DEN_MediumID', 'NUM_TightRelIso_DEN_MediumPromptID', 'NUM_TightRelIso_DEN_TightIDandIPCut', 'NUM_TightRelTkIso_DEN_HighPtIDandIPCut', 'NUM_TightRelTkIso_DEN_TrkHighPtIDandIPCut', 'NUM_TrackerMuons_DEN_genTracks', 'NUM_TrkHighPtID_DEN_TrackerMuons', 'NUM_TrkHighPtID_DEN_genTracks'] - - "mu_Reco 2017_UL": "NUM_TrackerMuons_DEN_genTracks", - "mu_HLT 2017_UL": "NUM_IsoMu27_DEN_CutBasedIdTight_and_PFIsoTight", - "mu_ID 2017_UL": "NUM_TightID_DEN_TrackerMuons", - "mu_Iso 2017_UL": "NUM_TightRelIso_DEN_TightIDandIPCut", - }, - }, -``` - -

-
- - -## Get Prescale weights - -! this only works in lxplus - -Generate prescale weights using `brilcalc` - -```python -python scripts/dump_prescale.py --HLT $HLT --lumi $LUMIMASK -# HLT : put prescaled triggers -# lumi: golden lumi json -``` - -## Create compiled JERC file(`pkl.gz`) - -:exclamation: In case existing correction file doesn't work for you due to the incompatibility of `cloudpickle` in different python versions. Please recompile the file to get new pickle file. - -Under `compile_jec.py` you need to create dedicated jet factory files with different campaigns. Following the name scheme with `mc` for MC and `data${run}` for data. - -Compile correction pickle files for a specific JEC campaign by changing the dict of jet_factory, and define the MC campaign and the output file name by passing it as arguments to the python script: - -``` -python -m BTVNanoCommissioning.utils.compile_jec ${campaign} jec_compiled -e.g. python -m BTVNanoCommissioning.utils.compile_jec Summer23 jec_compiled +For installing Micromamba, see [[here](https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html)] ``` +curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" +# Run and follow instructions on screen +bash Miniforge3-$(uname)-$(uname -m).sh -## Prompt data/MC checks and validation - -### Prompt data/MC checks (prompt_dataMC campaign, WIP) - -To quickly check the data/MC quickly, run part data/MC files, no SFs/JEC are applied, only the lumimasks. - -1. Get the file list from DAS, use the `scripts/fetch.py` scripts to obtain the jsons -2. Replace the lumimask name in prompt_dataMC in `AK4_parameters.py` , you can do `sed -i 's/$LUMIMASK_DATAMC/xxx.json/g` -3. Run through the dataset to obtained the `coffea` files -4. Dump the lumi information via `dump_processed.py`, then use `brilcalc` to get the dedicated luminosity info -5. Obtained data MC plots - -### Validation workflow - - -## Plotting code - -### data/MC comparisons -:exclamation_mark: If using wildcard for input, do not forget the quoatation marks! (see 2nd example below) - -You can specify `-v all` to plot all the variables in the `coffea` file, or use wildcard options (e.g. `-v "*DeepJet*"` for the input variables containing `DeepJet`) - -:new: non-uniform rebinning is possible, specify the bins with list of edges `--autorebin 50,80,81,82,83,100.5` - -```bash -python scripts/plotdataMC.py -i a.coffea,b.coffea --lumi 41500 -p ttdilep_sf -v z_mass,z_pt -python scripts/plotdataMC.py -i "test*.coffea" --lumi 41500 -p ttdilep_sf -v z_mass,z_pt # with wildcard option need "" +micromamba activate ``` +NOTE: always make sure that conda, python, and pip point to local micromamba installation (`which conda` etc.). -
more arguments -

+You can simply create the environment through the existing `test_env.yml` under your micromamba environment using micromamba, and activate it ``` +micromamba env create -f test_env.yml -options: - -h, --help show this help message and exit - --lumi LUMI luminosity in /pb - --com COM sqrt(s) in TeV - -p {ttdilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf}, --phase {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf} - which phase space - --log LOG log on y axis - --norm NORM Use for reshape SF, scale to same yield as no SFs case - -v VARIABLE, --variable VARIABLE - variables to plot, splitted by ,. Wildcard option * available as well. Specifying `all` will run through all variables. - --SF make w/, w/o SF comparisons - --ext EXT prefix name - -i INPUT, --input INPUT - input coffea files (str), splitted different files with ','. Wildcard option * available as well. - --autorebin AUTOREBIN - Rebin the plotting variables, input `int` or `list`. int: merge N bins. list of number: rebin edges(non-uniform bin is possible) - --xlabel XLABEL rename the label for x-axis - --ylabel YLABEL rename the label for y-axis - --splitOSSS SPLITOSSS - Only for W+c phase space, split opposite sign(1) and same sign events(-1), if not specified, the combined OS-SS phase space is used - --xrange XRANGE custom x-range, --xrange xmin,xmax - --flow FLOW - str, optional {None, 'show', 'sum'} Whether plot the under/overflow bin. If 'show', add additional under/overflow bin. If 'sum', add the under/overflow bin content to first/last bin. - --split {flavor,sample,sample_flav} - Decomposition of MC samples. Default is split to jet flavor(udsg, pu, c, b), possible to split by group of MC - samples. Combination of jetflavor+ sample split is also possible ``` - -

-

- -### data/data, MC/MC comparisons - -You can specify `-v all` to plot all the variables in the `coffea` file, or use wildcard options (e.g. `-v "*DeepJet*"` for the input variables containing `DeepJet`) -:exclamation_mark: If using wildcard for input, do not forget the quoatation marks! (see 2nd example below) +### Setup the framework ```bash -# with merge map, compare ttbar with data -python scripts/comparison.py -i "*.coffea" --mergemap '{"ttbar": ["TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8","TTto4Q_TuneCP5_13p6TeV_powheg-pythia8","TTtoLNu2Q_TuneCP5_13p6TeV_powheg-pythia8],"data":["MuonRun2022C-27Jun2023-v1","MuonRun2022D-27Jun2023-v1"]}' -r ttbar -c data -v mu_pt -p ttdilep_sf -# if no mergemap, take the key name directly -python scripts/comparison.py -i datac.coffea,datad.coffea -r MuonRun2022C-27Jun2023-v1 -c MuonRun2022D-27Jun2023-v1 -v mu_pt -p ttdilep_sf - -``` - -
more arguments -

- - ``` -options: - -h, --help show this help message and exit - -p {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf}, --phase {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf} - which phase space - -i INPUT, --input INPUT - input coffea files (str), splitted different files with ','. Wildcard option * available as well. - -r REF, --ref REF referance dataset - -c COMPARED, --compared COMPARED - compared datasets, splitted by , - --sepflav SEPFLAV seperate flavour(b/c/light) - --log log on y axis - -v VARIABLE, --variable VARIABLE - variables to plot, splitted by ,. Wildcard option * available as well. Specifying `all` will run through all variables. - --ext EXT prefix name - --com COM sqrt(s) in TeV - --mergemap MERGEMAP - Group list of sample(keys in coffea) as reference/compare set as dictionary format. Keys would be the new lables of the group - --autorebin AUTOREBIN - Rebin the plotting variables, input `int` or `list`. int: merge N bins. list of number: rebin edges(non-uniform bin is possible) - --xlabel XLABEL rename the label for x-axis - --ylabel YLABEL rename the label for y-axis - --norm compare shape, normalized yield to reference - --xrange XRANGE custom x-range, --xrange xmin,xmax - --flow FLOW - str, optional {None, 'show', 'sum'} Whether plot the under/overflow bin. If 'show', add additional under/overflow bin. If 'sum', add the under/overflow bin content to first/last bin. -``` - -

-

- - -### ROCs & efficiency plots - -Extract the ROCs for different tagger and efficiencies from validation workflow - -```python -python scripts/validation_plot.py -i $INPUT_COFFEA -v $VERSION -``` - -### Correlation plots study - -You can perform a study of linear correlations of b-tagging input variables. Additionally, soft muon variables may be added into the study by requesting `--SMu` argument. If you wan to limit the outputs only to DeepFlavB, PNetB and RobustParTAK4B, you can use the `--limit_outputs` option. If you want to use only the set of variables used for tagger training, not just all the input variables, then use the option `--limit_inputs`. To limit number of files read, make use of option `--max_files`. In case your study requires splitting samples by flavour, use `--flavour_split`. `--split_region_b` performs a sample splitting based on the DeepFlavB >/< 0.5. For Data/MC comparison purpose pay attention - change ranking factors (xs/sumw) in L420! - -```python -python correlation_plots.py $input_folder [--max_files $nmax_files --SMu --limit_inputs --limit_outputs --specify_MC --flavour_split --split_region_b] -``` - -### 2D plots (Correlation study-related) - -To further investigate the correlations, one can create the 2D plots of the variables used in this study. Inputs and optional arguments are the same as for the correlation plots study. - -```python -python 2Dhistogramms.py $input_folder [--max_files $nmax_files --SMu --limit_inputs --limit_outputs --specify_MC --flavour_split --split_region_b] -``` - - -## Store histograms from coffea file - -Use `scripts/make_template.py` to dump 1D/2D histogram from `.coffea` to `TH1D/TH2D` with hist. MC histograms can be reweighted to according to luminosity value given via `--lumi`. You can also merge several files - -```python -python scripts/make_template.py -i "testfile/*.coffea" --lumi 7650 -o test.root -v mujet_pt -a '{"flav":0,"osss":"sum"}' -``` - -
more arguments -

- -``` - -i INPUT, --input INPUT - Input coffea file(s) - -v VARIABLE, --variable VARIABLE - Variables to store (histogram name) - -a AXIS, --axis AXIS dict, put the slicing of histogram, specify 'sum' option as string - --lumi LUMI Luminosity in /pb - -o OUTPUT, --output OUTPUT - output root file name - --mergemap MERGEMAP Specify mergemap as dict, '{merge1:[dataset1,dataset2]...}' Also works with the json file with dict -``` - -

-

- - +# activate enviroment once you have coffea framework +conda/micromamba activate btv_coffea -
mergemap example -

+conda/micromamba activate /eos/home-m/milee/miniforge3/envs/btv_coffea -```json -{ - "WJets": ["WJetsToLNu_TuneCP5_13p6TeV-madgraphMLM-pythia8"], - "VV": [ "WW_TuneCP5_13p6TeV-pythia8", "WZ_TuneCP5_13p6TeV-pythia8", "ZZ_TuneCP5_13p6TeV-pythia8"], - "TT": [ "TTTo2J1L1Nu_CP5_13p6TeV_powheg-pythia8", "TTTo2L2Nu_CP5_13p6TeV_powheg-pythia8"], - "ST":[ "TBbarQ_t-channel_4FS_CP5_13p6TeV_powheg-madspin-pythia8", "TbarWplus_DR_AtLeastOneLepton_CP5_13p6TeV_powheg-pythia8", "TbarBQ_t-channel_4FS_CP5_13p6TeV_powheg-madspin-pythia8", "TWminus_DR_AtLeastOneLepton_CP5_13p6TeV_powheg-pythia8"], -"data":[ "Muon_Run2022C-PromptReco-v1", "SingleMuon_Run2022C-PromptReco-v1", "Muon_Run2022D-PromptReco-v1", "Muon_Run2022D-PromptReco-v2"] -} +# only first time, including submodules +git clone git@github.com:cms-btv-pog/BTVNanoCommissioning.git +# Once the environment is set up, compile the python package: +pip install -e . +pip install -e .[dev, doc] # for developer ``` -

-
- - - -## Notes for developers -The BTV tutorial for coffea part is under `notebooks` and the template to construct new workflow is `src/BTVNanoCommissioning/workflows/example.py` -Here are some tips provided for developers working on their forked version of this repository. Also some useful git commands can be found [here](https://codimd.web.cern.ch/wY3IrOBBT3i3GXIQxLMWPA) -### Setup CI pipeline for fork branch -Since the CI pipelines involve reading files via `xrootd` and access gitlab.cern.ch, you need to save some secrets in your forked directory. - -Yout can find the secret configuration in the direcotry : `Settings>>Secrets>>Actions`, and create the following secrets: +You can still install additional packages itself by `pip install $PACKAGE` -- `GIT_CERN_SSH_PRIVATE`: - 1. Create a ssh key pair with `ssh-keygen -t rsa -b 4096` (do not overwrite with your local one), add the public key to your CERN gitlab account - 2. Copy the private key to the entry -- `GRID_PASSWORD`: Add your grid password to the entry. -- `GRID_USERCERT` & `GRID_USERKEY`: Encrypt your grid user certification `base64 -i ~/.globus/userkey.pem | awk NF=NF RS= OFS=` and `base64 -i ~/.globus/usercert.pem | awk NF=NF RS= OFS=` and copy the output to the entry. +`conda/micromamba activate btv_coffea` is required to setup -Special commit head messages could run different commands in actions (add the flag in front of your commit) -The default configureation is doing -``` -python runner.py --workflow emctag_ttdilep_sf --json metadata/test_bta_run3.json --limit 1 --executor iterative --campaign Summer23 --isArray --isSyst all -``` - -- `[skip ci]`: not running ci at all in the commit message -- `ci:skip array` : remove `--isArray` option -- `ci:skip syst` : remove `--isSyst all` option -- `ci:JERC_split` : change systematic option to split JERC uncertainty sources `--isSyst JERC_split` -- `ci:weight_only` : change systematic option to weight only variations `--isSyst weight_only` - -### Running jupyter remotely -1. On your local machine, edit `.ssh/config`: -``` -Host lxplus* - HostName lxplus7.cern.ch - User - ForwardX11 yes - ForwardAgent yes - ForwardX11Trusted yes -Host *_f - LocalForward localhost:8800 localhost:8800 - ExitOnForwardFailure yes -``` -2. Connect to remote with `ssh lxplus_f` -3. Start a jupyter notebook: -``` -jupyter notebook --ip=127.0.0.1 --port 8800 --no-browser -``` -4. URL for notebook will be printed, copy and open in local browser diff --git a/docs/api.rst b/docs/api.rst index ec94338a..d7defd01 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -3,5 +3,106 @@ API .. autosummary:: :toctree: generated + BTVNanoCommissioning.utils.correction + BTVNanoCommissioning.utils.selection + BTVNanoCommissioning.utils.array_writer + BTVNanoCommissioning.utils.histogrammer + BTVNanoCommissioning.utils.sample + BTVNanoCommissioning.utils.plot_utils + BTVNanoCommissioning.helpers.update_branch + BTVNanoCommissioning.helpers.func + BTVNanoCommissioning.helpers.BTA_helper + BTVNanoCommissioning.helpers.xs_scaler - lumache + + + +Details +======= + +BTVNanoCommissioning.utils.correction +------------------------------------- +.. automodule:: BTVNanoCommissioning.utils.correction + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.utils.selection +------------------------------------ +.. automodule:: BTVNanoCommissioning.utils.selection + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.utils.AK4_parameters +----------------------------------------- +.. automodule:: BTVNanoCommissioning.utils.AK4_parameters + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.utils.array_writer +--------------------------------------- +.. automodule:: BTVNanoCommissioning.utils.array_writer + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.utils.histogrammer +--------------------------------------- +.. automodule:: BTVNanoCommissioning.utils.histogrammer + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.utils.sample +--------------------------------- +.. automodule:: BTVNanoCommissioning.utils.sample + :members: + :undoc-members: + :show-inheritance: + + +BTVNanoCommissioning.utils.plot_utils +------------------------------------- +.. automodule:: BTVNanoCommissioning.utils.plot_utils + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.helpers.update_branch +------------------------------------------ +.. automodule:: BTVNanoCommissioning.helpers.update_branch + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.helpers.func +--------------------------------- +.. automodule:: BTVNanoCommissioning.helpers.func + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.helpers.BTA_helper +--------------------------------------- +.. automodule:: BTVNanoCommissioning.helpers.BTA_helper + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.helpers.xs_scaler +-------------------------------------- +.. automodule:: BTVNanoCommissioning.helpers.xs_scaler + :members: + :undoc-members: + :show-inheritance: + +BTVNanoCommissioning.helpers.xsection +------------------------------------- +.. automodule:: BTVNanoCommissioning.helpers.xsection + :members: + :undoc-members: + :show-inheritance: + +scripts. \ No newline at end of file diff --git a/docs/auto.md b/docs/auto.md new file mode 100644 index 00000000..81431ab9 --- /dev/null +++ b/docs/auto.md @@ -0,0 +1,5 @@ +## Automation + + +At the moment the automation is limited with the computing resources using gitlab ci [autobtv](https://gitlab.cern.ch/cms-analysis/btv/software-and-algorithms/autobtv). + diff --git a/docs/conf.py b/docs/conf.py index 750bff19..8e62f3f1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,29 +10,52 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os, sys + +sys.path.insert(0, os.path.abspath("../src")) # -- Project information ----------------------------------------------------- -project = "Basic Sphinx Example Project" -copyright = "2022, Read the Docs core team" -author = "Read the Docs core team" +project = "BTVNanocommissioning" +copyright = "2024, BTV-POG-CMS" +author = "By BTV CMS" # -- General configuration --------------------------------------------------- # -- General configuration extensions = [ + "myst_parser", + "sphinx_copybutton", "sphinx.ext.duration", "sphinx.ext.doctest", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "sphinx.ext.imgmath", + "sphinx.ext.todo", +] + +myst_heading_anchors = 4 +myst_enable_extensions = [ + "amsmath", + "attrs_inline", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + "tasklist", ] + intersphinx_mapping = { "rtd": ("https://docs.readthedocs.io/en/stable/", None), "python": ("https://docs.python.org/3/", None), @@ -44,6 +67,8 @@ # -- Options for EPUB output epub_show_urls = "footnote" +source_suffix = [".rst", ".md"] +copybutton_exclude = ".linenos, .gp" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -61,3 +86,4 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_theme_options = {"style_nav_header_background": "#fcb302"} diff --git a/docs/developer.md b/docs/developer.md new file mode 100644 index 00000000..8104acc2 --- /dev/null +++ b/docs/developer.md @@ -0,0 +1,419 @@ +## For developers: Add new workflow + + +The BTV tutorial for coffea part is under [`notebooks`](https://github.com/cms-btv-pog/BTVNanoCommissioning/tree/master/notebooks) and the template to construct new workflow is [`src/BTVNanoCommissioning/workflows/example.py`](https://github.com/cms-btv-pog/BTVNanoCommissioning/blob/master/src/BTVNanoCommissioning/workflows/example.py) + +The BTV tutorial for coffea part is `notebooks/BTV_commissiong_tutorial-coffea.ipynb` and the template to construct new workflow is `src/BTVNanoCommissioning/workflows/example.py`. + + +Use the `example.py` as template to develope new workflow. + +### 0. Add new workflow info to `workflows/__init__.py` + + +```python +# if the workflow is in a new files, you need to import your workflow.py +from BTVNanoCommissioning.workflows.new_workflow import ( + NanoProcessor as new_WORKFLOWProcessor +) +# And then include the processor into the modules with the name of workflow. The name is used when specifying --workflow when running the jobs +workflows["name_workflow"] = new_WORKFLOWProcessor +# IF the workflow is based on the modifier to existing workflow, put the selectionModifier used in the existing workflow +workflows["ctag_ttsemilep_sf"] = partial( + CTAGWcTTValidSFProcessor, selectionModifier="semittM" +) +``` +Notice that if you are working on a WP SFs, please put **WP** in the name. + +### 1. Add histogram collections to `utils/histogrammer.py` + +The histograms are use the [`hist`](https://hist.readthedocs.io/en/latest/) in this framework. This can be easily to convert to root histogram by `uproot` or numpy histograms. For quick start of hist can be found [here](https://hist.readthedocs.io/en/latest/user-guide/quickstart.html) + +There are a few axes are predefined and commonly used for all the workflows, notice that the `name` should be consistent with the info in the tree if it is stored. +```python +pt_axis = Hist.axis.Regular(60, 0, 300, name="pt", label=" $p_{T}$ [GeV]") +eta_axis = Hist.axis.Regular(25, -2.5, 2.5, name="eta", label=" $\eta$") +phi_axis = Hist.axis.Regular(30, -3, 3, name="phi", label="$\phi$") +``` +The histograms are wrapped as `dict`, it should contains **syst_axis (at first axis)**, **Hist.storage.Weight() (in last axis)** and axis for your variable. The key is suggest to use the format of `$OBJECT_$VAR` in case the variable `$VAR` is in the tree. + +```python +_hist_dict["mujet_pt"] = Hist.Hist( + syst_axis, flav_axis, dr_axis, Hist.storage.Weight() + ) # create cutstomize histogram +``` + +The kinematic variables/workflow specific variables are defined first, then it takes the common collections of input variables from the common defintion. +In case you want to add common variables use for all the workflow, you can go to [`helper/definition.py`](#add-new-common-variables) + +### 2. Selections: Implemented selections on events (`workflow/`) + +Create `boolean` arrays along event axis. Also check whether some common selctions already in `utils/selection.py` + +```python + ## HLT- put trigger paths + triggers = [ + "Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ", + "Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ", + "Mu8_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ", + ] + req_trig = HLT_helper(events, triggers) + + ##### Add some selections + ## Muon cuts + # muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2 + + muon_sel = (events.Muon.pt > 15) & (mu_idiso(events, self._campaign)) # applied selection on pT, and `mu_idiso` is predefined selection in `selection.py` which refers to cut-based tight muon ID+Iso + event_mu = events.Muon[muon_sel] # Pruned the muon collections with the selection + req_muon = ak.num(event_mu.pt) == 1 # Check each event has exact one muon + .... # other selections + ## Apply all selections + event_level = ( + req_trig & req_lumi & req_jet & req_muon & req_ele & req_leadlep_pt + ) +``` + +In case you are modifying exist workflow, you need to add to `__init__` and in the selections + +```python +# in init +self.selMod = selectionModifier +# In selection +if self.selMod=="WcM": + event_level = req_trig & req_lumi & req_jet & req_muon & req_ele & req_leadlep_pt& req_Wc +``` + +### 3. Selected objects: Pruned objects with reduced event_level +Store the selected objects to event-based arrays. The selected object must contains **Sel**, for the muon-enriched jet and soft muon is **MuonJet** and **SoftMu**, the kinematics will store. The cross-object variables need to create entry specifically. + +```python + # Keep the structure of events and pruned the object size + pruned_ev = events[event_level] + pruned_ev["SelJet"] = event_jet[event_level][:, 0] + pruned_ev["SelMuon"] = event_mu[event_level][:, 0] + pruned_ev["mujet_ptratio"] = event_mu[event_level].pt / pruned_ev.SelJet.pt # notice that the cross-object need to be created specificaly + pruned_ev["mujet_dr"] = event_mu[event_level].delta_r(pruned_ev.SelJet) +``` + + + The pruned information are then proceed to store into histograms, output arrays and use to evaluate the weight. In case you have customize object for [corrections](#add-additional-weight-or-uncertainty-information), new common [variables](#add-new-common-variables) need to add, please go to dedicated section. + +See the details below for the usage of `pruned_ev` + +
output section +

+ +```python +#################### +# Output # +#################### +# Configure SFs - read pruned objects from the pruned_ev and apply SFs and call the systematics +weights = weight_manager(pruned_ev, self.SF_map, self.isSyst) +# Configure systematics shifts +if shift_name is None: + systematics = ["nominal"] + list(weights.variations) # nominal + weight variation systematics +else: + systematics = [shift_name] # JES/JER systematics + +# Fill the weight to output arrys +if not isRealData: + pruned_ev["weight"] = weights.weight() + for ind_wei in weights.weightStatistics.keys(): + pruned_ev[f"{ind_wei}_weight"] = weights.partial_weight( + include=[ind_wei] + ) +# Configure histograms- fill the histograms with pruned objects +if not self.noHist: + output = histo_writter( + pruned_ev, output, weights, systematics, self.isSyst, self.SF_map + ) +# Output arrays - store the pruned objects in the output arrays +if self.isArray: + array_writer(self, pruned_ev, events, systematics[0], dataset, isRealData) + ``` + +

+
+ + +### 4. Setup CI pipeline `.github/workflow` + +The actions are checking the changes would break the framework. The actions are collected in `.github/workflow` +You can simply include a workflow by adding the entries with name + +```yaml +- name: semileptonic + c ttbar workflows with correctionlib + run: | + string=$(git log -1 --pretty=format:'%s') + if [[ $string == *"ci:skip array"* ]]; then + opts=$(echo "$opts" | sed 's/--isArray //g') + fi + if [[ $string == *"ci:skip syst"* ]]; then + opts=$(echo "$opts" | sed 's/--isSyst all//g') + elif [[ $string == *"ci:JERC_split"* ]]; then + opts=$(echo "$opts" | sed 's/--isSyst all/--isSyst JERC_split/g') + elif [[ $string == *"ci:weight_only"* ]]; then + opts=$(echo "$opts" | sed 's/--isSyst all/--isSyst weight_only/g') + fi + python runner.py --workflow c_ttsemilep_sf --json metadata/test_bta_run3.json --limit 1 --executor iterative --campaign Summer23 --year 2023 $opts +``` + +Special commit head messages could run different commands in actions (add the flag in front of your commit) +The default configureation is doing +```python +python runner.py --workflow emctag_ttdilep_sf --json metadata/test_bta_run3.json --limit 1 --executor iterative --campaign Summer23 --isArray --isSyst all +``` + +- `[skip ci]`: not running ci at all in the commit message +- `ci:skip array` : remove `--isArray` option +- `ci:skip syst` : remove `--isSyst all` option +- `ci:JERC_split` : change systematic option to split JERC uncertainty sources `--isSyst JERC_split` +- `ci:weight_only` : change systematic option to weight only variations `--isSyst weight_only` + +
Set CI in your github account +

+ +Since the CI pipelines involve reading files via `xrootd` and access gitlab.cern.ch, you need to save some secrets in your forked directory. + +Yout can find the secret configuration in the direcotry : `Settings>>Secrets>>Actions`, and create the following secrets: + +- `GIT_CERN_SSH_PRIVATE`: + 1. Create a ssh key pair with `ssh-keygen -t rsa -b 4096` (do not overwrite with your local one), add the public key to your CERN gitlab account + 2. Copy the private key to the entry +- `GRID_PASSWORD`: Add your grid password to the entry. +- `GRID_USERCERT` & `GRID_USERKEY`: Encrypt your grid user certification `base64 -i ~/.globus/userkey.pem | awk NF=NF RS= OFS=` and `base64 -i ~/.globus/usercert.pem | awk NF=NF RS= OFS=` and copy the output to the entry. + +

+
+ + +### 5. Refine used MC as input `sample.py` +The `sample.py` collects the samples (dataset name) used in the workflow. This collections are use to create the dataset json file. +- `data` : data sample (MuonEG, Muon0....) +- `MC`: main MC used for the workflow +- `minor_MC` : minor MC samples use for background events +- `syst_MC`: systematic MC samples (TTbar sample mass, Hdamp ... variations) + +Here's the example for BTA_ttbar +```python +"BTA_ttbar": { + "data": ["MuonEG"], + "MC": ["TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8"], + "minor_MC": [ + "TTtoLNu2Q_TuneCP5_13p6TeV_powheg-pythia8", + "TWminusto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8", + "TbarWplusto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8", + "TWminustoLNu2Q_TuneCP5_13p6TeV_powheg-pythia8", + "TbarWplustoLNu2Q_TuneCP5_13p6TeV_powheg-pythia8", + "TbarBQ_t-channel_4FS_TuneCP5_13p6TeV_powheg-madspin-pythia8", + "TBbarQ_t-channel_4FS_TuneCP5_13p6TeV_powheg-madspin-pythia8", + "WWto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8", + "ZZto2L2Q_TuneCP5_13p6TeV_powheg-pythia8", + "WZto3LNu_TuneCP5_13p6TeV_powheg-pythia8", + "WZtoLNu2Q_TuneCP5_13p6TeV_powheg-pythia8", + ], + "syst_MC": [ + "TTto2L2Nu_MT-171p5_TuneCP5_13p6TeV_powheg-pythia8", + "TTto2L2Nu_MT-175p5_TuneCP5_13p6TeV_powheg-pythia8", + "TTto2L2Nu_Hdamp-158_TuneCP5_13p6TeV_powheg-pythia8", + "TTto2L2Nu_Hdamp-418_TuneCP5_13p6TeV_powheg-pythia8", + "TTto2L2Nu_TuneCP5Down_13p6TeV_powheg-pythia8", + "TTto2L2Nu_TuneCP5Up_13p6TeV_powheg-pythia8", + ], + }, +``` + +### Optional changes +#### Add workflow to `scripts/suball.py` +The `suball.py` summarize the steps to obtain the result. +In case your task requires to run several workflows, you can wrapped them as `dict` of the workflows +```python +scheme = { + # scale factor workflows + "SF": ["BTA_ttbar", "BTA_addPFMuons"], + # Use for prompt data MC checks for analysis + "Validation": ["ttdilep_sf", "ctag_Wc_sf"], + # commissioning workflows + "default_comissioning": [ + "ttdilep_sf", + "ttsemilep_sf", + "ctag_Wc_sf", + "ctag_DY_sf", + "QCD_sf", + "QCD_mu_sf" + ], + } +``` +#### Add new common variables in `helper/definition.py` + +In the `definition.py` we collect the axis definition, name and label of tagger scores/input variables +```python +disc_list=[....] # tagger score +definitions_dict = { + "DeepCSV_jetNSelectedTracks": # name used in tree + { + "displayname": "Jet N Selected Tracks", # axis name + "manual_ranges": [0.0, 25], + "ylabel_text": "Jets", + "format_unit": "2f", + "format_unit_digits": 2, + "bins": 25, + "inputVar_units": None, + }, + ... +} +``` +#### Additional corrections and uncertainty variations not in the framework +The corrections are collected in `utils/correction.py`. There are two types of the variation: weight varations, i.e. SFs, ueps weight, or object energy scale/resolution variations: JES/JER. Here's an example to add new corrections + +1. Add new info `utils/AK4_parameter.py` +```python +"JPCalib": { + "Run2023D-22Sep2023_v1": "calibeHistoWrite_Data2023D-22Sep2023_v1.root", + "Run2023D-22Sep2023_v2": "calibeHistoWrite_Data2023D-22Sep2023_v2.root", + "MC": "calibeHistoWrite_MC2023_Summer23BPix.root", + }, +``` +2. Add new collections to `load_SF` in `utils/correction.py` +Depends on corrections file type, read differently from its definition. See details in: [correctionlib official](https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration/-/tree/master/examples), or other custom way used in [coffea](https://coffea-hep.readthedocs.io/en/latest/notebooks/applying_corrections.html). This load all the correction information as `evaluator` can be use to extract weight information later +```python +for SF in config[campaign].keys(): + if SF == "lumiMask": + continue + ## pileup weight + if SF == "PU": + ## Check whether files in jsonpog-integration exist + if os.path.exists( + f"/cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration/POG/LUM/{year}_{campaign}" + ): + correct_map["PU"] = correctionlib.CorrectionSet.from_file( + f"/cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration/POG/LUM/{year}_{campaign}/puWeights.json.gz" + ) + ## Otherwise custom files + else: + _pu_path = f"BTVNanoCommissioning.data.PU.{campaign}" + with importlib.resources.path( + _pu_path, config[campaign]["PU"] + ) as filename: + if str(filename).endswith(".pkl.gz"): + with gzip.open(filename) as fin: + correct_map["PU"] = cloudpickle.load(fin)[ + "2017_pileupweight" + ] + elif str(filename).endswith(".json.gz"): + correct_map["PU"] = correctionlib.CorrectionSet.from_file( + str(filename) + ) + elif str(filename).endswith(".histo.root"): + ext = extractor() + ext.add_weight_sets([f"* * {filename}"]) + ext.finalize() + correct_map["PU"] = ext.make_evaluator() + +``` +3.1 Add weight based correction + +In the `utils/correction` create the reader to get the weight + +Create your function to readout the weight from the evaluator stored in the `correction_map`, the idea is to add the weight/systematic information to the event and return to the workflow + + + +```python +def btagSFs(jet, correct_map, weights, SFtype, syst=False): + ..... + if i == 0 and syst == False: + weights.add(SFtype, sfs) + + if syst == True: + weights.add_multivariation( + SFtype, + sfs, + systlist, + np.array(list(sfs_up_all.values())), + np.array(list(sfs_down_all.values())), + ) + # in case you only have the up/down variation + weights.add( + SFtype,# name of the weight + sfs,# nominal SFs + sf_unc_up,#up varition + sf_unc_down, #down varition + ) + +``` + +In case it's a common correction, add to the `weight_manager` in `utils/correction` otherwise directly to the workflow (weight based) + +```python +def weight_manager(pruned_ev, SF_map, isSyst): + weights = Weights(len(pruned_ev), storeIndividual=True) + ... + btagSFs(pruned_ev.SelJet, SF_map, weights, "DeepJetC", syst_wei) + btagSFs(pruned_ev.SelJet, SF_map, weights, "DeepJetB", syst_wei) + btagSFs(pruned_ev.SelJet, SF_map, weights, "DeepCSVB", syst_wei) + btagSFs(pruned_ev.SelJet, SF_map, weights, "DeepCSVC", syst_wei) + return weights +``` + +3.2 Add object variations + +For the object scale / resolution variation we shift object energy/positions as a list of `shifts` to the original object. +The `shifts` is a list of shifted object after the corrctions are applied to the objects + +```python +# JES uncertainty +if "JES_Total" in jets.fields: + shifts += [ + ( + { + "Jet": jets.JES_Total.up, # change the objects to JES up variation + "MET": met.JES_Total.up, + }, + "JESUp", # the uncertainty variation name + ), + ( + { + "Jet": jets.JES_Total.down, + "MET": met.JES_Total.down, + }, + "JESDown", + ), + ] +``` + +In case the shifts are in common , put to `common_shifts`: +```python +if "JME" in self.SF_map.keys(): + syst_JERC = self.isSyst + if self.isSyst == "JERC_split": + syst_JERC = "split" + shifts = JME_shifts( + shifts, self.SF_map, events, self._campaign, isRealData, syst_JERC + ) + else: + if int(self._year) < 2020: + shifts = [ + ({"Jet": events.Jet, "MET": events.MET, "Muon": events.Muon}, None) + ] + else: + shifts = [ + ( + { + "Jet": events.Jet, + "MET": events.PuppiMET, + "Muon": events.Muon, + }, + None, + ) + ] +``` + + +otherwise in your workflow `process(self, events)` add new shifts +```python +def process(self, events): + events = missing_branch(events) + shifts = common_shifts(self, events) + shifts+=[({obj:variation},name)] +``` \ No newline at end of file diff --git a/docs/figs/btv_wf.png b/docs/figs/btv_wf.png new file mode 100644 index 00000000..a0a1ccd8 Binary files /dev/null and b/docs/figs/btv_wf.png differ diff --git a/docs/figs/comm_wf.png b/docs/figs/comm_wf.png new file mode 100644 index 00000000..9de8acdd Binary files /dev/null and b/docs/figs/comm_wf.png differ diff --git a/docs/figs/sf_new.png b/docs/figs/sf_new.png new file mode 100644 index 00000000..548f9677 Binary files /dev/null and b/docs/figs/sf_new.png differ diff --git a/docs/figs/sf_old.png b/docs/figs/sf_old.png new file mode 100644 index 00000000..42b88fe7 Binary files /dev/null and b/docs/figs/sf_old.png differ diff --git a/docs/generated/BTVNanoCommissioning.helpers.BTA_helper.rst b/docs/generated/BTVNanoCommissioning.helpers.BTA_helper.rst new file mode 100644 index 00000000..89f2045a --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.helpers.BTA_helper.rst @@ -0,0 +1,15 @@ +BTVNanoCommissioning.helpers.BTA\_helper +======================================== + +.. automodule:: BTVNanoCommissioning.helpers.BTA_helper + + + .. rubric:: Functions + + .. autosummary:: + + calc_ip_vector + cumsum + is_from_GSP + to_bitwise_trigger + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.helpers.func.rst b/docs/generated/BTVNanoCommissioning.helpers.func.rst new file mode 100644 index 00000000..5af4e7cd --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.helpers.func.rst @@ -0,0 +1,19 @@ +BTVNanoCommissioning.helpers.func +================================= + +.. automodule:: BTVNanoCommissioning.helpers.func + + + .. rubric:: Functions + + .. autosummary:: + + PFCand_link + dump_lumi + flatten + memory_usage_psutil + normalize + num + update + uproot_writeable + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.helpers.update_branch.rst b/docs/generated/BTVNanoCommissioning.helpers.update_branch.rst new file mode 100644 index 00000000..c70cca46 --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.helpers.update_branch.rst @@ -0,0 +1,12 @@ +BTVNanoCommissioning.helpers.update\_branch +=========================================== + +.. automodule:: BTVNanoCommissioning.helpers.update_branch + + + .. rubric:: Functions + + .. autosummary:: + + missing_branch + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.helpers.xs_scaler.rst b/docs/generated/BTVNanoCommissioning.helpers.xs_scaler.rst new file mode 100644 index 00000000..532643e8 --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.helpers.xs_scaler.rst @@ -0,0 +1,18 @@ +BTVNanoCommissioning.helpers.xs\_scaler +======================================= + +.. automodule:: BTVNanoCommissioning.helpers.xs_scaler + + + .. rubric:: Functions + + .. autosummary:: + + additional_scale + collate + dict_depth + getSumW + merge_output + scaleSumW + scale_xs + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.helpers.xsection.rst b/docs/generated/BTVNanoCommissioning.helpers.xsection.rst new file mode 100644 index 00000000..ac8387b6 --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.helpers.xsection.rst @@ -0,0 +1,6 @@ +BTVNanoCommissioning.helpers.xsection +===================================== + +.. automodule:: BTVNanoCommissioning.helpers.xsection + + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.AK4_parameters.rst b/docs/generated/BTVNanoCommissioning.utils.AK4_parameters.rst new file mode 100644 index 00000000..4390f423 --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.AK4_parameters.rst @@ -0,0 +1,6 @@ +BTVNanoCommissioning.utils.AK4\_parameters +========================================== + +.. automodule:: BTVNanoCommissioning.utils.AK4_parameters + + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.array_writer.rst b/docs/generated/BTVNanoCommissioning.utils.array_writer.rst new file mode 100644 index 00000000..11c84ceb --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.array_writer.rst @@ -0,0 +1,12 @@ +BTVNanoCommissioning.utils.array\_writer +======================================== + +.. automodule:: BTVNanoCommissioning.utils.array_writer + + + .. rubric:: Functions + + .. autosummary:: + + array_writer + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.correction.rst b/docs/generated/BTVNanoCommissioning.utils.correction.rst new file mode 100644 index 00000000..16b3f0ac --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.correction.rst @@ -0,0 +1,36 @@ +BTVNanoCommissioning.utils.correction +===================================== + +.. automodule:: BTVNanoCommissioning.utils.correction + + + .. rubric:: Functions + + .. autosummary:: + + JME_shifts + Roccor_shifts + add_jec_variables + add_pdf_weight + add_ps_weight + add_scalevar_3pt + add_scalevar_7pt + btagSFs + common_shifts + eleSFs + jetveto + jmar_sf + load_SF + load_lumi + muSFs + puwei + top_pT_reweighting + top_pT_sf_formula + weight_manager + + .. rubric:: Classes + + .. autosummary:: + + JPCalibHandler + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.histogrammer.rst b/docs/generated/BTVNanoCommissioning.utils.histogrammer.rst new file mode 100644 index 00000000..fdc812af --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.histogrammer.rst @@ -0,0 +1,13 @@ +BTVNanoCommissioning.utils.histogrammer +======================================= + +.. automodule:: BTVNanoCommissioning.utils.histogrammer + + + .. rubric:: Functions + + .. autosummary:: + + histo_writter + histogrammer + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.plot_utils.rst b/docs/generated/BTVNanoCommissioning.utils.plot_utils.rst new file mode 100644 index 00000000..1068e61d --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.plot_utils.rst @@ -0,0 +1,20 @@ +BTVNanoCommissioning.utils.plot\_utils +====================================== + +.. automodule:: BTVNanoCommissioning.utils.plot_utils + + + .. rubric:: Functions + + .. autosummary:: + + MCerrorband + SFerror + autoranger + clopper_pearson_interval + compatible + normal_interval + plotratio + poisson_interval + rebin_hist + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.sample.rst b/docs/generated/BTVNanoCommissioning.utils.sample.rst new file mode 100644 index 00000000..677a3e6b --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.sample.rst @@ -0,0 +1,6 @@ +BTVNanoCommissioning.utils.sample +================================= + +.. automodule:: BTVNanoCommissioning.utils.sample + + \ No newline at end of file diff --git a/docs/generated/BTVNanoCommissioning.utils.selection.rst b/docs/generated/BTVNanoCommissioning.utils.selection.rst new file mode 100644 index 00000000..577095a3 --- /dev/null +++ b/docs/generated/BTVNanoCommissioning.utils.selection.rst @@ -0,0 +1,22 @@ +BTVNanoCommissioning.utils.selection +==================================== + +.. automodule:: BTVNanoCommissioning.utils.selection + + + .. rubric:: Functions + + .. autosummary:: + + HLT_helper + MET_filters + btag_mu_idiso + btag_wp + ele_cuttightid + ele_mvatightid + jet_cut + jet_id + mu_idiso + softmu_mask + wp_dict + \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..c1c1bf87 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,48 @@ +BTVNanoCommissioning +==================== + +This page documents the setup of the [BTV Run 3 commissioning framework](https://github.com/cms-btv-pog/BTVNanoCommissioning/tree/master). This framework is based on [coffea](https://coffeateam.github.io/coffea/) and using [btvnano](https://btv-wiki.docs.cern.ch/SoftwareAlgorithms/PFNano/) as input. The framework is also used as frontend for the btv automation task [autobtv](https://gitlab.cern.ch/cms-analysis/btv/software-and-algorithms/autobtv) + +This framework is based on [coffea processor](https://coffeateam.github.io/coffea/concepts.html#coffea-processor). Each workflow can be a separate **processor** file in the `workflows`, creating the mapping from `PFNano` to the histograms as `coffea` file or creating `.root` files by saving awkward arrays. Workflow processors can be passed to the `runner.py` script along with the fileset these should run over. Multiple executors can be chosen +(`iterative` - one by one, `futures` - multiprocessing. Scale out to clusters depend on facilities, see the details in [scale-out sites](./scaleout.md)). Obtain the histograms as plot(`.pdf`) or save to template `.root` file with dedicated scripts. + +![structure](figs/btv_wf.png) + +Current working in progress [issues](https://gitlab.cern.ch/cms-btv-coordination/tasks/-/issues/?label_name%5B%5D=Software%3A%3A%20BTVnano%20%26CommFW) + +The minimum requirement commands are shown in follow, specified the selections, datataset, campaign and year +``` +python runner.py --workflow ttsemilep_sf --json metadata/test_bta_run3.json --campaign Summer22EERun3 --year 2022 +``` + +More options for the runner can be done: + + +```bash markdown-code-runner +python runner.py -h +``` + +Currently the available workflows are summarized +:::{tip} +- Installation instructions are available [here](./installation.md) +- To run the commissioning/SF tasks [user](./user.md) +- To develop new worklows [developer](./developer.md) +- Framework structure [overview](./structure.md) +- Scale out sites and [options](./scaleout.md) +- Available selections [phase spaces](./wf.md) +- Useful scripts: plotting/root template conversion [scripts](./scripts.md) +- Automation [developments](./auto.md) +::: + +```{toctree} +:caption: Contents: +:maxdepth: 1 +installation.md +user.md +developer.md +wf.md +scripts.md +scaleout.md +auto.md +api.rst +``` diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 0d9ef1d6..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. include:: ../README.rst - -Welcome to Lumache's documentation! -=================================== - -**Lumache** (/lu'make/) is a Python library for cooks and food lovers -that creates recipes mixing random ingredients. -It pulls data from the `Open Food Facts database `_ -and offers a *simple* and *intuitive* API. - -Check out the :doc:`usage` section for further information, including -how to :ref:`installation` the project. - -.. note:: - - This project is under active development. - -Contents --------- - -.. toctree:: - - Home - usage - api diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 00000000..52fdb6f7 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,47 @@ +## Setup the environment + +You can install your [standalone conda envrionment](#standalone-conda-environment) via `yaml` or on the lxplus you can directly jump to [setup](#setup-the-framework) + +### Standalone conda environment +:::{caution} +suggested to install under `bash` environment +::: + +For installing Micromamba, see [[here](https://mamba.readthedocs.io/en/latest/installation/micromamba-installation.html)] +``` +curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" +# Run and follow instructions on screen +bash Miniforge3-$(uname)-$(uname -m).sh + +micromamba activate +``` +NOTE: always make sure that conda, python, and pip point to local micromamba installation (`which conda` etc.). + + +You can simply create the environment through the existing `test_env.yml` under your micromamba environment using micromamba, and activate it +``` +micromamba env create -f test_env.yml + +``` +### Setup the framework + +```bash +# activate enviroment once you have coffea framework +conda/micromamba activate btv_coffea + +conda/micromamba activate /eos/home-m/milee/miniforge3/envs/btv_coffea + +# only first time, including submodules +git clone git@github.com:cms-btv-pog/BTVNanoCommissioning.git +# Once the environment is set up, compile the python package: +pip install -e . +pip install -e .[dev, doc] # for developer +``` + +You can still install additional packages itself by `pip install $PACKAGE` + +`conda/micromamba activate btv_coffea` is required to setup + + +### Other installation options for coffea +See [https://coffeateam.github.io/coffea/installation.html](https://coffeateam.github.io/coffea/installation.html) diff --git a/docs/requirements.in b/docs/requirements.in index acbc25d5..6d2514e1 100644 --- a/docs/requirements.in +++ b/docs/requirements.in @@ -1,2 +1,4 @@ Sphinx>=5,<6 sphinx_rtd_theme +sphinx_copybutton +myst-parser diff --git a/docs/requirements.txt b/docs/requirements.txt index c95df4df..4f40aea5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,55 +4,54 @@ # # pip-compile docs/requirements.in # -alabaster==0.7.12 +setuptools==70.1.1 +alabaster==1.0.0 # via sphinx -babel==2.10.3 +babel==2.16.0 # via sphinx -certifi==2022.6.15 +certifi==2024.7.4 # via requests -charset-normalizer==2.1.0 +charset-normalizer==3.3.2 # via requests -docutils==0.17.1 +docutils==0.21.2 # via # sphinx # sphinx-rtd-theme -idna==3.3 +idna==3.7 # via requests imagesize==1.4.1 # via sphinx -jinja2==3.1.2 +jinja2==3.1.4 # via sphinx -markupsafe==2.1.1 +markupsafe==2.1.5 # via jinja2 -packaging==21.3 +packaging==24.1 # via sphinx -pygments==2.12.0 +pygments==2.18.0 # via sphinx -pyparsing==3.0.9 +pyparsing==3.1.2 # via packaging -pytz==2022.1 +pytz==2024.1 # via babel -requests==2.28.1 +requests==2.32.3 # via sphinx snowballstemmer==2.2.0 # via sphinx -sphinx==5.0.2 +sphinx==8.1.3 # via # -r docs/requirements.in # sphinx-rtd-theme -sphinx-rtd-theme==1.0.0 - # via -r docs/requirements.in -sphinxcontrib-applehelp==1.0.2 - # via sphinx -sphinxcontrib-devhelp==1.0.2 - # via sphinx -sphinxcontrib-htmlhelp==2.0.0 - # via sphinx +sphinx-copybutton==0.5.2 +sphinx-rtd-theme==3.0.1 +sphinxcontrib-applehelp==2.0.0 +sphinxcontrib-devhelp==2.0.0 +sphinxcontrib-htmlhelp==2.1.0 +sphinxcontrib-jquery==4.1 sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-qthelp==1.0.3 - # via sphinx -sphinxcontrib-serializinghtml==1.1.5 - # via sphinx -urllib3==1.26.9 +sphinxcontrib-qthelp==2.0.0 +sphinxcontrib-serializinghtml==2.0.0 +urllib3==2.2.2 # via requests +myst-parser==4.0.0 +markdown-code-runner==2.1.0 +markdown-it-py==3.0.0 diff --git a/docs/run.md b/docs/run.md new file mode 100644 index 00000000..115e3560 --- /dev/null +++ b/docs/run.md @@ -0,0 +1,2 @@ +## Running the workflows + diff --git a/docs/scaleout.md b/docs/scaleout.md new file mode 100644 index 00000000..4d325b50 --- /dev/null +++ b/docs/scaleout.md @@ -0,0 +1,101 @@ +## Scale-out (Sites & file split scheme) + +Scale out can be notoriously tricky between different sites. Coffea's integration of `slurm` and `dask` +makes this quite a bit easier and for some sites the ``native'' implementation is sufficient, e.g Condor@DESY. +However, some sites have certain restrictions for various reasons, in particular Condor @CERN and @FNAL. The scaleout scheme is named as follows: `$cluster_schedule_system/scheduler/site`. The existing sites are documented in [sites configuration](#sites-configuration-with-daskparsl-schedular) while [standalone condor submission](#standalone-condor-jobslxpluscmsconnect) is possible and strongly suggested when working on lxplus. + +:::{tip} + Here we also provide some file split schemes provided in the framework, the later two schemes can be used together to boost the processing time. **All in one** case is the default any non-lxplus clusters which avoid complications for merging several files in plotting step. **Dataset split** scheme can be used to save intermediate steps while the input json file need to be created by users. **File split** scheme is the default scheme for lxplus cluster(see details in [condor-dask@lxplus](#condorcern-lxplus)) where the file list of each dataset in the json file are split by a certain numbers of file `--fsize` and process sequencially automatically. You can also combine both **dataset split** and **file split** to parallize the job into different processes(notice in lxplus you need to login different machine due to port restriction) with quick job. + + + +Memory usage is also useful to adapt to cluster. Check the memory by calling `memory_usage_psutil()` from `helpers.func.memory_usage_psutil` to optimize job size. Example with `ectag_Wc_sf` summarized below. + +| Type |Array+Hist | Hist only| Array Only| +| :---: | :---: | :---: | :---: | +DoubleMuon (BTA,BTV_Comm_v2)| 1243MB | 848MB |1249MB| +DoubleMuon (PFCands, BTV_Comm_v1)|1650MB |1274MB |1632MB| +DoubleMuon (Nano_v11)|1183MB| 630MB |1180MB| +WJets_inc (BTA,BTV_Comm_v2)| 1243MB |848MB |1249MB| +WJets_inc (PFCands, BTV_Comm_v1)|1650MB |1274MB |1632MB +WJets_inc (Nano_v11)|1183MB |630MB |1180MB| + +::: + +### Sites configuration with dask/parsl schedular + +#### Condor@FNAL (CMSLPC) +Follow setup instructions at https://github.com/CoffeaTeam/lpcjobqueue. After starting +the singularity container run with +```bash +python runner.py --wf ttcom --executor dask/lpc +``` + +#### Condor@CERN (lxplus) +Only one port is available per node, so its possible one has to try different nodes until hitting +one with `8786` being open. Other than that, no additional configurations should be necessary. + +```bash +python runner.py --wf ttcom --executor dask/lxplus +``` + +jobs automatically split to 50 files per jobs to avoid job failure due to crowded cluster on lxplus with the naming scheme `hist_$workflow_$json_$dictindex_$fileindex.coffea`. The `.coffea` files can be then combined at plotting level + + +:::{caution} +The optimal scaleout options on lxplus are `-s 50 --chunk 50000` +::: + +To deal with unstable condor cluster and dask worker on lxplus, you can resubmit failure jobs via `--index $dictindex,$fileindex` option. `$dictindex` refers to the index in the `.json dict`, `$fileindex` refers to the index of the file list split to 50 files per dask-worker. The total number of files of each dict can be computed by `math.ceil(len($filelist)/50)` The job will start from the corresponding indices. + +#### Coffea-casa (Nebraska AF) +Coffea-casa is a JupyterHub based analysis-facility hosted at Nebraska. For more information and setup instuctions see +https://coffea-casa.readthedocs.io/en/latest/cc_user.html + +After setting up and checking out this repository (either via the online terminal or git widget utility run with +```bash +python runner.py --wf ttcom --executor dask/casa +``` +Authentication is handled automatically via login auth token instead of a proxy. File paths need to replace xrootd redirector with "xcache", `runner.py` does this automatically. + + +#### Condor@DESY +```bash +python runner.py --wf ttcom --executor dask/condor(parsl/condor) +``` + + + + +### Standalone condor jobs@lxplus/cmsconnect + +:::{warning} +Strongly suggest to use this in lxplus +::: + +You have the option to run the framework through "standalone condor jobs", bypassing the native coffea-supported job submission system. Within each job you submit, a standalone script will execute the following on the worker node: + + - Set up a minimal required Python environment. + - Retrieve the BTVNanoCommissioning repository, either from a git link or transferred locally. + - Launch the `python runner.py ...` command to execute the coffea framework in the iterative executor mode. + +This utility is currently adapted for the lxplus and cmsconnect condor systems. To generate jobs for launching, replace `python runner.py` with `python condor/submitter.py`, append the existing arguments, and add the following arguments in addition: + + - `--jobName`: Specify the desired condor job name. A dedicated folder will be generated, including all submission-related files. + - `--outputXrootdDir`: Indicate the XRootD directory's path (starting with `root://`) where the produced .coffea (and .root) files from each worker node will be transferred to. + - `--condorFileSize`: Define the number of files to process per condor job (default is 50). The input file list will be divided based on this count. + - `--remoteRepo` (optional, but recommended): Specify the path and branch of the remote repository to download the BTVNanoCommissioning repository. If not specified, the local directory will be packed and transferred as the condor input, potentially leading to higher loads for condor transfers. Use the format e.g. `--remoteRepo 'https://github.com/cms-btv-pog/BTVNanoCommissioning.git -b master'`. + +After executing the command, a new folder will be created, preparing the submission. Follow the on-screen instructions and utilize `condor_submit ...` to submit the jdl file. The output will be transferred to the designated XRootD destination. + +::: {admonition} Frequent issues for standalone condor jobs submission + + + +1. CMS Connect provides a condor interface where one can submit jobs to all resources available in the CMS Global Pool. See [WorkBookCMSConnect Twiki](https://twiki.cern.ch/twiki/bin/view/CMSPublic/WorkBookCMSConnect#Requesting_different_Operative_S) for the instructions if you use it for the first time. +2. The submitted jobs are of the kind which requires a proper setup of the X509 proxy, to use the XRootD service to access and store data. In the generated `.jdl` file, you may see a line configured for this purpose `use_x509userproxy = true`. If you have not submitted jobs of this kind on lxplus condor, we recommend you to add a line + ```bash + export X509_USER_PROXY=$HOME/x509up_u`id -u` + ``` + to `.bashrc` and run it so the proxy file will be stored in your AFS folder instead of in your `/tmp/USERNAME` folder. For submission on cmsconnect, no specific action is required. +::: diff --git a/docs/scripts.md b/docs/scripts.md new file mode 100644 index 00000000..e61502ab --- /dev/null +++ b/docs/scripts.md @@ -0,0 +1,185 @@ +## Scripts: prepare input/process output + +Here lists scripts can be used for BTV tasks + + +### `fetch.py` : create input json + + +Use `fetch.py` in folder `scripts/` to obtain your samples json files. You can create `$input_list` ,which can be a list of datasets taken from CMS DAS or names of dataset(need to specify campaigns explicity), and create the json contains `dataset_name:[filelist]`. One can specify the local path in that input list for samples not published in CMS DAS. +`$output_json_name$` is the name of your output samples json file. + +The `--whitelist_sites, --blacklist_sites` are considered for fetch dataset if multiple sites are available + + + + + + +### Get Prescale weights + +:::{caution} +Only works if `/cvmfs` is binding in the system +::: + +Generate prescale weights using `brilcalc` + +```python +python scripts/dump_prescale.py --HLT $HLT --lumi $LUMIMASK +# HLT : put prescaled triggers +# lumi: golden lumi json +``` + + +### Get processed information + +Get the run & luminosity information for the processed events from the coffea output files. When you use `--skipbadfiles`, the submission will ignore files not accesible(or time out) by `xrootd`. This script helps you to dump the processed luminosity into a json file which can be calculated by `brilcalc` tool and provide a list of failed lumi sections by comparing the original json input to the one from the `.coffea` files. + + +```bash +# all is default, dump lumi and failed files, if run -t lumi only case. no json file need to be specified +python scripts/dump_processed.py -c $COFFEA_FILES -n $OUTPUT_NAME (-j $ORIGINAL_JSON -t [all,lumi,failed]) +``` + + +### Plotting code +#### data/MC comparisons +:exclamation_mark: If using wildcard for input, do not forget the quoatation marks! (see 2nd example below) + +You can specify `-v all` to plot all the variables in the `coffea` file, or use wildcard options (e.g. `-v "*DeepJet*"` for the input variables containing `DeepJet`) + +:new: non-uniform rebinning is possible, specify the bins with list of edges `--autorebin 50,80,81,82,83,100.5` + +```bash +python scripts/plotdataMC.py -i a.coffea,b.coffea --lumi 41500 -p ttdilep_sf -v z_mass,z_pt +python scripts/plotdataMC.py -i "test*.coffea" --lumi 41500 -p ttdilep_sf -v z_mass,z_pt # with wildcard option need "" +``` + + + +``` + +options: + -h, --help show this help message and exit + --lumi LUMI luminosity in /pb + --com COM sqrt(s) in TeV + -p {ttdilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf}, --phase {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf} + which phase space + --log LOG log on y axis + --norm NORM Use for reshape SF, scale to same yield as no SFs case + -v VARIABLE, --variable VARIABLE + variables to plot, splitted by ,. Wildcard option * available as well. Specifying `all` will run through all variables. + --SF make w/, w/o SF comparisons + --ext EXT prefix name + -i INPUT, --input INPUT + input coffea files (str), splitted different files with ','. Wildcard option * available as well. + --autorebin AUTOREBIN + Rebin the plotting variables, input `int` or `list`. int: merge N bins. list of number: rebin edges(non-uniform bin is possible) + --xlabel XLABEL rename the label for x-axis + --ylabel YLABEL rename the label for y-axis + --splitOSSS SPLITOSSS + Only for W+c phase space, split opposite sign(1) and same sign events(-1), if not specified, the combined OS-SS phase space is used + --xrange XRANGE custom x-range, --xrange xmin,xmax + --flow FLOW + str, optional {None, 'show', 'sum'} Whether plot the under/overflow bin. If 'show', add additional under/overflow bin. If 'sum', add the under/overflow bin content to first/last bin. + --split {flavor,sample,sample_flav} + Decomposition of MC samples. Default is split to jet flavor(udsg, pu, c, b), possible to split by group of MC + samples. Combination of jetflavor+ sample split is also possible +``` + + + +### data/data, MC/MC comparisons + +You can specify `-v all` to plot all the variables in the `coffea` file, or use wildcard options (e.g. `-v "*DeepJet*"` for the input variables containing `DeepJet`) +:exclamation_mark: If using wildcard for input, do not forget the quoatation marks! (see 2nd example below) + +```bash +# with merge map, compare ttbar with data +python scripts/comparison.py -i "*.coffea" --mergemap '{"ttbar": ["TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8","TTto4Q_TuneCP5_13p6TeV_powheg-pythia8","TTtoLNu2Q_TuneCP5_13p6TeV_powheg-pythia8],"data":["MuonRun2022C-27Jun2023-v1","MuonRun2022D-27Jun2023-v1"]}' -r ttbar -c data -v mu_pt -p ttdilep_sf +# if no mergemap, take the key name directly +python scripts/comparison.py -i datac.coffea,datad.coffea -r MuonRun2022C-27Jun2023-v1 -c MuonRun2022D-27Jun2023-v1 -v mu_pt -p ttdilep_sf + +``` + + + + ``` +options: + -h, --help show this help message and exit + -p {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf}, --phase {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf} + which phase space + -i INPUT, --input INPUT + input coffea files (str), splitted different files with ','. Wildcard option * available as well. + -r REF, --ref REF referance dataset + -c COMPARED, --compared COMPARED + compared datasets, splitted by , + --sepflav SEPFLAV seperate flavour(b/c/light) + --log log on y axis + -v VARIABLE, --variable VARIABLE + variables to plot, splitted by ,. Wildcard option * available as well. Specifying `all` will run through all variables. + --ext EXT prefix name + --com COM sqrt(s) in TeV + --mergemap MERGEMAP + Group list of sample(keys in coffea) as reference/compare set as dictionary format. Keys would be the new lables of the group + --autorebin AUTOREBIN + Rebin the plotting variables, input `int` or `list`. int: merge N bins. list of number: rebin edges(non-uniform bin is possible) + --xlabel XLABEL rename the label for x-axis + --ylabel YLABEL rename the label for y-axis + --norm compare shape, normalized yield to reference + --xrange XRANGE custom x-range, --xrange xmin,xmax + --flow FLOW + str, optional {None, 'show', 'sum'} Whether plot the under/overflow bin. If 'show', add additional under/overflow bin. If 'sum', add the under/overflow bin content to first/last bin. +``` + + + + +### ROCs & efficiency plots + +Extract the ROCs for different tagger and efficiencies from validation workflow + +```python +python scripts/validation_plot.py -i $INPUT_COFFEA -v $VERSION +``` + +### Store histograms from coffea file + +Use `scripts/make_template.py` to dump 1D/2D histogram from `.coffea` to `TH1D/TH2D` with hist. MC histograms can be reweighted to according to luminosity value given via `--lumi`. You can also merge several files + +```python +python scripts/make_template.py -i "testfile/*.coffea" --lumi 7650 -o test.root -v mujet_pt -a '{"flav":0,"osss":"sum"}' +``` + + + +``` + -i INPUT, --input INPUT + Input coffea file(s) + -v VARIABLE, --variable VARIABLE + Variables to store (histogram name) + -a AXIS, --axis AXIS dict, put the slicing of histogram, specify 'sum' option as string + --lumi LUMI Luminosity in /pb + -o OUTPUT, --output OUTPUT + output root file name + --mergemap MERGEMAP Specify mergemap as dict, '{merge1:[dataset1,dataset2]...}' Also works with the json file with dict +``` + + + + + + + +```json +{ + "WJets": ["WJetsToLNu_TuneCP5_13p6TeV-madgraphMLM-pythia8"], + "VV": [ "WW_TuneCP5_13p6TeV-pythia8", "WZ_TuneCP5_13p6TeV-pythia8", "ZZ_TuneCP5_13p6TeV-pythia8"], + "TT": [ "TTTo2J1L1Nu_CP5_13p6TeV_powheg-pythia8", "TTTo2L2Nu_CP5_13p6TeV_powheg-pythia8"], + "ST":[ "TBbarQ_t-channel_4FS_CP5_13p6TeV_powheg-madspin-pythia8", "TbarWplus_DR_AtLeastOneLepton_CP5_13p6TeV_powheg-pythia8", "TbarBQ_t-channel_4FS_CP5_13p6TeV_powheg-madspin-pythia8", "TWminus_DR_AtLeastOneLepton_CP5_13p6TeV_powheg-pythia8"], +"data":[ "Muon_Run2022C-PromptReco-v1", "SingleMuon_Run2022C-PromptReco-v1", "Muon_Run2022D-PromptReco-v1", "Muon_Run2022D-PromptReco-v2"] +} +``` + + + diff --git a/docs/structure.md b/docs/structure.md new file mode 100644 index 00000000..1da65eaa --- /dev/null +++ b/docs/structure.md @@ -0,0 +1,78 @@ +## Structure of the framework + + + +The main ingredients of the framework are wrapped in `src/` directories with supported directories in the root path. + +### `src/data` : custom correctiosn + +Stores the customize corrections used in analysis. e.g. jet probality calibration, custom scale factors...etc. It has a structure similar to [`jsonpog-intergration`](https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration/) split by POGs, corrections. +| Type | File type | Comments| +| :---: | :---: | :---: | +| `lumiMasks` |`.json` | Masked good lumi-section used for physics analysis| +| `Prescales` | `.json.` | HLT paths for prescaled triggers| +| `PU` | `.pkl.gz` or `.histo.root` | Pileup reweight files, matched MC to data| +| `LSF` | `.histo.root` | Lepton ID/Iso/Reco/Trigger SFs| +| `BTV` | `.csv` or `.root` | b-tagger, c-tagger SFs| +| `JME` | `.txt` | JER, JEC files| +| `JPCalib` | `.root` | Jet probablity calibration, used in LTSV methods| + +### `src/utils`: configurations of frameworks + +- `histogrammer.py`: collections of hisograms & hisogram writter +- `selection.py`: collections of common selections +- `correction.py`: `coffea` corrections used in analysis +- `sample.py`: refined MC sample list for each workflow +- `AK4_parameters.py`: correction, lumi configuration for each campaign +- `plot_utils.py`: plotting utilities +- `array_writter.py`: write out events into root file + +### `src/helpers`: functionalities of the framework + +- `xsection(_13TeV).py`: xsection diectionary +- `particle*.csv`: particle mass info +- `defintions.py`: definitions of histogram name +- `BTA_helper.py`: special tools for BTA workflow +- `func.py`: useful functionality +- `update_branch.py`: update missing branch (tagger likelihood ratio) +- `cTagSFReader.py`(deprecated): csv reader of cSF + +### `src/workflows`: collections of workflow + +Collections of different selections used in commissioning and scale factor. Find the detail in [workflow section](./wf.md). + + +### `runner.py`: running coffea jobs +### `condor/`: standalone submission + +standalone condor submission script with futures executor. See the [details](scaleout.md#standalone-condor-jobs@lxplus/cmsconnect) + +### `scripts`: plotting, post-processing, all in one scripts + +- `fetch.py`: obtain dataset json file +- `suball.py`: all in one script to run commissioning/ quick data MC check.... +- Output post-processing + - `dump_prescale.py`: dump prescale info by `brilcalc` + - `dump_processed.py`: dump processed info from output coffea file: lumi & processed files + - `make_template.py`: convert coffea to ROOT hist + - `do_hadd.py`: hadd processed root file + - `missingFiles.py`: **for customiuzed ** check missing files not include and recreate new submission scripts + +- Plotting scripts: + - `comparison.py`: data/data, MC/MC comparison + - `plotdataMC.py`: data/MC comparison + - `validation_plot.py`: plot ROC curve & efficiency + - `2Dhistograms.py`: plot 2D histogram from root file + - `correction_plot.py`: plot corelation matrix from root file + +### `metadata`: collections of dataset json files + +Collections of json file for different campaigns. Split directory by campaign name. + +### `testfile`: example coffea files + + + + + + diff --git a/docs/usage.rst b/docs/usage.rst deleted file mode 100644 index 924afcf6..00000000 --- a/docs/usage.rst +++ /dev/null @@ -1,34 +0,0 @@ -Usage -===== - -.. _installation: - -Installation ------------- - -To use Lumache, first install it using pip: - -.. code-block:: console - - (.venv) $ pip install lumache - -Creating recipes ----------------- - -To retrieve a list of random ingredients, -you can use the ``lumache.get_random_ingredients()`` function: - -.. autofunction:: lumache.get_random_ingredients - -The ``kind`` parameter should be either ``"meat"``, ``"fish"``, -or ``"veggies"``. Otherwise, :py:func:`lumache.get_random_ingredients` -will raise an exception. - -.. autoexception:: lumache.InvalidKindError - -For example: - ->>> import lumache ->>> lumache.get_random_ingredients() -['shells', 'gorgonzola', 'parsley'] - diff --git a/docs/user.md b/docs/user.md new file mode 100644 index 00000000..d9a9943d --- /dev/null +++ b/docs/user.md @@ -0,0 +1,387 @@ +# For user : Running the workflows + +![structure](figs/comm_wf.png) + + +All in one script : `scripts/suball.py` + +All the steps are summarized in the [`suball.py`](#all-in-one-script--scriptssuballpy) scripts for the existing workflows. You can simply just run + +```python +python scripts/suball.py --scheme ${SCHEME_FOR_STUDY} --campaign ${CAMPAIGN_FOR_SF} --year ${YEAR} --DAS_campaign "$DATA_CAMPAIGN_RGX,$MC_CAMPAIGN_RGX" +#Example with 2023 Summer23 campaign +python scripts/suball.py --scheme default_comissioning --campaign Summer23 --year 2023 --DAS_campaign "*Run2023D*Sep2023*,*Run3Summer23BPixNanoAODv12-130X*" +``` +This wrap up the steps mentioned above as a streamline to obtained the required info + +The only missing item need to do manually is to change the updated correction in `AK4_parameter.py` as written [here](#correction-files-configurations) +Each steps are also explained in detailed below, this can be obtain by + +## 0. Make the dataset json files + +Use `fetch.py` in folder `scripts/` to obtain your samples json files for the predefined workflow with the refined MC. For more flexible usage please find [details](scripts.md#fetchpy--create-input-json) + +The fetch script reads the predefine data & MC samples dataset name and output the json file to `metadata/$CAMPAIGN/`, but to find the exact dataset for BTV studies, we usually need to specify the `DAS_campaign` + +``` +python scripts/fetch.py -c {campaign} --year {args.year} --from_workflow {wf} --DAS_campaign {DAS_campaign} {overwrite} +# campaign : the campaign name like Summer23,Winter22 +# year : data taking years 2022/2023... +# wf: workflow name like ttdilep_sf, ctag_Wc_sf +# DAS_campaign: Input the campaign name for DAS to search appropriate campaigns, use in dataset construction , please do `data_camapgin,mc_campaign` split by `,`, e.g. `*Run2023D*Sep2023*,*Run3Summer23BPixNanoAODv12-130X*` +# overwrite (bool): recreate the exist json file +``` + + +:::{caution} +Do not make the file list greater than 4k files to avoid scaleout issues in various site (file open limit) +::: + +:::{tip} +If `gfal-ls` does not work on your machine, reset the gfal-python with `GFAL_PYTHONBIN=/usr/bin/python3` +::: + +## 1. Correction files configurations & add new correction files (Optional) + +If the correction files are not supported yet by jsonpog-integration, you can still try with custom input data. + +All the `lumiMask`, correction files (SFs, pileup weight), and JEC, JER files are under `BTVNanoCommissioning/src/data/` following the substructure `${type}/${year}_${campaign}/${files}`(except `lumiMasks` and `Prescales`) + +| Type | File type | Comments| +| :---: | :---: | :---: | +| `lumiMasks` |`.json` | Masked good lumi-section used for physics analysis| +| `Prescales` | `.json.` | HLT paths for prescaled triggers| +| `PU` | `.pkl.gz` or `.histo.root` | Pileup reweight files, matched MC to data| +| `LSF` | `.histo.root` | Lepton ID/Iso/Reco/Trigger SFs| +| `BTV` | `.csv` or `.root` | b-tagger, c-tagger SFs| +| `JME` | `.txt` | JER, JEC files| +| `JPCalib` | `.root` | Jet probablity calibration, used in LTSV methods| + +Create a `dict` entry under `correction_config` with dedicated campaigns in `BTVNanoCommissioning/src/utils/AK4_parameters.py`. + + + + +The official correction files collected in [jsonpog-integration](https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration) is updated by POG, except `lumiMask` and `JME` still updated by by the BTVNanoCommissioning framework user/developer. For centrally maintained correction files, no input files have to be defined anymore in the `correction_config`. The example to implemented new corrections from POG can be found in [git](https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration/-/blob/master/examples/), and the contents of the correction files are in the [summary](https://cms-nanoaod-integration.web.cern.ch/commonJSONSFs/) + + + ```python + "2017_UL": { + # Same with custom config + "lumiMask": "Cert_294927-306462_13TeV_UL2017_Collisions17_MuonJSON.json", + + "JME": { + "MC": "Summer19UL17_V5_MC", + "Run2017F": "Summer19UL17_RunF_V5_DATA", + }, + ### Alternatively, take the txt files in https://github.com/cms-jet/JECDatabase/tree/master/textFiles + "JME": { + # specified the name of JEC + "name": "V1_AK4PFPuppi", + # dictionary of jec text files + "MC": [ + "Summer23Prompt23_V1_MC_L1FastJet_AK4PFPuppi", + "Summer23Prompt23_V1_MC_L2Relative_AK4PFPuppi", + "Summer23Prompt23_V1_MC_L2Residual_AK4PFPuppi", + "Summer23Prompt23_V1_MC_L3Absolute_AK4PFPuppi", + "Summer23Prompt23_V1_MC_UncertaintySources_AK4PFPuppi", + "Summer23Prompt23_V1_MC_Uncertainty_AK4PFPuppi", + "Summer23Prompt23_JRV1_MC_SF_AK4PFPuppi", + "Summer23Prompt23_JRV1_MC_PtResolution_AK4PFPuppi", + ], + "dataCv123": [ + "Summer23Prompt23_RunCv123_V1_DATA_L1FastJet_AK4PFPuppi", + "Summer23Prompt23_RunCv123_V1_DATA_L2Relative_AK4PFPuppi", + "Summer23Prompt23_RunCv123_V1_DATA_L3Absolute_AK4PFPuppi", + "Summer23Prompt23_RunCv123_V1_DATA_L2L3Residual_AK4PFPuppi", + ], + "dataCv4": [ + "Summer23Prompt23_RunCv4_V1_DATA_L1FastJet_AK4PFPuppi", + "Summer23Prompt23_RunCv4_V1_DATA_L2Relative_AK4PFPuppi", + "Summer23Prompt23_RunCv4_V1_DATA_L3Absolute_AK4PFPuppi", + "Summer23Prompt23_RunCv4_V1_DATA_L2L3Residual_AK4PFPuppi", + ], + }, + ### + # no config need to be specify for PU weights + "PU": None, + # Alternatively, take root file as input + "PU": "puwei_Summer23.histo.root", + # Btag SFs - specify $TAGGER : $TYPE-> find [$TAGGER_$TYPE] in json file + "BTV": {"deepCSV": "shape", "deepJet": "shape"}, + "roccor": None, + # JMAR, IDs from JME- Following the scheme: "${SF_name}": "${WP}" + "JMAR": {"PUJetID_eff": "L"}, + "LSF": { + # Electron SF - Following the scheme: "${SF_name} ${year}": "${WP}" + # https://github.com/cms-egamma/cms-egamma-docs/blob/master/docs/EgammaSFJSON.md + "ele_ID 2017": "wp90iso", + "ele_Reco 2017": "RecoAbove20", + + # Muon SF - Following the scheme: "${SF_name} ${year}": "${WP}" + + "mu_Reco 2017_UL": "NUM_TrackerMuons_DEN_genTracks", + "mu_HLT 2017_UL": "NUM_IsoMu27_DEN_CutBasedIdTight_and_PFIsoTight", + "mu_ID 2017_UL": "NUM_TightID_DEN_TrackerMuons", + "mu_Iso 2017_UL": "NUM_TightRelIso_DEN_TightIDandIPCut", + }, + # use for BTA production, jet probablity + "JPCalib": { + "Run2022E": "calibeHistoWrite_Data2022F_NANO130X_v1.root", + "Run2022F": "calibeHistoWrite_Data2022F_NANO130X_v1.root", + "Run2022G": "calibeHistoWrite_Data2022G_NANO130X_v1.root", + "MC": "calibeHistoWrite_MC2022EE_NANO130X_v1.root", + }, + }, + ``` + + + + +## 2. Run the workflow to get coffea files + +The `runner.py` handles the options to select the workflow with dedicated configuration for each campaign. The miniumum required info is +``` +python runner.py --wf {wf} --json metadata/{args.campaign}/{types}_{args.campaign}_{args.year}_{wf}.json {overwrite} --campaign {args.campaign} --year {args.year} +``` + +:::{tip} +- In case just to test your program, you can limit only one file with one chunk using iterative executor to avoid overwriting error message by `--max 1 --limit 1 --executor iterative` +- In case you only want to run particular sample in your json `--only $dataset_name`, i.e. `--only TT*` or `--only MuonEG_Run2023A` +- Change the numbers of scale job by `-s $NJOB` +- Store the arrays by setting the flag `--isArray` +- Modifying chunk size in case the jobs is to big `--chunk $N_EVENTS_PER_CHUNK` +- Sometimes the global redirector is insufficient, you can increase the numbers of retries (only in parsl/dask) `--retries 30`, or skip the files `--skipbadfiles` and later reprocess the missing info by create the json with skipped files. Methods to create the json files discussed in the next part. +::: + +Other options detail can be found here + +
runner options +

+ +```python +### ====> REQUIRED <======= +# --wf {validation,ttdilep_sf,ttsemilep_sf,c_ttsemilep_sf,emctag_ttdilep_sf,ctag_ttdilep_sf,ectag_ttdilep_sf,ctag_ttsemilep_sf,ectag_ttsemilep_sf,QCD_sf,QCD_smu_sf,ctag_Wc_sf,ectag_Wc_sf,ctag_DY_sf,ectag_DY_sf,BTA,BTA_addPFMuons,BTA_addAllTracks,BTA_ttbar}, --workflow {validation,ttdilep_sf,ttsemilep_sf,c_ttsemilep_sf,emctag_ttdilep_sf,ctag_ttdilep_sf,ectag_ttdilep_sf,ctag_ttsemilep_sf,ectag_ttsemilep_sf,QCD_sf,QCD_smu_sf,ctag_Wc_sf,ectag_Wc_sf,ctag_DY_sf,ectag_DY_sf,BTA,BTA_addPFMuons,BTA_addAllTracks,BTA_ttbar} +# Which processor to run +# -o OUTPUT, --output OUTPUT +# Output histogram filename (default: hists.coffea) +# --json SAMPLEJSON JSON file containing dataset and file locations (default: dummy_samples.json) +# --year YEAR Year +# --campaign CAMPAIGN Dataset campaign, change the corresponding correction files + +#=======Optional====== +# --isSyst {False,all,weight_only,JERC_split,JP_MC} +# Run with systematics, all, weights_only(no JERC uncertainties included),JERC_split, None +# --isArray Output root files +# --noHist Not output coffea histogram +# --overwrite Overwrite existing files + +# --executor {iterative,futures,parsl/slurm,parsl/condor,parsl/condor/naf_lite,dask/condor,dask/condor/brux,dask/slurm,dask/lpc,dask/lxplus,dask/casa,condor_standalone} +# The type of executor to use (default: futures). Other options can be implemented. For example see https://parsl.readthedocs.io/en/stable/userguide/configuring.html- +# `parsl/slurm` - tested at DESY/Maxwell- `parsl/condor` - tested at DESY, RWTH- `parsl/condor/naf_lite` - tested at DESY- `dask/condor/brux` - tested at BRUX (Brown U)- +# `dask/slurm` - tested at DESY/Maxwell- `dask/condor` - tested at DESY, RWTH- `dask/lpc` - custom lpc/condor setup (due to write access restrictions)- `dask/lxplus` - custom +# lxplus/condor setup (due to port restrictions) +# -j WORKERS, --workers WORKERS +# Number of workers (cores/threads) to use for multi-worker executors (e.g. futures or condor) (default: 3) +# -s SCALEOUT, --scaleout SCALEOUT +# Number of nodes to scale out to if using slurm/condor. Total number of concurrent threads is ``workers x scaleout`` (default: 6) +# --memory MEMORY Memory used in jobs default ``(default: 4.0) +# --disk DISK Disk used in jobs default ``(default: 4) +# --voms VOMS Path to voms proxy, made accessible to worker nodes. By default a copy will be made to $HOME. +# --chunk N Number of events per process chunk +# --retries N Number of retries for coffea processor +# --fsize FSIZE (Specific for dask/lxplus file splitting, default: 50) Numbers of files processed per dask-worker +# --index INDEX (Specific for dask/lxplus file splitting, default: 0,0) Format: $dict_index_start,$file_index_start,$dict_index_stop,$file_index_stop. Stop indices are optional. $dict_index +# refers to the index, splitted $dict_index and $file_index with ','$dict_index refers to the sample dictionary of the samples json file. $file_index refers to the N-th batch +# of files per dask-worker, with its size being defined by the option --index. The job will start (stop) submission from (with) the corresponding indices. +# --validate Do not process, just check all files are accessible +# --skipbadfiles Skip bad files. +# --only ONLY Only process specific dataset or file +# --limit N Limit to the first N files of each dataset in sample JSON +# --max N Max number of chunks to run in total +``` + +

+
+ + +## 3. Dump processed information to obtain luminoisty and processed files + +After obtained `coffea` file, we can check the processed files and obtain the luminoisty. + +Get the run & luminosity information for the processed events from the coffea output files. When you use `--skipbadfiles`, the submission will ignore files not accesible(or time out) by xrootd. This script helps you to dump the processed luminosity into a json file which can be calculated by brilcalc tool and provide a list of failed lumi sections by comparing the original json input to the one from the `.coffea` files. +We will see the luminosity info in `/pb` and the skipped files as new json for resubmission. + + +```bash +python scripts/dump_processed.py -t all -c INPUT_COFFEA --json ORIGINAL_JSON_INPUT -n {args.campaign}_{args.year}_{wf} +# -t {all,lumi,failed}, --type {all,lumi,failed} +# Choose the function for dump luminosity(`lumi`)/failed files(`failed`) into json +# -c COFFEA, --coffea COFFEA +# Processed coffea files, splitted by ,. Wildcard option * available as well. +# -n FNAME, --fname FNAME +# Output name of jsons(with _lumi/_dataset) +# -j JSONS, --jsons JSONS +# Original input json files, splitted by ,. Wildcard option * available as well. +``` + +### 4. Obtain data/MC plots + +We can obtain data/MC plots from coffea via the plotting scripts: + +You can specify `-v all` to plot all the variables in the `coffea` file, or use wildcard options (e.g. `-v "*DeepJet*"` for the input variables containing `DeepJet`) + +:new: non-uniform rebinning is possible, specify the bins with list of edges `--autorebin 50,80,81,82,83,100.5` + +```bash +python scripts/plotdataMC.py -i a.coffea,b.coffea --lumi 41500 -p ttdilep_sf -v z_mass,z_pt +python scripts/plotdataMC.py -i "test*.coffea" --lumi 41500 -p ttdilep_sf -v z_mass,z_pt # with wildcard option need "" + +``` + +
options +

+``` +options: + -h, --help show this help message and exit + --lumi LUMI luminosity in /pb + --com COM sqrt(s) in TeV + -p {ttdilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf}, --phase {dilep_sf,ttsemilep_sf,ctag_Wc_sf,ctag_DY_sf,ctag_ttsemilep_sf,ctag_ttdilep_sf} + which phase space + --log LOG log on y axis + --norm NORM Use for reshape SF, scale to same yield as no SFs case + -v VARIABLE, --variable VARIABLE + variables to plot, splitted by ,. Wildcard option * available as well. Specifying `all` will run through all variables. + --SF make w/, w/o SF comparisons + --ext EXT prefix name + -i INPUT, --input INPUT + input coffea files (str), splitted different files with ','. Wildcard option * available as well. + --autorebin AUTOREBIN + Rebin the plotting variables, input `int` or `list`. int: merge N bins. list of number: rebin edges(non-uniform bin is possible) + --xlabel XLABEL rename the label for x-axis + --ylabel YLABEL rename the label for y-axis + --splitOSSS SPLITOSSS + Only for W+c phase space, split opposite sign(1) and same sign events(-1), if not specified, the combined OS-SS phase space is used + --xrange XRANGE custom x-range, --xrange xmin,xmax + --flow FLOW + str, optional {None, 'show', 'sum'} Whether plot the under/overflow bin. If 'show', add additional under/overflow bin. If 'sum', add the under/overflow bin content to first/last bin. + --split {flavor,sample,sample_flav} + Decomposition of MC samples. Default is split to jet flavor(udsg, pu, c, b), possible to split by group of MC + samples. Combination of jetflavor+ sample split is also possible +``` + +

+
+ + + +## Reading coffea `hist` + + +Quick tutorial to go through coffea files + + +### Structure of the file + +The coffea contains histograms wrapped in a dictionary with `$dataset:{$histname:hist}`, the `hist` is the object using +[hist](https://hist.readthedocs.io/en/latest/) which allows multidimensional bins with different types of array +```python +{'WW_TuneCP5_13p6TeV-pythia8':{ +'btagDeepFlavB_b_0': Hist( + IntCategory([0, 1, 4, 5, 6], name='flav', label='Genflavour'), + IntCategory([1, -1], name='osss', label='OS(+)/SS(-)'), + StrCategory(['noSF'], growth=True, name='syst'), + Regular(30, -0.2, 1, name='discr', label='btagDeepFlavB_b'), + storage=Weight()) # Sum: WeightedSum(value=140, variance=140), 'btagDeepFlavB_bb_0': Hist( + IntCategory([0, 1, 4, 5, 6], name='flav', label='Genflavour'), + IntCategory([1, -1], name='osss', label='OS(+)/SS(-)'), + StrCategory(['noSF'], growth=True, name='syst'), + Regular(30, -0.2, 1, name='discr', label='btagDeepFlavB_bb'), + storage=Weight()) # Sum: WeightedSum(value=140, variance=140), 'btagDeepFlavB_lepb_0': Hist( + IntCategory([0, 1, 4, 5, 6], name='flav', label='Genflavour'), + IntCategory([1, -1], name='osss', label='OS(+)/SS(-)'), + StrCategory(['noSF'], growth=True, name='syst'), + Regular(30, -0.2, 1, name='discr', label='btagDeepFlavB_lepb'), + storage=Weight()) # Sum: WeightedSum(value=140, variance=140)}} +``` +There are also `column_array` stores the processed file and lumi/run info in each dataset for data. The information are used in [dump_processed info](user.md#3-dump-processed-information-to-obtain-luminoisty-and-processed-files) + + + +The histogram is a multidimentinal histogram, with all the axis listed +```python +Hist( + IntCategory([0, 1, 4, 5, 6], name='flav', label='Genflavour'),# different genflavor, 0 for light, 1 for PU, 2 for c, 3 for b. Always 0 for data. + IntCategory([1, -1], name='osss', label='OS(+)/SS(-)'),# opposite sign or same sign, only appears in W+c workflow + StrCategory(['noSF','PUUp','PUDown'], growth=True, name='syst'),# systematics variations, + Regular(30, -0.2, 1, name='discr', label='btagDeepFlavB_lepb'),# discriminator distribution, the last axis is always the variable + storage=Weight()) # Sum: WeightedSum(value=140, variance=140)# Value is sum of the entries, Variances is sum of the variances. +``` + +### Read coffea files and explore the histogram + +```python +from coffea.util import load +# open coffea file +output=load("hists_ctag_Wc_sf_VV.coffea") +# get the histogram and read the info +hist=output['WW_TuneCP5_13p6TeV-pythia8']['btagDeepFlavB_lepb_0'] +# addition for two histogram is possible if the axis is the same +histvv=output['WW_TuneCP5_13p6TeV-pythia8']['btagDeepFlavB_lepb_0']+ + output['WZ_TuneCP5_13p6TeV-pythia8']['btagDeepFlavB_lepb_0']+ + output['ZZ_TuneCP5_13p6TeV-pythia8']['btagDeepFlavB_lepb_0'] +# To get 1D histogram, we need to reduce the dimention +# we can specify the axis we want to read, e.g. read charm jet, opposite sign events with noSF +axis={'flav':3,'os':0,'syst':'noSF'} +hist1d=hist[axis] #--> this is the 1D histogram Hist +# you can also sum over the axis, e.g. here shows no jet flavor split and sum os+ss +axis={'flav':sum,'os':sum,'syst':'noSF'} +# rebin the axis is also possible, rebin the discrimnator by merged two bins into one +axis={'flav':sum,'os':sum,'syst':'noSF','discr':hist.rebin(2)} +``` + +### Plot the histogram +You can simply plot the histogram using [mplhep](https://mplhep.readthedocs.io/en/latest/) +```python +import mplhep as hep +import matplotlib.pyplot as plt +# set the plot style like tdr style +plt.style.use(hep.style.ROOT) +# make 1D histogram plot +hep.histplot(hist1D) +``` +### convert coffea hist to ROOT TH1 + + `scripts/make_template.py` does the work to convert the coffea hist into 1D/2D ROOT histogram: +``` +python scripts/make_template.py -i "testfile/*.coffea" --lumi 7650 -o test.root -v mujet_pt -a '{"flav":0,"osss":"sum"}' --mergemap +`` + -h, --help show this help message and exit + -i INPUT, --input INPUT + Input coffea file(s), can be a regular expression contains '*' + -v VARIABLE, --variable VARIABLE + Variables to store(histogram name) + -a AXIS, --axis AXIS dict, put the slicing of histogram, specify 'sum' option as string + --lumi LUMI Luminosity in /pb , normalize the MC yields to corresponding luminosity + -o OUTPUT, --output OUTPUT + output root file name + --mergemap MERGEMAP Specify mergemap as dict, '{merge1:[dataset1,dataset2]...}' Also works with the json file with dict +``` + + +:::tip{merge map} +```python +{ + "WJets": ["WJetsToLNu_TuneCP5_13p6TeV-madgraphMLM-pythia8"], + "VV": [ "WW_TuneCP5_13p6TeV-pythia8", "WZ_TuneCP5_13p6TeV-pythia8", "ZZ_TuneCP5_13p6TeV-pythia8"], + "TT": [ "TTTo2J1L1Nu_CP5_13p6TeV_powheg-pythia8", "TTTo2L2Nu_CP5_13p6TeV_powheg-pythia8"], + "ST":[ "TBbarQ_t-channel_4FS_CP5_13p6TeV_powheg-madspin-pythia8", "TbarWplus_DR_AtLeastOneLepton_CP5_13p6TeV_powheg-pythia8", "TbarBQ_t-channel_4FS_CP5_13p6TeV_powheg-madspin-pythia8", "TWminus_DR_AtLeastOneLepton_CP5_13p6TeV_powheg-pythia8"], +"data":[ "Muon_Run2022C-PromptReco-v1", "SingleMuon_Run2022C-PromptReco-v1", "Muon_Run2022D-PromptReco-v1", "Muon_Run2022D-PromptReco-v2"] +} +``` + +::: + + + + diff --git a/docs/wf.md b/docs/wf.md new file mode 100644 index 00000000..04ed43b3 --- /dev/null +++ b/docs/wf.md @@ -0,0 +1,64 @@ + +## Selections for different phase spaces + +The available workflows are summarized here, with dedicated selections. + +The goal is to unified the selections among SFs team and commissioning workflows. +Selections use for SFs also used in commissioning, the strucutre is summarized in the figure here +![SFs](figs/sf_new.png) + + +### Dileptonic $t\bar{t}$ phase space: b-tag SFs + +- `ttdilep_sf`:check performance for btag SFs, e$\mu$ selections + +- `ctag_ttdilep_sf`, `ectag_ttdilep_sf`,`emctag_ttdilep_sf` : ttdilep selections with soft-muon + +- `BTA_ttbar`: selections used in kinematic methods +### Semileptonic $t\bar{t}$ phase space: b-tag SFs/c-tag SFs +- `ttsemilep_sf`: tt semileptonic selection, used in commissioning +- `c_ttsemilep_sf`: tag c jet on top of ttsemileptonic selections +- `ctag_ttsemilep_sf`, `ectag_ttsemilep_sf`: tt semileptonic selections with soft-muon, same as W+c, higher jet multiplicty + +### QCD muon enriched phase space: b-tag SFs +- `QCD_smu_sf` : QCD selections with soft muon included, enriched b-jet + +### W+c phase space : c-SFs +- `ctag_Wc_sf,ectag_Wc_sf`: check performance for charm SFs, c-jets enriched SFs, used in commissioning & iter-cSF +- `ctag_Wc_WP_sf,ectag_Wc_WP_sf`: WP base charm selections, used in commissioning & WP-cSF + +### Z+jets phase space: light mis-tag rate +- `ctag_DY_sf, ectag_DY_sf`: Z+jets selections. Use in commissioning & iter-cSF + +### QCD phase space: light mis-tag rate +- `QCD_sf`: select QCD events for light mis-tag rate. + + + + +### BTA - BTagAnalyzer Ntuple producer (deprecated) + +Based on Congqiao's [development](https://github.com/cms-btv-pog/BTVNanoCommissioning/blob/master/notebooks/BTA_array_producer.ipynb) to produce BTA ntuples based on PFNano. + +:::{caution} +Only the newest version [BTV_Run3_2022_Comm_MINIAODv4](https://github.com/cms-btv-pog/btvnano-prod) ntuples work. Example files are given in [this](https://github.com/cms-btv-pog/BTVNanoCommissioning/blob/master/metadata/test_bta_run3.json) json. Optimize the chunksize(`--chunk`) in terms of the memory usage. This depends on sample, if the sample has huge jet collection/b-c hardons. The more info you store, the more memory you need. I would suggest to test with `iterative` to estimate the size. +::: + + +Run with the nominal `BTA` workflow to include the basic event variables, jet observables, and GEN-level quarks, hadrons, leptons, and V0 variables. +``` +python runner.py --wf BTA --json metadata/test_bta_run3.json --campaign Summer22EERun3 --isJERC +``` + +Run with the `BTA_addPFMuons` workflow to additionally include the `PFMuon` and `TrkInc` collection, used by the b-tag SF derivation with the QCD(μ) methods. +``` +python runner.py --wf BTA_addPFMuons --json metadata/test_bta_run3.json --campaign Summer22EERun3 --isJERC +``` + +Run with the `BTA_addAllTracks` workflow to additionally include the `Tracks` collection, used by the JP variable calibration. +``` +python runner.py --wf BTA_addAllTracks --json metadata/test_bta_run3.json --campaign Summer22EERun3 --isJERC +``` + diff --git a/scripts/fetch.py b/scripts/fetch.py index bb217123..699c65fe 100644 --- a/scripts/fetch.py +++ b/scripts/fetch.py @@ -106,6 +106,7 @@ "--campaign", help="campaign name (same as the campaign in runner.py)", default=None, + require=True, type=str, ) parser.add_argument("--year", help="year", default=None, type=str) @@ -560,10 +561,12 @@ def main(args): json.dump(reduced_fdict, fp, indent=4) else: - output_file = args.output - with open(output_file, "w") as fp: + os.system(f"mkdir -p metadata/{args.campaign}/") + with open(f"metadata/{args.campaign}/{args.output}", "w") as fp: json.dump(fdict, fp, indent=4) - print("The file is saved at: ", output_file) + print( + "The file is saved at: metadata/", {args.campaign}, "/", {args.output} + ) if __name__ == "__main__": diff --git a/scripts/suball.py b/scripts/suball.py index c6280f75..362591b1 100644 --- a/scripts/suball.py +++ b/scripts/suball.py @@ -72,7 +72,7 @@ def get_lumi_from_web(year): "-dc", "--DAS_campaign", required=True, - help="Input the campaign name for DAS to search appropriate campaigns, use in dataset construction , please do `data_camapgin,mc_campaign` split by `,`", + help="Input the campaign name for DAS to search appropriate campaigns, use in dataset construction , please do `data_camapgin,mc_campaign` split by `,`, e.g. `*Run2023D*Sep2023*,*Run3Summer23BPixNanoAODv12-130X*` ", ) parser.add_argument("-v", "--version", default="", help="version postfix") parser.add_argument( @@ -88,7 +88,7 @@ def get_lumi_from_web(year): args = parser.parse_args() # summarize diffeerent group for study - workflow_group = { + scheme = { # scale factor workflows "SF": ["BTA_ttbar", "BTA_addPFMuons"], # Use for prompt data MC checks for analysis @@ -100,11 +100,11 @@ def get_lumi_from_web(year): "ctag_Wc_sf", "ctag_DY_sf", "QCD_sf", - # "QCD_mu_sf" + "QCD_mu_sf", ], } if args.scheme in workflows.keys(): - workflow_group["test"] = [args.scheme] + scheme["test"] = [args.scheme] args.scheme = "test" # Check lumiMask exists and replace the Validation input_lumi_json = correction_config[args.campaign]["lumiMask"] @@ -123,7 +123,7 @@ def get_lumi_from_web(year): ) print(f"======>{input_lumi_json} is used for {args.year}") - for wf in workflow_group[args.scheme]: + for wf in scheme[args.scheme]: if args.debug: print(f"======{wf} in {args.scheme}=====") overwrite = "--overwrite" if args.overwrite else "" diff --git a/setup.cfg b/setup.cfg index 42393f51..9acc5a28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,22 +33,20 @@ project_urls = [options] packages = find: -install_requires = +install_requires = + p_tqdm vector coffea==0.7.22 - python_requires = <3.11 include_package_data = True -scripts = - runner.py - scripts/suball.py - scripts/plotdataMC.py - scripts/comparison.py - scripts/fetch.py - scripts/make_template.py package_dir = =src +[options.data_files] +scripts = + condor/* + scripts/* + runner.py [options.packages.find] where = src @@ -57,11 +55,13 @@ where = src dev = pytest>=6 black==24.2.0 -# docs = -# Sphinx~=3.0 -# myst_parser>=0.13 -# sphinx-book-theme>=0.1.0 -# sphinx_copybutton + +docs = + Sphinx>=7.0 + myst_parser>=0.13 + sphinx-book-theme>=0.1.0 + sphinx_copybutton + markdown-code-runner test = pytest>=6 diff --git a/src/BTVNanoCommissioning/helpers/BTA_helper.py b/src/BTVNanoCommissioning/helpers/BTA_helper.py index 3a3e0ccd..1d980726 100644 --- a/src/BTVNanoCommissioning/helpers/BTA_helper.py +++ b/src/BTVNanoCommissioning/helpers/BTA_helper.py @@ -62,12 +62,12 @@ # Mass table # ################ # mass table from https://github.com/scikit-hep/particle/blob/master/src/particle/data/particle2022.csv and https://gitlab.cern.ch/lhcb-conddb/DDDB/-/blob/master/param/ParticleTable.txt -df_main = pd.read_csv( - "src/BTVNanoCommissioning/helpers/particle2022.csv", delimiter=",", skiprows=1 -) -df_back = pd.read_csv( - "src/BTVNanoCommissioning/helpers/ParticleTable.csv", delimiter=",", skiprows=4 +path = os.path.dirname(os.path.abspath(__file__)) +maincsv, backcsv = os.path.join(path, "particle2022.csv"), os.path.join( + path, "ParticleTable.csv" ) +df_main = pd.read_csv(maincsv, delimiter=",", skiprows=1) +df_back = pd.read_csv(backcsv, delimiter=",", skiprows=4) df_main, df_back = df_main.astype({"ID": int}), df_back.astype({"PDGID": int}) main = dict(zip(df_main.ID, df_main.Mass / 1000.0)) backup = dict(zip(df_back.PDGID, df_back["MASS(GeV)"])) diff --git a/src/BTVNanoCommissioning/helpers/cTagSFReader.py b/src/BTVNanoCommissioning/helpers/cTagSFReader.py index 2196169e..5fa2c2a2 100644 --- a/src/BTVNanoCommissioning/helpers/cTagSFReader.py +++ b/src/BTVNanoCommissioning/helpers/cTagSFReader.py @@ -6,7 +6,29 @@ def getSF(flav, CvL, CvB, file="DeepCSV_ctagSF_MiniAOD94X_2017_pTincl.root", syst=""): - # _btag_path = "BTVNanoCommissioning.data.BTV.Rereco17_94X" + """ + Retrieve the scale factor (SF) for c-tagging based on the given parameters. + + This function retrieves the scale factor (SF) for c-tagging using the specified flavor, CvL, CvB, and systematic variation. The SF is read from a ROOT file and returned as a dense lookup table. + + Parameters: + flav (numpy.ndarray): Array of jet flavors. Typically, 4 for charm, 5 for bottom, and other values for light jets. + CvL (float): The CvL parameter value. + CvB (float): The CvB parameter value. + file (str, optional): The path to the ROOT file containing the scale factors. Defaults to "DeepCSV_ctagSF_MiniAOD94X_2017_pTincl.root". + syst (str, optional): The systematic variation to apply. Defaults to an empty string for the central value. + + Returns: + dense_lookup: A dense lookup table containing the scale factors. + + Example: + ```python + sf_lookup = getSF(flav_array, CvL_value, CvB_value) + ``` + + Deprecated: + This function is deprecated. Please use the BTVNanoCommissiong.correction.btagSFs module instead. + """ with importlib.resources.path( file[: file.rfind("/")].replace("/", "."), file[file.rfind("/") + 1 :] ) as filename: diff --git a/src/BTVNanoCommissioning/helpers/definitions.py b/src/BTVNanoCommissioning/helpers/definitions.py index ae84477d..6ab3435c 100644 --- a/src/BTVNanoCommissioning/helpers/definitions.py +++ b/src/BTVNanoCommissioning/helpers/definitions.py @@ -4,18 +4,11 @@ disc_list = [ "btagDeepFlavB", "btagDeepFlavC", - "btagTransDeepFlavB", - "btagTransDeepFlavC", "btagDeepFlavCvL", "btagDeepFlavCvB", "btagDeepFlavB_b", "btagDeepFlavB_bb", - "btagTransDeepFlavB_lepb", - "btagTransDeepFlavB_b", - "btagTransDeepFlavB_bb", - "btagTransDeepFlavB_lepb", "btagPNetB", - "btagTransPNetB", "btagPNetCvB", "btagPNetCvL", "btagPNetCvNotB", @@ -23,10 +16,6 @@ "btagPNetProbC", "btagPNetProbG", "btagPNetProbUDS", - "btagTransPNetProbB", - "btagTransPNetProbC", - "btagTransPNetProbG", - "btagTransPNetProbUDS", "btagPNetQvG", "btagPNetTauVJet", "btagRobustParTAK4B", @@ -36,13 +25,6 @@ "btagRobustParTAK4C", "btagRobustParTAK4G", "btagRobustParTAK4UDS", - "btagTransRobustParTAK4B", - "btagTransRobustParTAK4B_b", - "btagTransRobustParTAK4B_bb", - "btagTransRobustParTAK4B_lepb", - "btagTransRobustParTAK4C", - "btagTransRobustParTAK4G", - "btagTransRobustParTAK4UDS", "btagRobustParTAK4CvB", "btagRobustParTAK4CvL", "btagRobustParTAK4QG", @@ -6128,6 +6110,31 @@ def definitions(): + """ + Add new definitions to the definitions dictionary. + + Developed by Annika Stein, this function summarizes the information of tagger input variables with corresponding ranges, bins, and display names. + + Parameters: + definitions_dict (dict): The dictionary to which new definitions will be added. + + Example: + ```python + definitions_dict["DeepCSV_jetNSelectedTracks"] = { + "displayname": "Jet N Selected Tracks", + "manual_ranges": [0.0, 25], + "ylabel_text": "Jets", + "format_unit": "2f", + "format_unit_digits": 2, + "bins": 25, + "inputVar_units": None, + } + ``` + + Returns: + dict: with defitions of the tagger input variables added to the dictionary. + """ + # Function implementation here jetINDEX = [0, 1, 28, 41, 48, 49, 56, 57, 58, 59, 63, 64, 65] trackINDEX = [ 6, diff --git a/src/BTVNanoCommissioning/helpers/update_branch.py b/src/BTVNanoCommissioning/helpers/update_branch.py index 3539bb97..3fe17172 100644 --- a/src/BTVNanoCommissioning/helpers/update_branch.py +++ b/src/BTVNanoCommissioning/helpers/update_branch.py @@ -4,6 +4,34 @@ def missing_branch(events): + """ + Add missing branches or rename branches in the `events` object. + + This function adds missing branches or renames existing branches in the `events` object using the `missing_branch` parameter. + + Usage: + Use the `hasattr` function to check for missing branches. + + Deprecated: + The `add_jec` function is deprecated. Please use the `JME_shifts` function in the `correction` module instead. + + Example: + ```python + events.fixedGridRhoFastjetAll = ( + events.fixedGridRhoFastjetAll + if hasattr(events, "fixedGridRhoFastjetAll") + else events.Rho.fixedGridRhoFastjetAll + ) + ``` + + Parameters: + events (coffea.nanoaodevents): The events object to update. + missing_branch (str): The name of the missing branch to add or rename. + + Returns: + events (coffea.nanoaodevents): Events with updated branches. + """ + # Function implementation here events["fixedGridRhoFastjetAll"] = ( events.fixedGridRhoFastjetAll if hasattr(events, "fixedGridRhoFastjetAll") @@ -118,8 +146,12 @@ def missing_branch(events): events.PuppiMET, "MetUnclustEnUpDeltaX" ): met = events.PuppiMET - met["MetUnclustEnUpDeltaX"] = met.ptUnclusteredUp * np.cos(met.phiUnclusteredUp) - met["MetUnclustEnUpDeltaY"] = met.ptUnclusteredUp * np.sin(met.phiUnclusteredUp) + met["MetUnclustEnUpDeltaX"] = (met.ptUnclusteredUp - met.pt) * np.cos( + met.phiUnclusteredUp + ) + met["MetUnclustEnUpDeltaY"] = (met.ptUnclusteredUp - met.pt) * np.sin( + met.phiUnclusteredUp + ) events.PuppiMET = update( events.PuppiMET, { diff --git a/src/BTVNanoCommissioning/helpers/xs_scaler.py b/src/BTVNanoCommissioning/helpers/xs_scaler.py index be51fba4..c3019a85 100644 --- a/src/BTVNanoCommissioning/helpers/xs_scaler.py +++ b/src/BTVNanoCommissioning/helpers/xs_scaler.py @@ -4,6 +4,9 @@ import os from BTVNanoCommissioning.helpers.xsection import xsection +""" +Scale histograms to corresponding cross-section. Merge mutiple `.coffea` and collate the MC samples into sub-class in this function. +""" # from BTVNanoCommissioning.helpers.xsection_13TeV import xsection_13TeV import numpy as np diff --git a/src/BTVNanoCommissioning/helpers/xsection.py b/src/BTVNanoCommissioning/helpers/xsection.py index b6aca97b..3af9b648 100644 --- a/src/BTVNanoCommissioning/helpers/xsection.py +++ b/src/BTVNanoCommissioning/helpers/xsection.py @@ -1,3 +1,25 @@ +""" +This module provides functions to handle cross section information used to normalize data and MC when making plots. + +Cross section information is crucial for normalizing data and Monte Carlo (MC) simulations to ensure accurate comparisons and analyses. + +To add a new dictionary to the cross section list, you can obtain the necessary information from the [XSDB](https://cms-gen-dev.cern.ch/xsdb/). + +Example of a cross section dictionary: + +```python +{ + "id": "61641f28294617432cff1379", + "process_name": "QCD_Pt-30to50_EMEnriched_TuneCP5_13TeV_pythia8", + "cross_section": "6447000.0", + "total_uncertainty": "19870.0", + "equivalent_lumi": "0.0001551", + "matrix_generator": "none", + "energy": "13", +} +``` +""" + xsection = [ { "process_name": "DYJetsToLL_M-10to50_TuneCP5_13p6TeV-madgraphMLM-pythia8", diff --git a/src/BTVNanoCommissioning/utils/correction.py b/src/BTVNanoCommissioning/utils/correction.py index 594e66ca..8f5b7e4a 100644 --- a/src/BTVNanoCommissioning/utils/correction.py +++ b/src/BTVNanoCommissioning/utils/correction.py @@ -21,7 +21,29 @@ def load_SF(year, campaign, syst=False): + """ + Load scale factors (SF) for a given year and campaign. + + This function reads scale factors from the specified campaign configuration and returns them in a suitable format. + It handles different types of scale factors, such as pileup weights, and checks for the existence of files in + the jsonpog-integration directory or custom files. + + Parameters: + year (str): The year for which to load the scale factors. + campaign (str): The name of the campaign for which to load the scale factors. + syst (bool, optional): A flag to indicate whether to load systematic variations. Default is False. + + Returns: + dict: A dictionary containing the scale factors, where keys are the relevant identifiers and values are the scale factors. + + Raises: + FileNotFoundError: If the specified file does not exist. + ValueError: If the file content is not in the expected format. + KeyError: If the specified campaign or year is not found in the configuration. + """ + # read the configuration file to get the correct SFs correct_map = {"campaign": campaign} + for SF in config[campaign].keys(): if SF == "lumiMask": continue @@ -324,13 +346,106 @@ def load_SF(year, campaign, syst=False): def load_lumi(campaign): + """ + Load luminosity mask for a given campaign. + + This function reads the luminosity mask file for the specified campaign and returns a `LumiMask` object. + + Parameters: + campaign (str): The name of the campaign for which to load the luminosity mask. + + Returns: + LumiMask: An object representing the luminosity mask for the specified campaign. + + Raises: + KeyError: If the specified campaign is not found in the configuration. + FileNotFoundError: If the luminosity mask file does not exist. + """ + _lumi_path = "BTVNanoCommissioning.data.lumiMasks" with importlib.resources.path(_lumi_path, config[campaign]["lumiMask"]) as filename: return LumiMask(filename) # wrapped up common shifts +""" +BTVNanoCommissioning.utils.correction + +This module provides functions to handle corrections and uncertainties for scaling factors and weight variables in the BTVNanoCommissioning framework. + +Features: +- Scaling factors and weight variables: + - Corrections are wrapped with `weights` from the `coffea.analysis_tools.Weights` class along the event axis. + - Standardized correction files are handled using `correctionlib`. + - Additional methods for applying corrections can be found in the `coffea` documentation. + - Uncertainties are added using up/down variations, except for btag SFs which use `add_multivariation`. + +Example for Scaling Factors (SFs): +```python +## Initialization, add EGM map from correctionlib +correction_map["EGM"] = correctionlib.CorrectionSet.from_file( + f"src/BTVNanoCommissioning/jsonpog-integration/POG/EGM/{campaign}/electron.json.gz" + ) +## Initialization, add EGM map from custom file by extractor + ext = extractor() + ext.add_weight_sets(["eleID EGamma2D {filename}.root"]) + ext.finalize() + correction_map["EGM"] = ext.make_evaluator() + + # evaluation depends on file types...ignore here! + ## add SFs & uncertainties to weight function +weights.add(sf.split(" ")[0], sfs_alle, sfs_alle_up, sfs_alle_down) +``` + +""" + + def common_shifts(self, events): + """ + Apply common shifts to a events(mostly affect energy resolution/scale of objects). + + This function applies common shifts to the input DataFrame based on the specified shift type. + It modifies the DataFrame in place to reflect the systematic variations/dedicated corrections. + This includes JERC corrections, rochester corrections. + + + - Scale/Resolution Corrections: + Construct a shift list with tuples of (obj_dict, shift_name). + These corrections are applied independently on all objects by updating the contents of the branch. + Normally done before selection to apply updated objects. + Uncertainties are handled by updating object collections with up/down variations. + Example for Shift List: + ```python + # nominal correction + shift = [({"Jet": jets, "MET": met, "Muon" : muon}, None)] + # add variations + shifts += [ + ( + { + "Jet": jets.JES_Total.up, + "MET": met.JES_Total.up, + }, + "JESUp", + )] + shifts += [ + ( + { + "Jet": jets.JES_Total.down, + "MET": met.JES_Total.down, + }, + "JESDown", + )] + `` + Different treatment for weights and scale/resolution shifts is necessary to ensure accurate corrections and uncertainties are applied to the data. + + Parameters: + self (dict): The configuration dictionary from SF_map containing the scale factors and other settings. + events (events): The input events containing the data to be shifted. + + Returns: + pandas.DataFrame: The DataFrame with the applied systematic shifts. + """ + isRealData = not hasattr(events, "genWeight") dataset = events.metadata["dataset"] shifts = [] @@ -383,6 +498,23 @@ def add_jec_variables(jets, event_rho): ## Jet Veto def jetveto(jets, correct_map): + """ + Apply a veto to jets based on predefined transverse momentum (pt) and pseudorapidity (eta) thresholds. + + This function filters out jets that do not meet the predefined pt and eta criteria. It also utilizes a correction map + to apply additional corrections or selections to the jets. + + Parameters: + jets (iterable): A collection of jet objects or dictionaries containing jet properties. + correct_map (dict): A dictionary containing correction factors or additional selection criteria for the jets. + + Returns: + jets: A jets of jets that pass the predefined pt and eta criteria and any additional criteria from the correction map. + + Raises: + TypeError: If the jets parameter is not an iterable. + KeyError: If the jet objects do not contain the required 'pt' or 'eta' properties. + """ return ak.where( correct_map["jetveto"][list(correct_map["jetveto"].keys())[0]]( jets.phi, jets.eta @@ -403,7 +535,28 @@ def JME_shifts( systematic=False, exclude_jetveto=False, ): - + """ + Apply Jet Energy Corrections (JEC) and Jet Energy Resolutions (JER) shifts to events. + + This function applies JEC and JER shifts to the jets in the events based on the provided correction map and campaign. + It handles both real data and simulated data, and can optionally apply systematic variations and exclude jet vetoes. + + Parameters: + shifts (list): A list of shift types to apply (e.g., 'up', 'down'). + correct_map (dict): A dictionary containing correction factors and settings for JEC and JER. + events (awkward.Array): An array of events containing jet information. + campaign (str): The name of the campaign for which to apply the corrections. + isRealData (bool): A flag indicating whether the data is real or simulated. + systematic (bool, optional): A flag to indicate whether to apply systematic variations. Default is False. + exclude_jetveto (bool, optional): A flag to indicate whether to exclude jet vetoes. Default is False. + + Returns: + awkward.Array: The events array with applied JEC and JER shifts. + + Raises: + KeyError: If required keys are missing in the correct_map. + ValueError: If the campaign is not recognized or supported. + """ dataset = events.metadata["dataset"] jecname = "" # https://cms-jerc.web.cern.ch/JECUncertaintySources/, currently no recommendation of reduced/ full split sources @@ -645,6 +798,27 @@ def JME_shifts( ## Muon Rochester correction def Roccor_shifts(shifts, correct_map, events, isRealData, systematic=False): + """ + Apply Rochester corrections (Roccor) shifts to muons in events. + + This function applies Rochester corrections to the muons in the events based on the provided correction map and campaign. + It handles both real data and simulated data, and can optionally apply systematic variations. + + Parameters: + shifts (list): A list of shift types to apply (e.g., 'up', 'down'). + correct_map (dict): A dictionary containing correction factors and settings for Rochester corrections. + events (awkward.Array): An array of events containing muon information. + campaign (str): The name of the campaign for which to apply the corrections. + isRealData (bool): A flag indicating whether the data is real or simulated. + systematic (bool, optional): A flag to indicate whether to apply systematic variations. Default is False. + + Returns: + awkward.Array: The events array with applied Rochester corrections. + + Raises: + KeyError: If required keys are missing in the correct_map. + ValueError: If the campaign is not recognized or supported. + """ mu = events.Muon if isRealData: SF = correct_map["roccor"].kScaleDT( @@ -730,13 +904,33 @@ def Roccor_shifts(shifts, correct_map, events, isRealData, systematic=False): def puwei(nPU, correct_map, weights, syst=False): """ - Return pileup weight - Parameters - ---------- - nPU: ak.Array - correct_map : dict - weights : coffea.analysis_tool.weights - syst: "split", "weight_only" + <<<<<<< HEAD + Return pileup weight + Parameters + ---------- + nPU: ak.Array + correct_map : dict + weights : coffea.analysis_tool.weights + syst: "split", "weight_only" + ======= + Apply pileup weights to events based on the number of primary vertices (nPU). + + This function applies pileup weights to the events using the provided correction map and weights. + It can optionally apply systematic variations. + + Parameters: + nPU (awkward.Array(int)): The number of primary vertices in the event. + correct_map (dict): A dictionary containing correction factors and settings for pileup weights. + weights (): A dictionary to store the calculated weights. + syst (bool, optional): A flag to indicate whether to apply systematic variations. Default is False. + + Returns: + None: The function modifies the weights dictionary in place. + + Raises: + KeyError: If required keys are missing in the correct_map. + ValueError: If the nPU value is not recognized or supported. + >>>>>>> doc """ if "correctionlib" in str(type(correct_map["PU"])): if syst: @@ -772,6 +966,26 @@ def puwei(nPU, correct_map, weights, syst=False): def btagSFs(jet, correct_map, weights, SFtype, syst=False): + """ + Apply b-tagging scale factors (SFs) to a single jet. + + This function applies b-tagging scale factors to the given jet based on the provided correction map, weights, and scale factor type. + It can optionally apply systematic variations. + + Parameters: + jet (dict): A dictionary containing the properties of the jet. + correct_map (dict): A dictionary containing correction factors and settings for b-tagging scale factors.x + weights (coffea.weight.Weight): An instance of coffea's Weight class to store the calculated weights. + SFtype (str): The type of scale factor to apply , only shape-based C, B are supported. + syst (bool, optional): A flag to indicate whether to apply systematic variations. Default is False. + + Returns: + None: The function modifies the weights instance in place. + + Raises: + KeyError: If required keys are missing in the correct_map. + ValueError: If the SFtype is not recognized or supported. + """ if SFtype.endswith("C"): systlist = [ "Extrap", @@ -1569,7 +1783,7 @@ def __init__(self, year, campaign, isRealData, dataset, isSyst=False): self.edges = templates["histoCat0"].axes[0].edges() def flatten(self, array): - r""" + """ Get the fully flattened array and its layout for each layer """ layouts = [] @@ -1580,16 +1794,17 @@ def flatten(self, array): return array_fl, layouts def unflatten(self, array_fl, layouts): - r""" + """ Recover a flattened array using the original layouts """ + array = array_fl for layout in layouts[::-1]: array = ak.unflatten(array, layout) return array def calc_track_proba(self, ipsig: ak.Array, cat: ak.Array): - r""" + """ Calculate the track probability from the integral of the track IPsig templates, given the IPsig and category. Reference code: https://github.com/cms-sw/cmssw/blob/CMSSW_13_0_X/RecoBTag/TrackProbability/src/HistogramProbabilityEstimator.cc ipsig: IP significance array diff --git a/src/BTVNanoCommissioning/utils/histogrammer.py b/src/BTVNanoCommissioning/utils/histogrammer.py index 4600b4e9..1ef7cece 100644 --- a/src/BTVNanoCommissioning/utils/histogrammer.py +++ b/src/BTVNanoCommissioning/utils/histogrammer.py @@ -10,6 +10,26 @@ def histogrammer(events, workflow, year="2022", campaign="Summer22"): + """ + Most of workflows require same set of variables. Collect axis, histograms definition in single file + To contribute: Add additional axis, histogram using [hist](https://hist.readthedocs.io/en/latest/) for dedicated workflow into the `_hist_dict`. For the new histogram, please have the `syst_axis` as first axis, and `Weight` as last axis. + + Parameters: + events (awkward.Array): The events data to be histogrammed. + workflow (str): The workflow identifier to determine specific histogramming logic. + + Example: + ```python + # axis example + mass_axis = Hist.axis.Regular(50, 0, 300, name="mass", label=" $p_{T}$ [GeV]") + # hist example + _hist_dict["dr_lmusmu"] = Hist.Hist(syst_axis, dr_axis, Hist.storage.Weight()) + ``` + + Returns: + dict: A dictionary containing the defined histograms. + """ + _hist_dict = {} ## Common variables flav_axis = Hist.axis.IntCategory([0, 1, 4, 5, 6], name="flav", label="Genflavour") @@ -503,6 +523,7 @@ def histogrammer(events, workflow, year="2022", campaign="Summer22"): syst_axis, flav_axis, jpt_axis, Hist.storage.Weight() ) ### Btag input variables & PFCands + bininfo = definitions() for d in bininfo.keys(): if d not in events.Jet.fields: @@ -514,6 +535,8 @@ def histogrammer(events, workflow, year="2022", campaign="Summer22"): if bininfo[d]["inputVar_units"] is not None else bininfo[d]["displayname"] ) + if "WP" not in workflow: + break if "Wc_sf" in workflow: _hist_dict[d] = Hist.Hist( syst_axis, @@ -651,6 +674,27 @@ def histogrammer(events, workflow, year="2022", campaign="Summer22"): def histo_writter(pruned_ev, output, weights, systematics, isSyst, SF_map): + """ + Write histograms to the output dictionary based on pruned events and other parameters. + + This function processes the pruned events and writes the histograms to the `output` dictionary. It takes into account the weights, systematics, and scale factors. + + Parameters: + pruned_ev (coffea.nanoaodevents): The pruned events data to be histogrammed. + output (dict): The output dictionary where histograms will be stored. + weights (coffea.analysis_tools.Weights): The weights object for the events. + systematics (list): A list of systematic variations to be considered. + isSyst (str,bool): Indicating whether systematic variations are to be applied. + SF_map (dict): A dictionary containing scale factors for different variables. + + Example: + ```python + histo_writter(pruned_ev, output, weights, systematics, isSyst, SF_map) + ``` + + Returns: + None + """ exclude_btv = [ "DeepCSVC", "DeepCSVB", diff --git a/src/BTVNanoCommissioning/utils/plot_utils.py b/src/BTVNanoCommissioning/utils/plot_utils.py index e9ba53ad..5a6bd956 100644 --- a/src/BTVNanoCommissioning/utils/plot_utils.py +++ b/src/BTVNanoCommissioning/utils/plot_utils.py @@ -135,7 +135,10 @@ def compatible(self, other): - """Checks if this histogram is compatible with another, i.e. they have identical binning""" + """ + Checks if this histogram is compatible with another, i.e. they have identical binning + """ + if len(self.axes) != len(other.axes): return False if set(self.axes.name) != set(other.axes.name): @@ -146,7 +149,8 @@ def compatible(self, other): def poisson_interval(sumw, sumw2, coverage=_coverage1sd): - """Frequentist coverage interval for Poisson-distributed observations + """ + Frequentist coverage interval for Poisson-distributed observations Parameters ---------- sumw : numpy.ndarray @@ -164,6 +168,7 @@ def poisson_interval(sumw, sumw2, coverage=_coverage1sd): When a bin is zero, the scale of the nearest nonzero bin is substituted to scale the nominal upper bound. If all bins zero, a warning is generated and interval is set to ``sumw``. """ + scale = np.empty_like(sumw) scale[sumw != 0] = sumw2[sumw != 0] / sumw[sumw != 0] if np.sum(sumw == 0) > 0: @@ -189,7 +194,8 @@ def poisson_interval(sumw, sumw2, coverage=_coverage1sd): def normal_interval(pw, tw, pw2, tw2, coverage=_coverage1sd): - """Compute errors based on the expansion of pass/(pass + fail), possibly weighted + """ + Compute errors based on the expansion of pass/(pass + fail), possibly weighted Parameters ---------- pw : np.ndarray @@ -221,7 +227,8 @@ def normal_interval(pw, tw, pw2, tw2, coverage=_coverage1sd): def clopper_pearson_interval(num, denom, coverage=_coverage1sd): - """Compute Clopper-Pearson coverage interval for a binomial distribution + """ + Compute Clopper-Pearson coverage interval for a binomial distribution Parameters ---------- num : numpy.ndarray @@ -232,6 +239,7 @@ def clopper_pearson_interval(num, denom, coverage=_coverage1sd): Central coverage interval, defaults to 68% c.f. http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval """ + if np.any(num > denom): raise ValueError( "Found numerator larger than denominator while calculating binomial uncertainty" @@ -259,7 +267,8 @@ def plotratio( label=None, ext_denom_error=None, ): - """Create a ratio plot, dividing two compatible histograms + """ + Create a ratio plot, dividing two compatible histograms Parameters ---------- num : Hist diff --git a/src/BTVNanoCommissioning/workflows/example.py b/src/BTVNanoCommissioning/workflows/example.py index 755d86b2..4258b23f 100644 --- a/src/BTVNanoCommissioning/workflows/example.py +++ b/src/BTVNanoCommissioning/workflows/example.py @@ -169,25 +169,29 @@ def process_shift(self, events, shift_name): #################### # Output # #################### - # Configure SFs + # Configure SFs - read pruned objects from the pruned_ev and apply SFs and call the systematics weights = weight_manager(pruned_ev, self.SF_map, self.isSyst) - # Configure systematics + # Configure systematics shifts if shift_name is None: - systematics = ["nominal"] + list(weights.variations) + systematics = ["nominal"] + list( + weights.variations + ) # nominal + weight variation systematics else: - systematics = [shift_name] + systematics = [shift_name] # JES/JER systematics + + # Fill the weight to output arrys if not isRealData: pruned_ev["weight"] = weights.weight() for ind_wei in weights.weightStatistics.keys(): pruned_ev[f"{ind_wei}_weight"] = weights.partial_weight( include=[ind_wei] ) - # Configure histograms + # Configure histograms- fill the histograms with pruned objects if not self.noHist: output = histo_writter( pruned_ev, output, weights, systematics, self.isSyst, self.SF_map ) - # Output arrays + # Output arrays - store the pruned objects in the output arrays if self.isArray: array_writer(self, pruned_ev, events, systematics[0], dataset, isRealData) diff --git a/test_env.yml b/test_env.yml index c963ee26..df386e87 100644 --- a/test_env.yml +++ b/test_env.yml @@ -18,5 +18,4 @@ dependencies: - parsl==2024.01.29 - arrow - dask-jobqueue - - xgboost