Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement hera datasets #2175

Merged
merged 22 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3298bb0
initial commit for HERA dataset reimplementation
Oct 15, 2024
8430f56
initial commit for HERA dataset reimplementation
Oct 15, 2024
c32641b
add reimplementation of HERA beauty and charm QCD analysis and combin…
Oct 16, 2024
66d5101
small improvements in filter script.
Oct 16, 2024
386d497
Add check for covmat, remove total uncertainty before procedural unce…
Oct 30, 2024
127822f
Change process type, fix typo in metadata
Oct 30, 2024
3bad329
change names of kinematic varibles from k1, k2, k3 to x, Q2, y.
Oct 30, 2024
96311ee
Add files containing the reimplemented variant of the uncertainties t…
Oct 30, 2024
79a0990
replace Q2bins6 by k2bins6.
Oct 30, 2024
720e8b2
add reimplemented uncertainties for HERA_NC300GEV.
Oct 30, 2024
ca03af7
fix labels in metadata, remove legacy variants.
peterkrack Dec 8, 2024
89adf40
Merge branch 'master' into reimplement-HERA-datasets
peterkrack Dec 8, 2024
f05a592
fix typo in metadata.
peterkrack Dec 8, 2024
a3a958b
fix missing labels
peterkrack Dec 8, 2024
87108f8
commondata test fail when importing validphys
peterkrack Dec 8, 2024
fcd4cda
remove import of covmat_is_close from filter scripts
peterkrack Dec 8, 2024
4e7e907
remove covmat_is_close import from filter script.
peterkrack Dec 8, 2024
63c3f5b
fix in metadata
peterkrack Dec 8, 2024
0e6736a
clean up some files
peterkrack Dec 9, 2024
895ce1c
set kinematics_override back to dis_sqrt_scale to fix plots.
peterkrack Dec 11, 2024
c778061
Merge branch 'master' into reimplement-HERA-datasets
peterkrack Dec 11, 2024
b33f457
fix metadata file for HERA_NC_318GEV.
peterkrack Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/HERA_CC_318GEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
from pathlib import Path
from dataclasses import dataclass
import typing
from typing import List
import numpy as np
import pandas as pd
from os import PathLike
from fortranformat import FortranRecordWriter
import yaml

@dataclass
class commondata:
Copy link
Contributor

@giacomomagni giacomomagni Oct 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the filters share similar or even identical parts, you can consider to place it in the filter_utils folder and store common parts there. For ex:

from nnpdf_data.filter_utils.hera import commondata

central_values: np.ndarray
kinematics: np.ndarray
statistical_uncertainties: np.ndarray
systematic_uncertainties: np.ndarray
systypes: List[tuple[str, str]]
process: str
dataset_name: str
kinematic_quantities: List[str]


# Procedure to create data_*.yaml, kinematics_*.yaml and uncertainties_*.yaml
def write_new_commondata(self, data_filename: str | PathLike,
kinematics_filename: str | PathLike,
uncertainties_filename: str | PathLike):
# central data values
data = {"data_central": self.central_values.tolist()}
with data_filename.open("w+") as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False)

# kinematic quantieties
# TODO add arrays for min and max values to derived type?
bins = []
for kin in self.kinematics.tolist():
bins.append(
{self.kinematic_quantities[0]:
{
"min": None,
"mid": kin[0],
"max": None
},
self.kinematic_quantities[1]:
{
"min": None,
"mid": kin[1],
"max": None
},
self.kinematic_quantities[2]:
{
"min": None,
"mid": kin[2],
"max": None
}
})
data = {"bins": bins}
with kinematics_filename.open("w+") as f:
yaml.dump(data, f, default_flow_style=False, sort_keys=False)

# uncertainties
# There is only one statistical uncertainty per datapoint.
definitions = {"stat":
{
"description": "Statistical uncertainty.",
"treatment": "ADD",
"type": "UNCORR"
}
}
for isys, sys in enumerate(self.systypes):
definitions.update(
{f"sys_corr_{isys}":
{
"description": f"Systematic uncertainty {isys}",
"treatment": sys[0],
"type": sys[1]
}
})
bins = {"bins": [] }
for i, _ in enumerate(self.central_values):
systematics = {"stat": self.statistical_uncertainties.tolist()[i]}
for isys, sys in enumerate(self.systematic_uncertainties[i].tolist()):
systematics.update({f"sys_corr_{isys}": sys})
bins["bins"].append(systematics)
data = {"definitions": definitions }
# TODO Notation of reals is inconsistent from yaml.safe_dump
# sometimes it is in scientific notation sometimes not...
with uncertainties_filename.open("w+") as f:
yaml.safe_dump(data, f, default_flow_style=False, sort_keys=False)
yaml.safe_dump(bins, f, default_flow_style=False, sort_keys=False)





# TODO: old commondata format stores the uncertainties as
# both additive and multiplicative.
def write_old_commondata(self, data_filename: str | PathLike,
systype_filename: str | PathLike):
with data_filename.open("w+") as f:
f.write(
f"{self.dataset_name} {len(self.systypes)} {len(self.central_values)} \n")
FMT = "(I4,A10,"+str(3+1+1+len(self.systypes))+"E23.15)"
print(FMT)
line = FortranRecordWriter(FMT)
for i in range(len(self.central_values)):
l = ([i+1]+self.kinematics[i].tolist()+
[self.central_values[i].tolist()]+
[self.statistical_uncertainties[i].tolist()]+
self.systematic_uncertainties[i].tolist())
f.write(line.write(l)+"\n")

@dataclass
class hera_commondata(commondata):
def __init__(self, filename: str | PathLike, dataset_name: str,
process: str):
# Read the data.
file = Path(filename)
df = pd.read_table(file, sep=r"\s+")

# Kinematic quantieties.
self.central_values = df["Sigma"].to_numpy()
self.kinematics = df[["x", "Q2", "y"]].to_numpy()
self.kinematic_quantities = ["x", "Q2", "y"]

# Statistical uncertainties.
statistical_uncertainties = df["stat"].to_numpy()
for iunc,unc in enumerate(statistical_uncertainties):
unc = self.central_values[iunc]*unc/100
statistical_uncertainties[iunc] = unc
self.statistical_uncertainties = statistical_uncertainties

# Systematic uncertainties.
sys_uncert_col_names = list(df.columns.values)[5:]
self.systematic_uncertainties = df[sys_uncert_col_names].to_numpy()
systematic_uncertainties = df[sys_uncert_col_names].to_numpy()
for iunc,unc in enumerate(systematic_uncertainties):
unc = self.central_values[iunc]*unc/100
systematic_uncertainties[iunc] = unc
self.systematic_uncertainties = systematic_uncertainties

# All uncertainties are treated as multiplicative.
systypes = []
for name in sys_uncert_col_names:
if(name == "uncor"):
systypes.append(("MULT", "UNCORR"))
else:
systypes.append(("MULT", "HC_" + name))
self.systypes = systypes
self.process = process
self.dataset_name = dataset_name

def main():
print(" Reimplementing the HERA commondata")
hera_em = hera_commondata("./rawdata/HERA1+2_CCem.dat","HERACOMBCCEM", "DIS_CCE")
hera_em.write_new_commondata(Path("data_reimplemented_EM-SIGMARED.yaml"),
Path("kinematics_reimplemented_EM-SIGMARED.yaml"),
Path("uncertainties_reimplemented_EM-SIGMARED.yaml"))
hera_ep = hera_commondata("./rawdata/HERA1+2_CCep.dat","HERACOMBCCEP", "DIS_CCE")
hera_ep.write_new_commondata(Path("data_reimplemented_EP-SIGMARED.yaml"),
Path("kinematics_reimplemented_EP-SIGMARED.yaml"),
Path("uncertainties_reimplemented_EP-SIGMARED.yaml"))
if __name__ == "__main__":
main()



Loading