-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first version of the nnpdf data package , with versioning
add utility function to read metadata just from dataset name deprecate a bunch of functions fix include
- Loading branch information
1 parent
4da53ad
commit ee40d83
Showing
24 changed files
with
665 additions
and
526 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
""" | ||
Note: this module will be removed after the next tag, don't use anything from here | ||
""" | ||
|
||
import dataclasses | ||
import logging | ||
from operator import attrgetter | ||
|
||
import pandas as pd | ||
|
||
from nnpdf_data.coredata import CommonData | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
log.warning( | ||
"You are loading deprecated functionality that use the old commondata parser. This is no longer supported and will be removed in the near future" | ||
) | ||
|
||
|
||
### Old commondata: | ||
### All code below this line is deprecated and will be removed | ||
def load_commondata_old(commondatafile, systypefile, setname): | ||
"""Parse a commondata file and a systype file into a CommonData. | ||
Parameters | ||
---------- | ||
commondatafile : file or path to file | ||
systypefile : file or path to file | ||
Returns | ||
------- | ||
commondata : CommonData | ||
An object containing the data and information from the commondata | ||
and systype files. | ||
""" | ||
# First parse commondata file | ||
commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None) | ||
# Remove NaNs | ||
# TODO: replace commondata files with bad formatting | ||
# Build header | ||
commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"] | ||
nsys = (commondatatable.shape[1] - len(commondataheader)) // 2 | ||
|
||
commondataheader += ["ADD", "MULT"] * nsys | ||
commondatatable.columns = commondataheader | ||
commondatatable.set_index("entry", inplace=True) | ||
ndata = len(commondatatable) | ||
commondataproc = commondatatable["process"][1] | ||
# Check for consistency with commondata metadata | ||
cdmetadata = peek_commondata_metadata(commondatafile) | ||
if (nsys, ndata) != attrgetter("nsys", "ndata")(cdmetadata): | ||
raise ValueError(f"Commondata table information does not match metadata for {setname}") | ||
|
||
# Now parse the systype file | ||
systypetable = parse_systypes(systypefile) | ||
|
||
# Populate CommonData object | ||
return CommonData( | ||
setname=setname, | ||
ndata=ndata, | ||
commondataproc=commondataproc, | ||
nkin=3, | ||
nsys=nsys, | ||
commondata_table=commondatatable, | ||
systype_table=systypetable, | ||
legacy=True, | ||
) | ||
|
||
|
||
def parse_systypes(systypefile): | ||
"""Parses a systype file and returns a pandas dataframe.""" | ||
systypeheader = ["sys_index", "treatment", "name"] | ||
try: | ||
systypetable = pd.read_csv( | ||
systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None | ||
) | ||
systypetable.dropna(axis="columns", inplace=True) | ||
# Some datasets e.g. CMSWCHARMRAT have no systematics | ||
except pd.errors.EmptyDataError: | ||
systypetable = pd.DataFrame(columns=systypeheader) | ||
|
||
systypetable.set_index("sys_index", inplace=True) | ||
|
||
return systypetable | ||
|
||
|
||
@dataclasses.dataclass(frozen=True) | ||
class CommonDataMetadata: | ||
"""Contains metadata information about the data being read""" | ||
|
||
name: str | ||
nsys: int | ||
ndata: int | ||
process_type: str | ||
|
||
|
||
def peek_commondata_metadata(commondatafilename): | ||
"""Read some of the properties of the commondata object as a CommonData Metadata""" | ||
with open(commondatafilename) as f: | ||
try: | ||
l = f.readline() | ||
name, nsys_str, ndata_str = l.split() | ||
l = f.readline() | ||
process_type_str = l.split()[1] | ||
except Exception: | ||
log.error(f"Error processing {commondatafilename}") | ||
raise | ||
|
||
return CommonDataMetadata( | ||
name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str) | ||
) | ||
|
||
|
||
def get_plot_kinlabels(commondata): | ||
"""Return the LaTex kinematic labels for a given Commondata""" | ||
key = commondata.process_type | ||
|
||
# TODO: the keys in KINLABEL_LATEX need to be updated for the new commondata | ||
return KINLABEL_LATEX.get(key, key) | ||
|
||
|
||
def get_kinlabel_key(process_label): | ||
""" | ||
Since there is no 1:1 correspondence between latex keys and the old libNNPDF names | ||
we match the longest key such that the proc label starts with it. | ||
""" | ||
l = process_label | ||
try: | ||
if process_label == "EWK_RAP_ASY": | ||
# TODO this function is disappearing in this PR | ||
l = "EWK_RAP" | ||
return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k)) | ||
except StopIteration as e: | ||
raise ValueError( | ||
"Could not find a set of kinematic " | ||
"variables matching the process %s Check the " | ||
"labels defined in commondata.cc. " % (l) | ||
) from e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
""" | ||
This file contains examples of use of ``nnpdf_data`` as a library. | ||
This library is currently in pre-alpha form and should not be considered stable. | ||
The functions and examples in this file will be eventually removed but might become | ||
part of the library as an external user-facing interface. | ||
There is currently no user-facing interface so no stability is expected. | ||
""" | ||
|
||
from nnpdf_data import path_commondata | ||
from nnpdf_data.commondataparser import parse_new_metadata | ||
|
||
|
||
def parse_dataset(dataset, variant=None): | ||
"""Given a dataset name, read the observable metadata as a CommonData object. | ||
A variant can be given. | ||
The output is a ``ObservableMetaData`` object, with references to all files | ||
that form the dataset but none of them is loaded. | ||
This can then be used to _load_ the dataset using load_commondata. | ||
Example | ||
------- | ||
>>> from nnpdf_data.commondataparser import load_commondata | ||
>>> cd_meta = parse_dataset("LHCB_Z0_7TEV_DIELECTRON_Y") | ||
>>> cd = load_commondata(cd_meta) | ||
>>> print(cd) | ||
CommonData(setname='LHCB_Z0_7TEV_DIELECTRON_Y', ndata=9, commondataproc='DY_Z_Y', nkin=3, nsys=11, legacy=False, legacy_names=['LHCBZ940PB'], kin_variables=['y', 'm_Z2', 'sqrts']) | ||
""" | ||
setname, observable = dataset.rsplit("_", 1) | ||
metadata_file = path_commondata / setname / "metadata.yaml" | ||
metadata = parse_new_metadata(metadata_file, observable, variant=variant) | ||
return metadata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,76 +1,22 @@ | ||
from functools import lru_cache | ||
import pathlib | ||
|
||
import yaml | ||
from ._version import __version__ | ||
from .commondataparser import parse_new_metadata | ||
from .validphys_compatibility import legacy_to_new_map, legacy_to_new_mapping, new_to_legacy_map | ||
|
||
path_vpdata = pathlib.Path(__file__).parent | ||
path_commondata = path_vpdata / "commondata" | ||
|
||
# VP should not have access to this file, only to the products | ||
_path_legacy_mapping = path_commondata / "dataset_names.yml" | ||
theory_cards = path_vpdata / "theory_cards" | ||
|
||
with open(_path_legacy_mapping) as file: | ||
_legacy_to_new_mapping_raw = yaml.load(file, yaml.Loader) | ||
# Convert strings into a dictionary | ||
legacy_to_new_mapping = { | ||
k: ({"dataset": v} if isinstance(v, str) else v) for k, v in _legacy_to_new_mapping_raw.items() | ||
} | ||
|
||
|
||
@lru_cache | ||
def legacy_to_new_map(dataset_name, sys=None): | ||
"""Find the new dataset name and variant corresponding to an old dataset | ||
and systematics choice""" | ||
if dataset_name not in legacy_to_new_mapping: | ||
return dataset_name, None | ||
|
||
new_name = legacy_to_new_mapping[dataset_name] | ||
variant = new_name.get("variant") | ||
new_name = new_name["dataset"] | ||
if sys is not None: | ||
if variant is None: | ||
raise KeyError( | ||
f"I cannot translate the combination of {dataset_name} and sys: {sys}. Please report this." | ||
) | ||
variant += f"_{sys}" | ||
|
||
return new_name, variant | ||
|
||
|
||
@lru_cache | ||
def new_to_legacy_map(dataset_name, variant_used): | ||
"""Loop over the dictionary and find the right dataset. | ||
Since it is posible to have more than 1 dataset mapped to the same new one, | ||
returns a list of everything that matches. | ||
This function will loop over the entire dictionary of mappings and selects | ||
1. All datasets that match exactly what's in the runcard (dataset & variant): exact_matches | ||
2. All datasets that match the dataset name: matches | ||
If there are any `exact_matches`, it will return only those; otherwise, return all `matches` | ||
if there are no `matches` at all, return None | ||
""" | ||
|
||
matches = [] | ||
exact_matches = [] | ||
|
||
for old_name, new_info in legacy_to_new_mapping.items(): | ||
new_name = new_info["dataset"] | ||
variant = new_info.get("variant") | ||
|
||
if new_name == dataset_name: | ||
matches.append(old_name) | ||
# if it's a nuclear DIS data promote legacy to be legacy_dw | ||
if "_DW_" in old_name and variant_used == "legacy": | ||
variant = "legacy_dw" | ||
def load_dataset_metadata(dataset_name, variant=None): | ||
"""Given a dataset name, return the metadata""" | ||
|
||
if variant_used == variant: | ||
exact_matches.append(old_name) | ||
# Compatibility with old nnpdf names, these two lines | ||
# might disappear at any given point | ||
if variant is None: | ||
dataset_name, variant = legacy_to_new_map(dataset_name) | ||
|
||
# If we found exact matches, return those and stop looking | ||
if exact_matches: | ||
return exact_matches | ||
elif matches: | ||
return matches | ||
return None | ||
setname, observable = dataset_name.rsplit("_", 1) | ||
metadata_file = path_commondata / setname / "metadata.yaml" | ||
return parse_new_metadata(metadata_file, observable, variant=variant) |
Oops, something went wrong.