Skip to content

Commit

Permalink
Extract search for input files into separate function (HEP-FCC#418)
Browse files Browse the repository at this point in the history
* Extract search for input files into separate function

* Add few printouts

* Using join to merge path
  • Loading branch information
kjvbrt authored Nov 12, 2024
1 parent de8c9eb commit b9b8422
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 44 deletions.
9 changes: 5 additions & 4 deletions python/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def get_entries(inpath: str) -> int | None:
try:
nevents = infile.Get("events").GetEntries()
except AttributeError:
LOGGER.error('Input file is missing "events" TTree!\nAborting...')
LOGGER.error('Input file is missing "events" TTree!\n - %s'
'\nAborting...', inpath)
sys.exit(3)

return nevents
Expand Down Expand Up @@ -63,15 +64,15 @@ def get_entries_sow(infilepath: str, nevents_max: Optional[int] = None, get_loca
sumOfWeightsTTree = 0.

# check for empty chunk (can this be improved? exception from RDF cannot be caught it seems?)
tree =infile.Get("events")
tree = infile.Get("events")
if not tree:
print("Tree not found in file", infilepath, " possibly empty chunk - continuing with next one.")
infile.Close()
return processEvents, eventsTTree, processSumOfWeights, sumOfWeightsTTree

try:

#use a RDF here too so the nevents restriction option can be imposed easily for the local events
# use a RDF here too so the nevents restriction option can be imposed easily for the local events
rdf_tmp = ROOT.ROOT.RDataFrame("events", infilepath)

if nevents_max:
Expand All @@ -85,7 +86,7 @@ def get_entries_sow(infilepath: str, nevents_max: Optional[int] = None, get_loca
# infile.Get("events").Draw('EventHeader.weight[0]>>histo')
# histo=ROOT.gDirectory.Get('histo')
histo = rdf_tmp.Histo1D(weight_name)
sumOfWeightsTTree=float(eventsTTree)*histo.GetMean()
sumOfWeightsTTree = float(eventsTTree) * histo.GetMean()
except cppyy.gbl.std.runtime_error:
LOGGER.error('Error: Event weights requested with do_weighted,'
'but input file does not contain weight column. Aborting.')
Expand Down
3 changes: 2 additions & 1 deletion python/run_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -886,7 +886,8 @@ def run_histmaker(args, rdf_module, anapath):

LOGGER.info('Writing out process %s, nEvents processed %s',
process, f'{evtcount.GetValue():,}')
with ROOT.TFile(f'{output_dir}/{process}.root', 'RECREATE'):
with ROOT.TFile(os.path.join(output_dir, f'{process}.root'),
'RECREATE'):
for hist in hists_to_write.values():
if do_scale:
hist.Scale(scale * int_lumi)
Expand Down
102 changes: 63 additions & 39 deletions python/run_final_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def get_entries(infilepath: str) -> tuple[int, int]:
try:
events_in_ttree = infile.Get("events").GetEntries()
except AttributeError:
LOGGER.error('Input file is missing "events" TTree!\nAborting...')
LOGGER.error('Input file is missing "events" TTree!\n - %s'
'\nAborting...')
sys.exit(3)

return events_processed, events_in_ttree
Expand All @@ -67,6 +68,42 @@ def get_processes(rdf_module: object) -> list[str]:
return process_list


# _____________________________________________________________________________
def find_sample_files(input_dir: str,
sample_name: str) -> list[str]:
'''
Find input files for the specified sample name.
'''
result: list[str] = []

full_input_path = os.path.abspath(os.path.join(input_dir, sample_name))

# Find all input files ending with .root
if os.path.isdir(full_input_path):
all_files = os.listdir(full_input_path)
# Remove files not ending with `.root`
all_files = [f for f in all_files if f.endswith('.root')]
# Remove directories
all_files = [f for f in all_files
if os.path.isfile(os.path.join(full_input_path, f))]
result = [os.path.join(full_input_path, f) for f in all_files]

# Handle case when there is just one input file
if len(result) < 1:
if os.path.isfile(full_input_path + '.root'):
result.append(full_input_path + '.root')
else:
LOGGER.debug('Input file "%s" does not exist!',
full_input_path + '.root')

if len(result) < 1:
LOGGER.error('Can not find input files for "%s" sample!\nAborting...',
sample_name)
sys.exit(3)

return result


# _____________________________________________________________________________
def save_results(results: dict[str, dict[str, any]],
rdf_module: object) -> None:
Expand Down Expand Up @@ -279,47 +316,34 @@ def run(rdf_module, args) -> None:

file_list[process_name] = ROOT.vector('string')()

infilepath = input_dir + process_name + '.root' # input file

if not os.path.isfile(infilepath):
LOGGER.debug('File %s does not exist!\nTrying if it is a '
'directory as it might have been processed in batch.',
infilepath)
else:
LOGGER.info('Open file:\n %s', infilepath)
flist = find_sample_files(input_dir, process_name)
for filepath in flist:
# TODO: check in `get_entries()` if file is valid and remove it
# from the input list if it is not
if do_weighted:
process_events[process_name], events_ttree[process_name], \
sow_process[process_name], sow_ttree[process_name] = \
get_entries_sow(infilepath, weight_name="weight")
else:
process_events[process_name], events_ttree[process_name] = \
get_entries(infilepath)
file_list[process_name].push_back(infilepath)

indirpath = input_dir + process_name
if os.path.isdir(indirpath):
#reset the nevts/sow counters to avoid wrong counting in case a single file of same name (e.g. local test output) also exists in the same directory
process_events[process_name] = 0
events_ttree[process_name] = 0
sow_process[process_name] = 0.
sow_ttree[process_name] = 0.

info_msg = f'Open directory {indirpath}'
flist = glob.glob(indirpath + '/chunk*.root')
for filepath in flist:
info_msg += '\n\t' + filepath
if do_weighted:
chunk_process_events, chunk_events_ttree, \
chunk_process_events, chunk_events_ttree, \
chunk_sow_process, chunk_sow_ttree = \
get_entries_sow(filepath, weight_name="weight")
sow_process[process_name] += chunk_sow_process
sow_ttree[process_name] += chunk_sow_ttree
else:
chunk_process_events, chunk_events_ttree = \
get_entries(filepath)
process_events[process_name] += chunk_process_events
events_ttree[process_name] += chunk_events_ttree
file_list[process_name].push_back(filepath)
sow_process[process_name] += chunk_sow_process
sow_ttree[process_name] += chunk_sow_ttree
else:
chunk_process_events, chunk_events_ttree = \
get_entries(filepath)
process_events[process_name] += chunk_process_events
events_ttree[process_name] += chunk_events_ttree
file_list[process_name].push_back(filepath)
if len(file_list[process_name]) < 1:
LOGGER.error('No valid input files for sample "%s" '
'found!\nAborting..', process_name)
sys.exit(3)
if len(file_list[process_name]) == 1:
LOGGER.info('Loading events for sample "%s" from file:\n - %s',
process_name, file_list[process_name][0])
else:
info_msg = f'Loading events for sample "{process_name}"'
info_msg += ' from files:'
for filepath in file_list[process_name]:
info_msg += f'\n - {filepath}'
LOGGER.info(info_msg)

info_msg = 'Processed events:'
Expand Down

0 comments on commit b9b8422

Please sign in to comment.