diff --git a/CLDConfig/CLDReconstruction.py b/CLDConfig/CLDReconstruction.py index 4683c4c..f104802 100644 --- a/CLDConfig/CLDReconstruction.py +++ b/CLDConfig/CLDReconstruction.py @@ -19,20 +19,24 @@ import os from Gaudi.Configuration import INFO, WARNING, DEBUG -from Configurables import k4DataSvc, MarlinProcessorWrapper -from k4MarlinWrapper.inputReader import create_reader, attach_edm4hep2lcio_conversion +from Gaudi.Configurables import EventDataSvc, MarlinProcessorWrapper, GeoSvc, TrackingCellIDEncodingSvc +from k4FWCore import ApplicationMgr, IOSvc from k4FWCore.parseArgs import parser -from py_utils import SequenceLoader, attach_lcio2edm4hep_conversion, create_writer, parse_collection_patch_file +from py_utils import SequenceLoader, parse_collection_patch_file +from k4MarlinWrapper.io_helpers import IOHandlerHelper import ROOT ROOT.gROOT.SetBatch(True) parser_group = parser.add_argument_group("CLDReconstruction.py custom options") -parser_group.add_argument("--inputFiles", action="extend", nargs="+", metavar=("file1", "file2"), help="One or multiple input files") +# Need the dummy input such that the IOHandlerHelper.add_reader call below does not crash when called with --help +parser_group.add_argument("--inputFiles", action="store", nargs="+", metavar=("file1", "file2"), help="One or multiple input files", default=["dummy_input.edm4hep.root"]) parser_group.add_argument("--outputBasename", help="Basename of the output file(s)", default="output") parser_group.add_argument("--trackingOnly", action="store_true", help="Run only track reconstruction", default=False) parser_group.add_argument("--enableLCFIJet", action="store_true", help="Enable LCFIPlus jet clustering parts", default=False) +parser_group.add_argument("--enableMLJetTagger", action="store_true", help="Enable ML-based jet flavor tagging", default=False) +parser_group.add_argument("--MLJetTaggerModel", action="store", help="Type of ML model to use for inference", type=str, default="model_ParT_ecm240_cld_o2_v5") parser_group.add_argument("--cms", action="store", help="Choose a Centre-of-Mass energy", default=240, choices=(91, 160, 240, 365), type=int) parser_group.add_argument("--compactFile", help="Compact detector file to use", type=str, default=os.environ["K4GEO"] + "/FCCee/CLD/compact/CLD_o2_v07/CLD_o2_v07.xml") tracking_group = parser_group.add_mutually_exclusive_group() @@ -40,11 +44,12 @@ tracking_group.add_argument("--truthTracking", action="store_true", default=False, help="Cheat tracking pattern recognition") reco_args = parser.parse_known_args()[0] -algList = [] -svcList = [] -evtsvc = k4DataSvc("EventDataSvc") -svcList.append(evtsvc) +evtsvc = EventDataSvc("EventDataSvc") +iosvc = IOSvc() + +svcList = [evtsvc, iosvc] +algList = [] CONFIG = { "CalorimeterIntegrationTimeWindow": "10ns", @@ -59,7 +64,6 @@ REC_COLLECTION_CONTENTS_FILE = "collections_rec_level.txt" # file with the collections to be patched in when writing from LCIO to EDM4hep -from Configurables import GeoSvc, TrackingCellIDEncodingSvc, Lcio2EDM4hepTool geoservice = GeoSvc("GeoSvc") geoservice.detectors = [reco_args.compactFile] geoservice.OutputLevel = INFO @@ -92,13 +96,8 @@ }, ) -if reco_args.inputFiles: - read = create_reader(reco_args.inputFiles, evtsvc) - read.OutputLevel = INFO - algList.append(read) -else: - print('WARNING: No input files specified, the CLD Reconstruction will fail') - read = None +io_handler = IOHandlerHelper(algList, iosvc) +io_handler.add_reader(reco_args.inputFiles) MyAIDAProcessor = MarlinProcessorWrapper("MyAIDAProcessor") MyAIDAProcessor.OutputLevel = WARNING @@ -144,8 +143,15 @@ sequenceLoader.load("HighLevelReco/PFOSelector") sequenceLoader.load("HighLevelReco/JetClusteringOrRenaming") sequenceLoader.load("HighLevelReco/JetAndVertex") + sequenceLoader.load("HighLevelReco/MLJetTagger") # event number processor, down here to attach the conversion back to edm4hep to it algList.append(EventNumber) +from Configurables import EDM4hep2LcioTool +input_conv = EDM4hep2LcioTool("EventNumber_InputConverter") +input_conv.convertAll = True +input_conv.collNameMapping = {"MCParticles": "MCParticle"} +input_conv.OutputLevel = DEBUG +EventNumber.EDM4hep2LcioTool = input_conv DST_KEEPLIST = ["MCParticlesSkimmed", "MCPhysicsParticles", "RecoMCTruthLink", "SiTracks", "SiTracks_Refitted", "PandoraClusters", "PandoraPFOs", "SelectedPandoraPFOs", "LooseSelectedPandoraPFOs", "TightSelectedPandoraPFOs", "RefinedVertexJets", "RefinedVertexJets_rel", "RefinedVertexJets_vtx", "RefinedVertexJets_vtx_RP", "BuildUpVertices", "BuildUpVertices_res", "BuildUpVertices_RP", "BuildUpVertices_res_RP", "BuildUpVertices_V0", "BuildUpVertices_V0_res", "BuildUpVertices_V0_RP", "BuildUpVertices_V0_res_RP", "PrimaryVertices", "PrimaryVertices_res", "PrimaryVertices_RP", "PrimaryVertices_res_RP", "RefinedVertices", "RefinedVertices_RP"] @@ -153,10 +159,22 @@ # TODO: replace all the ugly strings by something sensible like Enum if CONFIG["OutputMode"] == "LCIO": - Output_REC = create_writer("lcio", "Output_REC", f"{reco_args.outputBasename}_REC") - algList.append(Output_REC) + Output_REC = io_handler.add_lcio_writer("Output_REC") + Output_REC.Parameters = { + "LCIOOutputFile": [f"{reco_args.outputBasename}_REC.slcio"], + "LCIOWriteMode": ["WRITE_NEW"], + } - Output_DST = create_writer("lcio", "Output_DST", f"{reco_args.outputBasename}_DST", DST_KEEPLIST, DST_SUBSETLIST) + Output_DST = io_handler.add_lcio_writer("Output_DST") + dropped_types = ["MCParticle", "LCRelation", "SimCalorimeterHit", "CalorimeterHit", "SimTrackerHit", "TrackerHit", "TrackerHitPlane", "Track", "ReconstructedParticle", "LCFloatVec"] + Output_DST.Parameters = { + "LCIOOutputFile": [f"{reco_args.outputBasename}_DST.slcio"], + "LCIOWriteMode": ["WRITE_NEW"], + "DropCollectionNames": [], + "DropCollectionTypes": dropped_types, + "FullSubsetCollections": DST_SUBSETLIST, + "KeepCollectionNames": DST_KEEPLIST, + } algList.append(Output_DST) if CONFIG["OutputMode"] == "EDM4Hep": @@ -169,21 +187,14 @@ } algList.append(collPatcherRec) - Output_REC = create_writer("edm4hep", "Output_REC", f"{reco_args.outputBasename}_REC") - algList.append(Output_REC) - + io_handler.add_edm4hep_writer(f"{reco_args.outputBasename}_REC.edm4hep.root", ["keep *"]) # FIXME: needs https://github.com/key4hep/k4FWCore/issues/226 - # Output_DST = create_writer("edm4hep", "Output_DST", f"{reco_args.outputBasename}_DST", DST_KEEPLIST) - # algList.append(Output_DST) - + # -# We need to convert the inputs in case we have EDM4hep input -attach_edm4hep2lcio_conversion(algList, read) -# We need to convert the outputs in case we have EDM4hep output -attach_lcio2edm4hep_conversion(algList) +# We need to attach all the necessary converters +io_handler.finalize_converters() -from Configurables import ApplicationMgr ApplicationMgr( TopAlg = algList, EvtSel = 'NONE', EvtMax = 3, # Overridden by the --num-events switch to k4run diff --git a/CLDConfig/HighLevelReco/JetAndVertex.py b/CLDConfig/HighLevelReco/JetAndVertex.py index 7ad11f7..18195db 100644 --- a/CLDConfig/HighLevelReco/JetAndVertex.py +++ b/CLDConfig/HighLevelReco/JetAndVertex.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from Gaudi.Configuration import WARNING +from Gaudi.Configuration import WARNING, INFO from Configurables import MarlinProcessorWrapper @@ -135,6 +135,12 @@ "UpdateVertexRPDaughters": ["0"], "UseMCP": ["0"] } + JetClusteringAndRefinerPatcher = MarlinProcessorWrapper( + "JetClusteringAndRefinerPatcher", OutputLevel=INFO, ProcessorType="PatchCollections" + ) + JetClusteringAndRefinerPatcher.Parameters = { + "PatchCollections": ["yth", "VertexJets|y01,y12,y23,y34,y45,y56,y67,y78,y89,y910"] + } if CONFIG["VertexUnconstrained"] == "ON": VertexFinderUnconstrained = MarlinProcessorWrapper("VertexFinderUnconstrained") @@ -192,13 +198,14 @@ } JetAndVertexSequence = [ - VertexFinder, + VertexFinder ] # FIXME: LCFIPlus causes occasional breakage: https://github.com/lcfiplus/LCFIPlus/issues/69 # due to not adding the jet clustering parameters to every event as PID information if reco_args.enableLCFIJet: JetAndVertexSequence.append(JetClusteringAndRefiner) + JetAndVertexSequence.append(JetClusteringAndRefinerPatcher) if CONFIG["VertexUnconstrained"] == "ON": JetAndVertexSequence.append(VertexFinderUnconstrained) diff --git a/CLDConfig/HighLevelReco/MLJetTagger.py b/CLDConfig/HighLevelReco/MLJetTagger.py new file mode 100644 index 0000000..6adf22f --- /dev/null +++ b/CLDConfig/HighLevelReco/MLJetTagger.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2014-2024 Key4hep-Project. +# +# This file is part of Key4hep. +# See https://key4hep.github.io/key4hep-doc/ for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from Configurables import JetTagger +import yaml +import os + +if reco_args.enableMLJetTagger: + # check if jet clustering is also enabled (prerequisite for jet flavor tagging) + if not reco_args.enableLCFIJet: + raise ValueError("MLJetTagger requires LCFIPlus jet clustering to be enabled. Please add --enableLCFIJet to the command or disable --enableMLJetTagger.") + + # Get the directory of the current script + script_dir = os.path.dirname(os.path.abspath(__file__)) + # Construct the path to the YAML file + yaml_path = os.path.join(script_dir, "models_MLJetTagger.yaml") + + # Load YAML config + with open(yaml_path, "r") as file: + model_config = yaml.safe_load(file) + + # check if the model type is valid + if reco_args.MLJetTaggerModel not in model_config: + raise ValueError(f"Invalid model type '{reco_args.MLJetTaggerModel}'. Valid options are: {', '.join(model_config.keys())}.") + + # load the model configuration + onnx_model = model_config[reco_args.MLJetTaggerModel]["onnx_model"] + json_onnx_config = model_config[reco_args.MLJetTaggerModel]["json_onnx_config"] + flavor_collection_names = model_config[reco_args.MLJetTaggerModel]["flavor_collection_names"] + + # print out the model configuration + print("RUNNING JET TAGGING WITH MLJETTAGGER") + + print(f"Using MLJetTagger model: \t\t {reco_args.MLJetTaggerModel}\n", + f"The model uses the architecture: \t {model_config[reco_args.MLJetTaggerModel]['model']}\n", + f"was trained on the kinematics: \t {model_config[reco_args.MLJetTaggerModel]['kinematics']}\n", + f"and the detector version: \t\t {model_config[reco_args.MLJetTaggerModel]['detector']}\n", + f"at a center-of-mass energy of: \t {model_config[reco_args.MLJetTaggerModel]['ecm']} GeV\n", + f"Comment: \t\t\t\t {model_config[reco_args.MLJetTaggerModel]['comment']}\n", + f"Appending collections to the event: \t {', '.join(flavor_collection_names)}\n",) + + # create the MLJetTagger algorithm + + k4MLJetTagger = JetTagger("JetTagger", + model_path=onnx_model, + json_path=json_onnx_config, + flavor_collection_names = flavor_collection_names, # to make sure the order and naming is correct + InputJets=["RefinedVertexJets"], + InputPrimaryVertices=["PrimaryVertices"], + OutputIDCollections=flavor_collection_names, + ) + + # append sequence to the algorithm list + MLJetTaggerSequence = [ + k4MLJetTagger, + ] +else: + MLJetTaggerSequence = [] diff --git a/CLDConfig/HighLevelReco/models_MLJetTagger.yaml b/CLDConfig/HighLevelReco/models_MLJetTagger.yaml new file mode 100644 index 0000000..1e70feb --- /dev/null +++ b/CLDConfig/HighLevelReco/models_MLJetTagger.yaml @@ -0,0 +1,18 @@ +# this yaml file stores and should be filled in with information about how a jet-flavor tagger is trained and the necessary information to run inference + +model_ParT_ecm240_cld_o2_v5: + model: "ParticleTransformer" + ecm: 240 + detector: "CLD_o2_v5" + kinematics: "Z(vv)H(jj)" + onnx_model: "/eos/experiment/fcc/ee/jet_flavour_tagging/fullsim_test_spring2024/fullsimCLD240_2mio.onnx" + json_onnx_config: "/eos/experiment/fcc/ee/jet_flavour_tagging/fullsim_test_spring2024/preprocess_fullsimCLD240_2mio.json" + flavor_collection_names: + - "RefinedJetTag_G" + - "RefinedJetTag_U" + - "RefinedJetTag_S" + - "RefinedJetTag_C" + - "RefinedJetTag_B" + - "RefinedJetTag_D" + - "RefinedJetTag_TAU" + comment: "The model was trained on 1.9 mio/jets per flavor. First implementation of ML tagging for full sim." diff --git a/CLDConfig/cdb.log b/CLDConfig/cdb.log new file mode 100644 index 0000000..e69de29 diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 149c62a..27eec6a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -61,6 +61,12 @@ add_test(NAME LCFIJet ) set_property(TEST LCFIJet APPEND PROPERTY DEPENDS ddsim_edm4hep) +add_test(NAME MLJetTagger + WORKING_DIRECTORY ${CLDConfig_DIR} + COMMAND k4run --enableLCFIJet --enableMLJetTagger --inputFiles=test.edm4hep.root --outputBasename=trkOnly_test_edm4hep CLDReconstruction.py --GeoSvc.detectors=${DETECTOR} +) +set_property(TEST MLJetTagger APPEND PROPERTY DEPENDS ddsim_edm4hep) + add_test(NAME tracking_truth WORKING_DIRECTORY ${CLDConfig_DIR} COMMAND k4run --trackingOnly --truthTracking --inputFiles=test.edm4hep.root --outputBasename=trkOnly_truth_test_edm4hep CLDReconstruction.py --GeoSvc.detectors=${DETECTOR}