149 #include <unordered_map>
153 using std::make_pair;
159 const
Int_t TMVA::MethodBDT::fgDebugLevel = 0;
168 TMVA::
MethodBase( jobName,
Types::kBDT, methodTitle, theData, theOption)
171 , fSigToBkgFraction(0)
176 , fBaggedGradBoost(kFALSE)
180 , fMinNodeSizeS("5%")
183 , fMinLinCorrForFisher(.8)
184 , fUseExclusiveVars(0)
185 , fUseYesNoLeaf(kFALSE)
186 , fNodePurityLimit(0)
191 , fFValidationEvents(0)
193 , fRandomisedTrees(kFALSE)
195 , fUsePoissonNvars(0)
196 , fUseNTrainEvents(0)
197 , fBaggedSampleFraction(0)
198 , fNoNegWeightsInTraining(kFALSE)
199 , fInverseBoostNegWeights(kFALSE)
200 , fPairNegWeightsGlobal(kFALSE)
201 , fTrainWithNegWeights(kFALSE)
202 , fDoBoostMonitor(kFALSE)
210 , fDoPreselection(kFALSE)
211 , fSkipNormalization(kFALSE)
212 , fHistoricBool(kFALSE)
214 fMonitorNtuple =
NULL;
216 fRegressionLossFunctionBDTG =
nullptr;
226 , fSigToBkgFraction(0)
231 , fBaggedGradBoost(
kFALSE)
235 , fMinNodeSizeS(
"5%")
238 , fMinLinCorrForFisher(.8)
239 , fUseExclusiveVars(0)
241 , fNodePurityLimit(0)
246 , fFValidationEvents(0)
248 , fRandomisedTrees(
kFALSE)
250 , fUsePoissonNvars(0)
251 , fUseNTrainEvents(0)
252 , fBaggedSampleFraction(0)
253 , fNoNegWeightsInTraining(
kFALSE)
254 , fInverseBoostNegWeights(
kFALSE)
255 , fPairNegWeightsGlobal(
kFALSE)
256 , fTrainWithNegWeights(
kFALSE)
266 , fSkipNormalization(
kFALSE)
336 DeclareOptionRef(fNTrees,
"NTrees",
"Number of trees in the forest");
337 if (DoRegression()) {
338 DeclareOptionRef(fMaxDepth=50,
"MaxDepth",
"Max depth of the decision tree allowed");
340 DeclareOptionRef(fMaxDepth=3,
"MaxDepth",
"Max depth of the decision tree allowed");
343 TString tmp=
"5%";
if (DoRegression()) tmp=
"0.2%";
344 DeclareOptionRef(fMinNodeSizeS=tmp,
"MinNodeSize",
"Minimum percentage of training events required in a leaf node (default: Classification: 5%, Regression: 0.2%)");
346 DeclareOptionRef(fNCuts,
"nCuts",
"Number of grid points in variable range used in finding optimal cut in node splitting");
348 DeclareOptionRef(fBoostType,
"BoostType",
"Boosting type for the trees in the forest (note: AdaCost is still experimental)");
350 AddPreDefVal(
TString(
"AdaBoost"));
351 AddPreDefVal(
TString(
"RealAdaBoost"));
352 AddPreDefVal(
TString(
"AdaCost"));
353 AddPreDefVal(
TString(
"Bagging"));
355 AddPreDefVal(
TString(
"AdaBoostR2"));
357 if (DoRegression()) {
358 fBoostType =
"AdaBoostR2";
360 fBoostType =
"AdaBoost";
362 DeclareOptionRef(fAdaBoostR2Loss=
"Quadratic",
"AdaBoostR2Loss",
"Type of Loss function in AdaBoostR2");
363 AddPreDefVal(
TString(
"Linear"));
364 AddPreDefVal(
TString(
"Quadratic"));
365 AddPreDefVal(
TString(
"Exponential"));
367 DeclareOptionRef(fBaggedBoost=
kFALSE,
"UseBaggedBoost",
"Use only a random subsample of all events for growing the trees in each boost iteration.");
368 DeclareOptionRef(fShrinkage=1.0,
"Shrinkage",
"Learning rate for GradBoost algorithm");
369 DeclareOptionRef(fAdaBoostBeta=.5,
"AdaBoostBeta",
"Learning rate for AdaBoost algorithm");
370 DeclareOptionRef(fRandomisedTrees,
"UseRandomisedTrees",
"Determine at each node splitting the cut variable only as the best out of a random subset of variables (like in RandomForests)");
371 DeclareOptionRef(fUseNvars,
"UseNvars",
"Size of the subset of variables used with RandomisedTree option");
372 DeclareOptionRef(fUsePoissonNvars,
"UsePoissonNvars",
"Interpret \"UseNvars\" not as fixed number but as mean of a Poisson distribution in each split with RandomisedTree option");
373 DeclareOptionRef(fBaggedSampleFraction=.6,
"BaggedSampleFraction",
"Relative size of bagged event sample to original size of the data sample (used whenever bagging is used (i.e. UseBaggedBoost, Bagging,)" );
375 DeclareOptionRef(fUseYesNoLeaf=
kTRUE,
"UseYesNoLeaf",
376 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node -> Real-AdaBoost");
377 if (DoRegression()) {
381 DeclareOptionRef(fNegWeightTreatment=
"InverseBoostNegWeights",
"NegWeightTreatment",
"How to treat events with negative weights in the BDT training (particular the boosting) : IgnoreInTraining; Boost With inverse boostweight; Pair events with negative and positive weights in training sample and *annihilate* them (experimental!)");
382 AddPreDefVal(
TString(
"InverseBoostNegWeights"));
383 AddPreDefVal(
TString(
"IgnoreNegWeightsInTraining"));
384 AddPreDefVal(
TString(
"NoNegWeightsInTraining"));
385 AddPreDefVal(
TString(
"PairNegWeightsGlobal"));
390 DeclareOptionRef(fCss=1.,
"Css",
"AdaCost: cost of true signal selected signal");
391 DeclareOptionRef(fCts_sb=1.,
"Cts_sb",
"AdaCost: cost of true signal selected bkg");
392 DeclareOptionRef(fCtb_ss=1.,
"Ctb_ss",
"AdaCost: cost of true bkg selected signal");
393 DeclareOptionRef(fCbb=1.,
"Cbb",
"AdaCost: cost of true bkg selected bkg ");
395 DeclareOptionRef(fNodePurityLimit=0.5,
"NodePurityLimit",
"In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
398 DeclareOptionRef(fSepTypeS,
"SeparationType",
"Separation criterion for node splitting");
399 AddPreDefVal(
TString(
"CrossEntropy"));
400 AddPreDefVal(
TString(
"GiniIndex"));
401 AddPreDefVal(
TString(
"GiniIndexWithLaplace"));
402 AddPreDefVal(
TString(
"MisClassificationError"));
403 AddPreDefVal(
TString(
"SDivSqrtSPlusB"));
404 AddPreDefVal(
TString(
"RegressionVariance"));
405 if (DoRegression()) {
406 fSepTypeS =
"RegressionVariance";
408 fSepTypeS =
"GiniIndex";
411 DeclareOptionRef(fRegressionLossFunctionBDTGS =
"Huber",
"RegressionLossFunctionBDTG",
"Loss function for BDTG regression.");
412 AddPreDefVal(
TString(
"Huber"));
413 AddPreDefVal(
TString(
"AbsoluteDeviation"));
414 AddPreDefVal(
TString(
"LeastSquares"));
416 DeclareOptionRef(fHuberQuantile = 0.7,
"HuberQuantile",
"In the Huber loss function this is the quantile that separates the core from the tails in the residuals distribution.");
418 DeclareOptionRef(fDoBoostMonitor=
kFALSE,
"DoBoostMonitor",
"Create control plot with ROC integral vs tree number");
420 DeclareOptionRef(fUseFisherCuts=
kFALSE,
"UseFisherCuts",
"Use multivariate splits using the Fisher criterion");
421 DeclareOptionRef(fMinLinCorrForFisher=.8,
"MinLinCorrForFisher",
"The minimum linear correlation between two variables demanded for use in Fisher criterion in node splitting");
422 DeclareOptionRef(fUseExclusiveVars=
kFALSE,
"UseExclusiveVars",
"Variables already used in fisher criterion are not anymore analysed individually for node splitting");
425 DeclareOptionRef(fDoPreselection=
kFALSE,
"DoPreselection",
"and and apply automatic pre-selection for 100% efficient signal (bkg) cuts prior to training");
428 DeclareOptionRef(fSigToBkgFraction=1,
"SigToBkgFraction",
"Sig to Bkg ratio used in Training (similar to NodePurityLimit, which cannot be used in real adaboost");
430 DeclareOptionRef(fPruneMethodS,
"PruneMethod",
"Note: for BDTs use small trees (e.g.MaxDepth=3) and NoPruning: Pruning: Method used for pruning (removal) of statistically insignificant branches ");
431 AddPreDefVal(
TString(
"NoPruning"));
432 AddPreDefVal(
TString(
"ExpectedError"));
433 AddPreDefVal(
TString(
"CostComplexity"));
435 DeclareOptionRef(fPruneStrength,
"PruneStrength",
"Pruning strength");
437 DeclareOptionRef(fFValidationEvents=0.5,
"PruningValFraction",
"Fraction of events to use for optimizing automatic pruning.");
439 DeclareOptionRef(fSkipNormalization=
kFALSE,
"SkipNormalization",
"Skip normalization at initialization, to keep expectation value of BDT output according to the fraction of events");
442 DeclareOptionRef(fMinNodeEvents=0,
"nEventsMin",
"deprecated: Use MinNodeSize (in % of training events) instead");
444 DeclareOptionRef(fBaggedGradBoost=
kFALSE,
"UseBaggedGrad",
"deprecated: Use *UseBaggedBoost* instead: Use only a random subsample of all events for growing the trees in each iteration.");
445 DeclareOptionRef(fBaggedSampleFraction,
"GradBaggingFraction",
"deprecated: Use *BaggedSampleFraction* instead: Defines the fraction of events to be used in each iteration, e.g. when UseBaggedGrad=kTRUE. ");
446 DeclareOptionRef(fUseNTrainEvents,
"UseNTrainEvents",
"deprecated: Use *BaggedSampleFraction* instead: Number of randomly picked training events used in randomised (and bagged) trees");
447 DeclareOptionRef(fNNodesMax,
"NNodesMax",
"deprecated: Use MaxDepth instead to limit the tree size" );
459 DeclareOptionRef(fHistoricBool=
kTRUE,
"UseWeightedTrees",
460 "Use weighted trees or simple average in classification from the forest");
461 DeclareOptionRef(fHistoricBool=
kFALSE,
"PruneBeforeBoost",
"Flag to prune the tree before applying boosting algorithm");
462 DeclareOptionRef(fHistoricBool=
kFALSE,
"RenormByClass",
"Individually re-normalize each event class to the original size after boosting");
464 AddPreDefVal(
TString(
"NegWeightTreatment"),
TString(
"IgnoreNegWeights"));
475 else if (fSepTypeS ==
"giniindex") fSepType =
new GiniIndex();
477 else if (fSepTypeS ==
"crossentropy") fSepType =
new CrossEntropy();
478 else if (fSepTypeS ==
"sdivsqrtsplusb") fSepType =
new SdivSqrtSplusB();
479 else if (fSepTypeS ==
"regressionvariance") fSepType =
NULL;
481 Log() << kINFO << GetOptions() <<
Endl;
482 Log() << kFATAL <<
"<ProcessOptions> unknown Separation Index option " << fSepTypeS <<
" called" <<
Endl;
485 if(!(fHuberQuantile >= 0.0 && fHuberQuantile <= 1.0)){
486 Log() << kINFO << GetOptions() <<
Endl;
487 Log() << kFATAL <<
"<ProcessOptions> Huber Quantile must be in range [0,1]. Value given, " << fHuberQuantile <<
", does not match this criteria" <<
Endl;
490 fRegressionLossFunctionBDTGS.ToLower();
491 if (fRegressionLossFunctionBDTGS ==
"huber") fRegressionLossFunctionBDTG =
new HuberLossFunctionBDT(fHuberQuantile);
495 Log() << kINFO << GetOptions() <<
Endl;
496 Log() << kFATAL <<
"<ProcessOptions> unknown Regression Loss Function BDT option " << fRegressionLossFunctionBDTGS <<
" called" <<
Endl;
499 fPruneMethodS.ToLower();
504 Log() << kINFO << GetOptions() <<
Endl;
505 Log() << kFATAL <<
"<ProcessOptions> unknown PruneMethod " << fPruneMethodS <<
" option called" <<
Endl;
511 <<
"Sorry automatic pruning strength determination is not implemented yet for ExpectedErrorPruning" <<
Endl;
515 if (fMinNodeEvents > 0){
516 fMinNodeSize =
Double_t(fMinNodeEvents*100.) /
Data()->GetNTrainingEvents();
517 Log() << kWARNING <<
"You have explicitly set ** nEventsMin = " << fMinNodeEvents<<
" ** the min absolute number \n"
518 <<
"of events in a leaf node. This is DEPRECATED, please use the option \n"
519 <<
"*MinNodeSize* giving the relative number as percentage of training \n"
520 <<
"events instead. \n"
521 <<
"nEventsMin="<<fMinNodeEvents<<
"--> MinNodeSize="<<fMinNodeSize<<
"%"
523 Log() << kWARNING <<
"Note also that explicitly setting *nEventsMin* so far OVERWRITES the option recommended \n"
524 <<
" *MinNodeSize* = " << fMinNodeSizeS <<
" option !!" <<
Endl ;
525 fMinNodeSizeS =
Form(
"%F3.2",fMinNodeSize);
528 SetMinNodeSize(fMinNodeSizeS);
532 fAdaBoostR2Loss.ToLower();
534 if (fBoostType==
"Grad") {
536 if (fNegWeightTreatment==
"InverseBoostNegWeights"){
537 Log() << kINFO <<
"the option *InverseBoostNegWeights* does not exist for BoostType=Grad --> change" <<
Endl;
538 Log() << kINFO <<
"to new default for GradBoost *Pray*" <<
Endl;
539 Log() << kDEBUG <<
"i.e. simply keep them as if which should work fine for Grad Boost" <<
Endl;
540 fNegWeightTreatment=
"Pray";
541 fNoNegWeightsInTraining=
kFALSE;
543 }
else if (fBoostType==
"RealAdaBoost"){
544 fBoostType =
"AdaBoost";
546 }
else if (fBoostType==
"AdaCost"){
550 if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
551 if (fAutomatic && fFValidationEvents > 0.5) {
552 Log() << kWARNING <<
"You have chosen to use more than half of your training sample "
553 <<
"to optimize the automatic pruning algorithm. This is probably wasteful "
554 <<
"and your overall results will be degraded. Are you sure you want this?"
559 if (this->
Data()->HasNegativeEventWeights()){
560 Log() << kINFO <<
" You are using a Monte Carlo that has also negative weights. "
561 <<
"That should in principle be fine as long as on average you end up with "
562 <<
"something positive. For this you have to make sure that the minimal number "
563 <<
"of (un-weighted) events demanded for a tree node (currently you use: MinNodeSize="
564 << fMinNodeSizeS <<
" ("<< fMinNodeSize <<
"%)"
565 <<
", (or the deprecated equivalent nEventsMin) you can set this via the "
566 <<
"BDT option string when booking the "
567 <<
"classifier) is large enough to allow for reasonable averaging!!! "
568 <<
" If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
569 <<
"which ignores events with negative weight in the training. " <<
Endl
570 <<
Endl <<
"Note: You'll get a WARNING message during the training if that should ever happen" <<
Endl;
573 if (DoRegression()) {
574 if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
575 Log() << kWARNING <<
"Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" <<
Endl;
579 if (fSepType !=
NULL){
580 Log() << kWARNING <<
"Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" <<
Endl;
584 Log() << kWARNING <<
"Sorry, UseFisherCuts is not available for regression analysis, I will ignore it!" <<
Endl;
588 Log() << kWARNING <<
"Sorry, the option of nCuts<0 using a more elaborate node splitting algorithm " <<
Endl;
589 Log() << kWARNING <<
"is not implemented for regression analysis ! " <<
Endl;
590 Log() << kWARNING <<
"--> I switch do default nCuts = 20 and use standard node splitting"<<
Endl;
594 if (fRandomisedTrees){
595 Log() << kINFO <<
" Randomised trees use no pruning" <<
Endl;
600 if (fUseFisherCuts) {
601 Log() << kWARNING <<
"When using the option UseFisherCuts, the other option nCuts<0 (i.e. using" <<
Endl;
602 Log() <<
" a more elaborate node splitting algorithm) is not implemented. " <<
Endl;
609 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
610 <<
" I set it to 1 .. just so that the program does not crash"
615 fNegWeightTreatment.ToLower();
616 if (fNegWeightTreatment ==
"ignorenegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
617 else if (fNegWeightTreatment ==
"nonegweightsintraining") fNoNegWeightsInTraining =
kTRUE;
618 else if (fNegWeightTreatment ==
"inverseboostnegweights") fInverseBoostNegWeights =
kTRUE;
619 else if (fNegWeightTreatment ==
"pairnegweightsglobal") fPairNegWeightsGlobal =
kTRUE;
620 else if (fNegWeightTreatment ==
"pray")
Log() << kDEBUG <<
"Yes, good luck with praying " <<
Endl;
622 Log() << kINFO << GetOptions() <<
Endl;
623 Log() << kFATAL <<
"<ProcessOptions> unknown option for treating negative event weights during training " << fNegWeightTreatment <<
" requested" <<
Endl;
626 if (fNegWeightTreatment ==
"pairnegweightsglobal")
627 Log() << kWARNING <<
" you specified the option NegWeightTreatment=PairNegWeightsGlobal : This option is still considered EXPERIMENTAL !! " <<
Endl;
634 while (tmp < fNNodesMax){
638 Log() << kWARNING <<
"You have specified a deprecated option *NNodesMax="<<fNNodesMax
639 <<
"* \n this has been translated to MaxDepth="<<fMaxDepth<<
Endl;
643 if (fUseNTrainEvents>0){
644 fBaggedSampleFraction = (
Double_t) fUseNTrainEvents/
Data()->GetNTrainingEvents();
645 Log() << kWARNING <<
"You have specified a deprecated option *UseNTrainEvents="<<fUseNTrainEvents
646 <<
"* \n this has been translated to BaggedSampleFraction="<<fBaggedSampleFraction<<
"(%)"<<
Endl;
649 if (fBoostType==
"Bagging") fBaggedBoost =
kTRUE;
650 if (fBaggedGradBoost){
651 fBaggedBoost =
kTRUE;
652 Log() << kWARNING <<
"You have specified a deprecated option *UseBaggedGrad* --> please use *UseBaggedBoost* instead" <<
Endl;
660 if (sizeInPercent > 0 && sizeInPercent < 50){
661 fMinNodeSize=sizeInPercent;
664 Log() << kFATAL <<
"you have demanded a minimal node size of "
665 << sizeInPercent <<
"% of the training events.. \n"
666 <<
" that somehow does not make sense "<<
Endl;
676 if (sizeInPercent.
IsFloat()) SetMinNodeSize(sizeInPercent.
Atof());
678 Log() << kFATAL <<
"I had problems reading the option MinNodeEvents, which "
679 <<
"after removing a possible % sign now reads " << sizeInPercent <<
Endl;
691 fBoostType =
"AdaBoost";
692 if(DataInfo().GetNClasses()!=0)
696 fBoostType =
"AdaBoostR2";
697 fAdaBoostR2Loss =
"Quadratic";
698 if(DataInfo().GetNClasses()!=0)
704 fPruneMethodS =
"NoPruning";
708 fFValidationEvents = 0.5;
709 fRandomisedTrees =
kFALSE;
712 fUsePoissonNvars =
kTRUE;
717 SetSignalReferenceCut( 0 );
730 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
733 fBoostWeights.clear();
734 if (fMonitorNtuple) { fMonitorNtuple->Delete(); fMonitorNtuple=
NULL; }
735 fVariableImportance.clear();
737 fLossFunctionEventInfo.clear();
742 Log() << kDEBUG <<
" successfully(?) reset the method " <<
Endl;
754 for (
UInt_t i=0; i<fForest.size(); i++)
delete fForest[i];
762 if (!HasTrainingTree())
Log() << kFATAL <<
"<Init> Data().TrainingTree() is zero pointer" <<
Endl;
764 if (fEventSample.size() > 0) {
766 for (
UInt_t iev=0; iev<fEventSample.size(); iev++) fEventSample[iev]->SetBoostWeight(1.);
769 UInt_t nevents =
Data()->GetNTrainingEvents();
771 std::vector<const TMVA::Event*> tmpEventSample;
772 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
774 Event*
event =
new Event( *GetTrainingEvent(ievt) );
775 tmpEventSample.push_back(event);
778 if (!DoRegression()) DeterminePreselectionCuts(tmpEventSample);
779 else fDoPreselection =
kFALSE;
781 for (
UInt_t i=0; i<tmpEventSample.size(); i++)
delete tmpEventSample[i];
786 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
789 Event*
event =
new Event( *GetTrainingEvent(ievt) );
790 if (fDoPreselection){
791 if (
TMath::Abs(ApplyPreselectionCuts(event)) > 0.05) {
797 if (event->GetWeight() < 0 && (IgnoreEventsWithNegWeightsInTraining() || fNoNegWeightsInTraining)){
798 if (firstNegWeight) {
799 Log() << kWARNING <<
" Note, you have events with negative event weight in the sample, but you've chosen to ignore them" <<
Endl;
803 }
else if (event->GetWeight()==0){
804 if (firstZeroWeight) {
806 Log() <<
"Events with weight == 0 are going to be simply ignored " <<
Endl;
810 if (event->GetWeight() < 0) {
811 fTrainWithNegWeights=
kTRUE;
814 if (fPairNegWeightsGlobal){
815 Log() << kWARNING <<
"Events with negative event weights are found and "
816 <<
" will be removed prior to the actual BDT training by global "
817 <<
" paring (and subsequent annihilation) with positiv weight events"
820 Log() << kWARNING <<
"Events with negative event weights are USED during "
821 <<
"the BDT training. This might cause problems with small node sizes "
822 <<
"or with the boosting. Please remove negative events from training "
823 <<
"using the option *IgnoreEventsWithNegWeightsInTraining* in case you "
824 <<
"observe problems with the boosting"
831 Double_t modulo = 1.0/(fFValidationEvents);
832 Int_t imodulo =
static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ?
ceil(modulo) :
floor(modulo) );
833 if (ievt % imodulo == 0) fValidationSample.push_back( event );
834 else fEventSample.push_back( event );
837 fEventSample.push_back(event);
843 Log() << kINFO <<
"<InitEventSample> Internally I use " << fEventSample.size()
844 <<
" for Training and " << fValidationSample.size()
845 <<
" for Pruning Validation (" << ((
Float_t)fValidationSample.size())/((
Float_t)fEventSample.size()+fValidationSample.size())*100.0
846 <<
"% of training used for validation)" << Endl;
850 if (fPairNegWeightsGlobal) PreProcessNegativeEventWeights();
853 if (!DoRegression() && !fSkipNormalization){
854 Log() << kDEBUG <<
"\t<InitEventSample> For classification trees, "<<
Endl;
855 Log() << kDEBUG <<
" \tthe effective number of backgrounds is scaled to match "<<
Endl;
856 Log() << kDEBUG <<
" \tthe signal. Otherwise the first boosting step would do 'just that'!"<<
Endl;
870 Double_t nevents = fEventSample.size();
872 Int_t sumSig=0, sumBkg=0;
873 for (
UInt_t ievt=0; ievt<fEventSample.size(); ievt++) {
874 if ((DataInfo().IsSignal(fEventSample[ievt])) ) {
875 sumSigW += fEventSample[ievt]->GetWeight();
878 sumBkgW += fEventSample[ievt]->GetWeight();
882 if (sumSigW && sumBkgW){
883 Double_t normSig = nevents/((1+fSigToBkgFraction)*sumSigW)*fSigToBkgFraction;
884 Double_t normBkg = nevents/((1+fSigToBkgFraction)*sumBkgW); ;
885 Log() << kDEBUG <<
"\tre-normalise events such that Sig and Bkg have respective sum of weights = "
886 << fSigToBkgFraction <<
Endl;
887 Log() << kDEBUG <<
" \tsig->sig*"<<normSig <<
"ev. bkg->bkg*"<<normBkg <<
"ev." <<
Endl;
888 Log() << kHEADER <<
"#events: (reweighted) sig: "<< sumSigW*normSig <<
" bkg: " << sumBkgW*normBkg <<
Endl;
889 Log() << kINFO <<
"#events: (unweighted) sig: "<< sumSig <<
" bkg: " << sumBkg <<
Endl;
890 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
891 if ((DataInfo().IsSignal(fEventSample[ievt])) ) fEventSample[ievt]->SetBoostWeight(normSig);
892 else fEventSample[ievt]->SetBoostWeight(normBkg);
895 Log() << kINFO <<
"--> could not determine scaling factors as either there are " <<
Endl;
896 Log() << kINFO <<
" no signal events (sumSigW="<<sumSigW<<
") or no bkg ev. (sumBkgW="<<sumBkgW<<
")"<<
Endl;
901 fTrainSample = &fEventSample;
903 GetBaggedSubSample(fEventSample);
904 fTrainSample = &fSubSample;
930 std::vector<const Event*> negEvents;
931 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
932 if (fEventSample[iev]->GetWeight() < 0) {
933 totalNegWeights += fEventSample[iev]->GetWeight();
934 negEvents.push_back(fEventSample[iev]);
936 totalPosWeights += fEventSample[iev]->GetWeight();
938 totalWeights += fEventSample[iev]->GetWeight();
940 if (totalNegWeights == 0 ) {
941 Log() << kINFO <<
"no negative event weights found .. no preprocessing necessary" <<
Endl;
944 Log() << kINFO <<
"found a total of " << totalNegWeights <<
" of negative event weights which I am going to try to pair with positive events to annihilate them" <<
Endl;
945 Log() << kINFO <<
"found a total of " << totalPosWeights <<
" of events with positive weights" <<
Endl;
946 Log() << kINFO <<
"--> total sum of weights = " << totalWeights <<
" = " << totalNegWeights+totalPosWeights <<
Endl;
953 for (
Int_t i=0; i<2; i++){
954 invCov = ((*cov)[i]);
956 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is almost singular with determinant="
958 <<
" did you use the variables that are linear combinations or highly correlated?"
962 std::cout <<
"<MethodBDT::PreProcessNeg...> matrix is singular with determinant="
964 <<
" did you use the variables that are linear combinations?"
973 Log() << kINFO <<
"Found a total of " << totalNegWeights <<
" in negative weights out of " << fEventSample.size() <<
" training events " <<
Endl;
974 Timer timer(negEvents.size(),
"Negative Event paired");
975 for (
UInt_t nev = 0; nev < negEvents.size(); nev++){
976 timer.DrawProgressBar( nev );
977 Double_t weight = negEvents[nev]->GetWeight();
978 UInt_t iClassID = negEvents[nev]->GetClass();
979 invCov = ((*cov)[iClassID]);
985 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
986 if (iClassID==fEventSample[iev]->
GetClass() && fEventSample[iev]->GetWeight() > 0){
988 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++){
989 for (
UInt_t jvar=0; jvar<GetNvar(); jvar++){
990 dist += (negEvents[nev]->GetValue(ivar)-fEventSample[iev]->GetValue(ivar))*
991 (*invCov)[ivar][jvar]*
992 (negEvents[nev]->GetValue(jvar)-fEventSample[iev]->GetValue(jvar));
995 if (dist < minDist) { iMin=iev; minDist=
dist;}
1001 Double_t newWeight = (negEvents[nev]->GetWeight() + fEventSample[iMin]->GetWeight());
1003 negEvents[nev]->SetBoostWeight( 0 );
1004 fEventSample[iMin]->SetBoostWeight( newWeight/fEventSample[iMin]->GetOriginalWeight() );
1006 negEvents[nev]->SetBoostWeight( newWeight/negEvents[nev]->GetOriginalWeight() );
1007 fEventSample[iMin]->SetBoostWeight( 0 );
1010 }
else Log() << kFATAL <<
"preprocessing didn't find event to pair with the negative weight ... probably a bug" <<
Endl;
1011 weight = negEvents[nev]->GetWeight();
1014 Log() << kINFO <<
"<Negative Event Pairing> took: " <<
timer.GetElapsedTime()
1018 totalNegWeights = 0;
1019 totalPosWeights = 0;
1026 std::vector<const Event*> newEventSample;
1028 for (
UInt_t iev = 0; iev < fEventSample.size(); iev++){
1029 if (fEventSample[iev]->GetWeight() < 0) {
1030 totalNegWeights += fEventSample[iev]->GetWeight();
1031 totalWeights += fEventSample[iev]->GetWeight();
1033 totalPosWeights += fEventSample[iev]->GetWeight();
1034 totalWeights += fEventSample[iev]->GetWeight();
1036 if (fEventSample[iev]->GetWeight() > 0) {
1037 newEventSample.push_back(
new Event(*fEventSample[iev]));
1038 if (fEventSample[iev]->
GetClass() == fSignalClass){
1039 sigWeight += fEventSample[iev]->GetWeight();
1042 bkgWeight += fEventSample[iev]->GetWeight();
1047 if (totalNegWeights < 0)
Log() << kFATAL <<
" compensation of negative event weights with positive ones did not work " << totalNegWeights <<
Endl;
1049 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1050 fEventSample = newEventSample;
1052 Log() << kINFO <<
" after PreProcessing, the Event sample is left with " << fEventSample.size() <<
" events (unweighted), all with positive weights, adding up to " << totalWeights <<
Endl;
1053 Log() << kINFO <<
" nSig="<<nSig <<
" sigWeight="<<sigWeight <<
" nBkg="<<nBkg <<
" bkgWeight="<<bkgWeight <<
Endl;
1065 std::map<TString,TMVA::Interval*> tuneParameters;
1066 std::map<TString,Double_t> tunedParameters;
1075 tuneParameters.insert(std::pair<TString,Interval*>(
"NTrees",
new Interval(10,1000,5)));
1076 tuneParameters.insert(std::pair<TString,Interval*>(
"MaxDepth",
new Interval(2,4,3)));
1077 tuneParameters.insert(std::pair<TString,Interval*>(
"MinNodeSize",
new LogInterval(1,30,30)));
1082 if (fBoostType==
"AdaBoost"){
1083 tuneParameters.insert(std::pair<TString,Interval*>(
"AdaBoostBeta",
new Interval(.2,1.,5)));
1085 }
else if (fBoostType==
"Grad"){
1086 tuneParameters.insert(std::pair<TString,Interval*>(
"Shrinkage",
new Interval(0.05,0.50,5)));
1088 }
else if (fBoostType==
"Bagging" && fRandomisedTrees){
1091 tuneParameters.insert(std::pair<TString,Interval*>(
"UseNvars",
new Interval(min_var,max_var,4)));
1095 Log()<<kINFO <<
" the following BDT parameters will be tuned on the respective *grid*\n"<<
Endl;
1096 std::map<TString,TMVA::Interval*>::iterator it;
1097 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1098 Log() << kWARNING << it->first <<
Endl;
1099 std::ostringstream oss;
1100 (it->second)->
Print(oss);
1106 tunedParameters=optimize.
optimize();
1108 return tunedParameters;
1117 std::map<TString,Double_t>::iterator it;
1118 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
1119 Log() << kWARNING << it->first <<
" = " << it->second <<
Endl;
1120 if (it->first ==
"MaxDepth" ) SetMaxDepth ((
Int_t)it->second);
1121 else if (it->first ==
"MinNodeSize" ) SetMinNodeSize (it->second);
1122 else if (it->first ==
"NTrees" ) SetNTrees ((
Int_t)it->second);
1123 else if (it->first ==
"NodePurityLimit") SetNodePurityLimit (it->second);
1124 else if (it->first ==
"AdaBoostBeta" ) SetAdaBoostBeta (it->second);
1125 else if (it->first ==
"Shrinkage" ) SetShrinkage (it->second);
1126 else if (it->first ==
"UseNvars" ) SetUseNvars ((
Int_t)it->second);
1127 else if (it->first ==
"BaggedSampleFraction" ) SetBaggedSampleFraction (it->second);
1128 else Log() << kFATAL <<
" SetParameter for " << it->first <<
" not yet implemented " <<
Endl;
1145 Log() << kERROR <<
" Zero Decision Trees demanded... that does not work !! "
1146 <<
" I set it to 1 .. just so that the program does not crash"
1151 if (fInteractive && fInteractive->NotInitialized()){
1152 std::vector<TString> titles = {
"Boost weight",
"Error Fraction"};
1153 fInteractive->Init(titles);
1155 fIPyMaxIter = fNTrees;
1156 fExitFromTraining =
false;
1160 if (IsNormalised())
Log() << kFATAL <<
"\"Normalise\" option cannot be used with BDT; "
1161 <<
"please remove the option from the configuration string, or "
1162 <<
"use \"!Normalise\""
1166 Log() << kINFO <<
"Regression Loss Function: "<< fRegressionLossFunctionBDTG->Name() <<
Endl;
1168 Log() << kINFO <<
"Training "<< fNTrees <<
" Decision Trees ... patience please" <<
Endl;
1170 Log() << kDEBUG <<
"Training with maximal depth = " <<fMaxDepth
1171 <<
", MinNodeEvents=" << fMinNodeEvents
1172 <<
", NTrees="<<fNTrees
1173 <<
", NodePurityLimit="<<fNodePurityLimit
1174 <<
", AdaBoostBeta="<<fAdaBoostBeta
1180 TString hname =
"AdaBooost weight distribution";
1186 if (DoRegression()) {
1190 hname=
"Boost event weights distribution";
1196 TH1* nodesBeforePruningVsTree =
new TH1I(
Form(
"%s_NodesBeforePruning",DataInfo().
GetName()),
"nodes before pruning",fNTrees,0,fNTrees);
1197 TH1* nodesAfterPruningVsTree =
new TH1I(
Form(
"%s_NodesAfterPruning",DataInfo().
GetName()),
"nodes after pruning",fNTrees,0,fNTrees);
1201 if(!DoMulticlass()){
1205 results->
Store(h,
"BoostWeights");
1209 if (fDoBoostMonitor){
1210 TH2* boostMonitor =
new TH2F(
"BoostMonitor",
"ROC Integral Vs iTree",2,0,fNTrees,2,0,1.05);
1212 boostMonitor->
SetYTitle(
"ROC Integral");
1213 results->
Store(boostMonitor,
"BoostMonitor");
1215 boostMonitorGraph->
SetName(
"BoostMonitorGraph");
1216 boostMonitorGraph->
SetTitle(
"ROCIntegralVsNTrees");
1217 results->
Store(boostMonitorGraph,
"BoostMonitorGraph");
1221 h =
new TH1F(
"BoostWeightVsTree",
"Boost weights vs tree",fNTrees,0,fNTrees);
1224 results->
Store(h,
"BoostWeightsVsTree");
1227 h =
new TH1F(
"ErrFractHist",
"error fraction vs tree number",fNTrees,0,fNTrees);
1230 results->
Store(h,
"ErrorFrac");
1233 nodesBeforePruningVsTree->
SetXTitle(
"#tree");
1234 nodesBeforePruningVsTree->
SetYTitle(
"#tree nodes");
1235 results->
Store(nodesBeforePruningVsTree);
1238 nodesAfterPruningVsTree->
SetXTitle(
"#tree");
1239 nodesAfterPruningVsTree->
SetYTitle(
"#tree nodes");
1240 results->
Store(nodesAfterPruningVsTree);
1244 fMonitorNtuple=
new TTree(
"MonitorNtuple",
"BDT variables");
1245 fMonitorNtuple->Branch(
"iTree",&fITree,
"iTree/I");
1246 fMonitorNtuple->Branch(
"boostWeight",&fBoostWeight,
"boostWeight/D");
1247 fMonitorNtuple->Branch(
"errorFraction",&fErrorFraction,
"errorFraction/D");
1250 Int_t nNodesBeforePruningCount = 0;
1251 Int_t nNodesAfterPruningCount = 0;
1253 Int_t nNodesBeforePruning = 0;
1254 Int_t nNodesAfterPruning = 0;
1257 if(fBoostType==
"Grad"){
1258 InitGradBoost(fEventSample);
1264 while (itree < fNTrees && continueBoost){
1265 if (fExitFromTraining)
break;
1266 fIPyCurrentIter = itree;
1279 if (fBoostType!=
"Grad"){
1280 Log() << kFATAL <<
"Multiclass is currently only supported by gradient boost. "
1281 <<
"Please change boost option accordingly (GradBoost)."
1284 UInt_t nClasses = DataInfo().GetNClasses();
1285 for (
UInt_t i=0;i<nClasses;i++){
1286 fForest.push_back(
new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), i,
1287 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1288 itree*nClasses+i, fNodePurityLimit, itree*nClasses+1));
1289 fForest.back()->SetNVars(GetNvar());
1290 if (fUseFisherCuts) {
1291 fForest.back()->SetUseFisherCuts();
1292 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1293 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1297 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1298 Double_t bw = this->Boost(*fTrainSample, fForest.back(),i);
1300 fBoostWeights.push_back(bw);
1302 fBoostWeights.push_back(0);
1303 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1310 fForest.push_back(
new DecisionTree( fSepType, fMinNodeSize, fNCuts, &(DataInfo()), fSignalClass,
1311 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fMaxDepth,
1312 itree, fNodePurityLimit, itree));
1313 fForest.back()->SetNVars(GetNvar());
1314 if (fUseFisherCuts) {
1315 fForest.back()->SetUseFisherCuts();
1316 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
1317 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
1320 nNodesBeforePruning = fForest.back()->BuildTree(*fTrainSample);
1322 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad") {
1323 nNodesBeforePruning = fForest.back()->CleanTree();
1326 nNodesBeforePruningCount += nNodesBeforePruning;
1327 nodesBeforePruningVsTree->
SetBinContent(itree+1,nNodesBeforePruning);
1329 fForest.back()->SetPruneMethod(fPruneMethod);
1330 fForest.back()->SetPruneStrength(fPruneStrength);
1332 std::vector<const Event*> * validationSample =
NULL;
1333 if(fAutomatic) validationSample = &fValidationSample;
1335 Double_t bw = this->Boost(*fTrainSample, fForest.back());
1337 fBoostWeights.push_back(bw);
1339 fBoostWeights.push_back(0);
1340 Log() << kWARNING <<
"stopped boosting at itree="<<itree <<
Endl;
1351 if (fUseYesNoLeaf && !DoRegression() && fBoostType!=
"Grad"){
1352 fForest.back()->CleanTree();
1354 nNodesAfterPruning = fForest.back()->GetNNodes();
1355 nNodesAfterPruningCount += nNodesAfterPruning;
1356 nodesAfterPruningVsTree->
SetBinContent(itree+1,nNodesAfterPruning);
1359 fInteractive->AddPoint(itree, fBoostWeight, fErrorFraction);
1362 fMonitorNtuple->Fill();
1363 if (fDoBoostMonitor){
1364 if (! DoRegression() ){
1365 if ( itree==fNTrees-1 || (!(itree%500)) ||
1366 (!(itree%250) && itree <1000)||
1367 (!(itree%100) && itree < 500)||
1368 (!(itree%50) && itree < 250)||
1369 (!(itree%25) && itree < 150)||
1370 (!(itree%10) && itree < 50)||
1371 (!(itree%5) && itree < 20)
1372 ) BoostMonitor(itree);
1383 Log() << kDEBUG <<
"\t<Train> average number of nodes (w/o pruning) : "
1384 << nNodesBeforePruningCount/GetNTrees() <<
Endl;
1387 Log() << kDEBUG <<
"\t<Train> average number of nodes before/after pruning : "
1388 << nNodesBeforePruningCount/GetNTrees() <<
" / "
1389 << nNodesAfterPruningCount/GetNTrees()
1397 Log() << kDEBUG <<
"Now I delete the privat data sample"<<
Endl;
1398 for (
UInt_t i=0; i<fEventSample.size(); i++)
delete fEventSample[i];
1399 for (
UInt_t i=0; i<fValidationSample.size(); i++)
delete fValidationSample[i];
1400 fEventSample.clear();
1401 fValidationSample.clear();
1403 if (!fExitFromTraining) fIPyMaxIter = fIPyCurrentIter;
1414 for (
UInt_t itree=0; itree<nTrees; itree++) {
1416 sum += fForest[itree]->CheckEvent(e,
kFALSE);
1419 return 2.0/(1.0+
exp(-2.0*sum))-1;
1427 if (DoMulticlass()) {
1428 UInt_t nClasses = DataInfo().GetNClasses();
1429 std::vector<Double_t> expCache;
1430 if (cls == nClasses - 1) {
1431 expCache.resize(nClasses);
1433 for (
auto e : eventSample) {
1434 fResiduals[
e].at(cls) += fForest.back()->CheckEvent(
e,
kFALSE);
1435 if (cls == nClasses - 1) {
1436 auto &residualsThisEvent = fResiduals[
e];
1437 std::transform(residualsThisEvent.begin(),
1438 residualsThisEvent.begin() + nClasses,
1439 expCache.begin(), [](
Double_t d) {
return exp(d); });
1440 for (
UInt_t i = 0; i < nClasses; i++) {
1442 for (
UInt_t j = 0; j < nClasses; j++) {
1444 norm += expCache[j] / expCache[i];
1448 Double_t res = (
e->GetClass() == i) ? (1.0 - p_cls) : (-p_cls);
1454 for (
auto e : eventSample) {
1455 auto &residualAt0 = fResiduals[
e].at(0);
1456 residualAt0 += fForest.back()->CheckEvent(
e,
kFALSE);
1457 Double_t p_sig = 1.0 / (1.0 +
exp(-2.0 * residualAt0));
1458 Double_t res = (DataInfo().IsSignal(
e) ? 1 : 0) - p_sig;
1470 for (std::vector<const TMVA::Event*>::const_iterator
e=fEventSample.begin();
e!=fEventSample.end();
e++) {
1471 fLossFunctionEventInfo[*
e].predictedValue += fForest.back()->CheckEvent(*
e,
kFALSE);
1475 fRegressionLossFunctionBDTG->SetTargets(eventSample, fLossFunctionEventInfo);
1488 std::unordered_map<TMVA::DecisionTreeNode*, LeafInfo> leaves;
1489 for (
auto e : eventSample) {
1492 auto &
v = leaves[node];
1493 auto target =
e->GetTarget(cls);
1494 v.sumWeightTarget += target * weight;
1495 v.sum2 +=
fabs(target) * (1.0-
fabs(target)) * weight * weight;
1497 for (
auto &iLeave : leaves) {
1498 constexpr
auto minValue = 1
e-30;
1499 if (iLeave.second.sum2 < minValue) {
1500 iLeave.second.sum2 = minValue;
1502 iLeave.first->SetResponse(fShrinkage/DataInfo().GetNClasses() * iLeave.second.sumWeightTarget/iLeave.second.sum2);
1507 DoMulticlass() ? UpdateTargets(fEventSample, cls) : UpdateTargets(fEventSample);
1518 std::map<TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > > leaves;
1519 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1521 (leaves[node]).push_back(fLossFunctionEventInfo[*
e]);
1526 for (std::map<
TMVA::DecisionTreeNode*,vector< TMVA::LossFunctionEventInfo > >::iterator iLeave=leaves.begin();
1527 iLeave!=leaves.end();++iLeave){
1528 Double_t fit = fRegressionLossFunctionBDTG->Fit(iLeave->second);
1529 (iLeave->first)->SetResponse(fShrinkage*fit);
1532 UpdateTargetsRegression(*fTrainSample);
1546 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1550 fRegressionLossFunctionBDTG->Init(fLossFunctionEventInfo, fBoostWeights);
1551 UpdateTargetsRegression(*fTrainSample,
kTRUE);
1554 else if(DoMulticlass()){
1555 UInt_t nClasses = DataInfo().GetNClasses();
1556 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1557 for (
UInt_t i=0;i<nClasses;i++){
1559 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
1561 fResiduals[*
e].push_back(0);
1566 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1567 Double_t r = (DataInfo().IsSignal(*
e)?1:0)-0.5;
1569 fResiduals[*
e].push_back(0);
1580 for (
UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
1581 Bool_t isSignalType= (dt->
CheckEvent(fValidationSample[ievt]) > fNodePurityLimit ) ? 1 : 0;
1583 if (isSignalType == (DataInfo().IsSignal(fValidationSample[ievt])) ) {
1584 ncorrect += fValidationSample[ievt]->GetWeight();
1587 nfalse += fValidationSample[ievt]->GetWeight();
1591 return ncorrect / (ncorrect + nfalse);
1602 if (fBoostType==
"AdaBoost") returnVal = this->AdaBoost (eventSample, dt);
1603 else if (fBoostType==
"AdaCost") returnVal = this->AdaCost (eventSample, dt);
1604 else if (fBoostType==
"Bagging") returnVal = this->Bagging ( );
1605 else if (fBoostType==
"RegBoost") returnVal = this->RegBoost (eventSample, dt);
1606 else if (fBoostType==
"AdaBoostR2") returnVal = this->AdaBoostR2(eventSample, dt);
1607 else if (fBoostType==
"Grad"){
1609 returnVal = this->GradBoostRegression(eventSample, dt);
1610 else if(DoMulticlass())
1611 returnVal = this->GradBoost (eventSample, dt, cls);
1613 returnVal = this->GradBoost (eventSample, dt);
1616 Log() << kINFO << GetOptions() <<
Endl;
1617 Log() << kFATAL <<
"<Boost> unknown boost option " << fBoostType<<
" called" <<
Endl;
1621 GetBaggedSubSample(fEventSample);
1636 TH1F *tmpS =
new TH1F(
"tmpS",
"", 100 , -1., 1.00001 );
1637 TH1F *tmpB =
new TH1F(
"tmpB",
"", 100 , -1., 1.00001 );
1641 UInt_t signalClassNr = DataInfo().GetClassInfo(
"Signal")->GetNumber();
1652 for (
UInt_t iev=0; iev < nevents; iev++){
1653 const Event*
event = GetTestingEvent(iev);
1655 if (event->GetClass() == signalClassNr) {tmp=tmpS;}
1657 tmp->
Fill(PrivateGetMvaValue(event),event->GetWeight());
1661 std::vector<TH1F*> hS;
1662 std::vector<TH1F*> hB;
1663 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1664 hS.push_back(
new TH1F(
Form(
"SigVar%dAtTree%d",ivar,iTree),
Form(
"SigVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1665 hB.push_back(
new TH1F(
Form(
"BkgVar%dAtTree%d",ivar,iTree),
Form(
"BkgVar%dAtTree%d",ivar,iTree),100,DataInfo().GetVariableInfo(ivar).GetMin(),DataInfo().GetVariableInfo(ivar).GetMax()));
1666 results->
Store(hS.back(),hS.back()->GetTitle());
1667 results->
Store(hB.back(),hB.back()->GetTitle());
1671 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1672 if (fEventSample[iev]->GetBoostWeight() > max) max = 1.01*fEventSample[iev]->GetBoostWeight();
1674 TH1F *tmpBoostWeightsS =
new TH1F(
Form(
"BoostWeightsInTreeS%d",iTree),
Form(
"BoostWeightsInTreeS%d",iTree),100,0.,max);
1675 TH1F *tmpBoostWeightsB =
new TH1F(
Form(
"BoostWeightsInTreeB%d",iTree),
Form(
"BoostWeightsInTreeB%d",iTree),100,0.,max);
1676 results->
Store(tmpBoostWeightsS,tmpBoostWeightsS->
GetTitle());
1677 results->
Store(tmpBoostWeightsB,tmpBoostWeightsB->
GetTitle());
1679 TH1F *tmpBoostWeights;
1680 std::vector<TH1F*> *
h;
1682 for (
UInt_t iev=0; iev < fEventSample.size(); iev++){
1683 if (fEventSample[iev]->
GetClass() == signalClassNr) {
1684 tmpBoostWeights=tmpBoostWeightsS;
1687 tmpBoostWeights=tmpBoostWeightsB;
1690 tmpBoostWeights->
Fill(fEventSample[iev]->GetBoostWeight());
1691 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
1692 (*h)[ivar]->Fill(fEventSample[iev]->GetValue(ivar),fEventSample[iev]->GetWeight());
1728 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
1730 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
1733 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1736 UInt_t iclass=(*e)->GetClass();
1739 if ( DoRegression() ) {
1741 sumGlobalwfalse += w * tmpDev;
1742 sumGlobalwfalse2 += w * tmpDev*tmpDev;
1743 if (tmpDev > maxDev) maxDev = tmpDev;
1748 if (!(isSignalType == DataInfo().IsSignal(*
e))) {
1749 sumGlobalwfalse+= w;
1754 if (DataInfo().IsSignal(*
e)) trueType = 1;
1756 sumGlobalwfalse+= w*trueType*dtoutput;
1761 err = sumGlobalwfalse/sumGlobalw ;
1762 if ( DoRegression() ) {
1764 if (fAdaBoostR2Loss==
"linear"){
1765 err = sumGlobalwfalse/maxDev/sumGlobalw ;
1767 else if (fAdaBoostR2Loss==
"quadratic"){
1768 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
1770 else if (fAdaBoostR2Loss==
"exponential"){
1772 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1775 err += w * (1 -
exp (-tmpDev/maxDev)) / sumGlobalw;
1780 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
1781 <<
" namely " << fAdaBoostR2Loss <<
"\n"
1782 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
1786 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
Endl;
1790 std::vector<Double_t> newSumw(sumw.size(),0);
1793 if (err >= 0.5 && fUseYesNoLeaf) {
1797 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
1798 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
1800 <<
"please check why this happens, maybe too many events per node requested ?"
1804 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
1805 <<
") That should not happen, please check your code (i.e... the BDT code), I "
1806 <<
" stop boosting here" <<
Endl;
1810 }
else if (err < 0) {
1811 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
1812 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
1813 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
1814 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
1818 boostWeight =
TMath::Log((1.-err)/err)*fAdaBoostBeta;
1820 boostWeight =
TMath::Log((1.+err)/(1-err))*fAdaBoostBeta;
1823 Log() << kDEBUG <<
"BDT AdaBoos wrong/all: " << sumGlobalwfalse <<
"/" << sumGlobalw <<
" 1-err/err="<<boostWeight<<
" log.."<<
TMath::Log(boostWeight)<<
Endl;
1828 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1830 if (fUseYesNoLeaf||DoRegression()){
1831 if ((!( (dt->
CheckEvent(*
e,fUseYesNoLeaf) > fNodePurityLimit ) == DataInfo().IsSignal(*
e))) || DoRegression()) {
1835 if ( (*e)->GetWeight() > 0 ){
1836 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1838 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1840 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1841 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1849 if (DataInfo().IsSignal(*
e)) trueType = 1;
1853 if ( (*e)->GetWeight() > 0 ){
1854 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1856 if (DoRegression()) results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
1858 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1859 else (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1862 newSumGlobalw+=(*e)->GetWeight();
1863 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
1869 Log() << kDEBUG <<
"new Nsig="<<newSumw[0]*globalNormWeight <<
" new Nbkg="<<newSumw[1]*globalNormWeight <<
Endl;
1872 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1876 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
1877 else (*e)->ScaleBoostWeight( globalNormWeight );
1880 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
1884 fBoostWeight = boostWeight;
1885 fErrorFraction = err;
1911 Double_t err=0, sumGlobalWeights=0, sumGlobalCost=0;
1913 std::vector<Double_t> sumw(DataInfo().GetNClasses(),0);
1915 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1917 sumGlobalWeights += w;
1918 UInt_t iclass=(*e)->GetClass();
1922 if ( DoRegression() ) {
1923 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1928 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
1929 Bool_t isSelectedSignal = (dtoutput>0);
1930 if (isTrueSignal) trueType = 1;
1934 if (isTrueSignal && isSelectedSignal) cost=Css;
1935 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1936 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1937 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1938 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
1940 sumGlobalCost+= w*trueType*dtoutput*cost;
1945 if ( DoRegression() ) {
1946 Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1951 sumGlobalCost /= sumGlobalWeights;
1956 vector<Double_t> newSumClassWeights(sumw.size(),0);
1958 Double_t boostWeight =
TMath::Log((1+sumGlobalCost)/(1-sumGlobalCost)) * fAdaBoostBeta;
1962 for (vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
1965 Bool_t isTrueSignal = DataInfo().IsSignal(*
e);
1966 Bool_t isSelectedSignal = (dtoutput>0);
1967 if (isTrueSignal) trueType = 1;
1971 if (isTrueSignal && isSelectedSignal) cost=Css;
1972 else if (isTrueSignal && !isSelectedSignal) cost=Cts_sb;
1973 else if (!isTrueSignal && isSelectedSignal) cost=Ctb_ss;
1974 else if (!isTrueSignal && !isSelectedSignal) cost=Cbb;
1975 else Log() << kERROR <<
"something went wrong in AdaCost" <<
Endl;
1978 if (DoRegression())
Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1979 if ( (*e)->GetWeight() > 0 ){
1980 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
1982 if (DoRegression())
Log() << kFATAL <<
" AdaCost not implemented for regression"<<
Endl;
1984 if ( fInverseBoostNegWeights )(*e)->ScaleBoostWeight( 1. / boostfactor);
1987 newSumGlobalWeights+=(*e)->GetWeight();
1988 newSumClassWeights[(*e)->GetClass()] += (*e)->GetWeight();
1993 Double_t globalNormWeight=
Double_t(eventSample.size())/newSumGlobalWeights;
1994 Log() << kDEBUG <<
"new Nsig="<<newSumClassWeights[0]*globalNormWeight <<
" new Nbkg="<<newSumClassWeights[1]*globalNormWeight <<
Endl;
1997 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2000 if (DataInfo().IsSignal(*
e))(*e)->ScaleBoostWeight( globalNormWeight * fSigToBkgFraction );
2001 else (*e)->ScaleBoostWeight( globalNormWeight );
2005 if (!(DoRegression()))results->
GetHist(
"BoostWeights")->
Fill(boostWeight);
2009 fBoostWeight = boostWeight;
2010 fErrorFraction = err;
2037 if (!fSubSample.empty()) fSubSample.clear();
2039 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2040 n = trandom->
PoissonD(fBaggedSampleFraction);
2041 for (
Int_t i=0;i<
n;i++) fSubSample.push_back(*
e);
2075 if ( !DoRegression() )
Log() << kFATAL <<
"Somehow you chose a regression boost method for a classification job" <<
Endl;
2077 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
2079 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2084 sumwfalse += w * tmpDev;
2085 sumwfalse2 += w * tmpDev*tmpDev;
2086 if (tmpDev > maxDev) maxDev = tmpDev;
2090 if (fAdaBoostR2Loss==
"linear"){
2091 err = sumwfalse/maxDev/sumw ;
2093 else if (fAdaBoostR2Loss==
"quadratic"){
2094 err = sumwfalse2/maxDev/maxDev/sumw ;
2096 else if (fAdaBoostR2Loss==
"exponential"){
2098 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2101 err += w * (1 -
exp (-tmpDev/maxDev)) / sumw;
2106 Log() << kFATAL <<
" you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
2107 <<
" namely " << fAdaBoostR2Loss <<
"\n"
2108 <<
"and this is not implemented... a typo in the options ??" <<
Endl;
2116 Log() << kERROR <<
" YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
2117 <<
"boost such a thing... if after 1 step the error rate is == 0.5"
2119 <<
"please check why this happens, maybe too many events per node requested ?"
2123 Log() << kERROR <<
" The error rate in the BDT boosting is > 0.5. ("<< err
2124 <<
") That should not happen, but is possible for regression trees, and"
2125 <<
" should trigger a stop for the boosting. please check your code (i.e... the BDT code), I "
2126 <<
" stop boosting " <<
Endl;
2130 }
else if (err < 0) {
2131 Log() << kERROR <<
" The error rate in the BDT boosting is < 0. That can happen"
2132 <<
" due to improper treatment of negative weights in a Monte Carlo.. (if you have"
2133 <<
" an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
2134 <<
" for the time being I set it to its absolute value.. just to continue.." <<
Endl;
2138 Double_t boostWeight = err / (1.-err);
2143 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2145 results->
GetHist(
"BoostWeights")->
Fill(boostfactor);
2147 if ( (*e)->GetWeight() > 0 ){
2148 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
2149 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
2150 if (newWeight == 0) {
2151 Log() << kINFO <<
"Weight= " << (*e)->GetWeight() <<
Endl;
2152 Log() << kINFO <<
"BoostWeight= " << (*e)->GetBoostWeight() <<
Endl;
2153 Log() << kINFO <<
"boostweight="<<boostWeight <<
" err= " <<err <<
Endl;
2154 Log() << kINFO <<
"NewBoostWeight= " << newBoostWeight <<
Endl;
2155 Log() << kINFO <<
"boostfactor= " << boostfactor <<
Endl;
2156 Log() << kINFO <<
"maxDev = " << maxDev <<
Endl;
2158 Log() << kINFO <<
"target = " << (*e)->GetTarget(0) <<
Endl;
2161 (*e)->SetBoostWeight( newBoostWeight );
2164 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
2166 newSumw+=(*e)->GetWeight();
2170 Double_t normWeight = sumw / newSumw;
2171 for (std::vector<const TMVA::Event*>::const_iterator
e=eventSample.begin();
e!=eventSample.end();
e++) {
2174 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
2181 fBoostWeight = boostWeight;
2182 fErrorFraction = err;
2194 if (fDoPreselection){
2195 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2196 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%d",ivar), fIsLowBkgCut[ivar]);
2197 gTools().
AddAttr( wght,
Form(
"PreselectionLowBkgVar%dValue",ivar), fLowBkgCut[ivar]);
2198 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%d",ivar), fIsLowSigCut[ivar]);
2199 gTools().
AddAttr( wght,
Form(
"PreselectionLowSigVar%dValue",ivar), fLowSigCut[ivar]);
2200 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%d",ivar), fIsHighBkgCut[ivar]);
2201 gTools().
AddAttr( wght,
Form(
"PreselectionHighBkgVar%dValue",ivar),fHighBkgCut[ivar]);
2202 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%d",ivar), fIsHighSigCut[ivar]);
2203 gTools().
AddAttr( wght,
Form(
"PreselectionHighSigVar%dValue",ivar),fHighSigCut[ivar]);
2209 gTools().
AddAttr( wght,
"AnalysisType", fForest.back()->GetAnalysisType() );
2211 for (
UInt_t i=0; i< fForest.size(); i++) {
2212 void* trxml = fForest[i]->AddXMLTo(wght);
2223 for (i=0; i<fForest.size(); i++)
delete fForest[i];
2225 fBoostWeights.clear();
2232 if (
gTools().HasAttr( parent,
Form(
"PreselectionLowBkgVar%d",0))) {
2233 fIsLowBkgCut.resize(GetNvar());
2234 fLowBkgCut.resize(GetNvar());
2235 fIsLowSigCut.resize(GetNvar());
2236 fLowSigCut.resize(GetNvar());
2237 fIsHighBkgCut.resize(GetNvar());
2238 fHighBkgCut.resize(GetNvar());
2239 fIsHighSigCut.resize(GetNvar());
2240 fHighSigCut.resize(GetNvar());
2244 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++){
2246 fIsLowBkgCut[ivar]=tmpBool;
2248 fLowBkgCut[ivar]=tmpDouble;
2250 fIsLowSigCut[ivar]=tmpBool;
2252 fLowSigCut[ivar]=tmpDouble;
2254 fIsHighBkgCut[ivar]=tmpBool;
2256 fHighBkgCut[ivar]=tmpDouble;
2258 fIsHighSigCut[ivar]=tmpBool;
2260 fHighSigCut[ivar]=tmpDouble;
2267 if(
gTools().HasAttr(parent,
"TreeType")) {
2278 fForest.back()->SetTreeID(i++);
2280 fBoostWeights.push_back(boostWeight);
2292 Int_t analysisType(0);
2295 istr >> dummy >> fNTrees;
2296 Log() << kINFO <<
"Read " << fNTrees <<
" Decision trees" <<
Endl;
2298 for (
UInt_t i=0;i<fForest.size();i++)
delete fForest[i];
2300 fBoostWeights.clear();
2303 for (
int i=0;i<fNTrees;i++) {
2304 istr >> dummy >> iTree >> dummy >> boostWeight;
2306 fForest.back()->Print( std::cout );
2307 Log() << kFATAL <<
"Error while reading weight file; mismatch iTree="
2308 << iTree <<
" i=" << i
2309 <<
" dummy " << dummy
2310 <<
" boostweight " << boostWeight
2315 fForest.back()->SetTreeID(i);
2316 fForest.back()->
Read(istr, GetTrainingTMVAVersionCode());
2317 fBoostWeights.push_back(boostWeight);
2324 return this->GetMvaValue( err, errUpper, 0 );
2334 const Event* ev = GetEvent();
2335 if (fDoPreselection) {
2336 Double_t val = ApplyPreselectionCuts(ev);
2339 return PrivateGetMvaValue(ev, err, errUpper, useNTrees);
2351 NoErrorCalc(err, errUpper);
2355 UInt_t nTrees = fForest.size();
2357 if (useNTrees > 0 ) nTrees = useNTrees;
2359 if (fBoostType==
"Grad")
return GetGradBoostMVA(ev,nTrees);
2363 for (
UInt_t itree=0; itree<nTrees; itree++) {
2365 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,fUseYesNoLeaf);
2366 norm += fBoostWeights[itree];
2378 if (fMulticlassReturnVal ==
NULL) fMulticlassReturnVal =
new std::vector<Float_t>();
2379 fMulticlassReturnVal->clear();
2381 UInt_t nClasses = DataInfo().GetNClasses();
2382 std::vector<Double_t> temp(nClasses);
2383 auto forestSize = fForest.size();
2387 for (
UInt_t itree = 0; itree < forestSize; ++itree) {
2388 temp[classOfTree] += fForest[itree]->CheckEvent(e,
kFALSE);
2389 if (++classOfTree == nClasses) classOfTree = 0;
2394 std::transform(temp.begin(), temp.end(), temp.begin(), [](
Double_t d){
return exp(d);});
2396 for(
UInt_t iClass=0; iClass<nClasses; iClass++){
2398 for(
UInt_t j=0;j<nClasses;j++){
2400 norm += temp[j] / temp[iClass];
2402 (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
2405 return *fMulticlassReturnVal;
2414 if (fRegressionReturnVal ==
NULL) fRegressionReturnVal =
new std::vector<Float_t>();
2415 fRegressionReturnVal->clear();
2417 const Event * ev = GetEvent();
2422 if (fBoostType==
"AdaBoostR2") {
2433 vector< Double_t > response(fForest.size());
2434 vector< Double_t > weight(fForest.size());
2437 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2438 response[itree] = fForest[itree]->CheckEvent(ev,
kFALSE);
2439 weight[itree] = fBoostWeights[itree];
2440 totalSumOfWeights += fBoostWeights[itree];
2443 std::vector< std::vector<Double_t> > vtemp;
2444 vtemp.push_back( response );
2445 vtemp.push_back( weight );
2450 while (sumOfWeights <= totalSumOfWeights/2.) {
2451 sumOfWeights += vtemp[1][t];
2465 else if(fBoostType==
"Grad"){
2466 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2467 myMVA += fForest[itree]->CheckEvent(ev,
kFALSE);
2470 evT->
SetTarget(0, myMVA+fBoostWeights[0] );
2473 for (
UInt_t itree=0; itree<fForest.size(); itree++) {
2475 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(ev,
kFALSE);
2476 norm += fBoostWeights[itree];
2484 const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
2485 fRegressionReturnVal->push_back( evT2->
GetTarget(0) );
2490 return *fRegressionReturnVal;
2499 Log() << kDEBUG <<
"\tWrite monitoring histograms to file: " << BaseDir()->GetPath() <<
Endl;
2503 fMonitorNtuple->
Write();
2514 fVariableImportance.resize(GetNvar());
2515 for (
UInt_t ivar = 0; ivar < GetNvar(); ivar++) {
2516 fVariableImportance[ivar]=0;
2519 for (
UInt_t itree = 0; itree < GetNTrees(); itree++) {
2520 std::vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
2521 for (
UInt_t i=0; i< relativeImportance.size(); i++) {
2522 fVariableImportance[i] += fBoostWeights[itree] * relativeImportance[i];
2526 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++){
2527 fVariableImportance[ivar] =
TMath::Sqrt(fVariableImportance[ivar]);
2528 sum += fVariableImportance[ivar];
2530 for (
UInt_t ivar=0; ivar< fVariableImportance.size(); ivar++) fVariableImportance[ivar] /= sum;
2532 return fVariableImportance;
2542 std::vector<Double_t> relativeImportance = this->GetVariableImportance();
2543 if (ivar < (
UInt_t)relativeImportance.size())
return relativeImportance[ivar];
2544 else Log() << kFATAL <<
"<GetVariableImportance> ivar = " << ivar <<
" is out of range " <<
Endl;
2556 vector< Double_t> importance(this->GetVariableImportance());
2558 for (
UInt_t ivar=0; ivar<GetNvar(); ivar++) {
2560 fRanking->AddRank(
Rank( GetInputLabel(ivar), importance[ivar] ) );
2574 Log() <<
"Boosted Decision Trees are a collection of individual decision" <<
Endl;
2575 Log() <<
"trees which form a multivariate classifier by (weighted) majority " <<
Endl;
2576 Log() <<
"vote of the individual trees. Consecutive decision trees are " <<
Endl;
2577 Log() <<
"trained using the original training data set with re-weighted " <<
Endl;
2578 Log() <<
"events. By default, the AdaBoost method is employed, which gives " <<
Endl;
2579 Log() <<
"events that were misclassified in the previous tree a larger " <<
Endl;
2580 Log() <<
"weight in the training of the following tree." <<
Endl;
2582 Log() <<
"Decision trees are a sequence of binary splits of the data sample" <<
Endl;
2583 Log() <<
"using a single discriminant variable at a time. A test event " <<
Endl;
2584 Log() <<
"ending up after the sequence of left-right splits in a final " <<
Endl;
2585 Log() <<
"(\"leaf\") node is classified as either signal or background" <<
Endl;
2586 Log() <<
"depending on the majority type of training events in that node." <<
Endl;
2590 Log() <<
"By the nature of the binary splits performed on the individual" <<
Endl;
2591 Log() <<
"variables, decision trees do not deal well with linear correlations" <<
Endl;
2592 Log() <<
"between variables (they need to approximate the linear split in" <<
Endl;
2593 Log() <<
"the two dimensional space by a sequence of splits on the two " <<
Endl;
2594 Log() <<
"variables individually). Hence decorrelation could be useful " <<
Endl;
2595 Log() <<
"to optimise the BDT performance." <<
Endl;
2599 Log() <<
"The two most important parameters in the configuration are the " <<
Endl;
2600 Log() <<
"minimal number of events requested by a leaf node as percentage of the " <<
Endl;
2601 Log() <<
" number of training events (option \"MinNodeSize\" replacing the actual number " <<
Endl;
2602 Log() <<
" of events \"nEventsMin\" as given in earlier versions" <<
Endl;
2603 Log() <<
"If this number is too large, detailed features " <<
Endl;
2604 Log() <<
"in the parameter space are hard to be modelled. If it is too small, " <<
Endl;
2605 Log() <<
"the risk to overtrain rises and boosting seems to be less effective" <<
Endl;
2606 Log() <<
" typical values from our current experience for best performance " <<
Endl;
2607 Log() <<
" are between 0.5(%) and 10(%) " <<
Endl;
2609 Log() <<
"The default minimal number is currently set to " <<
Endl;
2610 Log() <<
" max(20, (N_training_events / N_variables^2 / 10)) " <<
Endl;
2611 Log() <<
"and can be changed by the user." <<
Endl;
2613 Log() <<
"The other crucial parameter, the pruning strength (\"PruneStrength\")," <<
Endl;
2614 Log() <<
"is also related to overtraining. It is a regularisation parameter " <<
Endl;
2615 Log() <<
"that is used when determining after the training which splits " <<
Endl;
2616 Log() <<
"are considered statistically insignificant and are removed. The" <<
Endl;
2617 Log() <<
"user is advised to carefully watch the BDT screen output for" <<
Endl;
2618 Log() <<
"the comparison between efficiencies obtained on the training and" <<
Endl;
2619 Log() <<
"the independent test sample. They should be equal within statistical" <<
Endl;
2620 Log() <<
"errors, in order to minimize statistical fluctuations in different samples." <<
Endl;
2632 fout <<
" std::vector<"<<nodeName<<
"*> fForest; // i.e. root nodes of decision trees" << std::endl;
2633 fout <<
" std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << std::endl;
2634 fout <<
"};" << std::endl << std::endl;
2635 fout <<
"double " << className <<
"::GetMvaValue__( const std::vector<double>& inputValues ) const" << std::endl;
2636 fout <<
"{" << std::endl;
2637 fout <<
" double myMVA = 0;" << std::endl;
2638 if (fDoPreselection){
2639 for (
UInt_t ivar = 0; ivar< fIsLowBkgCut.size(); ivar++){
2640 if (fIsLowBkgCut[ivar]){
2641 fout <<
" if (inputValues["<<ivar<<
"] < " << fLowBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2643 if (fIsLowSigCut[ivar]){
2644 fout <<
" if (inputValues["<<ivar<<
"] < "<< fLowSigCut[ivar] <<
") return 1; // is signal preselection cut" << std::endl;
2646 if (fIsHighBkgCut[ivar]){
2647 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighBkgCut[ivar] <<
") return -1; // is background preselection cut" << std::endl;
2649 if (fIsHighSigCut[ivar]){
2650 fout <<
" if (inputValues["<<ivar<<
"] > "<<fHighSigCut[ivar]<<
") return 1; // is signal preselection cut" << std::endl;
2655 if (fBoostType!=
"Grad"){
2656 fout <<
" double norm = 0;" << std::endl;
2658 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++){" << std::endl;
2659 fout <<
" "<<nodeName<<
" *current = fForest[itree];" << std::endl;
2660 fout <<
" while (current->GetNodeType() == 0) { //intermediate node" << std::endl;
2661 fout <<
" if (current->GoesRight(inputValues)) current=("<<nodeName<<
"*)current->GetRight();" << std::endl;
2662 fout <<
" else current=("<<nodeName<<
"*)current->GetLeft();" << std::endl;
2663 fout <<
" }" << std::endl;
2664 if (fBoostType==
"Grad"){
2665 fout <<
" myMVA += current->GetResponse();" << std::endl;
2667 if (fUseYesNoLeaf) fout <<
" myMVA += fBoostWeights[itree] * current->GetNodeType();" << std::endl;
2668 else fout <<
" myMVA += fBoostWeights[itree] * current->GetPurity();" << std::endl;
2669 fout <<
" norm += fBoostWeights[itree];" << std::endl;
2671 fout <<
" }" << std::endl;
2672 if (fBoostType==
"Grad"){
2673 fout <<
" return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << std::endl;
2675 else fout <<
" return myMVA /= norm;" << std::endl;
2676 fout <<
"};" << std::endl << std::endl;
2677 fout <<
"void " << className <<
"::Initialize()" << std::endl;
2678 fout <<
"{" << std::endl;
2680 for (
UInt_t itree=0; itree<GetNTrees(); itree++) {
2681 fout <<
" // itree = " << itree << std::endl;
2682 fout <<
" fBoostWeights.push_back(" << fBoostWeights[itree] <<
");" << std::endl;
2683 fout <<
" fForest.push_back( " << std::endl;
2684 this->MakeClassInstantiateNode((
DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
2685 fout <<
" );" << std::endl;
2687 fout <<
" return;" << std::endl;
2688 fout <<
"};" << std::endl;
2689 fout <<
" " << std::endl;
2690 fout <<
"// Clean up" << std::endl;
2691 fout <<
"inline void " << className <<
"::Clear() " << std::endl;
2692 fout <<
"{" << std::endl;
2693 fout <<
" for (unsigned int itree=0; itree<fForest.size(); itree++) { " << std::endl;
2694 fout <<
" delete fForest[itree]; " << std::endl;
2695 fout <<
" }" << std::endl;
2696 fout <<
"}" << std::endl;
2708 fout <<
"#define NN new "<<nodeName << std::endl;
2710 fout <<
" " << std::endl;
2711 fout <<
"#ifndef "<<nodeName<<
"__def" << std::endl;
2712 fout <<
"#define "<<nodeName<<
"__def" << std::endl;
2713 fout <<
" " << std::endl;
2714 fout <<
"class "<<nodeName<<
" {" << std::endl;
2715 fout <<
" " << std::endl;
2716 fout <<
"public:" << std::endl;
2717 fout <<
" " << std::endl;
2718 fout <<
" // constructor of an essentially \"empty\" node floating in space" << std::endl;
2719 fout <<
" "<<nodeName<<
" ( "<<nodeName<<
"* left,"<<nodeName<<
"* right," << std::endl;
2720 if (fUseFisherCuts){
2721 fout <<
" int nFisherCoeff," << std::endl;
2722 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2723 fout <<
" double fisherCoeff"<<i<<
"," << std::endl;
2726 fout <<
" int selector, double cutValue, bool cutType, " << std::endl;
2727 fout <<
" int nodeType, double purity, double response ) :" << std::endl;
2728 fout <<
" fLeft ( left )," << std::endl;
2729 fout <<
" fRight ( right )," << std::endl;
2730 if (fUseFisherCuts) fout <<
" fNFisherCoeff ( nFisherCoeff )," << std::endl;
2731 fout <<
" fSelector ( selector )," << std::endl;
2732 fout <<
" fCutValue ( cutValue )," << std::endl;
2733 fout <<
" fCutType ( cutType )," << std::endl;
2734 fout <<
" fNodeType ( nodeType )," << std::endl;
2735 fout <<
" fPurity ( purity )," << std::endl;
2736 fout <<
" fResponse ( response ){" << std::endl;
2737 if (fUseFisherCuts){
2738 for (
UInt_t i=0;i<GetNVariables()+1;i++){
2739 fout <<
" fFisherCoeff.push_back(fisherCoeff"<<i<<
");" << std::endl;
2742 fout <<
" }" << std::endl << std::endl;
2743 fout <<
" virtual ~"<<nodeName<<
"();" << std::endl << std::endl;
2744 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2745 fout <<
" virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << std::endl;
2746 fout <<
" "<<nodeName<<
"* GetRight( void ) {return fRight; };" << std::endl << std::endl;
2747 fout <<
" // test event if it descends the tree at this node to the left " << std::endl;
2748 fout <<
" virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << std::endl;
2749 fout <<
" "<<nodeName<<
"* GetLeft( void ) { return fLeft; }; " << std::endl << std::endl;
2750 fout <<
" // return S/(S+B) (purity) at this node (from training)" << std::endl << std::endl;
2751 fout <<
" double GetPurity( void ) const { return fPurity; } " << std::endl;
2752 fout <<
" // return the node type" << std::endl;
2753 fout <<
" int GetNodeType( void ) const { return fNodeType; }" << std::endl;
2754 fout <<
" double GetResponse(void) const {return fResponse;}" << std::endl << std::endl;
2755 fout <<
"private:" << std::endl << std::endl;
2756 fout <<
" "<<nodeName<<
"* fLeft; // pointer to the left daughter node" << std::endl;
2757 fout <<
" "<<nodeName<<
"* fRight; // pointer to the right daughter node" << std::endl;
2758 if (fUseFisherCuts){
2759 fout <<
" int fNFisherCoeff; // =0 if this node doesn't use fisher, else =nvar+1 " << std::endl;
2760 fout <<
" std::vector<double> fFisherCoeff; // the fisher coeff (offset at the last element)" << std::endl;
2762 fout <<
" int fSelector; // index of variable used in node selection (decision tree) " << std::endl;
2763 fout <<
" double fCutValue; // cut value applied on this node to discriminate bkg against sig" << std::endl;
2764 fout <<
" bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << std::endl;
2765 fout <<
" int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << std::endl;
2766 fout <<
" double fPurity; // Purity of node from training"<< std::endl;
2767 fout <<
" double fResponse; // Regression response value of node" << std::endl;
2768 fout <<
"}; " << std::endl;
2769 fout <<
" " << std::endl;
2770 fout <<
"//_______________________________________________________________________" << std::endl;
2771 fout <<
" "<<nodeName<<
"::~"<<nodeName<<
"()" << std::endl;
2772 fout <<
"{" << std::endl;
2773 fout <<
" if (fLeft != NULL) delete fLeft;" << std::endl;
2774 fout <<
" if (fRight != NULL) delete fRight;" << std::endl;
2775 fout <<
"}; " << std::endl;
2776 fout <<
" " << std::endl;
2777 fout <<
"//_______________________________________________________________________" << std::endl;
2778 fout <<
"bool "<<nodeName<<
"::GoesRight( const std::vector<double>& inputValues ) const" << std::endl;
2779 fout <<
"{" << std::endl;
2780 fout <<
" // test event if it descends the tree at this node to the right" << std::endl;
2781 fout <<
" bool result;" << std::endl;
2782 if (fUseFisherCuts){
2783 fout <<
" if (fNFisherCoeff == 0){" << std::endl;
2784 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2785 fout <<
" }else{" << std::endl;
2786 fout <<
" double fisher = fFisherCoeff.at(fFisherCoeff.size()-1);" << std::endl;
2787 fout <<
" for (unsigned int ivar=0; ivar<fFisherCoeff.size()-1; ivar++)" << std::endl;
2788 fout <<
" fisher += fFisherCoeff.at(ivar)*inputValues.at(ivar);" << std::endl;
2789 fout <<
" result = fisher > fCutValue;" << std::endl;
2790 fout <<
" }" << std::endl;
2792 fout <<
" result = (inputValues[fSelector] > fCutValue );" << std::endl;
2794 fout <<
" if (fCutType == true) return result; //the cuts are selecting Signal ;" << std::endl;
2795 fout <<
" else return !result;" << std::endl;
2796 fout <<
"}" << std::endl;
2797 fout <<
" " << std::endl;
2798 fout <<
"//_______________________________________________________________________" << std::endl;
2799 fout <<
"bool "<<nodeName<<
"::GoesLeft( const std::vector<double>& inputValues ) const" << std::endl;
2800 fout <<
"{" << std::endl;
2801 fout <<
" // test event if it descends the tree at this node to the left" << std::endl;
2802 fout <<
" if (!this->GoesRight(inputValues)) return true;" << std::endl;
2803 fout <<
" else return false;" << std::endl;
2804 fout <<
"}" << std::endl;
2805 fout <<
" " << std::endl;
2806 fout <<
"#endif" << std::endl;
2807 fout <<
" " << std::endl;
2816 Log() << kFATAL <<
"MakeClassInstantiateNode: started with undefined node" <<
Endl;
2819 fout <<
"NN("<<std::endl;
2826 fout <<
", " <<std::endl;
2833 fout <<
", " << std::endl
2834 << std::setprecision(6);
2835 if (fUseFisherCuts){
2837 for (
UInt_t i=0; i< GetNVariables()+1; i++) {
2861 Int_t nTotS_unWeighted = 0, nTotB_unWeighted = 0;
2863 std::vector<TMVA::BDTEventWrapper> bdtEventSample;
2865 fIsLowSigCut.assign(GetNvar(),
kFALSE);
2866 fIsLowBkgCut.assign(GetNvar(),
kFALSE);
2867 fIsHighSigCut.assign(GetNvar(),
kFALSE);
2868 fIsHighBkgCut.assign(GetNvar(),
kFALSE);
2870 fLowSigCut.assign(GetNvar(),0.);
2871 fLowBkgCut.assign(GetNvar(),0.);
2872 fHighSigCut.assign(GetNvar(),0.);
2873 fHighBkgCut.assign(GetNvar(),0.);
2878 for( std::vector<const TMVA::Event*>::const_iterator it = eventSample.begin(); it != eventSample.end(); ++it ) {
2879 if (DataInfo().IsSignal(*it)){
2880 nTotS += (*it)->GetWeight();
2884 nTotB += (*it)->GetWeight();
2890 for(
UInt_t ivar = 0; ivar < GetNvar(); ivar++ ) {
2892 std::sort( bdtEventSample.begin(),bdtEventSample.end() );
2894 Double_t bkgWeightCtr = 0.0, sigWeightCtr = 0.0;
2895 std::vector<TMVA::BDTEventWrapper>::iterator it = bdtEventSample.begin(), it_end = bdtEventSample.end();
2896 for( ; it != it_end; ++it ) {
2897 if (DataInfo().IsSignal(**it))
2898 sigWeightCtr += (**it)->GetWeight();
2900 bkgWeightCtr += (**it)->GetWeight();
2902 it->SetCumulativeWeight(
false,bkgWeightCtr);
2903 it->SetCumulativeWeight(
true,sigWeightCtr);
2908 Double_t dVal = (DataInfo().GetVariableInfo(ivar).GetMax() - DataInfo().GetVariableInfo(ivar).GetMin())/100. ;
2909 Double_t nSelS, nSelB, effS=0.05, effB=0.05, rejS=0.05, rejB=0.05;
2910 Double_t tmpEffS, tmpEffB, tmpRejS, tmpRejB;
2915 for(
UInt_t iev = 1; iev < bdtEventSample.size(); iev++) {
2918 nSelS = bdtEventSample[iev].GetCumulativeWeight(
true);
2919 nSelB = bdtEventSample[iev].GetCumulativeWeight(
false);
2921 tmpEffS=nSelS/nTotS;
2922 tmpEffB=nSelB/nTotB;
2925 if (nSelS==0 && tmpEffB>effB) {effB=tmpEffB; fLowBkgCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowBkgCut[ivar]=
kTRUE;}
2926 else if (nSelB==0 && tmpEffS>effS) {effS=tmpEffS; fLowSigCut[ivar] = bdtEventSample[iev].GetVal() - dVal; fIsLowSigCut[ivar]=
kTRUE;}
2927 else if (nSelB==nTotB && tmpRejS>rejS) {rejS=tmpRejS; fHighSigCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighSigCut[ivar]=
kTRUE;}
2928 else if (nSelS==nTotS && tmpRejB>rejB) {rejB=tmpRejB; fHighBkgCut[ivar] = bdtEventSample[iev].GetVal() + dVal; fIsHighBkgCut[ivar]=
kTRUE;}
2933 Log() << kDEBUG <<
" \tfound and suggest the following possible pre-selection cuts " <<
Endl;
2934 if (fDoPreselection)
Log() << kDEBUG <<
"\tthe training will be done after these cuts... and GetMVA value returns +1, (-1) for a signal (bkg) event that passes these cuts" <<
Endl;
2935 else Log() << kDEBUG <<
"\tas option DoPreselection was not used, these cuts however will not be performed, but the training will see the full sample"<<
Endl;
2936 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
2937 if (fIsLowBkgCut[ivar]){
2938 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" < " << fLowBkgCut[ivar] <<
Endl;
2940 if (fIsLowSigCut[ivar]){
2941 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" < " << fLowSigCut[ivar] <<
Endl;
2943 if (fIsHighBkgCut[ivar]){
2944 Log() << kDEBUG <<
" \tfound cut: Bkg if var " << ivar <<
" > " << fHighBkgCut[ivar] <<
Endl;
2946 if (fIsHighSigCut[ivar]){
2947 Log() << kDEBUG <<
" \tfound cut: Sig if var " << ivar <<
" > " << fHighSigCut[ivar] <<
Endl;
2962 for (
UInt_t ivar=0; ivar < GetNvar(); ivar++ ) {
2963 if (fIsLowBkgCut[ivar]){
2964 if (ev->
GetValue(ivar) < fLowBkgCut[ivar]) result = -1;
2966 if (fIsLowSigCut[ivar]){
2967 if (ev->
GetValue(ivar) < fLowSigCut[ivar]) result = 1;
2969 if (fIsHighBkgCut[ivar]){
2970 if (ev->
GetValue(ivar) > fHighBkgCut[ivar]) result = -1;
2972 if (fIsHighSigCut[ivar]){
2973 if (ev->
GetValue(ivar) > fHighSigCut[ivar]) result = 1;
Double_t AdaCost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaCost boosting algorithm takes a simple cost Matrix (currently fixed for all events...
void Train(void)
BDT training.
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
virtual const char * GetTitle() const
Returns title of object.
std::string GetName(const std::string &scope_name)
void PreProcessNegativeEventWeights()
O.k.
Double_t AdaBoostR2(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Adaption of the AdaBoost to regression problems (see H.Drucker 1997).
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
double dist(Rotation3D const &r1, Rotation3D const &r2)
static long int sum(long int i)
Random number generator class based on M.
THist< 1, int, THistStatContent > TH1I
virtual Double_t PoissonD(Double_t mean)
Generates a random number according to a Poisson law.
MsgLogger & Endl(MsgLogger &ml)
Double_t Boost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Apply the boosting algorithm (the algorithm is selecte via the the "option" given in the constructor...
TH1 * GetHist(const TString &alias) const
Singleton class for Global types used by TMVA.
void WriteMonitoringHistosToFile(void) const
Here we could write some histograms created during the processing to the output file.
void DeclareCompatibilityOptions()
Options that are used ONLY for the READER to ensure backward compatibility.
void AddWeightsXMLTo(void *parent) const
Write weights to XML.
Double_t GradBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt, UInt_t cls=0)
Calculate the desired response value for each region.
const Ranking * CreateRanking()
Compute ranking of input variables.
void MakeClassSpecificHeader(std::ostream &, const TString &) const
Specific class header.
void BDT(TString dataset, const TString &fin="TMVA.root")
Absolute Deviation BDT Loss Function.
TString & ReplaceAll(const TString &s1, const TString &s2)
virtual void SetName(const char *name)
Set the name of the TNamed.
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
void DeclareOptions()
Define the options (their key words).
Double_t Atof() const
Return floating-point value contained in string.
UInt_t GetNFisherCoeff() const
Virtual base Class for all MVA method.
virtual DecisionTreeNode * GetRight() const
TMVA::DecisionTreeNode * GetEventNode(const TMVA::Event &e) const
get the pointer to the leaf node where a particular event ends up in...
Double_t Bagging()
Call it boot-strapping, re-sampling or whatever you like, in the end it is nothing else but applying ...
tomato 1-D histogram with a float per channel (see TH1 documentation)}
Ranking for variables in method (implementation)
Short_t Min(Short_t a, Short_t b)
Int_t GetNodeType(void) const
virtual void SetYTitle(const char *title)
virtual void SetTitle(const char *title="")
Set graph title.
void DeterminePreselectionCuts(const std::vector< const TMVA::Event * > &eventSample)
Find useful preselection cuts that will be applied before and Decision Tree training.
void ProcessOptions()
The option string is decoded, for available options see "DeclareOptions".
void UpdateTargetsRegression(std::vector< const TMVA::Event * > &, Bool_t first=kFALSE)
Calculate current residuals for all events and update targets for next iteration. ...
Int_t FloorNint(Double_t x)
virtual DecisionTreeNode * GetLeft() const
Double_t GetMvaValue(Double_t *err=0, Double_t *errUpper=0)
LongDouble_t Power(LongDouble_t x, LongDouble_t y)
Float_t GetValue(UInt_t ivar) const
return value of i'th variable
void GetHelpMessage() const
Get help message text.
Double_t GetGradBoostMVA(const TMVA::Event *e, UInt_t nTrees)
Returns MVA value: -1 for background, 1 for signal.
Implementation of the CrossEntropy as separation criterion.
virtual void SetTuneParameters(std::map< TString, Double_t > tuneParameters)
Set the tuning parameters according to the argument.
virtual Double_t Determinant() const
Float_t GetPurity(void) const
Bool_t GetCutType(void) const
virtual Bool_t HasAnalysisType(Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets)
BDT can handle classification with multiple classes and regression with one regression-target.
void Reset(void)
Reset the method, as if it had just been instantiated (forget all training etc.). ...
TString & Append(const char *cs)
std::vector< std::vector< double > > Data
Double_t RegBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
A special boosting only for Regression (not implemented).
void SetMinNodeSize(Double_t sizeInPercent)
virtual Int_t Read(const char *name)
Read contents of object with specified name from the current directory.
Class that contains all the data information.
Least Squares BDT Loss Function.
Implementation of the SdivSqrtSplusB as separation criterion.
PDF wrapper for histograms; uses user-defined spline interpolation.
void MakeClassInstantiateNode(DecisionTreeNode *n, std::ostream &fout, const TString &className) const
Recursively descends a tree and writes the node instance to the output stream.
const std::vector< Float_t > & GetMulticlassValues()
Get the multiclass MVA response for the BDT classifier.
Double_t GradBoostRegression(std::vector< const TMVA::Event * > &, DecisionTree *dt)
Implementation of M_TreeBoost using any loss function as described by Friedman 1999.
Implementation of the MisClassificationError as separation criterion.
void InitGradBoost(std::vector< const TMVA::Event * > &)
Initialize targets for first tree.
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
TString GetElapsedTime(Bool_t Scientific=kTRUE)
returns pretty string with elapsed time
const std::vector< Float_t > & GetRegressionValues()
Get the regression value generated by the BDTs.
void InitEventSample()
Initialize the event sample (i.e. reset the boost-weights... etc).
virtual void Delete(Option_t *option="")
Delete this object.
VecExpr< UnaryOp< Fabs< T >, VecExpr< A, T, D >, T >, T, D > fabs(const VecExpr< A, T, D > &rhs)
std::string GetMethodName(TCppMethod_t)
Service class for 2-Dim histogram classes.
std::map< TString, Double_t > optimize()
void BoostMonitor(Int_t iTree)
Fills the ROCIntegral vs Itree from the testSample for the monitoring plots during the training ...
The TMVA::Interval Class.
Double_t GetFisherCoeff(Int_t ivar) const
virtual ~MethodBDT(void)
Destructor.
Implementation of the GiniIndex as separation criterion.
virtual void SetBinContent(Int_t bin, Double_t content)
Set bin content see convention for numbering bins in TH1::GetBin In case the bin number is greater th...
Double_t PrivateGetMvaValue(const TMVA::Event *ev, Double_t *err=0, Double_t *errUpper=0, UInt_t useNTrees=0)
Return the MVA value (range [-1;1]) that classifies the event according to the majority vote from the...
Implementation of a Decision Tree.
char * Form(const char *fmt,...)
void SetTarget(UInt_t itgt, Float_t value)
set the target value (dimension itgt) to value
SeparationBase * fSepType
void Init(void)
Common initialisation with defaults for the BDT-Method.
void ReadWeightsFromXML(void *parent)
Reads the BDT from the xml file.
Double_t AdaBoost(std::vector< const TMVA::Event * > &, DecisionTree *dt)
The AdaBoost implementation.
Double_t TestTreeQuality(DecisionTree *dt)
Test the tree quality.. in terms of Misclassification.
Implementation of the GiniIndex With Laplace correction as separation criterion.
static void SetVarIndex(Int_t iVar)
TGraph * GetGraph(const TString &alias) const
void Print(std::ostream &os, const OptionType &opt)
void ReadWeightsFromStream(std::istream &istr)
Read the weights (BDT coefficients).
Double_t ApplyPreselectionCuts(const Event *ev)
Apply the preselection cuts before even bothering about any Decision Trees in the GetMVA ...
void UpdateTargets(std::vector< const TMVA::Event * > &, UInt_t cls=0)
Calculate residual for all events.
static DecisionTree * CreateFromXML(void *node, UInt_t tmva_Version_Code=TMVA_VERSION_CODE)
re-create a new tree (decision tree or search tree) from XML
static RooMathCoreReg dummy
Bool_t IsFloat() const
Returns kTRUE if string contains a floating point or integer number.
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
The TMVA::Interval Class.
LossFunctionBDT * fRegressionLossFunctionBDTG
TMatrixTSym< Element > & Invert(Double_t *det=0)
Invert the matrix and calculate its determinant Notice that the LU decomposition is used instead of B...
Float_t GetTarget(UInt_t itgt) const
virtual std::map< TString, Double_t > OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA")
Call the Optimizer with the set of parameters and ranges that are meant to be tuned.
Short_t GetSelector() const
#define REGISTER_METHOD(CLASS)
for example
void GetBaggedSubSample(std::vector< const TMVA::Event * > &)
Fills fEventSample with fBaggedSampleFraction*NEvents random training events.
virtual void SetXTitle(const char *title)
virtual void SetPoint(Int_t i, Double_t x, Double_t y)
Set x and y values for point number i.
virtual void DeclareCompatibilityOptions()
options that are used ONLY for the READER to ensure backward compatibility they are hence without any...
Class that is the base-class for a vector of result.
Short_t Max(Short_t a, Short_t b)
A Graph is a graphics object made of two arrays X and Y with npoints each.
void DrawProgressBar(Int_t, const TString &comment="")
draws progress bar in color or B&W caution:
std::vector< Double_t > GetVariableImportance()
Return the relative variable importance, normalized to all variables together having the importance 1...
A TTree object has a header with a name and a title.
void Store(TObject *obj, const char *alias=0)
Double_t Sqrt(Double_t x)
virtual void Set(Int_t n)
Set number of points in the graph Existing coordinates are preserved New coordinates above fNpoints a...
THist< 2, float, THistStatContent, THistStatUncertainty > TH2F
double norm(double *x, double *p)
Float_t GetResponse(void) const
Timing information for training and evaluation of MVA methods.
void MakeClassSpecific(std::ostream &, const TString &) const
Make ROOT-independent C++ class for classifier response (classifier-specific implementation).
Analysis of Boosted Decision Trees.
Int_t CeilNint(Double_t x)
Float_t GetCutValue(void) const
MethodBDT(const TString &jobName, const TString &methodTitle, DataSetInfo &theData, const TString &theOption="")
The standard constructor for the "boosted decision trees".