65 : fVisHistsUseImp(
kTRUE ),
69 std::srand( randSEED );
80 , fVisHistsUseImp(
kTRUE )
99 UInt_t neve = fTrainingEvents.size();
102 fNEveEffTrain = CalcWeightSum( &fTrainingEvents );
111 this->SetMethodBase(rfbase);
112 fRuleEnsemble.Initialize(
this );
113 fRuleFitParams.SetRuleFit(
this );
125 UInt_t nevents = fMethodRuleFit->Data()->GetNTrainingEvents();
126 std::vector<const TMVA::Event*> tmp;
127 for (
Long64_t ievt=0; ievt<nevents; ievt++) {
128 const Event *
event = fMethodRuleFit->GetEvent(ievt);
129 tmp.push_back(event);
131 SetTrainingEvents( tmp );
140 fRuleEnsemble.MakeModel();
143 fRuleFitParams.Init();
152 fMethodBase = rfbase;
153 fMethodRuleFit =
dynamic_cast<const MethodRuleFit *
>(rfbase);
177 if (events==0)
return 0.0;
178 if (neve==0) neve=events->size();
181 for (
UInt_t ie=0; ie<neve; ie++) {
182 sumw += ((*events)[ie])->GetWeight();
192 fLogger->SetMinType(t);
193 fRuleEnsemble.SetMsgType(t);
194 fRuleFitParams.SetMsgType(t);
203 if (fMethodRuleFit==0) {
204 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
206 std::vector<const Event *> evevec;
207 for (
UInt_t ie=0; ie<fNTreeSample; ie++) {
208 evevec.push_back(fTrainingEventsRndm[ie]);
223 if (fMethodRuleFit==0) {
224 Log() << kFATAL <<
"RuleFit::BuildTree() - Attempting to build a tree NOT from a MethodRuleFit" <<
Endl;
226 Log() << kDEBUG <<
"Creating a forest with " << fMethodRuleFit->GetNTrees() <<
" decision trees" <<
Endl;
227 Log() << kDEBUG <<
"Each tree is built using a random subsample with " << fNTreeSample <<
" events" <<
Endl;
229 Timer timer( fMethodRuleFit->GetNTrees(),
"RuleFit" );
240 Bool_t useBoost = fMethodRuleFit->UseBoost();
242 if (useBoost) SaveEventWeights();
244 for (
Int_t i=0; i<fMethodRuleFit->GetNTrees(); i++) {
246 if (!useBoost) ReshuffleEvents();
249 for (
UInt_t ie = 0; ie<fNTreeSample; ie++) {
250 if (fMethodBase->DataInfo().IsSignal(fTrainingEventsRndm[ie])) nsig++;
259 const Int_t ntriesMax=10;
262 frnd = 100*rndGen.
Uniform( fMethodRuleFit->GetMinFracNEve(), 0.5*fMethodRuleFit->GetMaxFracNEve() );
264 Bool_t useRandomisedTree = !useBoost;
265 dt =
new DecisionTree( fMethodRuleFit->GetSeparationBase(), frnd, fMethodRuleFit->GetNCuts(), &(fMethodRuleFit->DataInfo()), iclass, useRandomisedTree);
266 dt->
SetNVars(fMethodBase->GetNvar());
274 tryAgain = ((dt==0) && (ntries<ntriesMax));
277 fForest.push_back(dt);
278 if (useBoost) Boost(dt);
282 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
283 Log() << kWARNING <<
" Failed growing a tree even after " << ntriesMax <<
" trials" <<
Endl;
284 Log() << kWARNING <<
" Possible solutions: " <<
Endl;
285 Log() << kWARNING <<
" 1. increase the number of training events" <<
Endl;
286 Log() << kWARNING <<
" 2. set a lower min fraction cut (fEventsMin)" <<
Endl;
287 Log() << kWARNING <<
" 3. maybe also decrease the max fraction cut (fEventsMax)" <<
Endl;
288 Log() << kWARNING <<
" If the above warning occurs rarely only, it can be ignored" <<
Endl;
289 Log() << kWARNING <<
"------------------------------------------------------------------" <<
Endl;
292 Log() << kDEBUG <<
"Built tree with minimum cut at N = " << frnd <<
"% events"
293 <<
" => N(nodes) = " << fForest.back()->GetNNodes()
294 <<
" ; n(tries) = " << ntries
299 if (useBoost) RestoreEventWeights();
310 fEventWeights.clear();
311 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end();
e++) {
312 Double_t w = (*e)->GetBoostWeight();
313 fEventWeights.push_back(w);
323 if (fEventWeights.size() != fTrainingEvents.size()) {
324 Log() << kERROR <<
"RuleFit::RestoreEventWeights() called without having called SaveEventWeights() before!" <<
Endl;
327 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end();
e++) {
328 (*e)->SetBoostWeight(fEventWeights[ie]);
343 std::vector<Char_t> correctSelected;
345 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end();
e++) {
350 if (isSignalType == fMethodBase->DataInfo().IsSignal(*
e)) {
351 correctSelected.push_back(
kTRUE);
355 correctSelected.push_back(
kFALSE);
363 Double_t boostWeight = (err>0 ? (1.0-err)/err : 1000.0);
367 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end();
e++) {
368 if (!correctSelected[ie])
369 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostWeight);
370 newSumw+=(*e)->GetWeight();
375 for (std::vector<const Event*>::iterator
e=fTrainingEvents.begin();
e!=fTrainingEvents.end();
e++) {
376 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * scale);
378 Log() << kDEBUG <<
"boostWeight = " << boostWeight <<
" scale = " << scale <<
Endl;
387 UInt_t ntrees = fForest.size();
388 if (ntrees==0)
return;
393 for (
UInt_t i=0; i<ntrees; i++) {
400 Log() << kVERBOSE <<
"Nodes in trees: average & std dev = " << sumn/ntrees <<
" , " << sig <<
Endl;
410 Log() << kVERBOSE <<
"Fitting rule/linear terms" <<
Endl;
411 fRuleFitParams.MakeGDPath();
419 Log() << kVERBOSE <<
"Calculating importance" <<
Endl;
420 fRuleEnsemble.CalcImportance();
421 fRuleEnsemble.CleanupRules();
422 fRuleEnsemble.CleanupLinear();
423 fRuleEnsemble.CalcVarImportance();
424 Log() << kVERBOSE <<
"Filling rule statistics" <<
Endl;
425 fRuleEnsemble.RuleResponseStats();
433 return fRuleEnsemble.EvalEvent( e );
441 if (fMethodRuleFit==0)
Log() << kFATAL <<
"RuleFit::SetTrainingEvents - MethodRuleFit not initialized" <<
Endl;
443 if (neve==0)
Log() << kWARNING <<
"An empty sample of training events was given" <<
Endl;
446 fTrainingEvents.clear();
447 fTrainingEventsRndm.clear();
448 for (
UInt_t i=0; i<neve; i++) {
449 fTrainingEvents.push_back(static_cast< const Event *>(el[i]));
450 fTrainingEventsRndm.push_back(static_cast< const Event *>(el[i]));
454 std::random_shuffle( fTrainingEventsRndm.begin(), fTrainingEventsRndm.end() );
457 fNTreeSample =
static_cast<UInt_t>(neve*fMethodRuleFit->GetTreeEveFrac());
458 Log() << kDEBUG <<
"Number of events per tree : " << fNTreeSample
459 <<
" ( N(events) = " << neve <<
" )"
460 <<
" randomly drawn without replacement" <<
Endl;
469 if ((nevents<fTrainingEventsRndm.size()) && (nevents>0)) {
470 evevec.resize(nevents);
471 for (
UInt_t ie=0; ie<nevents; ie++) {
472 evevec[ie] = fTrainingEventsRndm[ie];
476 Log() << kWARNING <<
"GetRndmSampleEvents() : requested sub sample size larger than total size (BUG!).";
487 if (hlist.empty())
return;
494 for (
UInt_t i=0; i<hlist.size(); i++) {
504 if (wm<wmin) wmin=wm;
521 for (
UInt_t i=0; i<hlist.size(); i++) {
540 if (!ruleHasVar)
return;
543 if(firstbin<0) firstbin=0;
551 Double_t fbfrac = (dormin ? ((fbmin+xbinw-rmin)/xbinw):1.0);
552 Double_t lbfrac = (dormax ? ((rmax-lbmax+xbinw)/xbinw):1.0);
557 for (
Int_t bin = binmin; bin<binmax+1; bin++) {
558 fbin = bin-firstbin+1;
562 else if (bin==binmax) {
570 if (fVisHistsUseImp) {
576 h2->
Fill(xc,0.5,val*f);
586 if (!fRuleEnsemble.DoLinear())
return;
592 if (fVisHistsUseImp) {
593 val = fRuleEnsemble.GetLinImportance(vind);
596 val = fRuleEnsemble.GetLinCoefficients(vind);
598 for (
Int_t bin = firstbin; bin<lastbin+1; bin++) {
600 h2->
Fill(xc,0.5,val);
612 if (fVisHistsUseImp) {
619 Double_t rxmin, rxmax, rymin, rymax;
620 Bool_t dorxmin, dorxmax, dorymin, dorymax;
626 if (!(ruleHasVarX || ruleHasVarY))
return;
645 Double_t fxbinmin = (dorxmin ? ((xbinmin+xbinw-vxmin)/xbinw):1.0);
646 Double_t fxbinmax = (dorxmax ? ((vxmax-xbinmax+xbinw)/xbinw):1.0);
647 Double_t fybinmin = (dorymin ? ((ybinmin+ybinw-vymin)/ybinw):1.0);
648 Double_t fybinmax = (dorymax ? ((vymax-ybinmax+ybinw)/ybinw):1.0);
653 for (
Int_t binx = binxmin; binx<binxmax+1; binx++) {
657 else if (binx==binxmax) {
664 for (
Int_t biny = binymin; biny<binymax+1; biny++) {
668 else if (biny==binymax) {
675 h2->
Fill(xc,yc,val*fx*fy);
685 Int_t nhists = hlist.size();
686 Int_t nvar = fMethodBase->GetNvar();
687 if (nhists!=nvar)
Log() << kFATAL <<
"BUG TRAP: number of hists is not equal the number of variables!" <<
Endl;
689 std::vector<Int_t> vindex;
692 for (
Int_t ih=0; ih<nhists; ih++) {
693 hstr = hlist[ih]->GetTitle();
694 for (
Int_t iv=0; iv<nvar; iv++) {
695 if (fMethodBase->GetInputTitle(iv) == hstr)
696 vindex.push_back(iv);
700 for (
Int_t iv=0; iv<nvar; iv++) {
703 FillCut(hlist[iv],rule,vindex[iv]);
707 FillLin(hlist[iv],vindex[iv]);
718 if (!(ruleimp>0))
return;
719 if (ruleimp<fRuleEnsemble.GetImportanceCut())
return;
721 Int_t nhists = hlist.size();
722 Int_t nvar = fMethodBase->GetNvar();
723 Int_t ncorr = (nvar*(nvar+1)/2)-nvar;
724 if (nhists!=ncorr)
Log() << kERROR <<
"BUG TRAP: number of corr hists is not correct! ncorr = "
725 << ncorr <<
" nvar = " << nvar <<
" nhists = " << nhists <<
Endl;
727 std::vector< std::pair<Int_t,Int_t> > vindex;
731 for (
Int_t ih=0; ih<nhists; ih++) {
732 hstr = hlist[ih]->GetName();
733 if (GetCorrVars( hstr, var1, var2 )) {
734 iv1 = fMethodBase->DataInfo().FindVarIndex( var1 );
735 iv2 = fMethodBase->DataInfo().FindVarIndex( var2 );
736 vindex.push_back( std::pair<Int_t,Int_t>(iv2,iv1) );
739 Log() << kERROR <<
"BUG TRAP: should not be here - failed getting var1 and var2" <<
Endl;
743 for (
Int_t ih=0; ih<nhists; ih++) {
746 FillCorr(hlist[ih],rule,vindex[ih].first,vindex[ih].second);
764 var1 = titleCopy(0,splitPos);
765 var2 = titleCopy(splitPos+4, titleCopy.
Length());
778 const TString directories[5] = {
"InputVariables_Id",
779 "InputVariables_Deco",
780 "InputVariables_PCA",
781 "InputVariables_Gauss",
782 "InputVariables_Gauss_Deco" };
784 const TString corrDirName =
"CorrelationPlots";
790 TDirectory* methodDir = fMethodBase->BaseDir();
796 Log() << kWARNING <<
"No basedir - BUG??" <<
Endl;
802 done = ((varDir!=0) || (type>4));
805 Log() << kWARNING <<
"No input variable directory found - BUG?" <<
Endl;
810 Log() << kWARNING <<
"No correlation directory found" <<
Endl;
811 Log() << kWARNING <<
"Check for other warnings related to correlation histograms" <<
Endl;
815 Log() << kWARNING <<
"No rulefit method directory found - BUG?" <<
Endl;
819 varDirName = varDir->
GetName();
825 Log() << kWARNING <<
"No correlation directory found : " << corrDirName <<
Endl;
831 Log() << kDEBUG <<
"Got number of plots = " << noPlots <<
Endl;
834 std::vector<TH2F *> h1Vector;
835 std::vector<TH2F *> h2CorrVector;
838 while ((key = (
TKey*)next())) {
844 Log() << kDEBUG <<
"Got histogram : " << hname <<
Endl;
858 h1Vector.push_back( newhist );
865 while ((key = (
TKey*)nextCorr())) {
874 Log() << kDEBUG <<
"Got histogram (2D) : " << hname <<
Endl;
882 TH2F *newhist =
new TH2F(newname,htitle,
885 if (GetCorrVars( newname, var1, var2 )) {
886 Int_t iv1 = fMethodBase->DataInfo().FindVarIndex(var1);
887 Int_t iv2 = fMethodBase->DataInfo().FindVarIndex(var2);
902 h2CorrVector.push_back( newhist );
908 UInt_t nrules = fRuleEnsemble.GetNRules();
910 for (
UInt_t i=0; i<nrules; i++) {
911 rule = fRuleEnsemble.GetRulesConst(i);
912 FillVisHistCut(rule, h1Vector);
915 FillVisHistCut(0, h1Vector);
916 NormVisHists(h1Vector);
921 for (
UInt_t i=0; i<nrules; i++) {
922 rule = fRuleEnsemble.GetRulesConst(i);
923 FillVisHistCorr(rule, h2CorrVector);
925 NormVisHists(h2CorrVector);
929 for (
UInt_t i=0; i<h1Vector.size(); i++) h1Vector[i]->Write();
930 for (
UInt_t i=0; i<h2CorrVector.size(); i++) h2CorrVector[i]->Write();
938 TDirectory* methodDir = fMethodBase->BaseDir();
940 Log() << kWARNING <<
"<MakeDebugHists> No rulefit method directory found - bug?" <<
Endl;
945 std::vector<Double_t> distances;
946 std::vector<Double_t> fncuts;
947 std::vector<Double_t> fnvars;
952 UInt_t nrules = fRuleEnsemble.GetNRules();
953 for (
UInt_t i=0; i<nrules; i++) {
954 ruleA = fRuleEnsemble.GetRulesConst(i);
955 for (
UInt_t j=i+1; j<nrules; j++) {
956 ruleB = fRuleEnsemble.GetRulesConst(j);
961 distances.push_back(dAB);
962 fncuts.push_back(static_cast<Double_t>(nc));
963 fnvars.push_back(static_cast<Double_t>(nv));
964 if (dAB<dABmin) dABmin=dAB;
965 if (dAB>dABmax) dABmax=dAB;
970 TH1F *histDist =
new TH1F(
"RuleDist",
"Rule distances",100,dABmin,dABmax);
971 TTree *distNtuple =
new TTree(
"RuleDistNtuple",
"RuleDist ntuple");
975 distNtuple->
Branch(
"dist", &ntDist,
"dist/D");
976 distNtuple->
Branch(
"ncuts",&ntNcuts,
"ncuts/D");
977 distNtuple->
Branch(
"nvars",&ntNvars,
"nvars/D");
979 for (
UInt_t i=0; i<distances.size(); i++) {
980 histDist->
Fill(distances[i]);
981 ntDist = distances[i];
void ForestStatistics()
summary of statistics of all trees
virtual const char * GetTitle() const
Returns title of object.
void SetPruneMethod(EPruneMethod m=kCostComplexityPruning)
void MakeForest()
make a forest of decisiontrees
virtual Int_t FindBin(Double_t x, Double_t y=0, Double_t z=0)
Return Global bin number corresponding to x,y,z.
virtual void Scale(Double_t c1=1, Option_t *option="")
Multiply this histogram by a constant c1.
virtual Int_t Fill(Double_t x)
Increment bin with abscissa X by 1.
J Friedman's RuleFit method.
Random number generator class based on M.
MsgLogger & Endl(MsgLogger &ml)
virtual void SetMaximum(Double_t maximum=-1111)
const std::vector< const TMVA::Event * > & GetTrainingEvents() const
void CalcImportance()
calculates the importance of each rule
virtual TList * GetListOfKeys() const
virtual TObject * Get(const char *namecycle)
Return pointer to object identified by namecycle.
A class implementing various fits of rule ensembles.
void FillVisHistCorr(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all correlation plots
TString & ReplaceAll(const TString &s1, const TString &s2)
virtual Int_t Fill()
Fill all branches.
THist< 1, float, THistStatContent, THistStatUncertainty > TH1F
void SetMsgType(EMsgType t)
set the current message type to that of mlog for this class and all other subtools ...
Virtual base Class for all MVA method.
Bool_t GetCorrVars(TString &title, TString &var1, TString &var2)
get first and second variables from title
virtual Double_t GetBinLowEdge(Int_t bin) const
Return low edge of bin.
void InitNEveEff()
init effective number of events (using event weights)
virtual void SetMinimum(Double_t minimum=-1111)
void FitCoefficients()
Fit the coefficients for the rule ensemble.
tomato 1-D histogram with a float per channel (see TH1 documentation)}
virtual Double_t GetBinWidth(Int_t bin) const
Return bin width.
virtual Int_t GetNbinsX() const
Bool_t BeginsWith(const char *s, ECaseCompare cmp=kExact) const
Double_t RuleDist(const Rule &other, Bool_t useCutValue) const
Returns:
Implementation of a rule.
RuleFit(void)
default constructor
void BuildTree(TMVA::DecisionTree *dt)
build the decision tree using fNTreeSample events from fTrainingEventsRndm
const MethodBase * GetMethodBase() const
void SetTrainingEvents(const std::vector< const TMVA::Event * > &el)
set the training events randomly
void GetRndmSampleEvents(std::vector< const TMVA::Event * > &evevec, UInt_t nevents)
draw a random subsample of the training events without replacement
Book space in a file, create I/O buffers, to fill them, (un)compress them.
virtual ~RuleFit(void)
destructor
ROOT::Math::KDTree< _DataPoint > * BuildTree(const std::vector< const _DataPoint * > &vDataPoints, const unsigned int iBucketSize)
const RuleCut * GetRuleCut() const
void SetMethodBase(const MethodBase *rfbase)
set MethodBase
Double_t CheckEvent(const TMVA::Event *, Bool_t UseYesNoLeaf=kFALSE) const
the event e is put into the decision tree (starting at the root node) and the output is NodeType (sig...
virtual TFile * GetFile() const
const std::vector< const TMVA::DecisionTree * > & GetForest() const
virtual Int_t Write(const char *name=0, Int_t option=0, Int_t bufsize=0)
Write this object to the current directory.
void RestoreEventWeights()
save event weights - must be done before making the forest
void FillVisHistCut(const Rule *rule, std::vector< TH2F * > &hlist)
help routine to MakeVisHists() - fills for all variables
void Copy(const RuleFit &other)
copy method
void Initialize(Bool_t useTMVAStyle=kTRUE)
Bool_t ContainsVariable(UInt_t iv) const
check if variable in node
void FillCorr(TH2F *h2, const TMVA::Rule *rule, Int_t v1, Int_t v2)
fill rule correlation between vx and vy, weighted with either the importance or the coefficient ...
void MakeDebugHists()
this will create a histograms intended rather for debugging or for the curious user ...
static const Int_t randSEED
tomato 2-D histogram with a float per channel (see TH1 documentation)}
Double_t GetImportance() const
const RuleEnsemble & GetRuleEnsemble() const
void SetPruneStrength(Double_t p)
Implementation of a Decision Tree.
void FillLin(TH2F *h2, Int_t vind)
fill lin
virtual const char * GetName() const
Returns name of object.
The ROOT global object gROOT contains a list of all defined classes.
Bool_t GetCutRange(Int_t sel, Double_t &rmin, Double_t &rmax, Bool_t &dormin, Bool_t &dormax) const
get cut range for a given selector
const MethodRuleFit * GetMethodRuleFit() const
void Boost(TMVA::DecisionTree *dt)
Boost the events.
virtual Int_t GetBin(Int_t binx, Int_t biny, Int_t binz=0) const
Return Global bin number corresponding to binx,y,z.
virtual Int_t FindBin(Double_t x)
Find bin number corresponding to abscissa x.
TString & Remove(Ssiz_t pos)
void SaveEventWeights()
save event weights - must be done before making the forest
Describe directory structure in memory.
void MakeVisHists()
this will create histograms visualizing the rule ensemble
UInt_t GetNumVarsUsed() const
you should not use this method at all Int_t Int_t Double_t Double_t Double_t e
virtual Double_t Uniform(Double_t x1=1)
Returns a uniform deviate on the interval (0, x1).
void InitPtrs(const TMVA::MethodBase *rfbase)
initialize pointers
Double_t GetCoefficient() const
Double_t CalcWeightSum(const std::vector< const TMVA::Event * > *events, UInt_t neve=0)
calculate the sum of weights
void NormVisHists(std::vector< TH2F * > &hlist)
normalize rule importance hists
ostringstream derivative to redirect and format output
virtual Double_t GetBinCenter(Int_t bin) const
Return center of bin.
virtual Int_t Branch(TCollection *list, Int_t bufsize=32000, Int_t splitlevel=99, const char *name="")
Create one branch for each element in the collection.
virtual Int_t GetNbinsY() const
Double_t PruneTree(const EventConstList *validationSample=NULL)
prune (get rid of internal nodes) the Decision tree to avoid overtraining several different pruning m...
void FillCut(TH2F *h2, const TMVA::Rule *rule, Int_t vind)
Fill cut.
virtual Bool_t cd(const char *path=0)
Change current directory to "this" directory.
Double_t GetSupport() const
Bool_t Contains(const char *pat, ECaseCompare cmp=kExact) const
UInt_t BuildTree(const EventConstList &eventSample, DecisionTreeNode *node=NULL)
building the decision tree by recursively calling the splitting of one (root-) node into two daughter...
Double_t EvalEvent(const Event &e)
evaluate single event
A TTree object has a header with a name and a title.
Bool_t InheritsFrom(const char *cl) const
Return kTRUE if this class inherits from a class with name "classname".
void Initialize(const TMVA::MethodBase *rfbase)
initialize the parameters of the RuleFit method and make rules
Double_t Sqrt(Double_t x)
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
virtual Double_t GetMaximum(Double_t maxval=FLT_MAX) const
Return maximum value smaller than maxval of bins in the range, unless the value has been overridden b...
Int_t Fill(Double_t)
Invalid Fill method.
virtual void SetTitle(const char *title="")
Set the title of the TNamed.
THist< 2, float, THistStatContent, THistStatUncertainty > TH2F
Timing information for training and evaluation of MVA methods.
virtual Double_t GetMinimum(Double_t minval=-FLT_MAX) const
Return minimum value larger than minval of bins in the range, unless the value has been overridden by...