contrib/torch/decisiontree/benchmark.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

local dt = require "decisiontree._env"

local bm = {}
function bm.CartTrainer(opt)
   local timer = torch.Timer()
   local trainSet, validSet = dt.getSparseDummyData(opt)
   print(string.format("CartTrainer: sparse dataset create: %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))

   local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
   local treeState = dt.GiniState(trainSet:getExampleIds())
   timer:reset()
   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
   print(string.format("CartTrainer: train single-thread : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))

   timer:reset()
   cartTrainer:featureParallel(opt.nThread)
   print(string.format("CartTrainer: setup feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
   timer:reset()
   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
   print(string.format("CartTrainer: train feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
end

function bm.GradientBoostState(opt)
   local trainSet, validSet = dt.getSparseDummyData(opt)

   trainSet:initScore()

   local treeState = dt.GradientBoostState(trainSet:getExampleIds(), nn.LogitBoostCriterion(false))

   local timer = torch.Timer() -- first step also calls SparseTensor:buildIndex()
   treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
   print(string.format("GradientBoostState: findBestSplit (first) : %f sec", timer:time().real))

   timer:reset()
   treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
   print(string.format("GradientBoostState: findBestSplit (second) : %f sec", timer:time().real))

end

local function file_exists(name)
   local f=io.open(name,"r")
   if f~=nil then io.close(f) return true else return false end
end

function bm.GradientBoostTrainer(opt)
   local trainSet, validSet
   if file_exists("/tmp/train.bin") and file_exists("/tmp/valid.bin") then
      trainSet = torch.load("/tmp/train.bin")
      validSet = torch.load("/tmp/valid.bin")
   else
      if opt.sparse then
         trainSet, validSet = dt.getSparseDummyData(opt)
      else
         trainSet, validSet = dt.getDenseDummyData(opt)
      end
      torch.save("/tmp/train.bin", trainSet)
      torch.save("/tmp/valid.bin", validSet)
   end

   local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
   opt.lossFunction = nn.LogitBoostCriterion(false)
   opt.treeTrainer = cartTrainer
   local forestTrainer = dt.GradientBoostTrainer(opt)

   local timer = torch.Timer()
   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
   local time = timer:time().real
   print(string.format("GradientBoostTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))

   cartTrainer:featureParallel(opt.nThread)
   timer:reset()
   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
   local time = timer:time().real
   print(string.format("GradientBoostTrainer: train feature-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
end

function bm.RandomForestTrainer(opt)
   local trainSet, validSet = dt.getSparseDummyData(opt)

   local forestTrainer = dt.RandomForestTrainer(opt)
   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)

   local timer = torch.Timer()
   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
   local time = timer:time().real
   print(string.format("RandomForestTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))

   timer:reset()
   forestTrainer:treeParallel(opt.nThread)
   print(string.format("RandomForestTrainer: setup tree-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))

   timer:reset()
   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
   local time = timer:time().real
   print(string.format("RandomForestTrainer: train tree-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
end

function bm.DFD(opt)
   local _ = require 'moses'
   local opt = _.clone(opt)
   opt.nExample = 200
   local trainSet, validSet = dt.getDenseDummyData(opt)

   local forestTrainer = dt.RandomForestTrainer(opt)
   forestTrainer:treeParallel(opt.nThread)
   local timer = torch.Timer()
   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
   local time = timer:time().real
   print(string.format("DFD: train random forest in parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))


   -- benchmark nn.DFD
   local input = trainSet.input:sub(1,opt.batchsize)
   local dfd = nn.DFD(decisionForest)
   dfd:forward(input)
   timer:reset()
   for i=1,opt.nloop do
      dfd:forward(input)
   end
   print(string.format("DFD: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
end

function bm.Sparse2Dense(opt)
   local _ = require 'moses'
   local opt = _.clone(opt)
   opt.nExample = opt.batchsize
   local trainSet = dt.getSparseDummyData(opt)

   local input = {{},{}}
   for i=1,opt.batchsize do
      input[1][i] = trainSet.input[i].keys
      input[2][i] = trainSet.input[i].values
   end
   assert(#input[1] == opt.batchsize)

   -- benchmark nn.Sparse2Dense
   local s2d = nn.Sparse2Dense(torch.LongTensor():range(1,opt.nFeature))
   s2d:forward(input)
   local timer = torch.Timer()
   for i=1,opt.nloop do
      s2d:forward(input)
   end
   print(string.format("Sparse2Dense: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
end

function dt.benchmark(benchmarks, opt2)
   local opt = {
      nExample=10000, nCluster=2, nFeature=1000, overlap=0, nValid=100,        -- getSparseDummyData
      nTree=20, featureBaggingSize=-1, sparse=true,                           -- GradientBoostTrainer and RandomForestTrainer
      nThread=2, shrinkage=0.1, downsampleRatio=0.1, evalFreq=5, earlyStop=0, -- GradientBoostTrainer
      activeRatio=0.5,                                                      -- RandomForestTrainer
      batchsize=32, nloop=10
   }

   local _ = require 'moses'
   benchmarks = benchmarks or _.keys(bm)
   assert(torch.type(benchmarks) == 'table')
   for i,benchmark in ipairs(benchmarks) do
      local opt1 = _.clone(opt)
      for key, value in pairs(opt2 or {}) do
         opt1[key] = value
      end
      opt1.nActive = opt1.nActive or torch.round(opt1.nFeature/10)
      opt1.maxLeafNodes = opt1.maxLeafNodes or (opt1.nExample/10)
      opt1.minLeafSize = opt1.minLeafSize or (opt1.nExample/100)

      assert(torch.type(benchmark) == 'string', benchmark)
      assert(bm[benchmark], benchmark)
      bm[benchmark](opt1)
   end
end