123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- local dt = require "decisiontree._env"
-
- local bm = {}
- function bm.CartTrainer(opt)
- local timer = torch.Timer()
- local trainSet, validSet = dt.getSparseDummyData(opt)
- print(string.format("CartTrainer: sparse dataset create: %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-
- local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
- local treeState = dt.GiniState(trainSet:getExampleIds())
- timer:reset()
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- print(string.format("CartTrainer: train single-thread : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-
- timer:reset()
- cartTrainer:featureParallel(opt.nThread)
- print(string.format("CartTrainer: setup feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
- timer:reset()
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- print(string.format("CartTrainer: train feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
- end
-
- function bm.GradientBoostState(opt)
- local trainSet, validSet = dt.getSparseDummyData(opt)
-
- trainSet:initScore()
-
- local treeState = dt.GradientBoostState(trainSet:getExampleIds(), nn.LogitBoostCriterion(false))
-
- local timer = torch.Timer() -- first step also calls SparseTensor:buildIndex()
- treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
- print(string.format("GradientBoostState: findBestSplit (first) : %f sec", timer:time().real))
-
- timer:reset()
- treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
- print(string.format("GradientBoostState: findBestSplit (second) : %f sec", timer:time().real))
-
- end
-
- local function file_exists(name)
- local f=io.open(name,"r")
- if f~=nil then io.close(f) return true else return false end
- end
-
- function bm.GradientBoostTrainer(opt)
- local trainSet, validSet
- if file_exists("/tmp/train.bin") and file_exists("/tmp/valid.bin") then
- trainSet = torch.load("/tmp/train.bin")
- validSet = torch.load("/tmp/valid.bin")
- else
- if opt.sparse then
- trainSet, validSet = dt.getSparseDummyData(opt)
- else
- trainSet, validSet = dt.getDenseDummyData(opt)
- end
- torch.save("/tmp/train.bin", trainSet)
- torch.save("/tmp/valid.bin", validSet)
- end
-
- local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
- opt.lossFunction = nn.LogitBoostCriterion(false)
- opt.treeTrainer = cartTrainer
- local forestTrainer = dt.GradientBoostTrainer(opt)
-
- local timer = torch.Timer()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
- local time = timer:time().real
- print(string.format("GradientBoostTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-
- cartTrainer:featureParallel(opt.nThread)
- timer:reset()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
- local time = timer:time().real
- print(string.format("GradientBoostTrainer: train feature-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
- end
-
- function bm.RandomForestTrainer(opt)
- local trainSet, validSet = dt.getSparseDummyData(opt)
-
- local forestTrainer = dt.RandomForestTrainer(opt)
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
-
- local timer = torch.Timer()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
- local time = timer:time().real
- print(string.format("RandomForestTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-
- timer:reset()
- forestTrainer:treeParallel(opt.nThread)
- print(string.format("RandomForestTrainer: setup tree-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-
- timer:reset()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
- local time = timer:time().real
- print(string.format("RandomForestTrainer: train tree-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
- end
-
- function bm.DFD(opt)
- local _ = require 'moses'
- local opt = _.clone(opt)
- opt.nExample = 200
- local trainSet, validSet = dt.getDenseDummyData(opt)
-
- local forestTrainer = dt.RandomForestTrainer(opt)
- forestTrainer:treeParallel(opt.nThread)
- local timer = torch.Timer()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
- local time = timer:time().real
- print(string.format("DFD: train random forest in parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-
-
- -- benchmark nn.DFD
- local input = trainSet.input:sub(1,opt.batchsize)
- local dfd = nn.DFD(decisionForest)
- dfd:forward(input)
- timer:reset()
- for i=1,opt.nloop do
- dfd:forward(input)
- end
- print(string.format("DFD: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
- end
-
- function bm.Sparse2Dense(opt)
- local _ = require 'moses'
- local opt = _.clone(opt)
- opt.nExample = opt.batchsize
- local trainSet = dt.getSparseDummyData(opt)
-
- local input = {{},{}}
- for i=1,opt.batchsize do
- input[1][i] = trainSet.input[i].keys
- input[2][i] = trainSet.input[i].values
- end
- assert(#input[1] == opt.batchsize)
-
- -- benchmark nn.Sparse2Dense
- local s2d = nn.Sparse2Dense(torch.LongTensor():range(1,opt.nFeature))
- s2d:forward(input)
- local timer = torch.Timer()
- for i=1,opt.nloop do
- s2d:forward(input)
- end
- print(string.format("Sparse2Dense: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
- end
-
- function dt.benchmark(benchmarks, opt2)
- local opt = {
- nExample=10000, nCluster=2, nFeature=1000, overlap=0, nValid=100, -- getSparseDummyData
- nTree=20, featureBaggingSize=-1, sparse=true, -- GradientBoostTrainer and RandomForestTrainer
- nThread=2, shrinkage=0.1, downsampleRatio=0.1, evalFreq=5, earlyStop=0, -- GradientBoostTrainer
- activeRatio=0.5, -- RandomForestTrainer
- batchsize=32, nloop=10
- }
-
- local _ = require 'moses'
- benchmarks = benchmarks or _.keys(bm)
- assert(torch.type(benchmarks) == 'table')
- for i,benchmark in ipairs(benchmarks) do
- local opt1 = _.clone(opt)
- for key, value in pairs(opt2 or {}) do
- opt1[key] = value
- end
- opt1.nActive = opt1.nActive or torch.round(opt1.nFeature/10)
- opt1.maxLeafNodes = opt1.maxLeafNodes or (opt1.nExample/10)
- opt1.minLeafSize = opt1.minLeafSize or (opt1.nExample/100)
-
- assert(torch.type(benchmark) == 'string', benchmark)
- assert(bm[benchmark], benchmark)
- bm[benchmark](opt1)
- end
- end
|