aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/lua-torch/decisiontree
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2019-07-01 15:13:04 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2019-07-01 15:13:04 +0100
commit891b250b452f8e1963a99931f241ac75e34d0281 (patch)
treeab56b822aca3cc6d02a3c9afbe8ca2f6d1c0381f /contrib/lua-torch/decisiontree
parent38691d998d019ac0fba95720c337e3f9badf55c4 (diff)
downloadrspamd-891b250b452f8e1963a99931f241ac75e34d0281.tar.gz
rspamd-891b250b452f8e1963a99931f241ac75e34d0281.zip
[Project] Remove torch
Diffstat (limited to 'contrib/lua-torch/decisiontree')
-rw-r--r--contrib/lua-torch/decisiontree/CMakeLists.txt51
-rw-r--r--contrib/lua-torch/decisiontree/CartNode.lua42
-rw-r--r--contrib/lua-torch/decisiontree/CartTrainer.lua180
-rw-r--r--contrib/lua-torch/decisiontree/CartTree.lua90
-rw-r--r--contrib/lua-torch/decisiontree/DFD.lua182
-rw-r--r--contrib/lua-torch/decisiontree/DataSet.lua142
-rw-r--r--contrib/lua-torch/decisiontree/DecisionForest.lua82
-rw-r--r--contrib/lua-torch/decisiontree/DecisionForestTrainer.lua22
-rw-r--r--contrib/lua-torch/decisiontree/DecisionTree.lua12
-rw-r--r--contrib/lua-torch/decisiontree/GBDT_common.h106
-rw-r--r--contrib/lua-torch/decisiontree/GiniState.lua54
-rw-r--r--contrib/lua-torch/decisiontree/GradientBoostState.lua57
-rw-r--r--contrib/lua-torch/decisiontree/GradientBoostTrainer.lua244
-rw-r--r--contrib/lua-torch/decisiontree/LICENSE201
-rw-r--r--contrib/lua-torch/decisiontree/LogitBoostCriterion.lua45
-rw-r--r--contrib/lua-torch/decisiontree/MSECriterion.lua13
-rw-r--r--contrib/lua-torch/decisiontree/README.md386
-rw-r--r--contrib/lua-torch/decisiontree/RandomForestTrainer.lua159
-rw-r--r--contrib/lua-torch/decisiontree/Sparse2Dense.lua88
-rw-r--r--contrib/lua-torch/decisiontree/SparseTensor.lua54
-rw-r--r--contrib/lua-torch/decisiontree/TreeState.lua191
-rw-r--r--contrib/lua-torch/decisiontree/WorkPool.lua156
-rw-r--r--contrib/lua-torch/decisiontree/_env.lua5
-rw-r--r--contrib/lua-torch/decisiontree/benchmark.lua171
-rw-r--r--contrib/lua-torch/decisiontree/doc/benchmark.md291
-rw-r--r--contrib/lua-torch/decisiontree/error.h24
-rw-r--r--contrib/lua-torch/decisiontree/generic/CartTree.c88
-rw-r--r--contrib/lua-torch/decisiontree/generic/DFD.c157
-rw-r--r--contrib/lua-torch/decisiontree/generic/GBDT.c392
-rw-r--r--contrib/lua-torch/decisiontree/generic/GBDT_internal.c312
-rw-r--r--contrib/lua-torch/decisiontree/generic/GBDT_internal.h34
-rw-r--r--contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c90
-rw-r--r--contrib/lua-torch/decisiontree/generic/S2D.c90
-rw-r--r--contrib/lua-torch/decisiontree/hash_map.c445
-rw-r--r--contrib/lua-torch/decisiontree/hash_map.h36
-rw-r--r--contrib/lua-torch/decisiontree/init.c77
-rw-r--r--contrib/lua-torch/decisiontree/init.lua70
-rw-r--r--contrib/lua-torch/decisiontree/internal_hash_map.h13
-rw-r--r--contrib/lua-torch/decisiontree/khash.h627
-rw-r--r--contrib/lua-torch/decisiontree/math.lua84
-rw-r--r--contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec40
-rw-r--r--contrib/lua-torch/decisiontree/test.lua817
-rw-r--r--contrib/lua-torch/decisiontree/utils.h45
-rw-r--r--contrib/lua-torch/decisiontree/utils.lua125
44 files changed, 0 insertions, 6590 deletions
diff --git a/contrib/lua-torch/decisiontree/CMakeLists.txt b/contrib/lua-torch/decisiontree/CMakeLists.txt
deleted file mode 100644
index b94b4deb2..000000000
--- a/contrib/lua-torch/decisiontree/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-LIST(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}")
-
-SET(src
- init.c
- hash_map.c
-)
-SET(luasrc
- _env.lua
- benchmark.lua
- CartNode.lua
- CartTrainer.lua
- CartTree.lua
- DataSet.lua
- DecisionForest.lua
- DecisionForestTrainer.lua
- DecisionTree.lua
- DFD.lua
- GiniState.lua
- GradientBoostState.lua
- GradientBoostTrainer.lua
- init.lua
- LogitBoostCriterion.lua
- math.lua
- MSECriterion.lua
- RandomForestTrainer.lua
- Sparse2Dense.lua
- SparseTensor.lua
- test.lua
- TreeState.lua
- utils.lua
- WorkPool.lua
-)
-
-IF (WITH_OPENMP)
- FIND_PACKAGE(OpenMP)
- IF(OPENMP_FOUND)
- MESSAGE(STATUS "Compiling with OpenMP support")
- SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
- SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
- SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
- ENDIF(OPENMP_FOUND)
-ENDIF (WITH_OPENMP)
-
-ADD_TORCH_PACKAGE(decisiontree "${src}" "${luasrc}" "A decision tree library, for Torch")
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-### Torch packages supposes libraries prefix is "lib"
-SET_TARGET_PROPERTIES(decisiontree PROPERTIES
- PREFIX "lib"
- IMPORT_PREFIX "lib")
-TARGET_LINK_LIBRARIES(decisiontree ${TH_LIBRARIES})
-INSTALL(TARGETS decisiontree DESTINATION ${RSPAMD_LIBDIR})
diff --git a/contrib/lua-torch/decisiontree/CartNode.lua b/contrib/lua-torch/decisiontree/CartNode.lua
deleted file mode 100644
index e6a85006c..000000000
--- a/contrib/lua-torch/decisiontree/CartNode.lua
+++ /dev/null
@@ -1,42 +0,0 @@
-local dt = require 'decisiontree._env'
-local CartNode = torch.class("dt.CartNode", dt)
-
-function CartNode:__init(nodeId, leftChild, rightChild, splitFeatureId, splitFeatureValue, score, splitGain)
- self.nodeId = nodeId or 0
- self.leftChild = leftChild
- self.rightChild = rightChild
- self.splitFeatureId = splitFeatureId or -1
- self.splitFeatureValue = splitFeatureValue or 0
- self.score = score or 0
- self.splitGain = splitGain
-end
-
-function CartNode:__tostring__()
- return self:recursivetostring()
-end
-
-function CartNode:recursivetostring(indent)
- indent = indent or ' '
-
- -- Is this a leaf node?
- local res = ''
- if not (self.leftChild or self.rightChild) then
- res = res .. self.score .. '\n'
- else
- -- Print the criteria
- res = res .. 'input[' .. self.splitFeatureId .. '] <' .. self.splitFeatureValue .. '?\n'
-
- -- Print the branches
- if self.leftChild then
- res = res .. indent .. 'True->' .. self.leftChild:recursivetostring(indent .. ' ')
- end
- if self.rightChild then
- res = res .. indent .. 'False->' .. self.rightChild:recursivetostring(indent .. ' ')
- end
- end
- return res
-end
-
-function CartNode:clone()
- return CartNode(self.nodeId, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, self.score, self.splitGain)
-end
diff --git a/contrib/lua-torch/decisiontree/CartTrainer.lua b/contrib/lua-torch/decisiontree/CartTrainer.lua
deleted file mode 100644
index 63ae6c148..000000000
--- a/contrib/lua-torch/decisiontree/CartTrainer.lua
+++ /dev/null
@@ -1,180 +0,0 @@
-local dt = require "decisiontree._env"
-local _ = require "moses"
-
-local CartTrainer = torch.class("dt.CartTrainer", dt)
-
--- Generic CART trainer
-function CartTrainer:__init(dataset, minLeafSize, maxLeafNodes)
- assert(torch.isTypeOf(dataset, 'dt.DataSet'))
- self.dataset = dataset
- self.minLeafSize = assert(minLeafSize) -- min examples per leaf
- self.maxLeafNodes = assert(maxLeafNodes) -- max leaf nodes in tree
-
- -- by default, single thread
- self.parallelMode = 'singlethread'
-end
-
-function CartTrainer:train(rootTreeState, activeFeatures)
- assert(torch.isTypeOf(rootTreeState, 'dt.TreeState'))
- assert(torch.isTensor(activeFeatures))
- local root = dt.CartNode()
- root.id = 0
- root.score = rootTreeState:score(self.dataset)
-
- local nleaf = 1
-
- -- TODO : nodeparallel: parallelize here. The queue is a workqueue.
- local queue = {}
- table.insert(queue, 1, {cartNode=root, treeState=rootTreeState})
-
- while #queue > 0 and nleaf < self.maxLeafNodes do
- local treeGrowerArgs = table.remove(queue, #queue)
- local currentTreeState = treeGrowerArgs.treeState
-
- -- Note: if minLeafSize = 1 and maxLeafNode = inf, then each example will be its own leaf...
- if self:hasEnoughTrainingExamplesToSplit(currentTreeState.exampleIds:size(1)) then
- nleaf = self:processNode(nleaf, queue, treeGrowerArgs.cartNode, currentTreeState, activeFeatures)
- end
- end
-
- -- CartTree with random branching (when feature is missing)
- local branchleft = function() return math.random() < 0.5 end
- return dt.CartTree(root, branchleft), nleaf
-end
-
-function CartTrainer:processNode(nleaf, queue, node, treeState, activeFeatures)
- local bestSplit
- if self.parallelMode == 'singlethread' then
- bestSplit = self:findBestSplitForAllFeatures(treeState, activeFeatures)
- elseif self.parallelMode == 'featureparallel' then
- bestSplit = self:findBestSplitForAllFeaturesFP(treeState, activeFeatures)
- else
- error("Unrecognized parallel mode: " .. self.parallelMode)
- end
-
- if bestSplit then
- local leftTreeState, rightTreeState = treeState:branch(bestSplit, self.dataset)
- assert(bestSplit.leftChildSize + bestSplit.rightChildSize == leftTreeState.exampleIds:size(1) + rightTreeState.exampleIds:size(1), "The left and right subtrees don't match the split found!")
- self:setValuesAndCreateChildrenForNode(node, bestSplit, leftTreeState, rightTreeState, nleaf)
-
- table.insert(queue, 1, {cartNode=node.leftChild, treeState=leftTreeState})
- table.insert(queue, 1, {cartNode=node.rightChild, treeState=rightTreeState})
-
- return nleaf + 1
- end
-
- return nleaf
-end
-
-function CartTrainer:findBestSplitForAllFeatures(treeState, activeFeatures)
- local timer = torch.Timer()
- local bestSplit = treeState:findBestSplit(self.dataset, activeFeatures, self.minLeafSize, -1, -1)
-
- if bestSplit then
- assert(torch.type(bestSplit) == 'table')
- end
-
- if dt.PROFILE then
- print("findBestSplitForAllFeatures time="..timer:time().real)
- end
- return bestSplit
-end
-
--- Updates the parentNode with the bestSplit information by creates left/right child Nodes.
-function CartTrainer:setValuesAndCreateChildrenForNode(parentNode, bestSplit, leftState, rightState, nleaf)
- assert(torch.isTypeOf(parentNode, 'dt.CartNode'))
- assert(torch.type(bestSplit) == 'table')
- assert(torch.isTypeOf(leftState, 'dt.TreeState'))
- assert(torch.isTypeOf(rightState, 'dt.TreeState'))
- assert(torch.type(nleaf) == 'number')
-
- local leftChild = dt.CartNode()
- leftChild.score = leftState:score(self.dataset)
- leftChild.nodeId = 2 * nleaf - 1
-
- local rightChild = dt.CartNode()
- rightChild.score = rightState:score(self.dataset)
- rightChild.nodeId = 2 * nleaf
-
- parentNode.splitFeatureId = bestSplit.splitId
- parentNode.splitFeatureValue = bestSplit.splitValue
- parentNode.leftChild = leftChild
- parentNode.rightChild = rightChild
- parentNode.splitGain = bestSplit.splitGain
-end
-
--- We minimally need 2 * N examples in the parent to satisfy >= N examples per child
-function CartTrainer:hasEnoughTrainingExamplesToSplit(count)
- return count >= 2 * self.minLeafSize
-end
-
--- call before training to enable feature-parallelization
-function CartTrainer:featureParallel(workPool)
- assert(self.parallelMode == 'singlethread', self.parallelMode)
- self.parallelMode = 'featureparallel'
- self.workPool = torch.type(workPool) == 'number' and dt.WorkPool(workPool) or workPool
- assert(torch.isTypeOf(self.workPool, 'dt.WorkPool'))
-
- -- this deletes all SparseTensor hash maps so that they aren't serialized
- self.dataset:deleteIndex()
-
- -- require the dt package
- self.workPool:update('require', {libname='decisiontree',varname='dt'})
- -- setup worker store (each worker will have its own copy)
- local store = {
- dataset=self.dataset,
- minLeafSize=self.minLeafSize
- }
- self.workPool:update('storeKeysValues', store)
-end
-
--- feature parallel
-function CartTrainer:findBestSplitForAllFeaturesFP(treeState, activeFeatures)
- local timer = torch.Timer()
- local bestSplit
- if treeState.findBestSplitFP then
- bestSplit = treeState:findBestSplitFP(self.dataset, activeFeatures, self.minLeafSize, self.workPool.nThread)
- end
-
- if not bestSplit then
- for i=1,self.workPool.nThread do
- -- upvalues
- local treeState = treeState
- local shardId = i
- local nShard = self.workPool.nThread
- local featureIds = activeFeatures
- -- closure
- local task = function(store)
- assert(store.dataset)
- assert(store.minLeafSize)
- if treeState.threadInitialize then
- treeState:threadInitialize()
- end
-
- local bestSplit = treeState:findBestSplit(store.dataset, featureIds, store.minLeafSize, shardId, nShard)
- return bestSplit
- end
-
- self.workPool:writeup('execute', task)
- end
-
- for i=1,self.workPool.nThread do
- local taskname, candidateSplit = self.workPool:read()
- assert(taskname == 'execute')
- if candidateSplit then
- if ((not bestSplit) or candidateSplit.splitGain < bestSplit.splitGain) then
- bestSplit = candidateSplit
- end
- end
- end
- end
-
- if bestSplit then
- assert(torch.type(bestSplit) == 'table')
- end
-
- if dt.PROFILE then
- print("findBestSplitForAllFeaturesFP time="..timer:time().real)
- end
- return bestSplit
-end
diff --git a/contrib/lua-torch/decisiontree/CartTree.lua b/contrib/lua-torch/decisiontree/CartTree.lua
deleted file mode 100644
index c74dfda9e..000000000
--- a/contrib/lua-torch/decisiontree/CartTree.lua
+++ /dev/null
@@ -1,90 +0,0 @@
-local _ = require "moses"
-local dt = require 'decisiontree._env'
-
--- CART (classification-regression decision tree).
--- The example is always branched to the left when the splitting feature is missing.
-local CartTree = torch.class("dt.CartTree", "dt.DecisionTree", dt)
-
-function CartTree:__init(root, branchleft)
- assert(torch.isTypeOf(root, 'dt.CartNode'))
- self.root = root
- self.branchleft = branchleft or function() return true end
-end
-
--- TODO optimize this
-function CartTree:score(input, stack, optimized)
- if optimized == true and stack == nil and torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
- return input.nn.CartTreeFastScore(input, self.root, input.new())
- end
- return self:recursivescore(self.root, input, stack)
-end
-
--- Continuous: if input[node.splitFeatureId] < node.splitFeatureValue then leftNode else rightNode
--- Binary: if input[node.splitFeatureId] == 0 then leftNode else rightNode
--- when stack is provided, it is returned as the third argument containing the stack of nodes from root to leaf
-function CartTree:recursivescore(node, input, stack)
- assert(torch.isTypeOf(node, 'dt.CartNode'))
-
- if stack then
- stack = torch.type(stack) == 'table' and stack or {}
- table.insert(stack, node)
- end
-
- if not (node.leftChild or node.rightChild) then
- return node.score, node.nodeId, stack
- elseif not node.leftChild then
- return self:recursivescore(node.rightChild, input, stack)
- elseif not node.rightChild then
- return self:recursivescore(node.leftChild, input, stack)
- end
-
- local splitId = node.splitFeatureId
- local splitVal = node.splitFeatureValue
-
- if input[splitId] then -- if has key
- local featureVal = input[splitId]
- local nextNode = featureVal < splitVal and node.leftChild or node.rightChild
- return self:recursivescore(nextNode, input, stack)
- end
-
- -- if feature is missing, branch left
- local nextNode = self.branchleft() and node.leftChild or node.rightChild
- return self:recursivescore(nextNode, input, stack)
-end
-
-function CartTree:__tostring__()
- return self.root:recursivetostring()
-end
-
--- expects a stack returned by score
-function CartTree:stackToString(stack, input)
- assert(torch.type(stack) == 'table')
- assert(torch.isTypeOf(stack[1], 'dt.CartNode'))
-
- local res = 'Stack nodes from root to leaf\n'
- for i,node in ipairs(stack) do
- if not (node.leftChild or node.rightChild) then
- res = res .. "score="..node.score .. '\n'
- else
- local istr = ''
- if input then
- istr = '=' .. (input[node.splitFeatureId] or 'nil')
- end
- res = res .. 'input[' .. node.splitFeatureId .. ']' .. istr ..' < ' .. node.splitFeatureValue .. ' ? '
- res = res .. '(' .. ((node.leftChild and node.rightChild) and 'LR' or node.leftChild and 'L' or node.rightChild and 'R' or 'WAT?') .. ') '
- if node.leftChild == stack[i+1] then
- res = res .. 'Left\n'
- elseif node.rightChild == stack[i+1] then
- res = res .. 'Right\n'
- else
- error"stackToString error"
- end
- end
- end
- return res .. #stack .. " nodes"
-end
-
-function CartTree:clone()
- return CartTree(self.root:clone(), self.branchleft)
-end
-
diff --git a/contrib/lua-torch/decisiontree/DFD.lua b/contrib/lua-torch/decisiontree/DFD.lua
deleted file mode 100644
index e4746212a..000000000
--- a/contrib/lua-torch/decisiontree/DFD.lua
+++ /dev/null
@@ -1,182 +0,0 @@
--- nn.DFD: Decision Forest Discretizer
--- Takes a dense input and outputs a sparse output.
--- Each node in the forest is its own feature.
--- When a node is traversed, its commensurate feature takes on a value of 1.
--- For all non-traversed nodes, the feature is 0.
-local DFD, parent = torch.class("nn.DFD", "nn.Module")
-
--- TODO: add :type, as the default will convert the long tensors
-function DFD:__init(df, onlyLastNode)
- parent.__init(self)
- if torch.type(df) == 'table' then
- self:reconstructFromInfo(df)
- else
- assert(torch.type(df) == 'dt.DecisionForest')
-
- self.rootIds = torch.LongTensor()
- -- nodeId of left and right child nodes
- self.leftChild = torch.LongTensor()
- self.rightChild = torch.LongTensor()
- -- index and value of the feature that splits this node
- self.splitFeatureId = torch.LongTensor()
- self.splitFeatureValue = torch.Tensor()
- -- initialize state given df
- self:convertForest2Tensors(df)
- self:clearState()
- end
- self.onlyLastNode = onlyLastNode
- self.nTrees = self.rootIds:size(1)
-end
-
--- converts a DecisionForest to efficient tensor representation
-function DFD:convertForest2Tensors(df)
- self.rootIds:resize(#df.trees)
-
- -- nodeId will map to featureId
- local nodeId = 0
- -- sets nodeIds of all subnodes
- -- and measures the maximum depth over all trees
- local function recursiveTree(node, depth)
- depth = (depth or 0) + 1
- local rdepth = depth
- nodeId = nodeId + 1
- node._nodeId = nodeId
-
- if node.leftChild then
- rdepth = math.max(rdepth, recursiveTree(node.leftChild, depth))
- end
- if node.rightChild then
- rdepth = math.max(rdepth, recursiveTree(node.rightChild, depth))
- end
- return rdepth
- end
-
- -- sum over trees of max depth
- self.depth = 0
- for i,tree in ipairs(df.trees) do
- assert(torch.isTypeOf(tree.root, 'dt.CartNode'))
- self.depth = self.depth + recursiveTree(tree.root)
- end
- -- remove roots from depth
- self.depth = self.depth - self.rootIds:size(1)
-
- -- total number of nodes in all trees
- self.nNode = nodeId
-
- -- nodeId of left and right child nodes
- self.leftChild:resize(self.nNode):fill(-1)
- self.rightChild:resize(self.nNode):fill(-1)
- -- index and value of the feature that splits this node
- self.splitFeatureId:resize(self.nNode):fill(-1)
- self.splitFeatureValue:resize(self.nNode):fill(-1)
-
- -- aggregates CartNode attributes to an efficient tensor representation
- local function recursiveTree2(node)
- local nodeId = assert(node._nodeId)
- assert(self.splitFeatureId[nodeId] == -1)
-
- if node.leftChild then
- self.leftChild[nodeId] = assert(node.leftChild._nodeId)
- recursiveTree2(node.leftChild)
- else
- self.leftChild[nodeId] = 0
- end
-
- if node.rightChild then
- self.rightChild[nodeId] = assert(node.rightChild._nodeId)
- recursiveTree2(node.rightChild)
- else
- self.rightChild[nodeId] = 0
- end
-
- -- each node splits the dataset on a feature id-value pair
- self.splitFeatureId[nodeId] = assert(node.splitFeatureId)
- self.splitFeatureValue[nodeId] = assert(node.splitFeatureValue)
- end
-
- for i,tree in ipairs(df.trees) do
- self.rootIds[i] = assert(tree.root._nodeId)
- recursiveTree2(tree.root)
- end
-
- assert(self.leftChild:min() >= 0)
- assert(self.rightChild:min() >= 0)
-end
-
--- input is a batchsize x inputsize tensor
-function DFD:updateOutput(input)
- assert(torch.isTensor(input))
- assert(input:dim() == 2)
- input = input:contiguous()
-
- local batchsize, inputsize = input:size(1), input:size(2)
- local size = self.onlyLastNode and self.nTree or self.depth
-
- -- each sample's output keys is resized to maxdepth, which is the maximum size that it can take on
- self.outputkeys = self.outputkeys or torch.LongTensor()
- self.outputkeys:resize(batchsize, size)
- -- values are 1
- self.outputvalues = self.outputvalues or input.new()
- self.outputvalues:resize(batchsize, size):fill(1)
-
- self.output = input.nn.DFD_computeOutput(self.outputkeys, self.outputvalues, self.rootIds, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, input, self.onlyLastNode)
- return self.output
-end
-
-function DFD:type(type, tensorCache)
- if type then
- local info = self:getReconstructionInfo()
- for k, v in pairs(info) do
- if torch.type(v) ~= 'torch.LongTensor' then
- info[k] = nil
- end
- end
- parent.type(self, type, tensorCache)
- self:reconstructFromInfo(info)
- return self
- else
- return parent.type(self)
- end
-end
-
-function DFD:updateGradInput()
- error"Not Implemented"
-end
-
-function DFD:clearState()
- self.output = {{},{}}
- self.taskbuffer = {}
- self.outputkeys = nil
- self.outputvalues = nil
- self._range = nil
- self._indices = nil
- self._mask = nil
-end
-
-function DFD:reconstructFromInfo(DFDinfo)
- for k,v in pairs(DFDinfo) do
- self[k] = v
- end
- assert(self.leftChild:nDimension() == 1)
- assert(self.rightChild:nDimension() == 1)
- assert(self.leftChild:size(1) == self.nNode)
- assert(self.rightChild:size(1) == self.nNode)
- assert(self.leftChild:min() >= 0)
- assert(self.rightChild:min() >= 0)
- assert(self.splitFeatureId:nDimension() == 1)
- assert(self.splitFeatureValue:nDimension() == 1)
- assert(self.splitFeatureId:size(1) == self.splitFeatureValue:size(1))
-end
-
-function DFD:getReconstructionInfo()
- local DFDinfo = {
- nNode = self.nNode,
- rootIds = self.rootIds,
- leftChild = self.leftChild,
- rightChild = self.rightChild,
- splitFeatureId = self.splitFeatureId,
- splitFeatureValue = self.splitFeatureValue,
- depth = self.depth
- }
- return DFDinfo
-end
diff --git a/contrib/lua-torch/decisiontree/DataSet.lua b/contrib/lua-torch/decisiontree/DataSet.lua
deleted file mode 100644
index 15058a7c6..000000000
--- a/contrib/lua-torch/decisiontree/DataSet.lua
+++ /dev/null
@@ -1,142 +0,0 @@
-local dt = require "decisiontree._env"
-
-local DataSet = torch.class("dt.DataSet", dt)
-
-function DataSet:__init(input, target, nThreads)
- if torch.type(input) == 'table' then
- assert(torch.isTypeOf(input[1], 'torch.SparseTensor'))
- else
- assert(torch.isTensor(input))
- end
- self.input = input
- assert(torch.isTensor(target))
- self.target = target
- self.nThreads = nThreads or 1
-
- self.sortedFeatureValues, self.featureIds = self:sortFeatureValues(input)
-end
-
--- group examples by featureId. For each featureId, sort examples by featureValue (ascending order)
--- returns a table mapping featureIds to sorted lists of exampleIds
--- e.g. {featureId={example1,example2,example3}}
-function DataSet:sortFeatureValues(inputs)
- local isSparse = torch.typename(inputs[1]):match('torch.*SparseTensor')
- assert(isSparse or torch.isTensor(inputs))
-
- local featureIds = torch.LongTensor()
- local dataset = {} -- TODO use tds.Hash (will require SparseTensor to be userdata)
- if isSparse then
- local proto = inputs[1].values
- -- get list of featureIds
- local featureMap = {}
- for i,input in ipairs(inputs) do
- input.keys:apply(function(key)
- featureMap[key] = (featureMap[key] or 0) + 1
- end)
- end
- local _ = require "moses"
- featureIds = featureIds.new(_.keys(featureMap))
- local featureCounts = torch.LongTensor(featureIds:size(1))
- for i=1,featureIds:size(1) do
- featureCounts[i] = featureMap[featureIds[i]]
- end
-
- for i=1,featureIds:size(1) do
- local featureId = featureIds[i]
- local featureCount = featureCounts[i]
- dataset[featureId] = {
- values=proto.new(featureCount),
- examples=torch.LongTensor(featureCount),
- i=0
- }
- end
-
- for exampleId,input in ipairs(inputs) do
- local sparseIdx = 0
- input.keys:apply(function(key)
- sparseIdx = sparseIdx + 1
- local f = dataset[key]
- f.i = f.i + 1
- f.values[f.i] = input.values[sparseIdx]
- f.examples[f.i] = exampleId
- end)
- end
-
- local sortVal, sortIdx = proto.new(), torch.LongTensor()
- for featureId,f in pairs(dataset) do
- assert(f.values:size(1) == f.i)
- sortVal:sort(sortIdx, f.values, 1, false)
-
- local sortedExampleIds = torch.LongTensor(f.i)
- sortedExampleIds:index(f.examples, 1, sortIdx)
-
- dataset[featureId] = sortedExampleIds
- end
- else
- assert(torch.isTensor(inputs))
- featureIds:range(1,inputs:size(2))
-
- for i=1,inputs:size(2) do
- local featureId = i
- local values = inputs:select(2, i)
- local _, sortedFeatureExampleIds = values:sort(1, false)
- dataset[featureId] = sortedFeatureExampleIds
- end
- end
-
- return dataset, featureIds
-end
-
-function DataSet:getSortedFeature(featureId)
- assert(self.sortedFeatureValues)
- return self.sortedFeatureValues[featureId]
-end
-
-function DataSet:size()
- return self.target:size(1)
-end
-
-function DataSet:getExampleIds()
- if not self.exampleIds then
- self.exampleIds = torch.LongTensor():range(1,self:size())
- end
- return self.exampleIds
-end
-
-function DataSet:countPositive(exampleIds)
- assert(torch.type(exampleIds) == 'torch.LongTensor')
- local dt = require 'decisiontree'
- local buffer = dt.getBufferTable('DataSet')
- buffer.tensor = buffer.tensor or self.target.new()
- buffer.tensor:index(self.target, 1, exampleIds)
- local nPositive = 0
- buffer.tensor:apply(function(x)
- if x > 0 then nPositive = nPositive + 1 end
- end)
- return nPositive
-end
-
-function DataSet:initScore()
- self.score = self.score or torch.Tensor()
- self.score:resize(self:size()):fill(0)
-end
-
-function DataSet:buildIndex()
- if torch.type(self.input) == 'table' then
- for exampleId,input in ipairs(self.input) do
- if torch.isTypeOf(input, 'torch.SparseTensor') then
- input:buildIndex()
- end
- end
- end
-end
-
-function DataSet:deleteIndex()
- if torch.type(self.input) == 'table' then
- for exampleId,input in ipairs(self.input) do
- if torch.isTypeOf(input, 'torch.SparseTensor') then
- input:deleteIndex()
- end
- end
- end
-end
diff --git a/contrib/lua-torch/decisiontree/DecisionForest.lua b/contrib/lua-torch/decisiontree/DecisionForest.lua
deleted file mode 100644
index cac748e7e..000000000
--- a/contrib/lua-torch/decisiontree/DecisionForest.lua
+++ /dev/null
@@ -1,82 +0,0 @@
-local dt = require "decisiontree._env"
-
--- Decision forest that ensembles a bag of decision trees.
-local DecisionForest = torch.class("dt.DecisionForest", "dt.DecisionTree", dt)
-
-function DecisionForest:__init(trees, weight, bias)
- assert(torch.type(trees) == 'table')
- self.trees = trees
- if #trees == 0 then
- self.weight = weight or torch.Tensor()
- assert(torch.isTensor(self.weight))
- assert(self.weight:nElement() == 0)
- else
- assert(torch.isTypeOf(trees[1], 'dt.DecisionTree'))
- self.weight = weight or torch.Tensor(#trees):fill(1)
- assert(torch.isTensor(self.weight))
- assert(self.weight:dim() == 1)
- assert(self.weight:min() >= 0, "Expecting positive weights")
- assert(#trees == self.weight:size(1))
- end
-
- self.bias = bias or 0
- assert(torch.type(self.bias) == 'number')
-end
-
-function DecisionForest:score(input, incrementalId)
- assert(torch.isTensor(input))
-
- local buffer = {}
- if incrementalId then
- self.buffers = self.buffers or {}
- self.buffers[incrementalId] = self.buffers[incrementalId] or {}
- buffer = self.buffers[incrementalId]
- end
- buffer.initialCounter = buffer.initialCounter or 0
-
- -- TODO: score in parallel
- local output
- if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
- buffer.output = buffer.output or input.new()
- output = buffer.output
- assert(output:nElement() == 0 or output:size(1) == input:size(1))
- if output:nElement() == 0 then
- output:resize(input:size(1)):fill(self.bias)
- end
- for i,tree in ipairs(self.trees) do
- if i > buffer.initialCounter then
- local score = tree:score(input, nil, true)
- output:add(self.weight[i], score)
- end
- end
- else
- output = buffer.output or self.bias
- for i,tree in ipairs(self.trees) do
- if i > buffer.initialCounter then
- output = output + tree:score(input) * self.weight[i]
- end
- end
- buffer.output = output
- end
-
- buffer.initialCounter = #self.trees
-
- return output
-end
-
-function DecisionForest:add(tree, weight)
- assert(torch.type(weight) == 'number')
- assert(weight > 0)
- table.insert(self.trees, tree)
- self.weight:resize(#self.trees)
- self.weight[#self.trees] = weight
- return self
-end
-
-function DecisionForest:clone()
- local trees = {}
- for i, tree in ipairs(self.trees) do
- trees[i] = tree:clone()
- end
- return DecisionForest(trees, self.weight:clone(), self.bias)
-end
diff --git a/contrib/lua-torch/decisiontree/DecisionForestTrainer.lua b/contrib/lua-torch/decisiontree/DecisionForestTrainer.lua
deleted file mode 100644
index fc903678b..000000000
--- a/contrib/lua-torch/decisiontree/DecisionForestTrainer.lua
+++ /dev/null
@@ -1,22 +0,0 @@
-local dt = require "decisiontree._env"
-
--- Interface for all decisionForestTrainers
-local DFT = torch.class("dt.DecisionForestTrainer", dt)
-
--- Train a DecisionForest with examples, a table of valid featureIds and a dataset (i.e. sortedExamplesByFeatureId)
-function DFT:train(examples, validFeatureIds, dataset)
- assert(torch.type(examples) == "table")
- assert(torch.isTypeOf(examples[1], "dt.LabeledExample"))
-
- assert(torch.type(validFeatureIds) == 'table')
-
- assert(torch.type(dataset) == 'table')
- for k,v in pairs(dataset) do
- assert(torch.type(v) == 'table')
- assert(torch.isTypeOf(v[1], 'dt.LabeledExample'))
- break
- end
- -- dataset is a table mapping featureIds to sorted lists of LabeledExamples
- -- e.g. {featureId={example1,example2,example3}}
- error"Not Implemented"
-end
diff --git a/contrib/lua-torch/decisiontree/DecisionTree.lua b/contrib/lua-torch/decisiontree/DecisionTree.lua
deleted file mode 100644
index c61bc3757..000000000
--- a/contrib/lua-torch/decisiontree/DecisionTree.lua
+++ /dev/null
@@ -1,12 +0,0 @@
-local dt = require "decisiontree._env"
-
--- An interface for decision trees.
-local DecisionTree = torch.class("dt.DecisionTree", dt)
-
--- Score an input example and return the prediction score.
--- input is a Tensor or SparseTensor
--- return prediction score and nodeId
-function DecisionTree:score(input)
- error"Not Implemented"
- return score, nodeId
-end
diff --git a/contrib/lua-torch/decisiontree/GBDT_common.h b/contrib/lua-torch/decisiontree/GBDT_common.h
deleted file mode 100644
index eb993702d..000000000
--- a/contrib/lua-torch/decisiontree/GBDT_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-#include "khash.h"
-#include <pthread.h>
-
-#define computeGradientBoostLoss(g, h) (-(g)*(g)/(h))
-
-// we use khash to make iteration faster than lua tables
-KHASH_SET_INIT_INT64(long)
-
-// defines the data we need for running an instance of thet and its constructor/destructor
-typedef struct {
- khash_t(long)* exampleMap;
- THLongTensor *exampleIdsWithFeature_cache;
- long minLeafSize;
-} GBRunData;
-
-
-// allocates data that cannot be shared between threads
-static void gb_local_create_run_data(GBRunData *run_data) {
- run_data->exampleMap = kh_init(long);
- run_data->exampleIdsWithFeature_cache = THLongTensor_new();
-}
-
-static void gb_create_run_data(GBRunData *run_data, int minLeafSize) {
- gb_local_create_run_data(run_data);
- run_data->minLeafSize = minLeafSize;
-}
-
-static void gb_destroy_run_data(GBRunData *run_data) {
- THLongTensor_free(run_data->exampleIdsWithFeature_cache);
- kh_destroy(long, run_data->exampleMap);
-}
-
-// initializes the data required by the optimizer for the given feature.
-static THLongTensor *gb_internal_prepare(lua_State *L, THLongTensor *exampleIds,
- THLongTensor *exampleIdsWithFeature_cache, int input_index, long feature_id,
- khash_t(long)* exampleMap) {
- long *exampleIds_data = THLongTensor_data(exampleIds);
- long exampleIds_size = THLongTensor_size(exampleIds, 0);
-
- int ret = 0;
-
- // if the the input is a table, then we have a sparse dataset
- if (lua_istable(L, input_index)) {
- if (exampleIds_size == 0) {
- return NULL;
- }
- else {
- // loops over the examples' ids that this node has to evaluate and, if they have the feature
- // we're looking for, marks them as present and stores them in the order provided by the
- // dataset
- THLongTensor_resize1d(exampleIdsWithFeature_cache, exampleIds_size);
- kh_clear(long, exampleMap);
- kh_resize(long, exampleMap, exampleIds_size*8);
- long *exampleIdsWithFeature_data = THLongTensor_data(exampleIdsWithFeature_cache);
- long j = 0;
- // for each sample to be evaluated
- for (long i = 0; i < exampleIds_size; i++) {
- // gets the representation for the example
- lua_pushinteger(L, exampleIds_data[i]);
- lua_gettable(L, input_index);
-
- // builds the index, which happens only once per thread for efficiency
- lua_pushstring(L, "buildIndex");
- lua_gettable(L, -2);
- lua_pushvalue(L, -2);
- lua_call(L, 1, 0);
-
- // tries to get the feature for this sample
- lua_pushinteger(L, feature_id);
- lua_gettable(L, -2);
- // if present, then...
- if (!lua_isnil(L, -1)) {
- // saves the example
- exampleIdsWithFeature_data[j] = exampleIds_data[i];
- j++;
-
- // marks it as present in the hash table
- kh_put(long, exampleMap, exampleIds_data[i], &ret);
- }
-
- lua_pop(L, 2);
- }
-
- // resizes to fit only the samples that have the feature
- THLongTensor_resize1d(exampleIdsWithFeature_cache, j);
- kh_resize(long, exampleMap, j*8);
- return exampleIdsWithFeature_cache;
- }
- }
- else {
- // if the input isn't a table, then it's dense and we cannot have exampleIds missing, so it
- // depends on feature_id
- // since exampleIds is fixed between calls and this is going to store the same values to the
- // same position, we can cache it between calls
- if (kh_size(exampleMap) == 0) {
- kh_resize(long, exampleMap, exampleIds_size*8);
- for (long i = 0; i < exampleIds_size; i++) {
- kh_put(long, exampleMap, exampleIds_data[i], &ret);
- }
- }
- // notice that we just return the given tensor of ids instead of copying it. the rest of the
- // code handles this transparently
- return exampleIds;
- }
-}
-
diff --git a/contrib/lua-torch/decisiontree/GiniState.lua b/contrib/lua-torch/decisiontree/GiniState.lua
deleted file mode 100644
index 6dfed2845..000000000
--- a/contrib/lua-torch/decisiontree/GiniState.lua
+++ /dev/null
@@ -1,54 +0,0 @@
-local dt = require 'decisiontree._env'
-
--- used by RandomForestTrainer
-local GiniState, parent = torch.class("dt.GiniState", "dt.TreeState", dt)
-
-function GiniState:__init(exampleIds)
- parent.__init(self, exampleIds)
- self.nPositiveInLeftBranch = 0
- self.nPositiveInRightBranch = 0
-end
-
-function GiniState:score(dataset)
- local dt = require 'decisiontree'
- local nPositive = dataset:countPositive(self.exampleIds)
- return dt.calculateLogitScore(nPositive, self.exampleIds:size(1))
-end
-
-function GiniState:initialize(exampleIdsWithFeature, dataset)
- assert(torch.type(exampleIdsWithFeature) == 'torch.LongTensor')
- assert(torch.isTypeOf(dataset, 'dt.DataSet'))
- self.nPositiveInLeftBranch = dataset:countPositive(exampleIdsWithFeature)
- self.nPositiveInRightBranch = 0
-
- self.nExampleInLeftBranch = exampleIdsWithFeature:size(1)
- self.nExampleInRightBranch = 0
-end
-
-function GiniState:update(exampleId, dataset)
- assert(torch.type(exampleId) == 'number')
- assert(torch.isTypeOf(dataset, 'dt.DataSet'))
- if dataset.target[exampleId] > 0 then
- self.nPositiveInLeftBranch = self.nPositiveInLeftBranch - 1
- self.nPositiveInRightBranch = self.nPositiveInRightBranch + 1
- end
-
- self.nExampleInLeftBranch = self.nExampleInLeftBranch - 1
- self.nExampleInRightBranch = self.nExampleInRightBranch + 1
-end
-
-function GiniState:computeSplitInfo(splitFeatureId, splitFeatureValue)
- local dt = require 'decisiontree'
- local gini = dt.computeGini(self.nExampleInLeftBranch, self.nPositiveInLeftBranch, self.nExampleInRightBranch, self.nPositiveInRightBranch)
- local splitInfo = {
- splitId = assert(splitFeatureId),
- splitValue = assert(splitFeatureValue),
- leftChildSize = assert(self.nExampleInLeftBranch),
- leftPositiveCount = assert(self.nPositiveInLeftBranch),
- rightChildSize = assert(self.nExampleInRightBranch),
- rightPositiveCount = assert(self.nPositiveInRightBranch),
- gini = assert(gini),
- splitGain = gini
- }
- return splitInfo
-end \ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/GradientBoostState.lua b/contrib/lua-torch/decisiontree/GradientBoostState.lua
deleted file mode 100644
index f268f3da8..000000000
--- a/contrib/lua-torch/decisiontree/GradientBoostState.lua
+++ /dev/null
@@ -1,57 +0,0 @@
-local dt = require 'decisiontree._env'
-
-local GradientBoostState, parent = torch.class("dt.GradientBoostState", "dt.TreeState", dt)
-
-function GradientBoostState:__init(exampleIds, gradInput, hessInput)
- parent.__init(self, exampleIds)
- self.gradInput = gradInput
- self.hessInput = hessInput
-end
-
-function GradientBoostState:score(dataset)
- local dt = require 'decisiontree'
- local gradInput = self.gradInput:index(1, self.exampleIds)
- local hessInput = self.hessInput:index(1, self.exampleIds)
- return dt.computeNewtonScore(gradInput:sum(), hessInput:sum())
-end
-
--- calls _branch and encapsulates the left and right exampleIds into a TreeStates
-function GradientBoostState:branch(splitInfo, dataset)
- local leftExampleIds, rightExampleIds = self:_branch(splitInfo, dataset)
- return self.new(leftExampleIds, self.gradInput, self.hessInput), self.new(rightExampleIds, self.gradInput, self.hessInput)
-end
-
--- Partitions self given a splitInfo table, producing a pair of exampleIds corresponding to the left and right subtrees.
-function GradientBoostState:_branch(splitInfo, dataset)
- local input = dataset.input
- -- if the input is dense, we can use the optimized version
- if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
- return input.nn.GBDT_branch(splitInfo, input, self.exampleIds)
- end
- return parent._branch(self, splitInfo, dataset)
-end
-
--- The following methods are supersets of each other. You can comment out them to re-use the lua
--- version with just the provided core optimized
-
--- THIS ONE CANNOT BE COMMENTED OUT
-function GradientBoostState:findBestFeatureSplit(dataset, featureId, minLeafSize)
- local ret = self.hessInput.nn.GBDT_findBestFeatureSplit(self.exampleIds, dataset, featureId, minLeafSize, self.gradInput, self.hessInput)
- return ret
-end
-
--- finds the best split of examples in treeState among featureIds
-function GradientBoostState:findBestSplit(dataset, featureIds, minLeafSize, shardId, nShard)
- local ret = self.hessInput.nn.GBDT_findBestSplit(self.exampleIds, dataset, featureIds, minLeafSize, shardId, nShard, self.gradInput, self.hessInput)
- return ret
-end
-
--- finds the best split like the previous one, but performs feature parallelism. Note that the
--- optimization is only applied if the input is dense
-function GradientBoostState:findBestSplitFP(dataset, featureIds, minLeafSize, nThread)
- local input = dataset.input
- if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
- local ret = self.hessInput.nn.GBDT_findBestSplitFP(self.exampleIds, dataset, featureIds, minLeafSize, self.gradInput, self.hessInput, nThread)
- return ret
- end
-end
diff --git a/contrib/lua-torch/decisiontree/GradientBoostTrainer.lua b/contrib/lua-torch/decisiontree/GradientBoostTrainer.lua
deleted file mode 100644
index 51299b109..000000000
--- a/contrib/lua-torch/decisiontree/GradientBoostTrainer.lua
+++ /dev/null
@@ -1,244 +0,0 @@
-local dt = require "decisiontree._env"
-
--- Gradient boosted decision tree trainer
-local GradientBoostTrainer = torch.class("dt.GradientBoostTrainer", "dt.DecisionForestTrainer", dt)
-
-function GradientBoostTrainer:__init(opt)
- assert(torch.type(opt) == 'table')
-
- assert(torch.isTypeOf(opt.treeTrainer, 'dt.CartTrainer'))
- self.treeTrainer = opt.treeTrainer
-
- assert(torch.isTypeOf(opt.lossFunction, 'nn.Criterion'))
- self.lossFunction = opt.lossFunction
-
- assert(torch.type(opt.shrinkage) == 'number')
- assert(opt.shrinkage > 0)
- self.shrinkage = opt.shrinkage
-
- assert(torch.type(opt.downsampleRatio) == 'number')
- assert(opt.downsampleRatio > 0)
- self.downsampleRatio = opt.downsampleRatio
-
- assert(torch.type(opt.nTree) == 'number')
- assert(opt.nTree > 0)
- self.nTree = opt.nTree
-
- evalFreq = evalFreq or -1
- assert(torch.type(opt.evalFreq) == 'number')
- assert(torch.round(opt.evalFreq) == opt.evalFreq)
- self.evalFreq = opt.evalFreq
-
- -- when non-positive, no early-stopping
- earlyStop = earlyStop or (evalFreq-1)
- assert(torch.type(opt.earlyStop) == 'number')
- self.earlyStop = opt.earlyStop
-
- -- when non-positive, defaults to sqrt(#feature)
- assert(torch.type(opt.featureBaggingSize) == 'number')
- self.featureBaggingSize = opt.featureBaggingSize
-
- if opt.decisionForest then
- assert(torch.isTypeOf(opt.decisionForest, 'dt.DecisionForest'))
- end
- self.decisionForest = opt.decisionForest
-
- self.useInitBias = opt.useInitBias
-end
-
-function GradientBoostTrainer:computeBias(trainSet, verbose)
- assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
-
- if verbose then print("Use new bias generated from the training examples.") end
-
- return -0.5 * self.gradInput:sum() / self.hessInput:sum()
-end
-
-
-function GradientBoostTrainer:initialize(trainSet, verbose)
- assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
-
- trainSet:initScore()
- self.gradInput, self.hessInput = self.lossFunction:backward2(trainSet.score, trainSet.target)
-
- -- used for early-stopping (see validate())
- self.stopCount = 0
- self.prevTrainLoss = math.huge
- self.prevTestLoss = math.huge
-
- if verbose then print("Processing initial decision forest") end
-
- local decisionForest, bias
-
- if self.decisionForest then
- local bias = self.useInitBias and self.decisionForest.bias or self:computeBias(trainSet, verbose)
-
- decisionForest = dt.DecisionForest(self.decisionForest.trees, self.decisionForest.weight, bias)
-
- local input = trainSet.input
- if torch.isTensor(input) and input.isContiguous and input:isContiguous() then
- score = decisionForest:score(input)
- else
- score:resize(trainSet:size())
- for exampleId=1,trainSet:size() do
- score[exampleId] = decisionForest:score(input[exampleId])
- end
- end
- else
- local bias = self:computeBias(trainSet, verbose)
- decisionForest = dt.DecisionForest({}, torch.Tensor(), bias)
-
- trainSet.score:fill(bias)
- end
-
- if verbose then print("Finish loading initial decision forest") end
-
- return decisionForest
-end
-
--- Trains a decision forest of boosted decision trees.
--- examples are the training examples. validExamples are used for cross-validation.
-function GradientBoostTrainer:train(trainSet, featureIds, validSet, verbose)
- assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
- assert(torch.type(featureIds) == 'torch.LongTensor')
- assert(torch.isTypeOf(validSet, 'dt.DataSet'))
-
- local decisionForest = self:initialize(trainSet, verbose)
- local bestDecisionForest
-
- if verbose then print(string.format("Get %d featureIds.", featureIds:size(1))) end
-
- local baggingSize = self.featureBaggingSize > 0 and self.featureBaggingSize or torch.round(math.sqrt(featureIds:size(1)))
- local trainExampleIds = trainSet:getExampleIds()
- local baggingIndices, activeFeatures
- local treeExampleIds
-
- local timer = torch.Timer()
-
- for treeId = 1,self.nTree do
- timer:reset()
- if verbose then print(string.format("Begin processing tree number %d of %d", treeId, self.nTree)) end
-
- -- Get active features
- activeFeatures = activeFeatures or torch.LongTensor()
- if baggingSize < featureIds:size(1) then
- if verbose then print(string.format("Tree %d: Bagging %d from %d features", treeId, baggingSize, featureIds:size(1))) end
-
- baggingIndices = baggingIndices or torch.LongTensor()
- baggingIndices:randperm(featureIds:size(1))
- activeFeatures:index(featureIds, 1, baggingIndices:narrow(1,1,baggingSize))
- else
- activeFeatures = featureIds
- end
-
- -- Get data samples
- if self.downsampleRatio < 0.99 then
- local sampleSize = torch.round(trainSet:size() * self.downsampleRatio)
-
- if verbose then print(string.format("Tree %d: Downsampling %d of %d samples", treeId, sampleSize, trainSet:size())) end
-
- baggingIndices = baggingIndices or torch.LongTensor()
- baggingIndices:randperm(trainSet:size())
-
- treeExampleIds = treeExampleIds or torch.LongTensor()
- treeExampleIds:index(trainExampleIds, 1, baggingIndices:narrow(1,1,sampleSize))
- else
- treeExampleIds = trainExampleIds
- end
-
- if verbose then print(string.format("Tree %d: training CART tree", treeId)) end
-
- local rootTreeState = dt.GradientBoostState(treeExampleIds, self.gradInput, self.hessInput)
- local cartTree = self.treeTrainer:train(rootTreeState, activeFeatures)
-
- if verbose then print(string.format("Tree %d: finished training CART tree in %f seconds", treeId, timer:time().real)) end
-
- decisionForest:add(cartTree, self.shrinkage)
-
- -- update score
- local predictionScore
- local input = trainSet.input
- if torch.isTensor(input) and input:isContiguous() then
- predictionScore = cartTree:score(trainSet.input, nil, true)
- else
- local size = trainSet:size()
- predictionScore = torch.Tensor(size)
- for exampleId=1,size do
- predictionScore[exampleId] = cartTree:score(trainSet.input[exampleId])
- end
- end
- trainSet.score:add(self.shrinkage, predictionScore)
- self.gradInput, self.hessInput = self.lossFunction:backward2(trainSet.score, trainSet.target)
-
- if verbose then print(string.format("Tree %d: training complete in %f seconds", treeId, timer:time().real)) end
-
- -- cross-validation/early-stopping
- if self.evalFreq > 0 and treeId % self.evalFreq == 0 then
- timer:reset()
- local stop, validLoss, bestDecisionForest = self:validate(trainSet, validSet, decisionForest, bestDecisionForest)
- if dt.PROFILE then print("validate tree time: "..timer:time().real) end
- if verbose then print(string.format("Loss: train=%7.4f, valid=%7.4f", 0, validLoss)) end
- if stop then
- if verbose then print(string.format("GBDT early stopped on tree %d", treeId)) end
- break
- end
-
- end
- end
-
- return bestDecisionForest or decisionForest
-end
-
-function dt.GradientBoostTrainer:validate(trainSet, validSet, decisionForest, bestDecisionForest)
- assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
- assert(torch.isTypeOf(validSet, 'dt.DataSet'))
- assert(torch.isTypeOf(decisionForest, 'dt.DecisionForest'))
- assert(not bestDecisionForest or torch.isTypeOf(decisionForest, 'dt.DecisionForest'))
-
- -- buffer
- local buffer = dt.getBufferTable('GradientBoost')
- buffer.tensor = buffer.tensor or trainSet.score.new()
- local score = buffer.tensor
-
- -- per thread loss function (tensors are shared)
- local lossname = torch.typename(self.lossFunction)
- buffer[lossname] = buffer[lossname] or self.lossFunction:clone()
- local lossFunction = buffer[lossname]
-
- -- TODO batch this for large datasets
- local input = validSet.input
- if torch.isTensor(input) and input.isContiguous and input:isContiguous() then
- score = decisionForest:score(input, 'val')
- else
- score:resize(validSet:size())
- for exampleId=1,validSet:size() do
- score[exampleId] = decisionForest:score(input[exampleId], 'val')
- end
- end
- local validLoss = lossFunction:forward(score, validSet.target)
-
- -- early stop is not enabled when earlyStop=0
- local stop = false
- if self.earlyStop > 0 then
- -- Track test loss and detect early stop
- if self.prevTestLoss - validLoss < 0 then
- self.stopCount = self.stopCount + 1
- else
- bestDecisionForest = decisionForest:clone()
- self.stopCount = 0
- end
-
- stop = self.stopCount >= self.earlyStop
- end
-
- self.prevTestLoss = validLoss
-
- return stop, validLoss, bestDecisionForest
-end
-
-function GradientBoostTrainer:getName()
- return string.format(
- "gbdt-dRatio-%s-maxLeaf-%s-minExample-%s-nTree-%s-shrinkage-%s",
- self.downsampleRatio, self.maxLeafNodes, self.minLeafSize, self.nTree, self.shrinkage
- )
-end
diff --git a/contrib/lua-torch/decisiontree/LICENSE b/contrib/lua-torch/decisiontree/LICENSE
deleted file mode 100644
index 8dada3eda..000000000
--- a/contrib/lua-torch/decisiontree/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "{}"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright {yyyy} {name of copyright owner}
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/contrib/lua-torch/decisiontree/LogitBoostCriterion.lua b/contrib/lua-torch/decisiontree/LogitBoostCriterion.lua
deleted file mode 100644
index 5b9eb6028..000000000
--- a/contrib/lua-torch/decisiontree/LogitBoostCriterion.lua
+++ /dev/null
@@ -1,45 +0,0 @@
-local dt = require "decisiontree._env"
-
--- Ref: slide 17 in https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf
-
--- equivalent to nn.Sigmoid() + nn.BCECriterion()
-local LogitBoostCriterion, parent = torch.class("nn.LogitBoostCriterion", "nn.Criterion")
-
-function LogitBoostCriterion:__init(sizeAverage)
- parent.__init(self)
- self.sizeAverage = sizeAverage
- self.hessInput = self.gradInput.new()
- self._output = torch.Tensor()
-end
-
-function LogitBoostCriterion:updateOutput(input, target)
- input.nn.LogitBoostCriterion_updateOutput(input, target, self._output, self.sizeAverage)
- self.output = self._output[1]
- return self.output
-end
-
-function LogitBoostCriterion:updateGradInput(input, target)
- input.nn.LogitBoostCriterion_updateGradInput(input, target, self.gradInput)
- return self.gradInput
-end
-
-function LogitBoostCriterion:updateHessInput(input, target)
- input.nn.LogitBoostCriterion_updateHessInput(input, target, self.hessInput)
- return self.hessInput
-end
-
--- returns gradInput and hessInput
-function LogitBoostCriterion:backward2(input, target)
- return self:updateGradInput(input, target), self:updateHessInput(input, target)
-end
-
-local gradWrapper = function(input, target, grad)
- input.nn.LogitBoostCriterion_updateGradInput(input, target, grad)
-end
-local hessianWrapper = function(input, target, hessian)
- input.nn.LogitBoostCriterion_updateHessInput(input, target, hessian)
-end
-
-function LogitBoostCriterion:getWrappers()
- return gradWrapper, hessianWrapper
-end
diff --git a/contrib/lua-torch/decisiontree/MSECriterion.lua b/contrib/lua-torch/decisiontree/MSECriterion.lua
deleted file mode 100644
index 948c1a17e..000000000
--- a/contrib/lua-torch/decisiontree/MSECriterion.lua
+++ /dev/null
@@ -1,13 +0,0 @@
-local dt = require "decisiontree._env"
-
-function nn.MSECriterion.updateHessianInput(self, input, target)
- self.hessInput = self.hessInput or input.new()
- self.hessInput:resize(input:size()):fill(2)
- return self.hessInput
-end
-
--- returns gradInput and hessInput
-function nn.MSECriterion.backward2(self, input, target)
- return self:updateGradInput(input, target), self:updateHessInput(input, target)
-end
-
diff --git a/contrib/lua-torch/decisiontree/README.md b/contrib/lua-torch/decisiontree/README.md
deleted file mode 100644
index db4622add..000000000
--- a/contrib/lua-torch/decisiontree/README.md
+++ /dev/null
@@ -1,386 +0,0 @@
-# Torch decision tree library
-
-```lua
-local dt = require 'decisiontree'
-```
-
-This project implements random forests and gradient boosted decision trees (GBDT).
-The latter uses gradient tree boosting.
-Both use ensemble learning to produce ensembles of decision trees (that is, forests).
-
-## `nn.DFD`
-
-One practical application for decision forests is to *discretize* an input feature space into a richer output feature space.
-The `nn.DFD` Module can be used as a decision forest discretizer (DFD):
-
-```lua
-local dfd = nn.DFD(df, onlyLastNode)
-```
-
-where `df` is a `dt.DecisionForest` instance or the table returned by the method `getReconstructionInfo()` on another `nn.DFD` module, and `onlyLastNode` is a boolean that indicates that module should return only the id of the last node visited on each tree (by default it outputs all traversed nodes except for the roots).
-The `nn.DFD` module requires dense `input` tensors.
-Sparse `input` tensors (tables of tensors) are not supported.
-The `output` returned by a call to `updateOutput` is a batch of sparse tensors.
-This `output` where `output[1]` and `output[2]` are a respectively a list of key and value tensors:
-
-```lua
-{
- { [torch.LongTensor], ... , [torch.LongTensor] },
- { [torch.Tensor], ... , [torch.Tensor] }
-}
-```
-
-This module doesn't support CUDA.
-
-### Example
-As a concrete example, let us first train a Random Forest on a dummy dense dataset:
-
-```lua
-local nExample = 100
-local batchsize = 2
-local inputsize = 10
-
--- train Random Forest
-local trainSet = dt.getDenseDummyData(nExample, nil, inputsize)
-local opt = {
- activeRatio=0.5,
- featureBaggingSize=5,
- nTree=4,
- maxLeafNodes=nExample/2,
- minLeafSize=nExample/10,
-}
-local trainer = dt.RandomForestTrainer(opt)
-local df = trainer:train(trainSet, trainSet.featureIds)
-mytester:assert(#df.trees == opt.nTree)
-```
-
-Now that we have `df`, a `dt.DecisionForest` instance, we can use it to initialize `nn.DFD`:
-
-```lua
-local dfd = nn.DFD(df)
-```
-
-The `dfd` instance holds no reference to `df`, instead it extracts the relevant attributes from `df`.
-These attributes are stored in tensors for batching and efficiency.
-
-We can discretize a hypothetical `input` by calling `forward`:
-```lua
-local input = trainSet.input:sub(1,batchsize)
-local output = dfd:forward(input)
-```
-
-The resulting output is a table consisting of two tables: keys and values.
-The keys and values tables each contains `batchsize` tensors:
-
-```lua
-print(output)
-{
- 1 :
- {
- 1 : LongTensor - size: 14
- 2 : LongTensor - size: 16
- 3 : LongTensor - size: 15
- 4 : LongTensor - size: 13
- }
- 2 :
- {
- 1 : DoubleTensor - size: 14
- 2 : DoubleTensor - size: 16
- 3 : DoubleTensor - size: 15
- 4 : DoubleTensor - size: 13
- }
-}
-```
-
-An example's feature keys (`LongTensor`) and commensurate values (`DoubleTensor`) have the same number of elements.
-The examples have variable number of key-value pairs representing the nodes traversed in the tree.
-The output feature space has as many dimensions (that is, possible feature keys) for each node in the forest.
-
-## `torch.SparseTensor`
-
-Suppose you have a set of `keys` mapped to `values`:
-```lua
-local keys = torch.LongTensor{1,3,4,7,2}
-local values = torch.Tensor{0.1,0.3,0.4,0.7,0.2}
-```
-
-You can use a `SparseTensor` to encapsulate these into a read-only tensor:
-
-```lua
-local st = torch.SparseTensor(input, target)
-```
-
-The _decisiontree_ library uses `SparseTensors` to simulate the `__index` method of the `torch.Tensor`.
-For example, one can obtain the value associated to key 3 of the above `st` instance:
-
-```lua
-local value = st[3]
-assert(value == 0.3)
-```
-
-When the key,value pair are missing, `nil` is returned instead:
-
-```lua
-local value = st[2]
-assert(value == nil)
-```
-
-The best implementation for this kind of indexing is slow (it uses a sequential scan of the `keys).
-To speedup indexing, one can call the `buildIndex()` method before hand:
-
-```lua
-st:buildIndex()
-```
-
-The `buildIndex()` creates a hash map (a Lua table) of keys to their commensurate indices in the `values` table.
-
-## `dt.DataSet`
-
-The `CartTrainer`, `RandomForestTrainer` and `GradientBoostTrainer` require that data sets be encapsulated into a `DataSet`.
-Suppose you have a dataset of dense inputs and targets:
-
-```lua
-local nExample = 10
-local nFeature = 5
-local input = torch.randn(nExample, nFeature)
-local target = torch.Tensor(nExample):random(0,1)
-```
-
-these can be encapsulated into a `DataSet` object:
-
-```lua
-local dataset = dt.DataSet(input, target)
-```
-
-Now suppose you have a dataset where the `input` is a table of `SparseTensor` instances:
-
-```lua
-local input = {}
-for i=1,nExample do
- local nKeyVal = math.random(2,nFeature)
- local keys = torch.LongTensor(nKeyVal):random(1,nFeature)
- local values = torch.randn(nKeyVal)
- input[i] = torch.SparseTensor(keys, values)
-end
-```
-
-You can still use a `DataSet` to encapsulate the sparse dataset:
-
-```lua
-local dataset = dt.DataSet(input, target)
-```
-
-The main purpose of the `DataSet` class is to sort each feature by value.
-This is captured by the `sortFeatureValues(input)` method, which is called in the constructor:
-
-```lua
-local sortedFeatureValues, featureIds = self:sortFeatureValues(input)
-```
-
-The `featureIds` is a `torch.LongTensor` of all available feature IDs.
-For a dense `input` tensor, this is just `torch.LongTensor():range(1,input:size(2))`.
-But for a sparse `input` tensor, the `featureIds` tensor only contains the feature IDs present in the dataset.
-
-The resulting `sortedFeatureValues` is a table mapping `featureIds` to `exampleIds` sorted by `featureValues`.
-For each `featureId`, examples are sorted by `featureValue` in ascending order.
-For example, the table might look like: `{featureId=exampleIds}` where `examplesIds={1,3,2}`.
-
-The `CartTrainer` accesses the `sortedFeatureValues` tensor by calling `getSortedFeature(featureId)`:
-
-```lua
-local exampleIdsWithFeature = dataset:getSortedFeature(featureId)
-```
-
-The ability to access examples IDs sorted by feature value, given a feature ID, is the main purpose of the `DataSet`.
-The `CartTrainer` relies on these sorted lists to find the best way to split a set of examples between two tree nodes.
-
-## `dt.CartTrainer`
-
-```lua
-local trainer = dt.CartTrainer(dataset, minLeafSize, maxLeafNodes)
-```
-
-The `CartTrainer` is used by the `RandomForestTrainer` and `GradientBoostTrainer` to train individual trees.
-CART stands for classification and regression trees.
-However, only binary classifiers are unit tested.
-
-The constructor takes the following arguments:
-
- * `dataset` is a `dt.DataSet` instance representing the training set.
- * `minLeafSize` is the minimum examples per leaf node in a tree. The larger the value, the more regularization.
- * `maxLeafNodes` is the maximum nodes in the tree. The lower the value, the more regularization.
-
-Training is initiated by calling the `train()` method:
-
-```lua
-local trainSet = dt.DataSet(input, target)
-local rootTreeState = dt.GiniState(trainSet:getExampleIds())
-local activeFeatures = trainSet.featureIds
-local tree = trainer:train(rootTreeState, activeFeatures)
-```
-
-The resulting `tree` is a `CartTree` instance.
-The `rootTreeState` is a `TreeState` instance like `GiniState` (used by `RandomForestTrainer`) or `GradientBoostState` (used by `GradientBoostTrainer`).
-The `activeFeatures` is a `LongTensor` of feature IDs that used to build the tree.
-Every other feature ID is ignored during training. This is useful for feature bagging.
-
-By default the `CartTrainer` runs in a single-thread.
-The `featureParallel(nThread)` method can be called before calling `train()` to parallelize training using `nThread` workers:
-
-```lua
-local nThread = 3
-trainer:featureParallel(nThread)
-trainer:train(rootTreeState, activeFeatures)
-```
-
-Feature parallelization assigns a set of features IDs to each thread.
-
-The `CartTrainer` can be used as a stand-alone tree trainer.
-But it is recommended to use it within the context of a `RandomForestTrainer` or `GradientBoostTrainer` instead.
-The latter typically generalize better.
-
-## RandomForestTrainer
-
-The `RandomForestTrainer` is used to train a random forest:
-
-```lua
-local nExample = trainSet:size()
-local opt = {
- activeRatio=0.5,
- featureBaggingSize=5,
- nTree=14,
- maxLeafNodes=nExample/2,
- minLeafSize=nExample/10,
-}
-local trainer = dt.RandomForestTrainer(opt)
-local forest = trainer:train(trainSet, trainSet.featureIds)
-```
-
-The returned `forest` is a `DecisionForest` instance.
-A `DecisionForest` has a similar interface to the `CartTree`.
-Indeed, they both sub-class the `DecisionTree` abstract class.
-
-The constructor takes a single `opt` table argument, which contains the actual arguments:
-
- * `activeRatio` is the ratio of active examples per tree. This is used for boostrap sampling.
- * `featureBaggingSize` is the number of features per tree. This is also used fpr feature bagging.
- * `nTree` is the number of trees to be trained.
- * `maxLeafNodes` and `minLeafSize` are passed to the underlying `CartTrainer` constructor (controls regularization).
-
-Internally, the `RandomForestTrainer` passes a `GiniBoostState` to the `CartTrainer:train()` method.
-
-Training can be parallelized by calling `treeParallel(nThread)`:
-
-```lua
-local nThread = 3
-trainer:treeParallel(nThread)
-local forest = trainer:train(trainSet, trainSet.featureIds)
-```
-
-Training then parallelizes by training each tree in its own thread worker.
-
-## GradientBoostTrainer
-
-References:
- * A. [Boosted Tree presentation](https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
-
-Graient boosted decision trees (GBDT) can be trained as follows:
-```lua
-local nExample = trainSet:size()
-local maxLeafNode, minLeafSize = nExample/2, nExample/10
-local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNode)
-
-local opt = {
- lossFunction=nn.LogitBoostCriterion(false),
- treeTrainer=cartTrainer,
- shrinkage=0.1,
- downsampleRatio=0.8,
- featureBaggingSize=-1,
- nTree=14,
- evalFreq=8,
- earlyStop=0
-}
-
-local trainer = dt.GradientBoostTrainer(opt)
-local forest = trainer:train(trainSet, trainSet.featureIds, validSet)
-```
-
-The above code snippet uses the `LogitBoostCriterion` outlined in reference A.
-It is used for training binary classification trees.
-
-The returned `forest` is a `DecisionForest` instance.
-A `DecisionForest` has a similar interface to the `CartTree`.
-Indeed, they both sub-class the `DecisionTree` abstract class.
-
-The constructor takes a single `opt` table argument, which contains the actual arguments:
-
- * `lossFunction` is a `nn.Criterion` instance extended to include the `updateHessInput(input, target)` and `backward2(input, target)`. These return the hessian of the `input`.
- * `treeTrainer` is a `CartTrainer` instance. Its `featureParallel()` method can be called to implement feature parallelization.
- * `shrinkage` is the weight of each additional tree.
- * `downsampleRatio` is the ratio of examples to be sampled for each tree. Used for bootstrap sampling.
- * `featureBaggingSize` is the number of features to sample per tree. Used for feature bagging. `-1` defaults to `torch.round(math.sqrt(featureIds:size(1)))`
- * `nTree` is the maximum number of trees.
- * `evalFreq` is the number of epochs between calls to `validate()` for cross-validation and early-stopping.
- * `earlyStop` is the maximum number of epochs to wait for early-stopping.
-
-Internally, the `GradientBoostTrainer` passes a `GradientBoostState` to the `CartTrainer:train()` method.
-
-## TreeState
-
-An abstract class that holds the state of a subtree during decision tree training.
-It also manages the state of candidate splits.
-
-```lua
-local treeState = dt.TreeState(exampleIds)
-```
-
-The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
-
-## GiniState
-
-A `TreeState` subclass used internally by the `RandomForestTrainer`.
-Uses Gini impurity to determine how to split trees.
-
-```lua
-local treeState = dt.GiniState(exampleIds)
-```
-
-The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
-
-## GradientBoostState
-
-A `TreeState` subclass used internally by the `GradientBoostTrainer`.
-It implements the GBDT spliting algorithm, which uses a loss function.
-
-```lua
-local treeState = dt.GradientBoostState(exampleIds, lossFunction)
-```
-
-The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
-The `lossFunction` is an `nn.Criterion` instance (see `GradientBoostTrainer`).
-
-
-## WorkPool
-
-Utility class that simplifies construction of a pool of daemon threads with which to execute tasks in parallel.
-
-```lua
-local workpool = dt.WorkPool(nThread)
-```
-
-## CartTree
-
-Implements a trained CART decision tree:
-
-```lua
-local tree = nn.CartTree(rootNode)
-```
-
-The `rootNode` is a `CartNode` instance.
-Each `CartNode` contains pointers to left and right branches, which are themselves `CartNode` instances.
-
-For inference, use the `score(input)` method:
-
-```lua
-local score = tree:score(input)
-```
diff --git a/contrib/lua-torch/decisiontree/RandomForestTrainer.lua b/contrib/lua-torch/decisiontree/RandomForestTrainer.lua
deleted file mode 100644
index 41040b25b..000000000
--- a/contrib/lua-torch/decisiontree/RandomForestTrainer.lua
+++ /dev/null
@@ -1,159 +0,0 @@
-local dt = require "decisiontree._env"
-
-local RandomForestTrainer = torch.class("dt.RandomForestTrainer", dt)
-
-function RandomForestTrainer:__init(opt)
- assert(torch.type(opt.nTree) == 'number')
- assert(opt.nTree > 0)
- self.nTree = opt.nTree
- -- max number of leaf nodes per tree
- assert(torch.type(opt.maxLeafNodes) == 'number')
- assert(opt.maxLeafNodes > 0)
- self.maxLeafNodes = opt.maxLeafNodes
- -- min number of examples per leaf
- assert(torch.type(opt.minLeafSize) == 'number')
- assert(opt.minLeafSize > 0)
- self.minLeafSize = opt.minLeafSize
-
- -- when non-positive, defaults to sqrt(#feature)
- assert(torch.type(opt.featureBaggingSize) == 'number')
- self.featureBaggingSize = opt.featureBaggingSize
-
- assert(torch.type(opt.activeRatio) == 'number')
- assert(opt.activeRatio > 0)
- self.activeRatio = opt.activeRatio
-
- -- default parallelization is singlethread
- self.parallelMode = 'singlethread'
-end
-
--- Train a DecisionForest
-function RandomForestTrainer:train(trainSet, featureIds, verbose)
- assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
- assert(torch.type(featureIds) == 'torch.LongTensor')
-
- if verbose then print(string.format("Begin training Decision Forest with %d trees", self.nTree)) end
-
- local weight = torch.Tensor(self.nTree):fill(1 / self.nTree) -- RF uses uniform weights
-
- local trees
- if self.parallelMode == 'singlethread' then
- trees = self:trainTrees(trainSet, featureIds, verbose)
- elseif self.parallelMode == 'treeparallel' then
- trainSet:deleteIndex() -- prevents serialization bottleneck
- trees = self:trainTreesTP(trainSet, featureIds, verbose)
- else
- error("Unrecognized parallel mode: " .. self.parallelMode)
- end
-
- if verbose then print(string.format("Successfully trained %d trees", #trees)) end
-
- -- set bias
- local bias = 0;
- for i, tree in ipairs(trees) do
- bias = bias + tree.root.score * weight[i]
- end
-
- return dt.DecisionForest(trees, weight, bias)
-end
-
-function RandomForestTrainer:trainTrees(trainSet, featureIds, verbose)
-
- -- the same CartTrainer will be used for each tree
- local cartTrainer = dt.CartTrainer(trainSet, self.minLeafSize, self.maxLeafNodes)
-
- local trees = {}
- for treeId=1,self.nTree do
- -- Train a CartTree
- local tree = self.trainTree(cartTrainer, featureIds, self.featureBaggingSize, self.activeRatio, treeId, verbose)
- table.insert(trees, tree)
- end
- return trees
-end
-
--- static function that returns a cartTree
-function RandomForestTrainer.trainTree(cartTrainer, featureIds, baggingSize, activeRatio, treeId, verbose)
- assert(torch.isTypeOf(cartTrainer, 'dt.CartTrainer'))
- assert(torch.type(featureIds) == 'torch.LongTensor')
- local baggingSize = baggingSize > 0 and baggingSize or torch.round(math.sqrt(featureIds:size(1)))
-
- if verbose then
- print(string.format("Tree %d: Creating features bootstrap sample with baggingSize %d, nFeatures %d", treeId, baggingSize, featureIds:size(1)))
- end
-
- local trainSet = cartTrainer.dataset
-
- -- sample boot strap features
- local baggingIndices = torch.LongTensor(baggingSize):random(1,featureIds:size(1))
- local activeFeatures = featureIds:index(1, baggingIndices)
-
- -- sample boot strap examples
- local sampleSize = torch.round(trainSet:size() * activeRatio)
- if verbose then print(string.format("Creating bootstrap sample created of size %d", sampleSize)) end
-
- baggingIndices:resize(sampleSize):random(1,trainSet:size())
- local bootStrapExampleIds = torch.LongTensor()
- bootStrapExampleIds:index(trainSet:getExampleIds(), 1, baggingIndices)
-
- local cartTree = cartTrainer:train(dt.GiniState(bootStrapExampleIds), activeFeatures)
-
- if verbose then print(string.format("Complete processing tree number %d", treeId)) end
-
- return cartTree
-end
-
--- call before training to enable tree-level parallelization
-function RandomForestTrainer:treeParallel(workPool)
- assert(self.parallelMode == 'singlethread', self.parallelMode)
- self.parallelMode = 'treeparallel'
- self.workPool = torch.type(workPool) == 'number' and dt.WorkPool(workPool) or workPool
- assert(torch.isTypeOf(self.workPool, 'dt.WorkPool'))
-
- -- require the dt package
- self.workPool:update('require', {libname='decisiontree',varname='dt'})
-end
-
--- TP is for tree parallel (not toilet paper)
-function RandomForestTrainer:trainTreesTP(trainSet, featureIds, verbose)
- assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
- assert(torch.type(featureIds) == 'torch.LongTensor')
- local minLeafSize = self.minLeafSize
- local maxLeafNodes = self.maxLeafNodes
-
- -- setup worker store (each worker will have its own cartTrainer)
- self.workPool:updateup('execute', function(store)
- local dt = require 'decisiontree'
-
- store.cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
- store.featureIds = featureIds
- end)
-
- for treeId=1,self.nTree do
- -- upvalues
- local baggingSize = self.featureBaggingSize
- local activeRatio = self.activeRatio
- -- task closure that will be executed in worker-thread
- local function trainTreeTask(store)
- local dt = require 'decisiontree'
- return dt.RandomForestTrainer.trainTree(store.cartTrainer, store.featureIds, baggingSize, activeRatio, treeId, verbose)
- end
- self.workPool:writeup('execute', trainTreeTask)
- end
-
- local trees = {}
- for treeId=1,self.nTree do
- local taskname, tree = self.workPool:read()
- assert(taskname=='execute')
- assert(torch.isTypeOf(tree, 'dt.CartTree'))
- table.insert(trees, tree)
- end
- return trees
-end
-
-function RandomForestTrainer:getName()
- return string.format(
- "randomforest-aRatio-%4.2f-maxLeaf-%d-minExample-%d-nTree-%d",
- self.activeRatio, self.maxLeafNodes, self.minLeafSize, self.nTree
- )
-end
-
diff --git a/contrib/lua-torch/decisiontree/Sparse2Dense.lua b/contrib/lua-torch/decisiontree/Sparse2Dense.lua
deleted file mode 100644
index 4e5b79d2f..000000000
--- a/contrib/lua-torch/decisiontree/Sparse2Dense.lua
+++ /dev/null
@@ -1,88 +0,0 @@
-local S2D, parent = torch.class("nn.Sparse2Dense", "nn.Module")
-local dt = require 'decisiontree._env'
-
-function S2D:__init(features)
- parent.__init(self)
- if torch.type(features) == 'table' then
- assert(#features > 0)
- features = torch.LongTensor(features)
- end
- assert(torch.isTensor(features))
- self.features = features
- self.featureMap = nil
- self.masks = {}
- self.mappedKeys = {}
-end
-
-function S2D:updateOutput(input)
- if not self.featureMap then
- self.featureMap = dt.HashMap()
- self.featureMap:fill(self.features)
- end
- local batched, keys, values
- if torch.isTensor(input[1]) then
- keys = {input[1]}
- values = {input[2]}
- batched = false
- else
- keys = input[1]
- values = input[2]
- batched = true
- end
- assert(#keys == #values)
-
- local masks = self.masks
- local mappedKeys = self.mappedKeys
- local nKeys = #keys
- local nMasks = #masks
- if nMasks < nKeys then
- for i=nMasks+1,nKeys do
- masks[i] = torch.ByteTensor()
- mappedKeys[i] = torch.LongTensor()
- end
- elseif nMasks > nKeys then
- for i=nKeys+1,nMasks do
- masks[i] = nil
- mappedKeys[i] = nil
- end
- end
-
- self.featureMap:get(keys, mappedKeys, masks)
- self.output = self.output or torch.Tensor():type(self._type)
- self.output.nn.S2D_computeOutput(self.output, mappedKeys, values, masks, self.features)
- if not batched then
- self.output = self.output:view(-1)
- end
- return self.output
-end
-
-function S2D:type(type, tensorCache)
- if type then
- local features = self.features
- self.features = nil
- parent.type(self, type, tensorCache)
- self.features = features
- return self
- else
- return parent.type(self)
- end
-end
-
-function S2D:updateGradInput(input, gradOutput)
- error"Not Implemented"
-end
-
-function S2D:reset()
- parent.reset(self)
- self.featureMap = nil
-end
-
-function S2D:write(file)
- self.featureMap = nil
- parent.write(self, file)
-end
-
-function S2D:read(file)
- self.featureMap = nil
- parent.read(self, file)
-end
diff --git a/contrib/lua-torch/decisiontree/SparseTensor.lua b/contrib/lua-torch/decisiontree/SparseTensor.lua
deleted file mode 100644
index 4c620e618..000000000
--- a/contrib/lua-torch/decisiontree/SparseTensor.lua
+++ /dev/null
@@ -1,54 +0,0 @@
-
-local SparseTensor = torch.class("torch.SparseTensor")
-
-function SparseTensor:__init(keys, values)
- if keys and values then
- assert(torch.typename(keys):find('torch%..*LongTensor'))
- assert(torch.isTensor(values))
- assert(keys:nElement() == values:nElement(), "Expecting key and value tensors of same size")
- self.keys = keys
- self.values = values
- elseif not (keys or values) then
- self.keys = torch.LongTensor()
- self.values = torch.Tensor()
- else
- error"Expecting zero or two args"
- end
-end
-
-function SparseTensor:buildIndex(overwrite)
- if self._map and not overwrite then return end
- assert(self.keys and self.keys:dim() == 1)
- assert(self.values and self.values:dim() == 1)
- -- hash table
- self._map = {}
- for i=1,self.keys:size(1) do
- self._map[self.keys[i]] = i
- end
-end
-
-function SparseTensor:deleteIndex()
- self._map = nil
-end
-
-local __index = SparseTensor.__index
-function SparseTensor:__index(key)
- if key == nil then
- error"Attempt to index using a nil key"
- elseif torch.type(key) ~= 'number' then
- return __index(self, key)
- end
-
- if self._map then
- assert(torch.type(self._map) == 'table')
- local idx = self._map[key]
- return idx and self.values[idx] or nil
- elseif self.keys:nElement() > 0 then
- for i=1,self.keys:size(1) do
- if self.keys[i] == key then
- return self.values[i]
- end
- end
- end
- return nil
-end \ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/TreeState.lua b/contrib/lua-torch/decisiontree/TreeState.lua
deleted file mode 100644
index 3928649fd..000000000
--- a/contrib/lua-torch/decisiontree/TreeState.lua
+++ /dev/null
@@ -1,191 +0,0 @@
-local dt = require "decisiontree._env"
-
-local TreeState = torch.class("dt.TreeState", dt)
-
--- Holds the state of a subtree during decision tree training.
--- Also, manages the state of candidate splits
-function TreeState:__init(exampleIds)
- assert(torch.type(exampleIds) == 'torch.LongTensor')
- self.exampleIds = exampleIds
-
- self.nExampleInLeftBranch = 0
- self.nExampleInRightBranch = 0
-end
-
--- computes and returns the score of the node based on its examples
-function TreeState:score(dataset)
- error"NotImplemented"
-end
-
-
--- Initializes the split-state-updater. Initially all examples are in the left branch.
--- exampleIdsWithFeature is list of examples to split (those having a particular feature)
-function TreeState:initialize(exampleIdsWithFeature, dataset)
- error"NotImplemented"
-end
-
--- Update the split state. This call has the effect of shifting the example from the left to the right branch.
-function TreeState:update(exampleId, dataset)
- error"NotImplemented"
-end
-
--- Computes the SplitInfo determined by the current split state
--- @param splitFeatureId the feature id of the split feature
--- @param splitFeatureValue the feature value of the split feature
--- @return the SplitInfo determined by the current split state
-function TreeState:computeSplitInfo(splitFeatureId, splitFeatureValue)
- error"NotImplemented"
-end
-
--- bottleneck
-function TreeState:findBestFeatureSplit(dataset, featureId, minLeafSize)
- local dt = require "decisiontree"
- assert(torch.isTypeOf(dataset, 'dt.DataSet'))
- assert(torch.type(featureId) == 'number')
- assert(torch.type(minLeafSize) == 'number')
-
- -- all dataset example having this feature, sorted by value
- local featureExampleIds = dataset:getSortedFeature(featureId)
-
- local buffer = dt.getBufferTable('TreeState')
- buffer.longtensor = buffer.longtensor or torch.LongTensor()
- local exampleIdsWithFeature = buffer.longtensor
-
- -- map and tensor of examples containing feature:
- local exampleMap = {}
- local getExampleFeatureValue
-
- local j = 0
- if torch.type(dataset.input) == 'table' then
- exampleIdsWithFeature:resize(self.exampleIds:size())
- self.exampleIds:apply(function(exampleId)
- local input = dataset.input[exampleId]
- input:buildIndex()-- only builds index first time
- if input[featureId] then
- j = j + 1
- exampleIdsWithFeature[j] = exampleId
- exampleMap[exampleId] = j
- end
- end)
- if j == 0 then
- return
- end
- exampleIdsWithFeature:resize(j)
- getExampleFeatureValue = function(exampleId) return dataset.input[exampleId][featureId] end
- else
- exampleIdsWithFeature = self.exampleIds
- self.exampleIds:apply(function(exampleId)
- j = j + 1
- exampleMap[exampleId] = j
- end)
- local featureValues = dataset.input:select(2,featureId)
- getExampleFeatureValue = function(exampleId) return featureValues[exampleId] end
- end
-
-
- self:initialize(exampleIdsWithFeature, dataset)
-
- -- bottleneck
- local bestSplit, previousSplitValue, _tictoc
- for i=featureExampleIds:size(1),1,-1 do -- loop over examples sorted (desc) by feature value
- local exampleId = featureExampleIds[i]
-
- local exampleIdx = exampleMap[exampleId]
- if exampleIdx then
- local splitValue = getExampleFeatureValue(exampleId)
-
- if previousSplitValue and math.abs(splitValue - previousSplitValue) > dt.EPSILON then
- local splitInfo = self:computeSplitInfo(featureId, previousSplitValue, _tictoc)
- if (splitInfo.leftChildSize >= minLeafSize) and (splitInfo.rightChildSize >= minLeafSize) then
-
- if (not bestSplit) or (splitInfo.splitGain < bestSplit.splitGain) then
- _tictoc = bestSplit or {} -- reuse table
- bestSplit = splitInfo
- end
-
- end
- end
-
- previousSplitValue = splitValue
-
- -- bottleneck
- self:update(exampleId, dataset, exampleIdx)
- end
- end
-
- return bestSplit
-end
-
--- finds the best split of examples in treeState among featureIds
-function TreeState:findBestSplit(dataset, featureIds, minLeafSize, shardId, nShard)
- assert(torch.isTypeOf(dataset, 'dt.DataSet'))
- assert(torch.type(featureIds) == 'torch.LongTensor')
- assert(torch.type(minLeafSize) == 'number')
- assert(torch.type(shardId) == 'number')
- assert(torch.type(nShard) == 'number')
-
- local bestSplit
- for i=1,featureIds:size(1) do
- local featureId = featureIds[i]
- if (nShard <= 1) or ( (featureId % nShard) + 1 == shardId ) then -- feature sharded
- local splitCandidate = self:findBestFeatureSplit(dataset, featureId, minLeafSize)
- if splitCandidate and ((not bestSplit) or (splitCandidate.splitGain < bestSplit.splitGain)) then
- bestSplit = splitCandidate
- end
- end
- end
-
- return bestSplit
-end
-
--- Partitions self given a splitInfo table, producing a pair of exampleIds corresponding to the left and right subtrees.
-function TreeState:_branch(splitInfo, dataset)
- local leftIdx, rightIdx = 0, 0
- local nExample = self.exampleIds:size(1)
- local splitExampleIds = torch.LongTensor(nExample)
-
-
- for i=1,self.exampleIds:size(1) do
- local exampleId = self.exampleIds[i]
- local input = dataset.input[exampleId]
- local val = input[splitInfo.splitId]
- -- Note: when the feature is not present in the example, the example is droped from all sub-trees.
- -- Which means that for most sparse data, a tree cannot reach 100% accuracy...
- if val then
- if val < splitInfo.splitValue then
- leftIdx = leftIdx + 1
- splitExampleIds[leftIdx] = exampleId
- else
- rightIdx = rightIdx + 1
- splitExampleIds[nExample-rightIdx+1] = exampleId
- end
- end
- end
-
- local leftExampleIds = splitExampleIds:narrow(1,1,leftIdx)
- local rightExampleIds = splitExampleIds:narrow(1,nExample-rightIdx+1,rightIdx)
-
- assert(leftExampleIds:size(1) + rightExampleIds:size(1) <= self.exampleIds:size(1), "Left and right branches contain more data than the parent!")
- return leftExampleIds, rightExampleIds
-end
-
--- calls _branch and encapsulates the left and right exampleIds into a TreeStates
-function TreeState:branch(splitInfo, dataset)
- local leftExampleIds, rightExampleIds = self:_branch(splitInfo, dataset)
- return self.new(leftExampleIds), self.new(rightExampleIds)
-end
-
-function TreeState:size()
- return self.exampleIds:size(1)
-end
-
-function TreeState:contains(exampleId)
- local found = false
- self.exampleIds:apply(function(x)
- if x == exampleId then
- found = true
- end
- end)
- return found
-end
-
diff --git a/contrib/lua-torch/decisiontree/WorkPool.lua b/contrib/lua-torch/decisiontree/WorkPool.lua
deleted file mode 100644
index 8f473727e..000000000
--- a/contrib/lua-torch/decisiontree/WorkPool.lua
+++ /dev/null
@@ -1,156 +0,0 @@
-local dt = require "decisiontree._env"
-
--- Utility to simplify construction of a pool of daemon threads with which to execute tasks in parallel.
-local WorkPool = torch.class("dt.WorkPool", dt)
-
-function WorkPool:__init(nThread)
- self.nThread = nThread or 16
- assert(torch.type(self.nThread) == 'number')
- assert(self.nThread > 0)
-
- self:initialize()
-end
-
-function WorkPool:initialize()
- local ipc = require 'libipc'
- self.queuename = os.tmpname()
- self.queue = ipc.workqueue(self.queuename)
- self.queues = {}
- for i=1,self.nThread do
- self.queues[i] = ipc.workqueue(self.queuename.."/"..i)
- end
-
- -- spawn thread workers
- ipc.map(self.nThread, function(queuename, nThread, myId)
- assert(nThread)
- assert(myId)
- local ipc = require 'libipc'
-
- -- Open the queue by name (the main thread already created it)
- local mainqueue = ipc.workqueue(queuename)
- local workqueue = ipc.workqueue(queuename.."/"..myId)
-
- local taskname, args
-
- local store = {}
- local queue = mainqueue
-
- repeat
- local msg = queue:read()
- assert(torch.type(msg) == 'table')
- taskname, task = unpack(msg)
- if taskname == nil then
- break
- elseif torch.type(taskname) ~= 'string' then
- error("Expecting taskname string. Got "..torch.type(taskname))
- elseif taskname == 'storeKeyValue' then
- assert(torch.type(task) == 'table')
- assert(queue == workqueue)
- store[task.key] = task.value
- queue:write({taskname})
- elseif taskname == 'storeKeysValues' then
- assert(torch.type(task) == 'table')
- assert(queue == workqueue)
- for key,value in pairs(task) do
- store[key] = value
- end
- queue:write({taskname})
- elseif taskname == 'require' then
- assert(torch.type(task) == 'table')
- assert(torch.type(task.libname) == 'string')
- assert(torch.type(task.varname) == 'string')
- _G[task.varname] = require(task.libname)
- assert(queue == workqueue)
- queue:write({taskname})
- elseif taskname == 'storeReset' then
- store = {}
- mainqueue:write({taskname})
- elseif taskname == 'echo' then
- mainqueue:write({taskname, task})
- elseif taskname == 'readWorkerQueue' then
- queue = workqueue
- elseif taskname == 'readMainQueue' then
- queue = mainqueue
- elseif taskname == 'execute' then
- if torch.type(task) == 'table' then
- assert(task.func and task.args)
- queue:write({taskname, task.func(store, task.args, myId)})
- else
- assert(torch.type(task) == 'function')
- queue:write({taskname, task(store, myId)})
- end
- else
- error("Unknown taskname: "..taskname)
- end
- until taskname == nil
- end, self.queuename, self.nThread)
-
-end
-
--- Terminates all daemon threads.
-function WorkPool:terminate()
- for i=1,self.nThread do
- self.queue:write({})
- end
-end
-
--- this function is used to update the store of data in each worker thread
-function WorkPool:_update(taskname, task, upval)
- assert(torch.type(taskname) == 'string')
- local _ = require 'moses'
- assert(_.contains({'storeKeyValue','storeKeysValues','require','execute'}, taskname))
- assert(torch.type(task) == 'table' or torch.type(task) == 'function')
-
- -- tell the workers to read their individual queue
- for i=1,self.nThread do
- self.queue:write({'readWorkerQueue'})
- end
-
- -- write to individual worker queues
- for i=1,self.nThread do
- if upval then
- self.queues[i]:writeup({taskname, task})
- else
- self.queues[i]:write({taskname, task})
- end
- end
-
- -- TODO use ipc.mutex:barrier(nThread+1)
- -- barrier: make sure that every worker has completed task by reading their queue
- for i=1,self.nThread do
- assert(self.queues[i]:read()[1] == taskname)
- end
-
- -- finally, tell them to read the main queue
- for i=1,self.nThread do
- self.queues[i]:write({'readMainQueue'})
- end
-end
-
-function WorkPool:update(taskname, task)
- return self:_update(taskname, task, false)
-end
-
-function WorkPool:updateup(taskname, task)
- return self:_update(taskname, task, true)
-end
-
-function WorkPool:write(taskname, task)
- assert(torch.type(taskname) == 'string')
- assert(taskname ~= 'storeKeyValue' or taskname ~= 'storeKeysValues')
- self.queue:write({taskname, task})
-end
-
-function WorkPool:writeup(taskname, task)
- assert(torch.type(taskname) == 'string')
- assert(taskname ~= 'storeKeyValue' or taskname ~= 'storeKeysValues')
- self.queue:writeup({taskname, task})
-end
-
-function WorkPool:read()
- local res = self.queue:read()
- assert(torch.type(res) == 'table')
- assert(torch.type(res[1] == 'string'))
- return unpack(res)
-end
-
diff --git a/contrib/lua-torch/decisiontree/_env.lua b/contrib/lua-torch/decisiontree/_env.lua
deleted file mode 100644
index a92770152..000000000
--- a/contrib/lua-torch/decisiontree/_env.lua
+++ /dev/null
@@ -1,5 +0,0 @@
-
--- https://github.com/torch/torch7/issues/525
-
-local dl = {}
-return dl \ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/benchmark.lua b/contrib/lua-torch/decisiontree/benchmark.lua
deleted file mode 100644
index 2b6a03dc6..000000000
--- a/contrib/lua-torch/decisiontree/benchmark.lua
+++ /dev/null
@@ -1,171 +0,0 @@
-local dt = require "decisiontree._env"
-
-local bm = {}
-function bm.CartTrainer(opt)
- local timer = torch.Timer()
- local trainSet, validSet = dt.getSparseDummyData(opt)
- print(string.format("CartTrainer: sparse dataset create: %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-
- local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
- local treeState = dt.GiniState(trainSet:getExampleIds())
- timer:reset()
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- print(string.format("CartTrainer: train single-thread : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-
- timer:reset()
- cartTrainer:featureParallel(opt.nThread)
- print(string.format("CartTrainer: setup feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
- timer:reset()
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- print(string.format("CartTrainer: train feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-end
-
-function bm.GradientBoostState(opt)
- local trainSet, validSet = dt.getSparseDummyData(opt)
-
- trainSet:initScore()
-
- local treeState = dt.GradientBoostState(trainSet:getExampleIds(), nn.LogitBoostCriterion(false))
-
- local timer = torch.Timer() -- first step also calls SparseTensor:buildIndex()
- treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
- print(string.format("GradientBoostState: findBestSplit (first) : %f sec", timer:time().real))
-
- timer:reset()
- treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
- print(string.format("GradientBoostState: findBestSplit (second) : %f sec", timer:time().real))
-
-end
-
-local function file_exists(name)
- local f=io.open(name,"r")
- if f~=nil then io.close(f) return true else return false end
-end
-
-function bm.GradientBoostTrainer(opt)
- local trainSet, validSet
- if file_exists("/tmp/train.bin") and file_exists("/tmp/valid.bin") then
- trainSet = torch.load("/tmp/train.bin")
- validSet = torch.load("/tmp/valid.bin")
- else
- if opt.sparse then
- trainSet, validSet = dt.getSparseDummyData(opt)
- else
- trainSet, validSet = dt.getDenseDummyData(opt)
- end
- torch.save("/tmp/train.bin", trainSet)
- torch.save("/tmp/valid.bin", validSet)
- end
-
- local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
- opt.lossFunction = nn.LogitBoostCriterion(false)
- opt.treeTrainer = cartTrainer
- local forestTrainer = dt.GradientBoostTrainer(opt)
-
- local timer = torch.Timer()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
- local time = timer:time().real
- print(string.format("GradientBoostTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-
- cartTrainer:featureParallel(opt.nThread)
- timer:reset()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
- local time = timer:time().real
- print(string.format("GradientBoostTrainer: train feature-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-end
-
-function bm.RandomForestTrainer(opt)
- local trainSet, validSet = dt.getSparseDummyData(opt)
-
- local forestTrainer = dt.RandomForestTrainer(opt)
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
-
- local timer = torch.Timer()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
- local time = timer:time().real
- print(string.format("RandomForestTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-
- timer:reset()
- forestTrainer:treeParallel(opt.nThread)
- print(string.format("RandomForestTrainer: setup tree-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
-
- timer:reset()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
- local time = timer:time().real
- print(string.format("RandomForestTrainer: train tree-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-end
-
-function bm.DFD(opt)
- local _ = require 'moses'
- local opt = _.clone(opt)
- opt.nExample = 200
- local trainSet, validSet = dt.getDenseDummyData(opt)
-
- local forestTrainer = dt.RandomForestTrainer(opt)
- forestTrainer:treeParallel(opt.nThread)
- local timer = torch.Timer()
- local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
- local time = timer:time().real
- print(string.format("DFD: train random forest in parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
-
-
- -- benchmark nn.DFD
- local input = trainSet.input:sub(1,opt.batchsize)
- local dfd = nn.DFD(decisionForest)
- dfd:forward(input)
- timer:reset()
- for i=1,opt.nloop do
- dfd:forward(input)
- end
- print(string.format("DFD: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
-end
-
-function bm.Sparse2Dense(opt)
- local _ = require 'moses'
- local opt = _.clone(opt)
- opt.nExample = opt.batchsize
- local trainSet = dt.getSparseDummyData(opt)
-
- local input = {{},{}}
- for i=1,opt.batchsize do
- input[1][i] = trainSet.input[i].keys
- input[2][i] = trainSet.input[i].values
- end
- assert(#input[1] == opt.batchsize)
-
- -- benchmark nn.Sparse2Dense
- local s2d = nn.Sparse2Dense(torch.LongTensor():range(1,opt.nFeature))
- s2d:forward(input)
- local timer = torch.Timer()
- for i=1,opt.nloop do
- s2d:forward(input)
- end
- print(string.format("Sparse2Dense: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
-end
-
-function dt.benchmark(benchmarks, opt2)
- local opt = {
- nExample=10000, nCluster=2, nFeature=1000, overlap=0, nValid=100, -- getSparseDummyData
- nTree=20, featureBaggingSize=-1, sparse=true, -- GradientBoostTrainer and RandomForestTrainer
- nThread=2, shrinkage=0.1, downsampleRatio=0.1, evalFreq=5, earlyStop=0, -- GradientBoostTrainer
- activeRatio=0.5, -- RandomForestTrainer
- batchsize=32, nloop=10
- }
-
- local _ = require 'moses'
- benchmarks = benchmarks or _.keys(bm)
- assert(torch.type(benchmarks) == 'table')
- for i,benchmark in ipairs(benchmarks) do
- local opt1 = _.clone(opt)
- for key, value in pairs(opt2 or {}) do
- opt1[key] = value
- end
- opt1.nActive = opt1.nActive or torch.round(opt1.nFeature/10)
- opt1.maxLeafNodes = opt1.maxLeafNodes or (opt1.nExample/10)
- opt1.minLeafSize = opt1.minLeafSize or (opt1.nExample/100)
-
- assert(torch.type(benchmark) == 'string', benchmark)
- assert(bm[benchmark], benchmark)
- bm[benchmark](opt1)
- end
-end
diff --git a/contrib/lua-torch/decisiontree/doc/benchmark.md b/contrib/lua-torch/decisiontree/doc/benchmark.md
deleted file mode 100644
index cb8f905d6..000000000
--- a/contrib/lua-torch/decisiontree/doc/benchmark.md
+++ /dev/null
@@ -1,291 +0,0 @@
-# Benchmarks
-
-This file outlines the roadmap (and commensurate benchmarks) of optimizations and refactorings over time.
-
-## Baseline
-
-The baseline implementation is very slow.
-We converted the Twitter decision tree library (used internally) from Java to Lua.
-The objective was to replicate the GBDT and Random Forest implementations as is (more or less).
-The Java library is very good and reasonably fast. The same code in Lua is slow.
-The point of this Lua baseline was not to obtain the same computational performance as the Java library.
-Instead, we wanted the training and inferences algorithms of the Lua lib to match thoses of the Java lib.
-As such, the training/validation error of the baseline Lua lib should match that of the Java lib.
-The unit tests seem to validate this claim as both training/validation set performance is unit tested.
-We also used the conversion exercise as a way to learn about decision tree implementation (our background is deep learning).
-That being said, the baseline performance is terrible:
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark()"
-CartTrainer: sparse dataset create: 2963.192386 samples/sec; 0.337479 sec
-CartTrainer: train single-thread : 14.165438 samples/sec; 70.594361 sec
-CartTrainer: setup feature-parallel : 5.129034 samples/sec; 194.968478 sec
-CartTrainer: train feature-parallel : 9.736592 samples/sec; 102.705344 sec
-```
-
-The original Java lib had approximately 43 classes.
-The baseline has about 24.
-This reduction is due to obvious merging of classes. But also to conversions of classes to functions.
-The next patches continue this process of reducing the number of classes.
-
-## Patch 1 (complete):
-
-This patch further reduces the number of classes, but adds the DataSet class.
-The code is much simple to read. Examples are batched.
-
- * [x] examples are batched in dt.DataSet: {input, target, score}
- * [x] deprecate dt.LabeledExample
- * [x] list of examples are replaced with torch.LongTensors of exampleIds
- * [x] merge TreeBrancher into TreeState
- * [x] merge BestSplitFinder and SplitStateUpdater into TreeState
- * [x] TreeState subclasses: GradientBoostState and GiniState
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark()"
-CartTrainer: sparse dataset create: 3597.392294 samples/sec; 0.277984 sec
-CartTrainer: train single-thread : 35.763255 samples/sec; 27.961663 sec
-CartTrainer: setup feature-parallel : 36759.250495 samples/sec; 0.027220 sec
-CartTrainer: train feature-parallel : 72.523658 samples/sec; 13.788606 sec
-```
-
- The setup time for feature-parallelization is most improved.
- The run-time for feature-parallel also about half that of single-thread.
- Since its using 2 threads, that means the parallelization is working quite well.
-
- We also added benchmarks for the `RandomForestTrainer` and `GradientBoostTrainer`:
-
-```
-GradientBoostTrainer: train single-thread : 599.895105 samples/sec; 0.083348 sec/tree, 1.666958 sec
-GradientBoostTrainer: train feature-parallel : 974.235273 samples/sec; 0.051322 sec/tree, 1.026446 sec
-RandomForestTrainer: train single-thread : 134.781044 samples/sec; 0.370972 sec/tree, 7.419441 sec
-RandomForestTrainer: setup tree-parallel : 73341.097064 samples/sec; 0.013649 sec
-RandomForestTrainer: train tree-parallel : 262.975891 samples/sec; 0.190131 sec/tree, 3.802630 sec
-```
-
-Looks good.
-
-## Patch 2 (complete):
-
- * [x] dt.LossFunction -> nn.Criterion (LogitBoost is done, missing MSE)
- * [x] use SparseTensor:buildIndex() to accelerate TreeState:findBestSplit()
- * [x] benchmarks use 10000 instead of 1000 examples
-
-The benchmarks indicate good improvements. Most improvements were made possible by the use of `buildIndex`:
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark()"
-GradientBoostState: findBestSplit (first) : 11.415645 sec
-GradientBoostState: findBestSplit (second) : 11.246336 sec
-CartTrainer: sparse dataset create: 3284.803629 samples/sec; 3.044327 sec
-CartTrainer: train single-thread : 239.544758 samples/sec; 41.745858 sec
-CartTrainer: setup feature-parallel : 10996.443063 samples/sec; 0.909390 sec
-CartTrainer: train feature-parallel : 473.888592 samples/sec; 21.102011 sec
-RandomForestTrainer: train single-thread : 892.985186 samples/sec; 0.559920 sec/tree, 11.198394 sec
-RandomForestTrainer: setup tree-parallel : 176806.252266 samples/sec; 0.056569 sec
-RandomForestTrainer: train tree-parallel : 1377.849291 samples/sec; 0.362884 sec/tree, 7.257688 sec
-GradientBoostTrainer: train single-thread : 2685.485128 samples/sec; 0.186186 sec/tree, 3.723722 sec
-GradientBoostTrainer: train feature-parallel : 3712.313215 samples/sec; 0.134687 sec/tree, 2.693738 sec
-```
-
-The main bottleneck now is in serializing the SparseTensor hash maps. We temporarly overcame this bottleneck by
-deleting indexes when calling `CartTrainer:featureParallel()` and `RandomForestTrainer:treeParallel()`.
-In this way, the indexes are recreated for each thread. Ideally, we would use a C hash map such that a pointer
-could be serialized instead. But `tds.Hash` does not serialize well. For now instead, we use lua tables.
-
-This is the benchmark for `GradientBoostTrainer` on a large dataset of dense inputs:
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
-GradientBoostTrainer: train single-thread : 152.463989 samples/sec; 131.178517 sec/tree, 655.892584 sec
-GradientBoostTrainer: train feature-parallel : 224.288488 samples/sec; 89.170872 sec/tree, 445.854358 sec
-[tw-mbp-nleonard decisiontree]$ th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8,nThread=4})"
-GradientBoostTrainer: train single-thread : 163.836896 samples/sec; 122.072625 sec/tree, 610.363126 sec
-GradientBoostTrainer: train feature-parallel : 407.981442 samples/sec; 49.021838 sec/tree, 245.109188 sec
-```
-
-## Patch 3 :
-
-Optimize GBDT for large datasets consisting of dense inputs. The benchmarks:
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
-GradientBoostTrainer: train single-thread : 547.553407 samples/sec; 36.526117 sec/tree, 182.630587 sec
-GradientBoostTrainer: train feature-parallel : 792.964678 samples/sec; 25.221804 sec/tree, 126.109022 sec
-[tw-mbp-nleonard decisiontree]$ th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8,nThread=4})"
-GradientBoostTrainer: train single-thread : 555.793759 samples/sec; 35.984571 sec/tree, 179.922855 sec
-GradientBoostTrainer: train feature-parallel : 1289.977846 samples/sec; 15.504142 sec/tree, 77.520711 sec
-```
-
-For 1, 2 and 4 threads, the speedups of patch 3 over patch 2 are respectively: 3.39, 3.53, and 3.18.
-For this patch, the multi-threading speedup of 2 and 4 threads over a single thread are respectively: 1.42 and 2.33.
-Improvements over the previous patch were obtained by optimizing two aspects:
-
- 1. Optimizing `TreeState.findBestFeatureSplit` for dense datasets (for example: `if dense, then ...`);
- 2. Removing `assert` clauses in `GradientBoostState.update`. The `update` method is called for every (example, feature), making it a major bottleneck.
-
-Converting the `update` to C could lead to further optimizations.
-
-This patch also improves the benchmark on sparse datasets:
-```
-$ th -e "dt = require 'decisiontree'; dt.benchmark()"
-RandomForestTrainer: train single-thread : 1121.311196 samples/sec; 0.445907 sec/tree, 8.918131 sec
-RandomForestTrainer: setup tree-parallel : 168773.323354 samples/sec; 0.059256 sec
-RandomForestTrainer: train tree-parallel : 1701.280938 samples/sec; 0.293896 sec/tree, 5.877924 sec
-GradientBoostState: findBestSplit (first) : 8.250646 sec
-GradientBoostState: findBestSplit (second) : 7.952077 sec
-GradientBoostTrainer: train single-thread : 3355.248596 samples/sec; 0.149020 sec/tree, 2.980405 sec
-GradientBoostTrainer: train feature-parallel : 4399.133369 samples/sec; 0.113659 sec/tree, 2.273175 sec
-CartTrainer: sparse dataset create: 3428.105601 samples/sec; 2.917069 sec
-CartTrainer: train single-thread : 282.172416 samples/sec; 35.439331 sec
-CartTrainer: setup feature-parallel : 9455.440801 samples/sec; 1.057598 sec
-CartTrainer: train feature-parallel : 594.054049 samples/sec; 16.833491 sec
-DFD: train random forest in parallel : 346.831378 samples/sec; 0.288325 sec/tree, 5.766491 sec
-DFD: updateOutput : 831.105546 samples/sec; 0.038509 sec
-```
-
-## Patch 4 :
-
-This patch improves `nn.DFD` from
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
-DFD: train random forest in parallel : 10.527251 samples/sec; 0.037997 sec/tree, 18.998313 sec
-DFD: updateOutput : 32.442950 samples/sec; 9.863472 sec
-```
-
-to
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
-DFD: train random forest in parallel : 10.839547 samples/sec; 0.036902 sec/tree, 18.450956 sec
-DFD: updateOutput : 359.158353 samples/sec; 0.890975 sec
-Sparse2Dense: updateOutput : 15395.648952 samples/sec; 0.020791 sec
-```
-
-That is a 10x speedup for `nn.DFD`.
-
-The patch also adds a benchmark for `nn.Sparse2Dense`:
-
-```
-th -e "dt = require 'decisiontree'; dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
-Sparse2Dense: updateOutput : 17158.126406 samples/sec; 0.018653 sec
-```
-
-Indeed, `nn.Sparse2Dense` is not the bottleneck; `nn.DFD` is.
-
-## Patch 5 :
-
-This patch improves `nn.DFD` inference from
-
-```
-for i in `seq 3`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=100})"; done
-DFD: train random forest in parallel : 8.452295 samples/sec; 0.047324 sec/tree, 23.662212 sec
-DFD: updateOutput : 176.617872 samples/sec; 9.059109 sec
-DFD: train random forest in parallel : 8.350019 samples/sec; 0.047904 sec/tree, 23.952042 sec
-DFD: updateOutput : 183.508204 samples/sec; 8.718962 sec
-DFD: train random forest in parallel : 8.525779 samples/sec; 0.046917 sec/tree, 23.458266 sec
-DFD: updateOutput : 178.877077 samples/sec; 8.944692 sec
-```
-
-to
-
-```
-for i in `seq 3`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=100})"; done
-DFD: train random forest in parallel : 8.434502 samples/sec; 0.047424 sec/tree, 23.712129 sec
-DFD: updateOutput : 6479.597179 samples/sec; 0.246933 sec
-DFD: train random forest in parallel : 8.334543 samples/sec; 0.047993 sec/tree, 23.996518 sec
-DFD: updateOutput : 6663.641184 samples/sec; 0.240114 sec
-DFD: train random forest in parallel : 8.353265 samples/sec; 0.047885 sec/tree, 23.942735 sec
-DFD: updateOutput : 6882.607456 samples/sec; 0.232475 sec
-```
-
-That is a 37x speedup for `nn.DFD`.
-
-## Patch 6:
-
-This patch improves `nn.DFD` from the previous result to
-
-```
-for i in `seq 5`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=10000})"; done
-DFD: train random forest in parallel : 8.353504 samples/sec; 0.047884 sec/tree, 23.942050 sec
-DFD: updateOutput : 91967.342339 samples/sec; 1.739753 sec
-DFD: train random forest in parallel : 8.528141 samples/sec; 0.046904 sec/tree, 23.451770 sec
-DFD: updateOutput : 91405.321702 samples/sec; 1.750451 sec
-DFD: train random forest in parallel : 8.184562 samples/sec; 0.048872 sec/tree, 24.436250 sec
-DFD: updateOutput : 91623.388867 samples/sec; 1.746284 sec
-DFD: train random forest in parallel : 8.779561 samples/sec; 0.045560 sec/tree, 22.780182 sec
-DFD: updateOutput : 93914.242852 samples/sec; 1.703686 sec
-DFD: train random forest in parallel : 8.636201 samples/sec; 0.046317 sec/tree, 23.158330 sec
-DFD: updateOutput : 94092.241963 samples/sec; 1.700465 sec
-```
-
-That is another 13.8x speedup.
-
-## Patch 7:
-
-This patch improves `nn.Sparse2Dense` computation from
-
-```
-for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
-Sparse2Dense: updateOutput : 1103.570777 samples/sec; 28.996786 sec
-Sparse2Dense: updateOutput : 1092.064331 samples/sec; 29.302309 sec
-Sparse2Dense: updateOutput : 1036.963572 samples/sec; 30.859334 sec
-```
-
-to
-
-```
-for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
-Sparse2Dense: updateOutput : 62995.834470 samples/sec; 0.507978 sec
-Sparse2Dense: updateOutput : 62471.568253 samples/sec; 0.512242 sec
-Sparse2Dense: updateOutput : 62965.099331 samples/sec; 0.508226 sec
-```
-
-This represents a speedup of about 57x.
-
-## Patch 8:
-
-This patch improves `nn.Sparse2Dense` from the previous result to
-
-```for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
-Sparse2Dense: updateOutput : 124268.079914 samples/sec; 0.257515 sec
-Sparse2Dense: updateOutput : 114750.039542 samples/sec; 0.278873 sec
-Sparse2Dense: updateOutput : 122863.314766 samples/sec; 0.260458 sec
-```
-
-which corresponds to another 1.95x speedup.
-
-## Patch 9:
-
-This patches moves the core of training GBDTs, which used to be a big bottleneck, to C. It also
-performs small optimizations across the board (faster scoring, faster branching, ...) that provide a
-little more performance.
-
-The original commit had this performance:
-
-```
-th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
-GradientBoostTrainer: train single-thread : 500.414666 samples/sec; 39.966854 sec/tree, 199.834271 sec
-GradientBoostTrainer: train feature-parallel : 1227.228044 samples/sec; 16.296890 sec/tree, 81.484448 sec (4 threads)
-GradientBoostTrainer: train feature-parallel : 1385.926280 samples/sec; 14.430782 sec/tree, 72.153910 sec (8 threads)
-```
-
-and the new version has
-
-```
-GradientBoostTrainer: train single-thread : 15285.644631 samples/sec; 1.308417 sec/tree, 6.542086 sec
-GradientBoostTrainer: train feature-parallel : 43170.435932 samples/sec; 0.463280 sec/tree, 2.316400 sec (4 threads)
-GradientBoostTrainer: train feature-parallel : 50062.681239 samples/sec; 0.399499 sec/tree, 1.997496 sec (8 threads)
-```
-
-That represents a speedup of about 30.5x over the baseline for 1 thread and 36.1x for 8 threads.
-Note that the performance doesn't increase much as we increase the number of threads since we use
-feature parallelism and the number of features evaluated is small (29 in this case) due to bagging.
-If we disable bagging, then we have the following result with 8 threads and the new code:
-
-```
-GradientBoostTrainer: train single-thread : 590.823965 samples/sec; 33.851030 sec/tree, 169.255152 sec
-GradientBoostTrainer: train feature-parallel : 3232.188576 samples/sec; 6.187758 sec/tree, 30.938789 sec
-```
-
-So processing 836 features now is much faster than processing 29 before.
diff --git a/contrib/lua-torch/decisiontree/error.h b/contrib/lua-torch/decisiontree/error.h
deleted file mode 100644
index 18df3c939..000000000
--- a/contrib/lua-torch/decisiontree/error.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _ERROR_H_
-#define _ERROR_H_
-
-#include "luaT.h"
-#include <string.h>
-
-static inline int _lua_error(lua_State *L, int ret, const char* file, int line) {
- int pos_ret = ret >= 0 ? ret : -ret;
- return luaL_error(L, "ERROR: (%s, %d): (%d, %s)\n", file, line, pos_ret, strerror(pos_ret));
-}
-
-static inline int _lua_error_str(lua_State *L, const char *str, const char* file, int line) {
- return luaL_error(L, "ERROR: (%s, %d): (%s)\n", file, line, str);
-}
-
-static inline int _lua_error_str_str(lua_State *L, const char *str, const char* file, int line, const char *extra) {
- return luaL_error(L, "ERROR: (%s, %d): (%s: %s)\n", file, line, str, extra);
-}
-
-#define LUA_HANDLE_ERROR(L, ret) _lua_error(L, ret, __FILE__, __LINE__)
-#define LUA_HANDLE_ERROR_STR(L, str) _lua_error_str(L, str, __FILE__, __LINE__)
-#define LUA_HANDLE_ERROR_STR_STR(L, str, extra) _lua_error_str_str(L, str, __FILE__, __LINE__, extra)
-
-#endif
diff --git a/contrib/lua-torch/decisiontree/generic/CartTree.c b/contrib/lua-torch/decisiontree/generic/CartTree.c
deleted file mode 100644
index eb29fcf02..000000000
--- a/contrib/lua-torch/decisiontree/generic/CartTree.c
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/CartTree.c"
-#else
-
-static int nn_(tree_fast_score)(lua_State *L) {
- THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
- THTensor *score = luaT_checkudata(L, 3, torch_Tensor);
- long n_samples = THTensor_(size)(input, 0);
- long n_features = THTensor_(size)(input, 1);
- THTensor_(resize1d)(score, n_samples);
- real *input_data = THTensor_(data)(input);
- real *score_data = THTensor_(data)(score);
-
- lua_pushstring(L, "leftChild");
- const int left_child_string = 4;
- lua_pushstring(L, "rightChild");
- const int right_child_string = 5;
- lua_pushstring(L, "score");
- const int score_string = 6;
- lua_pushstring(L, "splitFeatureId");
- const int id_string = 7;
- lua_pushstring(L, "splitFeatureValue");
- const int value_string = 8;
-
- const int original_top = lua_gettop(L);
- for (long i = 0; i < n_samples; i++) {
- int node = 2;
- while (1) {
- int current_top = lua_gettop(L);
- lua_pushvalue(L, left_child_string);
- lua_rawget(L, node);
- lua_pushvalue(L, right_child_string);
- lua_rawget(L, node);
- if (lua_isnil(L, -2) && lua_isnil(L, -1)) {
- lua_pushvalue(L, score_string);
- lua_rawget(L, node);
- score_data[i] = lua_tonumber(L, -1);
- break;
- }
- if (lua_isnil(L, -2)) {
- // go to right
- node = current_top + 2;
- continue;
- }
- if (lua_isnil(L, -1)) {
- // go to left
- node = current_top + 1;
- continue;
- }
- lua_pushvalue(L, id_string);
- lua_rawget(L, node);
- lua_pushvalue(L, value_string);
- lua_rawget(L, node);
- long feature_id = lua_tointeger(L, -2);
- real feature_value = lua_tonumber(L, -1);
-
- real current_value = input_data[i * n_features + (feature_id-1)];
- if (current_value < feature_value) {
- // go to left
- node = current_top + 1;
- }
- else {
- // go to right
- node = current_top + 2;
- }
- }
- lua_pop(L, lua_gettop(L) - original_top);
- }
-
- lua_pop(L, 5);
-
- lua_pushvalue(L, 3);
- return 1;
-}
-
-static const struct luaL_Reg nn_(CT__) [] = {
- {"CartTreeFastScore", nn_(tree_fast_score)},
- {NULL, NULL}
-};
-
-static void nn_(CT_init)(lua_State *L)
-{
- luaT_pushmetatable(L, torch_Tensor);
- luaT_registeratname(L, nn_(CT__), "nn");
- lua_pop(L,1);
-}
-
-#endif
diff --git a/contrib/lua-torch/decisiontree/generic/DFD.c b/contrib/lua-torch/decisiontree/generic/DFD.c
deleted file mode 100644
index 599c4d794..000000000
--- a/contrib/lua-torch/decisiontree/generic/DFD.c
+++ /dev/null
@@ -1,157 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/DFD.c"
-#else
-
-static int nn_(DFD_computeOutput)(lua_State *L) {
- THLongTensor *outputkeys = luaT_checkudata(L, 1, "torch.LongTensor");
- THTensor *outputvalues = luaT_checkudata(L, 2, torch_Tensor);
- THLongTensor *root_ids = luaT_checkudata(L, 3, "torch.LongTensor");
- THLongTensor *left_child = luaT_checkudata(L, 4, "torch.LongTensor");
- THLongTensor *right_child = luaT_checkudata(L, 5, "torch.LongTensor");
- THLongTensor *split_feature_id = luaT_checkudata(L, 6, "torch.LongTensor");
- THTensor *split_feature_value = luaT_checkudata(L, 7, torch_Tensor);
- THTensor *input = luaT_checkudata(L, 8, torch_Tensor);
- char only_last_node = lua_toboolean(L, 9);
-
- // gets some important sizes from the input
- long batch_size = THTensor_(size)(input, 0);
- long input_size = THTensor_(size)(input, 1);
- long roots_size = THLongTensor_size(root_ids, 0);
- long depth = THLongTensor_size(outputkeys, 1);
-
- // keeps track of the number of nodes traversed in the trees by each sample.
- // each traversed node maps to an output feature having a value of 1
- long outputsize[batch_size];
- for (long i = 0; i < batch_size; i++)
- outputsize[i] = 0;
-
- // gets direct pointers to the memory of each tensor for efficiency
- long *root_ids_data = THLongTensor_data(root_ids);
- long *left_child_data = THLongTensor_data(left_child);
- long *right_child_data = THLongTensor_data(right_child);
- real *split_feature_value_data = THTensor_(data)(split_feature_value);
- long *split_feature_id_data = THLongTensor_data(split_feature_id);
- long *outputkeys_data = THLongTensor_data(outputkeys);
- real *input_data = THTensor_(data)(input);
-
- // for each sample in the batch
- for (long sample_index = 0; sample_index < batch_size; sample_index++) {
- // gets pointers to the direct memory associated with each sample for efficiency
- const long outputkeys_offset = sample_index * depth;
- const long input_offset = sample_index * input_size;
- long *local_outputkeys_data = &outputkeys_data[outputkeys_offset];
- real *local_input_data = &input_data[input_offset];
-
- // for each tree in the forest
- for (long i = 0; i < roots_size; i++) {
- int root = 1;
- long node_id = root_ids_data[i];
-
- // traverses the whole tree keeping track of which nodes were seen
- while (1) {
- if (root) {
- // root nodes aren't added to output because they are always traversed
- root = 0;
- }
- else if (!only_last_node) {
- // updates the outputsize for all samples traversing this node; and
- // set the traversed node as a feature in output for exampleIds
- long output_index = outputsize[sample_index];
- // updates the outputsize for all samples traversing this node
- outputsize[sample_index]++;
- // sets the traversed node as a feature in output for exampleIds
- local_outputkeys_data[output_index] = node_id;
- }
-
- // gets the left and right nodes. values of -1 represent missing node
- long left_id = left_child_data[node_id-1];
- long right_id = right_child_data[node_id-1];
-
- if (left_id <= 0 && right_id <= 0) {
- if (only_last_node) {
- long output_index = outputsize[sample_index];
- outputsize[sample_index]++;
- local_outputkeys_data[output_index] = node_id;
- }
- // if no children, stops
- break;
- }
- else if (left_id <= 0) {
- // if no left child, traverses right node
- node_id = right_id;
- }
- else if (right_id <= 0) {
- // if no right child, traverses left node
- node_id = left_id;
- }
- else {
- // if both left and right children, finds the direction for this sample
- // first get the reference from the node
- real split_value = split_feature_value_data[node_id-1];
- long split_id = split_feature_id_data[node_id-1]-1;
-
- // then gets the value of the sample
- real node_value = local_input_data[split_id];
- // and branchs
- if (node_value < split_value)
- node_id = left_id;
- else
- node_id = right_id;
- }
- }
- }
- }
-
- // now that we know which nodes were traverse for each sample, we can create the sparse output
- // with 1 entry pair for each sample
- THTensor *input_feature = THTensor_(new)();
- THLongTensor *indices = THLongTensor_new();
-
- // pushes the return table with 2 children tables
- lua_newtable(L);
- lua_pushinteger(L, 1);
- lua_newtable(L);
- lua_pushinteger(L, 2);
- lua_newtable(L);
-
- // for each sample...
- for (long i = 0; i < batch_size; i++) {
- long j = outputsize[i];
- // selects the tensor lines from the dense output
- THLongTensor_select(indices, outputkeys, 0, i);
- THTensor_(select)(input_feature, outputvalues, 0, i);
-
- // narrows the keys to actual number of nodes traversed and saves to the output
- lua_pushinteger(L, i+1);
- luaT_pushudata(L, THLongTensor_newNarrow(indices, 0, 0, j), "torch.LongTensor");
- lua_settable(L, -5);
-
- // and narrows the values
- lua_pushinteger(L, i+1);
- luaT_pushudata(L, THTensor_(newNarrow)(input_feature, 0, 0, j), torch_Tensor);
- lua_settable(L, -3);
- }
-
- // pushes the two parts of the output into the output table
- lua_settable(L, -5);
- lua_settable(L, -3);
-
- THLongTensor_free(indices);
- THTensor_(free)(input_feature);
-
- return 1;
-}
-
-static const struct luaL_Reg nn_(DFD__) [] = {
- {"DFD_computeOutput", nn_(DFD_computeOutput)},
- {NULL, NULL}
-};
-
-static void nn_(DFD_init)(lua_State *L)
-{
- luaT_pushmetatable(L, torch_Tensor);
- luaT_registeratname(L, nn_(DFD__), "nn");
- lua_pop(L,1);
-}
-
-#endif
diff --git a/contrib/lua-torch/decisiontree/generic/GBDT.c b/contrib/lua-torch/decisiontree/generic/GBDT.c
deleted file mode 100644
index 31f5b025d..000000000
--- a/contrib/lua-torch/decisiontree/generic/GBDT.c
+++ /dev/null
@@ -1,392 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/GBDT.c"
-#else
-
-#include "GBDT_internal.h"
-#include "GBDT_internal.c"
-
-// note that each one of the functions to find the best split is a subset of the next.
-// first we have one that can only evaluate a single feature, using the logic in lua to control the
-// features
-// then we have one that can go over a shard of faetures, following the feature parallelism
-// introduced by the lua logic
-// and finally we have one that performans the feature parallelism itself in the special case of
-// dense tensors
-// these functions are provided for completeness and to test in case the logic is to be changed
-
-// finds the best split for a given node and feature
-static int nn_(gb_findBestFeatureSplit)(lua_State *L) {
- THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
- const int dataset_index = 2;
- if (!lua_isnumber(L, 3))
- return LUA_HANDLE_ERROR_STR(L, "third argument should be an integer");
- long feature_id = lua_tointeger(L, 3);
- if (!lua_isnumber(L, 4))
- return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
- long minLeafSize = lua_tointeger(L, 4);
- // Since minLeafSize == 1 corresponds to each sample in its own leaf, any value below it doesn't
- // make sense
- if (minLeafSize < 1)
- minLeafSize = 1;
- THTensor *grad = luaT_checkudata(L, 5, torch_Tensor);
- THTensor *hess = luaT_checkudata(L, 6, torch_Tensor);
-
- if (!THLongTensor_isContiguous(exampleIds))
- return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
- if (!THTensor_(isContiguous)(grad))
- return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
- if (!THTensor_(isContiguous)(hess))
- return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
-
- // initializes the static data
- nn_(GBInitialization) initialization_data;
- nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
-
- // initializes the dynamic data
- GBRunData run_data;
- gb_create_run_data(&run_data, minLeafSize);
-
- // finds the best state possible for the split
- nn_(GBBestState) bs;
- nn_(gb_find_best_feature_split)(L, &initialization_data, &bs, feature_id, &run_data);
-
- lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
-
- // fills the table we the best split found and the lua logic above will do everything else
- // if no state was found, returns nil
- if (bs.valid_state == 0) {
- lua_pop(L, 1);
- lua_pushnil(L);
- }
- else {
- nn_(gb_internal_split_info)(L, &bs, initialization_data.splitInfo_index);
- }
-
- gb_destroy_run_data(&run_data);
-
- return 1;
-}
-
-// finds the best split for a given node and shard of features
-// this is more efficient than calling the previous one multiple times
-static int nn_(gb_findBestSplit)(lua_State *L) {
- THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
- const int dataset_index = 2;
- THLongTensor *feature_ids = luaT_checkudata(L, 3, "torch.LongTensor");
- if (!lua_isnumber(L, 4))
- return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
- long minLeafSize = lua_tointeger(L, 4);
- // Since minLeafSize == 1 corresponds to each sample in its own leaf, any value below it doesn't
- // make sense
- if (minLeafSize < 1)
- minLeafSize = 1;
- if (!lua_isnumber(L, 5))
- return LUA_HANDLE_ERROR_STR(L, "fifth argument should be an integer");
- long shardId = lua_tointeger(L, 5);
- if (!lua_isnumber(L, 6))
- return LUA_HANDLE_ERROR_STR(L, "sixth argument should be an integer");
- long nShard = lua_tointeger(L, 6);
- THTensor *grad = luaT_checkudata(L, 7, torch_Tensor);
- THTensor *hess = luaT_checkudata(L, 8, torch_Tensor);
-
- if (!THLongTensor_isContiguous(exampleIds))
- return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
- if (!THTensor_(isContiguous)(grad))
- return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
- if (!THTensor_(isContiguous)(hess))
- return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
-
- // initializes the static data
- nn_(GBInitialization) initialization_data;
- nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
-
- // initializes the dynamic data
- GBRunData run_data;
- gb_create_run_data(&run_data, minLeafSize);
-
- // initializes to evaluate all the features in this shard
- nn_(GBBestState) global_bs;
- global_bs.valid_state = 0;
- long n_features = THLongTensor_size(feature_ids, 0);
- if (!THLongTensor_isContiguous(feature_ids))
- return LUA_HANDLE_ERROR_STR(L, "feature_ids must be contiguous");
- long *feature_ids_data = THLongTensor_data(feature_ids);
-
- // for every feature
- for (long i = 0; i < n_features; i++) {
- long feature_id = feature_ids_data[i];
- // if we are responsible for it
- if (nShard <= 1 || (feature_id % nShard) + 1 == shardId) {
- // finds the best state possible for the split
- nn_(GBBestState) bs;
- nn_(gb_find_best_feature_split)(L, &initialization_data, &bs, feature_id, &run_data);
-
- // if it's valid and better than one we found before, saves it
- if (bs.valid_state) {
- if (global_bs.valid_state == 0 || bs.gain < global_bs.gain) {
- global_bs = bs;
- }
- }
- }
- }
-
- lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
-
- // fills the table we the best split found and the lua logic above will do everything else
- // if no state was found, returns nil
- if (global_bs.valid_state == 0) {
- lua_pop(L, 1);
- lua_pushnil(L);
- }
- else {
- nn_(gb_internal_split_info)(L, &global_bs, initialization_data.splitInfo_index);
- }
-
- gb_destroy_run_data(&run_data);
-
- return 1;
-}
-
-// all the info we have to apss to the slave threads so that they can do their jobs
-// note that we do not pass the lua state since it isn't required. we perform direct C parallelism
-// instead of using lua's parallelism like with the previous version
-typedef struct {
- nn_(GBInitialization) *initialization_data;
- GBRunData *run_data;
- long *index;
- nn_(GBBestState) *global_bs;
- long n_features;
- long *feature_ids_data;
- pthread_mutex_t *mutex;
- THLongTensor *exampleIds;
- THTensor *input;
- THLongTensor **sorted_ids_per_feature;
-} nn_(ThreadInfo);
-
-// loops over all the features in parallel and finds the best global split
-static void* nn_(thread_worker)(void *arg) {
- nn_(ThreadInfo) *info = (nn_(ThreadInfo) *)arg;
-
- while (1) {
- pthread_mutex_lock(info->mutex);
- long index = (*info->index);
- (*info->index)++;
- pthread_mutex_unlock(info->mutex);
-
- if (index >= info->n_features)
- break;
-
- // performs part of steps (1) and (2) of gb_find_best_feature_split without having to access the
- // lua state using pre-loaded data
- long feature_id = info->feature_ids_data[index];
- THLongTensor *exampleIdsWithFeature_ret = info->exampleIds;
- THLongTensor *featureExampleIds = info->sorted_ids_per_feature[index];
- nn_(GBInitialization) *initialization_data = info->initialization_data;
- GBRunData *run_data = info->run_data;
-
- // performs steps (3) and (4) of gb_find_best_feature_split since (1) and (2) were already
- // performed before
- nn_(GBBestState) bs;
- nn_(gb_internal_create)(initialization_data->grad, initialization_data->hess,
- exampleIdsWithFeature_ret, &bs.state);
- nn_(gb_internal_get_best_split_special)(&bs, featureExampleIds, run_data->exampleMap,
- info->input, run_data->minLeafSize, feature_id);
-
- // saves to the global state if it's better
- if (bs.valid_state) {
- pthread_mutex_lock(info->mutex);
- if (info->global_bs->valid_state == 0 || bs.gain < info->global_bs->gain) {
- (*info->global_bs) = bs;
- }
- pthread_mutex_unlock(info->mutex);
- }
- }
-
- return NULL;
-}
-
-// finds the global best split by doing feature parallelism directly in C
-static int nn_(gb_findBestSplitFP)(lua_State *L) {
- THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
- const int dataset_index = 2;
- THLongTensor *feature_ids = luaT_checkudata(L, 3, "torch.LongTensor");
- if (!lua_isnumber(L, 4))
- return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
- long minLeafSize = lua_tointeger(L, 4);
- THTensor *grad = luaT_checkudata(L, 5, torch_Tensor);
- THTensor *hess = luaT_checkudata(L, 6, torch_Tensor);
- if (!lua_isnumber(L, 7))
- return LUA_HANDLE_ERROR_STR(L, "seventh argument should be an integer");
- long nThread = lua_tointeger(L, 7);
-
- if (!THLongTensor_isContiguous(exampleIds))
- return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
- if (!THTensor_(isContiguous)(grad))
- return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
- if (!THTensor_(isContiguous)(hess))
- return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
-
- pthread_mutex_t mutex;
- pthread_mutex_init(&mutex, NULL);
-
- // initializes the static data
- nn_(GBInitialization) initialization_data;
- nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
-
- // initializes the dynamic data
- GBRunData run_data;
- gb_create_run_data(&run_data, minLeafSize);
-
- // initializes to evaluate all the features
- nn_(GBBestState) global_bs;
- global_bs.valid_state = 0;
- long n_features = THLongTensor_size(feature_ids, 0);
- if (!THLongTensor_isContiguous(feature_ids))
- return LUA_HANDLE_ERROR_STR(L, "feature_ids must be contiguous");
- long *feature_ids_data = THLongTensor_data(feature_ids);
-
- THTensor *input = luaT_checkudata(L, initialization_data.input_index, torch_Tensor);
-
- // performs step (1) of gb_find_best_feature_split so that we don't have to pass the lua state
- THLongTensor *sorted_ids_per_feature[n_features];
- for (long i = 0; i < n_features; i++) {
- long feature_id = feature_ids_data[i];
- lua_pushvalue(L, initialization_data.getSortedFeature_index);
- lua_pushvalue(L, initialization_data.dataset_index);
- lua_pushinteger(L, feature_id);
- lua_call(L, 2, 1);
-
- THLongTensor *featureExampleIds = luaT_checkudata(L, -1, "torch.LongTensor");
- sorted_ids_per_feature[i] = featureExampleIds;
- }
-
- // performas step (2) of gb_find_best_feature_split since it's the same for all features when the
- // data is dense
- long exampleIds_size = THLongTensor_size(initialization_data.exampleIds, 0);
- long *exampleIds_data = THLongTensor_data(initialization_data.exampleIds);
-
- int ret;
- kh_resize(long, run_data.exampleMap, exampleIds_size*8);
- for (long i = 0; i < exampleIds_size; i++)
- kh_put(long, run_data.exampleMap, exampleIds_data[i], &ret);
-
- // saves the info for the threads
- long index = 0;
- nn_(ThreadInfo) info;
- info.initialization_data = &initialization_data;
- info.run_data = &run_data;
- info.index = &index;
- info.global_bs = &global_bs;
- info.n_features = n_features;
- info.feature_ids_data = feature_ids_data;
- info.mutex = &mutex;
- info.exampleIds = exampleIds;
- info.input = input;
- info.sorted_ids_per_feature = sorted_ids_per_feature;
-
- pthread_t threads[nThread];
-
- // let the threads run like crazy over the features to find the minimum
- for (long i = 0; i < nThread; i++) {
- int ret = pthread_create(&threads[i], NULL, nn_(thread_worker), &info);
- if (ret)
- return LUA_HANDLE_ERROR_STR(L, "falied to create thread");
- }
-
- for (long i = 0; i < nThread; i++) {
- int ret = pthread_join(threads[i], NULL);
- if (ret)
- return LUA_HANDLE_ERROR_STR(L, "failed to join thread");
- }
-
- lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
-
- // fills the table we the best split found and the lua logic above will do everything else
- // if no state was found, returns nil
- if (global_bs.valid_state == 0) {
- lua_pop(L, 1);
- lua_pushnil(L);
- }
- else {
- nn_(gb_internal_split_info)(L, &global_bs, initialization_data.splitInfo_index);
- }
-
- gb_destroy_run_data(&run_data);
- pthread_mutex_destroy(&mutex);
-
- return 1;
-}
-
-// performs an efficient branch of the current examples based on a split info provided
-static int nn_(gb_branch)(lua_State *L) {
- if (!lua_istable(L, 1))
- return LUA_HANDLE_ERROR_STR(L, "first argument must be a table");
- THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
- THLongTensor *exampleIds = luaT_checkudata(L, 3, "torch.LongTensor");
-
- // gets direct access to the dataset
- long n_exampleIds = THLongTensor_size(exampleIds, 0);
- long *exampleIds_data = THLongTensor_data(exampleIds);
- long n_features = THTensor_(size)(input, 1);
- real *input_data = THTensor_(data)(input);
-
- // creates the tensors to be returned
- luaT_pushudata(L, THLongTensor_new(), "torch.LongTensor");
- luaT_pushudata(L, THLongTensor_new(), "torch.LongTensor");
- THLongTensor *leftExampleIds = luaT_checkudata(L, 4, "torch.LongTensor");
- THLongTensor *rightExampleIds = luaT_checkudata(L, 5, "torch.LongTensor");
- THLongTensor_resize1d(leftExampleIds, n_exampleIds);
-
- // gets direct access to the examples
- THLongTensor *splitExampleIds = leftExampleIds;
- long *splitExampleIds_data = THLongTensor_data(splitExampleIds);
-
- // gets the split info
- lua_pushstring(L, "splitId");
- lua_rawget(L, 1);
- const long splitId = lua_tointeger(L, -1);
- lua_pushstring(L, "splitValue");
- lua_rawget(L, 1);
- const real splitValue = lua_tonumber(L, -1);
- lua_pop(L, 2);
-
- long leftIdx = 0, rightIdx = 0;
-
- // goes over all the samples dividing them into the two sides
- for (long i = 0; i < n_exampleIds; i++) {
- long exampleId = exampleIds_data[i];
- real val = input_data[(exampleId-1) * n_features + (splitId - 1)];
- if (val <= splitValue) {
- leftIdx++;
- splitExampleIds_data[leftIdx-1] = exampleId;
- }
- else {
- rightIdx++;
- splitExampleIds_data[n_exampleIds - rightIdx + 1 - 1] = exampleId;
- }
- }
-
- // once done, the resulting tensors are just splits of the sample base. this is more efficient
- // than having 2 tensors since we didn't know where the split would happen (how much to each
- // side), but we knew that the sum would be constant
- THLongTensor_narrow(rightExampleIds, splitExampleIds, 0, n_exampleIds-rightIdx+1-1, rightIdx);
- THLongTensor_narrow(leftExampleIds, splitExampleIds, 0, 0, leftIdx);
- return 2;
-}
-
-static const struct luaL_Reg nn_(GBDT__) [] = {
- {"GBDT_findBestFeatureSplit", nn_(gb_findBestFeatureSplit)},
- {"GBDT_findBestSplit", nn_(gb_findBestSplit)},
- {"GBDT_findBestSplitFP", nn_(gb_findBestSplitFP)},
- {"GBDT_branch", nn_(gb_branch)},
- {NULL, NULL}
-};
-
-static void nn_(GBDT_init)(lua_State *L)
-{
- luaT_pushmetatable(L, torch_Tensor);
- luaT_registeratname(L, nn_(GBDT__), "nn");
- lua_pop(L,1);
-}
-
-#endif
diff --git a/contrib/lua-torch/decisiontree/generic/GBDT_internal.c b/contrib/lua-torch/decisiontree/generic/GBDT_internal.c
deleted file mode 100644
index 739aabf25..000000000
--- a/contrib/lua-torch/decisiontree/generic/GBDT_internal.c
+++ /dev/null
@@ -1,312 +0,0 @@
-// initializes the optimization structure based on the arguments provided, either filling directly
-// or making calls to lua to load some kind of data
-static void nn_(gb_initialize)(lua_State *L, nn_(GBInitialization) *initialization_data,
- THLongTensor *exampleIds, THTensor *grad, THTensor *hess, int dataset_index) {
- initialization_data->dataset_index = dataset_index;
- initialization_data->exampleIds = exampleIds;
- initialization_data->grad = grad;
- initialization_data->hess = hess;
-
- lua_newtable(L);
- initialization_data->splitInfo_index = lua_gettop(L);
-
- lua_pushstring(L, "input");
- lua_gettable(L, dataset_index);
- initialization_data->input_index = lua_gettop(L);
-
- lua_pushstring(L, "getSortedFeature");
- lua_gettable(L, dataset_index);
- initialization_data->getSortedFeature_index = lua_gettop(L);
-}
-
-// initializes a state that will be passed to the optimizer
-static void nn_(gb_internal_create)(THTensor *grad, THTensor *hessian,
- THLongTensor *exampleIds, nn_(GBState)* s) {
- long *exampleIds_data = THLongTensor_data(exampleIds);
- long n_examples = THLongTensor_size(exampleIds, 0);
- accreal leftGradientSum = 0;
- accreal leftHessianSum = 0;
-
- real *grad_data = THTensor_(data)(grad);
- real *hessian_data = THTensor_(data)(hessian);
-
- // only sums the relevant gradients and hessians
- for (long i = 0; i < n_examples; i++) {
- long exampleId = exampleIds_data[i]-1;
- leftGradientSum += grad_data[exampleId];
- leftHessianSum += hessian_data[exampleId];
- }
-
- // we move data from the left branch to the right branch
- s->rightGradientSum = 0;
- s->rightHessianSum = 1;
- s->nExampleInRightBranch = 0;
- s->leftGradientSum = leftGradientSum;
- s->leftHessianSum = leftHessianSum + 1;
- s->nExampleInLeftBranch = n_examples;
-
- // stores the loss in parent for efficiency
- real lossInParent = computeGradientBoostLoss(s->leftGradientSum + s->rightGradientSum,
- s->leftHessianSum + s->rightHessianSum);
- s->lossInParent = lossInParent;
-
- // caches the direct pointers to the data for efficiency
- s->grad_data = grad_data;
- s->hessian_data = hessian_data;
-}
-
-// computes the gain obtained by performing the split
-static real nn_(computeSplitGain)(nn_(GBState) *s) {
- real lossInLeftBranch = computeGradientBoostLoss(s->leftGradientSum, s->leftHessianSum);
- real lossInRightBranch = computeGradientBoostLoss(s->rightGradientSum, s->rightHessianSum);
- return lossInLeftBranch + lossInRightBranch - s->lossInParent;
-}
-
-// uses the state information to build the table required by the lua library about the best split
-static void nn_(gb_internal_split_info)(lua_State *L, nn_(GBBestState) *bs, int res) {
- long feature_id = bs->feature_id;
- real feature_value = bs->feature_value;
- real gain = bs->gain;
- nn_(GBState) *s = &bs->state;
- lua_pushstring(L, "splitGain");
- lua_pushnumber(L, gain);
- lua_rawset(L, res);
- lua_pushstring(L, "splitId");
- lua_pushinteger(L, feature_id);
- lua_rawset(L, res);
- lua_pushstring(L, "splitValue");
- lua_pushnumber(L, feature_value);
- lua_rawset(L, res);
- lua_pushstring(L, "leftChildSize");
- lua_pushinteger(L, s->nExampleInLeftBranch);
- lua_rawset(L, res);
- lua_pushstring(L, "rightChildSize");
- lua_pushinteger(L, s->nExampleInRightBranch);
- lua_rawset(L, res);
- lua_pushstring(L, "leftGradient");
- lua_pushnumber(L, s->leftGradientSum);
- lua_rawset(L, res);
- lua_pushstring(L, "rightGradient");
- lua_pushnumber(L, s->rightGradientSum);
- lua_rawset(L, res);
- lua_pushstring(L, "leftHessian");
- lua_pushnumber(L, s->leftHessianSum);
- lua_rawset(L, res);
- lua_pushstring(L, "rightHessian");
- lua_pushnumber(L, s->rightHessianSum);
- lua_rawset(L, res);
-}
-
-// core of the computation, where we loop over all the relevant samples looking for the best split
-// we can find
-static void nn_(gb_internal_get_best_split)(lua_State *L, nn_(GBBestState) *bs,
- THLongTensor *featureExampleIds, khash_t(long)* exampleMap, int input_table_index,
- long minLeafSize, long feature_id) {
- nn_(GBState) current_state;
- nn_(GBState) best_state;
- current_state = bs->state;
-
- real best_gain = INFINITY;
- real best_value = 0;
-
- // if the data is dense, pre-loads direct access to it
- THTensor *input = NULL;
- real *input_data = NULL;
- long n_features = 0;
- if (lua_istable(L, input_table_index)) {
- }
- else {
- input = luaT_checkudata(L, input_table_index, torch_Tensor);
- input_data = THTensor_(data)(input);
- n_features = THTensor_(size)(input, 1);
- }
-
- long stride = featureExampleIds->stride[0];
- long *featureExampleIds_data = THLongTensor_data(featureExampleIds);
-
- khiter_t k;
-
- real previousSplitValue = 0;
- // for each example with the given feature and from large to small value...
- for (long i = THLongTensor_size(featureExampleIds, 0)-1; i >= 0; i--) {
- long exampleId = featureExampleIds_data[i * stride];
-
- // checks if the sample is in the list of ones that have to be evaluated by this node
- k = kh_get(long, exampleMap, exampleId);
- if (k != kh_end(exampleMap)) {
- long exampleIdx = exampleId;
-
- // gets the split value, depending on whether the input is sparse or dense
- real splitValue;
- if (input_data) {
- splitValue = input_data[(exampleId-1) * n_features + feature_id-1];
- }
- else {
- lua_pushinteger(L, exampleId);
- lua_gettable(L, input_table_index);
- lua_pushinteger(L, feature_id);
- lua_gettable(L, -2);
- splitValue = lua_tonumber(L, -1);
- lua_pop(L, 2);
- }
-
- // performs one update of the state, moving a sample from the left branch to the right
- real gradient = current_state.grad_data[exampleIdx-1];
- real hessian = current_state.hessian_data[exampleIdx-1];
- current_state.leftGradientSum -= gradient;
- current_state.rightGradientSum += gradient;
- current_state.leftHessianSum -= hessian;
- current_state.rightHessianSum += hessian;
- current_state.nExampleInLeftBranch--;
- current_state.nExampleInRightBranch++;
-
- // since we remove from the left, once this becomes true, it stays true forever
- // hence we stop the loop
- if (current_state.nExampleInLeftBranch < minLeafSize)
- break;
-
- if (current_state.nExampleInRightBranch >= minLeafSize) {
- // if the values are equal between the steps, it doesn't make sense to evaluate the score
- // since we won't be able to separate the two
- if (previousSplitValue != splitValue) {
- // computes the gain **without including the parent** since it doesn't change as we move
- // examples between branches
- real lossInLeftBranch = computeGradientBoostLoss(current_state.leftGradientSum, current_state.leftHessianSum);
- real lossInRightBranch = computeGradientBoostLoss(current_state.rightGradientSum, current_state.rightHessianSum);
- real current_gain = lossInLeftBranch + lossInRightBranch;
- if (current_gain < best_gain) {
- best_gain = current_gain;
- best_value = splitValue;
- best_state = current_state;
- }
- }
- }
- previousSplitValue = splitValue;
- }
- }
-
- // if there is a valid gain, then marks the state as valid and fills the meta-info
- if (!isfinite(best_gain)) {
- bs->valid_state = 0;
- }
- else {
- bs->valid_state = 1;
- bs->state = best_state;
- bs->feature_id = feature_id;
- bs->gain = nn_(computeSplitGain)(&bs->state);
- bs->feature_value = best_value;
- }
-}
-
-// exactly like the previous version, but direct access to the data for efficiency. it also doesn't
-// rely on the lua state in the particular case of dense data, so we can evaluate this without using
-// the lua state
-static void nn_(gb_internal_get_best_split_special)(nn_(GBBestState) *bs,
- THLongTensor *featureExampleIds, khash_t(long)* exampleMap, THTensor *input, long minLeafSize,
- long feature_id) {
- nn_(GBState) current_state;
- nn_(GBState) best_state;
- current_state = bs->state;
-
- real best_gain = INFINITY;
- real best_value = 0;
-
- real *input_data = NULL;
- long n_features = 0;
- input_data = THTensor_(data)(input);
- n_features = THTensor_(size)(input, 1);
-
- long stride = featureExampleIds->stride[0];
- long *featureExampleIds_data = THLongTensor_data(featureExampleIds);
-
- khiter_t k;
-
- real previousSplitValue = 0;
- for (long i = THLongTensor_size(featureExampleIds, 0)-1; i >= 0; i--) {
- long exampleId = featureExampleIds_data[i * stride];
-
- k = kh_get(long, exampleMap, exampleId);
- if (k != kh_end(exampleMap)) {
- long exampleIdx = exampleId;
-
- // THIS is the main part that changes. seems crazy to have a special case just for this, but
- // since there are a **lot** of samples to be evaluated, the "if" in the previous case can
- // become expensive
- real splitValue;
- splitValue = input_data[(exampleId-1) * n_features + feature_id-1];
-
- real gradient = current_state.grad_data[exampleIdx-1];
- real hessian = current_state.hessian_data[exampleIdx-1];
- current_state.leftGradientSum -= gradient;
- current_state.rightGradientSum += gradient;
- current_state.leftHessianSum -= hessian;
- current_state.rightHessianSum += hessian;
- current_state.nExampleInLeftBranch--;
- current_state.nExampleInRightBranch++;
-
- // since we remove from the left, once this becomes true, it stays true forever
- // hence we stop the loop
- if (current_state.nExampleInLeftBranch < minLeafSize)
- break;
-
- // This will always fail in the first pass since minLeafSize >= 1 and nExampleInRightBranch
- // starts at 0
- if (current_state.nExampleInRightBranch >= minLeafSize) {
- if (previousSplitValue != splitValue) {
- real lossInLeftBranch = computeGradientBoostLoss(current_state.leftGradientSum, current_state.leftHessianSum);
- real lossInRightBranch = computeGradientBoostLoss(current_state.rightGradientSum, current_state.rightHessianSum);
- real current_gain = lossInLeftBranch + lossInRightBranch;
- if (current_gain < best_gain) {
- best_gain = current_gain;
- best_value = splitValue;
- best_state = current_state;
- }
- }
- }
- previousSplitValue = splitValue;
- }
- }
-
- if (!isfinite(best_gain)) {
- bs->valid_state = 0;
- }
- else {
- bs->valid_state = 1;
- bs->state = best_state;
- bs->feature_id = feature_id;
- bs->gain = nn_(computeSplitGain)(&bs->state);
- bs->feature_value = best_value;
- }
-}
-
-// core of the computation to find the split for a given feature and is divided in 4 steps
-static void nn_(gb_find_best_feature_split)(lua_State *L,
- nn_(GBInitialization) *initialization_data, nn_(GBBestState) *bs, long feature_id,
- GBRunData *run_data) {
-
- // 1) loads the examples in the dataset ordered by their feature value
- lua_pushvalue(L, initialization_data->getSortedFeature_index);
- lua_pushvalue(L, initialization_data->dataset_index);
- lua_pushinteger(L, feature_id);
- lua_call(L, 2, 1);
-
- THLongTensor *featureExampleIds = luaT_checkudata(L, -1, "torch.LongTensor");
-
- // 2) processes the data to find the intersection between the examples in the dataset and the
- // examples the current node has to evaluate
- THLongTensor *exampleIdsWithFeature_ret = gb_internal_prepare(L, initialization_data->exampleIds,
- run_data->exampleIdsWithFeature_cache, initialization_data->input_index, feature_id,
- run_data->exampleMap);
- if (!exampleIdsWithFeature_ret) {
- bs->valid_state = 0;
- return;
- }
-
- // 3) creates a new state to be used by the optimizer
- nn_(gb_internal_create)(initialization_data->grad, initialization_data->hess,
- exampleIdsWithFeature_ret, &bs->state);
-
- // 4) optimize away!
- nn_(gb_internal_get_best_split)(L, bs, featureExampleIds, run_data->exampleMap,
- initialization_data->input_index, run_data->minLeafSize, feature_id);
-}
diff --git a/contrib/lua-torch/decisiontree/generic/GBDT_internal.h b/contrib/lua-torch/decisiontree/generic/GBDT_internal.h
deleted file mode 100644
index 7119365cf..000000000
--- a/contrib/lua-torch/decisiontree/generic/GBDT_internal.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// representation of a state used while searching for the best split
-typedef struct {
- real leftGradientSum, rightGradientSum;
- real leftHessianSum, rightHessianSum;
- real lossInParent;
- long nExampleInLeftBranch, nExampleInRightBranch;
- real *grad_data, *hessian_data;
-} nn_(GBState);
-
-// representation for the best state found for a given feature
-typedef struct {
- nn_(GBState) state;
- real gain;
- long feature_id;
- real feature_value;
- int valid_state;
-} nn_(GBBestState);
-
-// full data that must be initialized before calling the optimizer
-typedef struct {
- // *_index represent positions on the lua stack
- int dataset_index;
- int splitInfo_index;
- int input_index;
- // position of the dataset's function to return the samples ordered for a given feature
- int getSortedFeature_index;
-
- // samples that this node has to evaluate
- THLongTensor *exampleIds;
-
- // cached gradient and hessian for all data
- THTensor *grad;
- THTensor *hess;
-} nn_(GBInitialization);
diff --git a/contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c b/contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c
deleted file mode 100644
index f2ea1ef09..000000000
--- a/contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/LogitBoostCriterion.c"
-#else
-
-#define EPS 1e-12
-
-static int nn_(LogitBoostCriterion_updateOutput)(lua_State *L)
-{
- THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
- THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
- THTensor *output = luaT_checkudata(L, 3, torch_Tensor);
- int sizeAverage = lua_toboolean(L, 4);
-
- if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
- luaL_error(L, "inconsistent input and target size");
- }
- THTensor_(resize1d)(output, 1);
-
- real sum = 0;
-
- TH_TENSOR_APPLY2(real, input, real, target,
- real x = *input_data;
- real y = *target_data;
- // math.log(1 + math.exp(target[i] <= 0 and input[i] or -input[i]))
- sum += log(1 + exp(y <= 0 ? x : -x));
- );
-
- if (sizeAverage)
- sum /= THTensor_(nElement)(input);
-
- THTensor_(set1d)(output, 0, sum);
- return 0;
-}
-
-static int nn_(LogitBoostCriterion_updateGradInput)(lua_State *L)
-{
- THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
- THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
- THTensor *gradInput = luaT_checkudata(L, 3, torch_Tensor);
-
- if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
- luaL_error(L, "inconsistent input and target size");
- }
- THTensor_(resizeAs)(gradInput, input);
-
- TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
- real x = *input_data;
- real y = *target_data;
- real p = (x >= 0) ? (1 / (1 + exp(-x))) : (1 - 1 / (1 + exp(x)));
- *gradInput_data = (y <= 0) ? p : (p - 1);
- );
-
- return 0;
-}
-
-static int nn_(LogitBoostCriterion_updateHessInput)(lua_State *L)
-{
- THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
- THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
- THTensor *hessInput = luaT_checkudata(L, 3, torch_Tensor);
-
- if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
- luaL_error(L, "inconsistent input and target size");
- }
- THTensor_(resizeAs)(hessInput, input);
-
- TH_TENSOR_APPLY3(real, hessInput, real, input, real, target,
- real x = *input_data;
- real p = (x >= 0) ? (1 / (1 + exp(-x))) : (1 - 1 / (1 + exp(x)));
- *hessInput_data = p * (1.0 - p);
- );
-
- return 0;
-}
-
-static const struct luaL_Reg nn_(LogitBoostCriterion__) [] = {
- {"LogitBoostCriterion_updateOutput", nn_(LogitBoostCriterion_updateOutput)},
- {"LogitBoostCriterion_updateGradInput", nn_(LogitBoostCriterion_updateGradInput)},
- {"LogitBoostCriterion_updateHessInput", nn_(LogitBoostCriterion_updateHessInput)},
- {NULL, NULL}
-};
-
-static void nn_(LogitBoostCriterion_init)(lua_State *L)
-{
- luaT_pushmetatable(L, torch_Tensor);
- luaT_registeratname(L, nn_(LogitBoostCriterion__), "nn");
- lua_pop(L,1);
-}
-
-#endif
diff --git a/contrib/lua-torch/decisiontree/generic/S2D.c b/contrib/lua-torch/decisiontree/generic/S2D.c
deleted file mode 100644
index 2392ee7c8..000000000
--- a/contrib/lua-torch/decisiontree/generic/S2D.c
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/S2D.c"
-#else
-
-static int nn_(S2D_computeOutput)(lua_State *L) {
- THTensor *output = luaT_checkudata(L, 1, torch_Tensor);
- const int keys_index = 2;
- const int values_index = 3;
- const int masks_index = 4;
-
- if (!lua_istable(L, keys_index))
- return LUA_HANDLE_ERROR_STR(L, "expeced position 2 to be a table");
- if (!lua_istable(L, values_index))
- return LUA_HANDLE_ERROR_STR(L, "expeced position 3 to be a table");
- if (!lua_istable(L, masks_index))
- return LUA_HANDLE_ERROR_STR(L, "expeced position 4 to be a table");
-
-
- THLongTensor *features = luaT_checkudata(L, 5, "torch.LongTensor");
-
- const int original_top = lua_gettop(L);
-
- long outputsize = THLongTensor_size(features, 0);
- long batch_size = lua_objlen(L, keys_index);
-
- // initializes output
- THTensor_(resize2d)(output, batch_size, outputsize);
- THTensor_(zero)(output);
- real *output_data = THTensor_(data)(output);
-
- // iterates over samples
- lua_pushnil(L);
- const int local_top = lua_gettop(L);
- while (lua_next(L, keys_index) != 0) {
- // gets data corresponding to the current sample
- long i = lua_tointeger(L, -2)-1;
- real *current_output_data = &output_data[i * outputsize];
- THLongTensor *keys = luaT_checkudata(L, -1, "torch.LongTensor");
- lua_rawgeti(L, values_index, i+1);
- THTensor *values = luaT_checkudata(L, -1, torch_Tensor);
- lua_rawgeti(L, masks_index, i+1);
- THByteTensor *mask = luaT_checkudata(L, -1, "torch.ByteTensor");
-
- long n_keys = THLongTensor_size(keys, 0);
- long n_values = THTensor_(size)(values, 0);
-
- // quick safety check
- if (n_keys != n_values)
- return LUA_HANDLE_ERROR_STR(L, "keys and values have to have the same size");
-
- // gets the direct memory pointers
- long *keys_data = THLongTensor_data(keys);
- real *values_data = THTensor_(data)(values);
- unsigned char *mask_data = THByteTensor_data(mask);
-
- // for each value in the sparse input...
- for (long j = 0; j < n_keys; j++) {
- // loads the value and key
- real current_value = values_data[j];
- long current_key = keys_data[j];
- unsigned char current_mask = mask_data[j];
-
- // if the feature is present in the map
- if (current_mask)
- // saves in the given position
- current_output_data[current_key-1] = current_value;
- }
- // cleans up the trash we create by iterating over keys to avoid it from overflowing
- lua_pop(L, lua_gettop(L) - local_top);
- }
-
- // cleans up the trash we added to the stack
- lua_pop(L, lua_gettop(L) - original_top);
-
- return 0;
-}
-
-static const struct luaL_Reg nn_(S2D__) [] = {
- {"S2D_computeOutput", nn_(S2D_computeOutput)},
- {NULL, NULL}
-};
-
-static void nn_(S2D_init)(lua_State *L)
-{
- luaT_pushmetatable(L, torch_Tensor);
- luaT_registeratname(L, nn_(S2D__), "nn");
- lua_pop(L,1);
-}
-
-#endif
diff --git a/contrib/lua-torch/decisiontree/hash_map.c b/contrib/lua-torch/decisiontree/hash_map.c
deleted file mode 100644
index 2c679e207..000000000
--- a/contrib/lua-torch/decisiontree/hash_map.c
+++ /dev/null
@@ -1,445 +0,0 @@
-#include "utils.h"
-#include "hash_map.h"
-#include "internal_hash_map.h"
-#include <pthread.h>
-
-hash_map_t hash_map_init(void) {
- return kh_init(long);
-}
-
-void hash_map_destroy(hash_map_t h_) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- kh_destroy(long, h);
-}
-
-void hash_map_clear(hash_map_t h_) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- kh_clear(long, h);
-}
-
-int hash_map_put(hash_map_t h_, long key, long val) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- int ret;
- khiter_t k = kh_put(long, h, key, &ret);
- ret = (ret >= 0);
- if (ret)
- kh_value(h, k) = val;
- return ret;
-}
-
-int hash_map_put_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_) {
- long *keys = THLongTensor_data(keys_);
- long *vals = THLongTensor_data(vals_);
- long size = get_tensor_size(keys_, Long);
- for (long i = 0; i < size; i++)
- if (!hash_map_put(h_, keys[i], vals[i]))
- return 0;
- return 1;
-}
-
-int hash_map_fill(hash_map_t h_, long key, long *counter) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- khiter_t k = kh_get(long, h, key);
- if (k == kh_end(h))
- return hash_map_put(h_, key, ++(*counter));
- return 1;
-}
-
-int hash_map_fill_tensor(hash_map_t h_, THLongTensor *keys_, long *counter) {
- long *keys = THLongTensor_data(keys_);
- long size = get_tensor_size(keys_, Long);
- for (long i = 0; i < size; i++)
- if (!hash_map_fill(h_, keys[i], counter))
- return 0;
- return 1;
-}
-
-int hash_map_get(hash_map_t h_, long key, long* val) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- khiter_t k = kh_get(long, h, key);
- if (k == kh_end(h))
- return 0;
- *val = kh_value(h, k);
- return 1;
-}
-
-void hash_map_get_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_, THByteTensor *mask_) {
- long *keys = THLongTensor_data(keys_);
- long *vals = THLongTensor_data(vals_);;
- unsigned char *mask = THByteTensor_data(mask_);
- long size = get_tensor_size(keys_, Long);
- for (long i = 0; i < size; i++)
- mask[i] = hash_map_get(h_, keys[i], &vals[i]);
-}
-
-void hash_map_del(hash_map_t h_, long key) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- khiter_t k = kh_get(long, h, key);
- if (k != kh_end(h))
- kh_del(long, h, k);
-}
-
-void hash_map_del_tensor(hash_map_t h_, THLongTensor *keys_) {
- long *keys = THLongTensor_data(keys_);
- long size = get_tensor_size(keys_, Long);
- for (long i = 0; i < size; i++)
- hash_map_del(h_, keys[i]);
-}
-
-size_t hash_map_size(hash_map_t h_) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- return kh_size(h);
-}
-
-void hash_map_to_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_) {
- internal_hash_map_t h = (internal_hash_map_t) h_;
- long *keys = THLongTensor_data(keys_);
- long *vals = THLongTensor_data(vals_);
- long key, val, i = 0;
- kh_foreach(h, key, val, {
- keys[i] = key;
- vals[i] = val;
- i++;
- });
-}
-
-static void autolock(hash_map_lua_t *h) {
- if (h->autolock) {
- pthread_mutex_lock(&h->mutex);
- }
-}
-
-static void autounlock(hash_map_lua_t *h) {
- if (h->autolock) {
- pthread_mutex_unlock(&h->mutex);
- }
-}
-
-int hash_map_autolock_on_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- h->autolock = 1;
- return 0;
-}
-
-int hash_map_autolock_off_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- h->autolock = 0;
- return 0;
-}
-
-int hash_map_init_lua(lua_State *L) {
- hash_map_lua_t **hp = (hash_map_lua_t**)lua_newuserdata(L, sizeof(hash_map_lua_t*));
- *hp = (hash_map_lua_t*)malloc(sizeof(hash_map_lua_t));
- hash_map_lua_t *h = *hp;
- h->refcount = 1;
- h->counter = 0;
- h->autolock = 0;
- h->h = hash_map_init();
-
- pthread_mutexattr_t mutex_attr;
- pthread_mutexattr_init(&mutex_attr);
- pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
- pthread_mutex_init(&h->mutex, &mutex_attr);
-
- luaL_getmetatable(L, "dt.HashMap");
- lua_setmetatable(L, -2);
- return 1;
-}
-
-int hash_map_gc_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- if (THAtomicDecrementRef(&h->refcount)) {
- pthread_mutex_destroy(&h->mutex);
- hash_map_destroy(h->h);
- free(h);
- }
- return 0;
-}
-
-int hash_map_retain_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- THAtomicIncrementRef(&h->refcount);
- return 0;
-}
-
-int hash_map_metatablename_lua(lua_State *L) {
- lua_pushstring(L, "dt.HashMap");
- return 1;
-}
-
-int hash_map_clear_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- autolock(h);
- hash_map_clear(h->h);
- autounlock(h);
- return 0;
-}
-
-int hash_map_put_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- int ret;
-#if LUA_VERSION_NUM <= 501
-#define lua_isinteger lua_isnumber
-#endif
- if (lua_isinteger(L, 2)) {
- if (!lua_isinteger(L, 3))
- return LUA_HANDLE_ERROR_STR(L, "second parameter is not a number");
- long key = lua_tointeger(L, 2);
- long val = lua_tointeger(L, 3);
- autolock(h);
- ret = hash_map_put(h->h, key, val);
- autounlock(h);
- }
- else {
- THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
- THLongTensor *vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
- check_tensor(L, keys, THLongTensor);
- check_tensor(L, vals, THLongTensor);
- check_tensors(L, keys, vals);
- autolock(h);
- ret = hash_map_put_tensor(h->h, keys, vals);
- autounlock(h);
- }
- if (!ret)
- return LUA_HANDLE_ERROR_STR(L, "failed to put into hash map");
- return 0;
-}
-
-int hash_map_fill_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- int ret;
- if (lua_isinteger(L, 2)) {
- long key = lua_tointeger(L, 2);
- autolock(h);
- ret = hash_map_fill(h->h, key, &h->counter);
- autounlock(h);
- }
- else {
- THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
- check_tensor(L, keys, THLongTensor);
- autolock(h);
- ret = hash_map_fill_tensor(h->h, keys, &h->counter);
- autounlock(h);
- }
- if (!ret)
- return LUA_HANDLE_ERROR_STR(L, "failed to fill into hash map");
- return 0;
-}
-
-int hash_map_adjust_counter_lua(lua_State *L) {
- hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
- internal_hash_map_t h = (internal_hash_map_t) h_->h;
-
- long val;
- kh_foreach_value(h, val, {
- if (val >= h_->counter)
- h_->counter = val;
- });
- return 0;
-}
-
-int hash_map_set_counter_lua(lua_State *L) {
- hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
- h_->counter = lua_tointeger(L, 2);
- return 0;
-}
-
-int hash_map_get_counter_lua(lua_State *L) {
- hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
- lua_pushinteger(L, h_->counter);
- return 1;
-}
-
-static int hash_map_get_tensor_lua(lua_State *L, hash_map_lua_t *h, int inplace) {
- THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
- check_tensor(L, keys, THLongTensor);
- THLongTensor *vals = inplace ? keys : NULL;
- THByteTensor *mask = NULL;
-
- int maskIdx = inplace ? 3 : 4;
-
- if (!inplace) {
- if (lua_gettop(L) < 3) {
- vals = THLongTensor_new();
- } else {
- vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
- check_tensor(L, vals, THLongTensor);
- }
- }
-
- if (lua_gettop(L) < maskIdx) {
- mask = THByteTensor_new();
- } else {
- mask = (THByteTensor *)luaT_checkudata(L, maskIdx, "torch.ByteTensor");
- check_tensor(L, mask, THByteTensor);
- }
-
- int n_dim = THLongTensor_nDimension(keys);
- THLongStorage *st = THLongStorage_newWithSize1(n_dim);
- for (int i = 0; i < n_dim; i++) {
- THLongStorage_set(st, i, THLongTensor_size(keys, i));
- }
- THByteTensor_resize(mask, st, NULL);
- if (!inplace) THLongTensor_resize(vals, st, NULL);
- THLongStorage_free(st);
-
- autolock(h);
- hash_map_get_tensor(h->h, keys, vals, mask);
- autounlock(h);
-
- if (!inplace && lua_gettop(L) < 3)
- luaT_pushudata(L, vals, "torch.LongTensor");
- if (lua_gettop(L) < maskIdx)
- luaT_pushudata(L, mask, "torch.ByteTensor");
-
- return 2;
-}
-
-static int hash_map_get_table_lua(lua_State *L, hash_map_lua_t *h, int inplace) {
- const int kidx = 2;
- const int vidx = inplace ? 2 : 3;
- const int midx = inplace ? 3 : 4;
- const int narg = lua_gettop(L);
-
- if (inplace) {
- if (narg < 3) {
- LUA_HANDLE_ERROR_STR(L, "HashMap.getInplace requires two arguments.");
- }
- } else {
- if (narg < 4) {
- LUA_HANDLE_ERROR_STR(L, "HashMap.get requires three arguments.");
- }
- }
-
- int count = push_table_contents(L, kidx);
- verify_push_table_contents(L, vidx, count);
- verify_push_table_contents(L, midx, count);
-
- THLongTensor *keys;
- THLongTensor *vals;
- THByteTensor *mask;
- for (int i = count - 1; i >= 0; i--) {
- int maskIdx = i - count;
- int valIdx = maskIdx - count;
- int keyIdx = inplace ? valIdx : (valIdx - count);
-
- keys = (THLongTensor *)luaT_checkudata(L, keyIdx, "torch.LongTensor");
- check_tensor(L, keys, THLongTensor);
- if (inplace) {
- vals = keys;
- } else {
- vals = (THLongTensor *)luaT_checkudata(L, valIdx, "torch.LongTensor");
- }
- mask = (THByteTensor *)luaT_checkudata(L, maskIdx, "torch.ByteTensor");
-
- int n_dim = THLongTensor_nDimension(keys);
- THLongStorage *st = THLongStorage_newWithSize1(n_dim);
- for (int i = 0; i < n_dim; i++) {
- THLongStorage_set(st, i, THLongTensor_size(keys, i));
- }
- THByteTensor_resize(mask, st, NULL);
- THLongTensor_resize(vals, st, NULL);
- THLongStorage_free(st);
-
- autolock(h);
- hash_map_get_tensor(h->h, keys, vals, mask);
- autounlock(h);
- }
- lua_pop(L, (narg - 1) * count);
- return 2;
-}
-
-int hash_map_get_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- if (lua_isinteger(L, 2)) {
- long key = lua_tointeger(L, 2);
- long val;
- autolock(h);
- int ret = hash_map_get(h->h, key, &val);
- autounlock(h);
- if (ret) {
- lua_pushinteger(L, val);
- lua_pushinteger(L, 1);
- }
- else {
- lua_pushinteger(L, 0);
- lua_pushinteger(L, 0);
- }
- } else if (lua_istable(L, 2)) {
- return hash_map_get_table_lua(L, h, 0);
- } else {
- return hash_map_get_tensor_lua(L, h, 0);
- }
- return 2;
-}
-
-int hash_map_get_inplace_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- if (lua_isinteger(L, 2)) {
- LUA_HANDLE_ERROR_STR(L, "HashMap.getInplace does not support integer arguments.");
- } else if (lua_istable(L, 2)) {
- return hash_map_get_table_lua(L, h, 1);
- } else {
- return hash_map_get_tensor_lua(L, h, 1);
- }
- return 2;
-}
-
-int hash_map_del_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- if (lua_isinteger(L, 2)) {
- long key = lua_tointeger(L, 2);
- autolock(h);
- hash_map_del(h->h, key);
- autounlock(h);
- }
- else {
- THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
- autolock(h);
- hash_map_del_tensor(h->h, keys);
- autounlock(h);
- }
- return 0;
-}
-
-int hash_map_size_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- long size = hash_map_size(h->h);
- lua_pushinteger(L, size);
- return 1;
-}
-
-int hash_map_to_tensor_lua(lua_State *L) {
- hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
- THLongTensor *keys, *vals;
-
- if (lua_gettop(L) < 2) {
- keys = THLongTensor_new();
- }
- else {
- keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
- check_tensor(L, keys, THLongTensor);
- }
-
- if (lua_gettop(L) < 3) {
- vals = THLongTensor_new();
- }
- else {
- vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
- check_tensor(L, vals, THLongTensor);
- }
-
- size_t size = hash_map_size(h->h);
- THLongTensor_resize1d(keys, size);
- THLongTensor_resize1d(vals, size);
-
- autolock(h);
- hash_map_to_tensor(h->h, keys, vals);
- autounlock(h);
-
- if (lua_gettop(L) < 2)
- luaT_pushudata(L, keys, "torch.LongTensor");
- if (lua_gettop(L) < 3)
- luaT_pushudata(L, vals, "torch.LongTensor");
- return 2;
-}
diff --git a/contrib/lua-torch/decisiontree/hash_map.h b/contrib/lua-torch/decisiontree/hash_map.h
deleted file mode 100644
index 5b215e4ca..000000000
--- a/contrib/lua-torch/decisiontree/hash_map.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "luaT.h"
-#include "TH.h"
-
-typedef void* hash_map_t;
-
-hash_map_t hash_map_init(void);
-void hash_map_destroy(hash_map_t);
-void hash_map_clear(hash_map_t);
-int hash_map_put(hash_map_t, long key, long val);
-int hash_map_put_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_);
-int hash_map_fill(hash_map_t, long key, long *counter);
-int hash_map_fill_tensor(hash_map_t, THLongTensor *keys_, long *counter);
-int hash_map_get(hash_map_t, long key, long *val);
-void hash_map_get_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_, THByteTensor *mask_);
-void hash_map_del(hash_map_t, long key);
-void hash_map_del_tensor(hash_map_t, THLongTensor *keys_);
-size_t hash_map_size(hash_map_t);
-void hash_map_to_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_);
-
-int hash_map_autolock_on_lua(lua_State *L);
-int hash_map_autolock_off_lua(lua_State *L);
-int hash_map_init_lua(lua_State *L);
-int hash_map_gc_lua(lua_State *L);
-int hash_map_retain_lua(lua_State *L);
-int hash_map_metatablename_lua(lua_State *L);
-int hash_map_clear_lua(lua_State *L);
-int hash_map_put_lua(lua_State *L);
-int hash_map_fill_lua(lua_State *L);
-int hash_map_adjust_counter_lua(lua_State *L);
-int hash_map_set_counter_lua(lua_State *L);
-int hash_map_get_counter_lua(lua_State *L);
-int hash_map_get_lua(lua_State *L);
-int hash_map_get_inplace_lua(lua_State *L);
-int hash_map_del_lua(lua_State *L);
-int hash_map_size_lua(lua_State *L);
-int hash_map_to_tensor_lua(lua_State *L);
diff --git a/contrib/lua-torch/decisiontree/init.c b/contrib/lua-torch/decisiontree/init.c
deleted file mode 100644
index 276241e8e..000000000
--- a/contrib/lua-torch/decisiontree/init.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "TH.h"
-#include "luaT.h"
-
-#ifdef _OPENMP
-#include "omp.h"
-#endif
-
-#include "error.h"
-#include "hash_map.h"
-
-#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
-#define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
-#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
-
-#include "generic/LogitBoostCriterion.c"
-#include "THGenerateFloatTypes.h"
-
-#include "generic/DFD.c"
-#include "THGenerateFloatTypes.h"
-
-#include "generic/S2D.c"
-#include "THGenerateFloatTypes.h"
-
-#include "generic/CartTree.c"
-#include "THGenerateFloatTypes.h"
-
-#include "GBDT_common.h"
-#include "generic/GBDT.c"
-#include "THGenerateFloatTypes.h"
-
-static const struct luaL_Reg decisiontree_hash_map_routines[] = {
- {"__gc", hash_map_gc_lua},
- {"retain", hash_map_retain_lua},
- {"metatablename", hash_map_metatablename_lua},
- {"clear", hash_map_clear_lua},
- {"put", hash_map_put_lua},
- {"fill", hash_map_fill_lua},
- {"adjustCounter", hash_map_adjust_counter_lua},
- {"getCounter", hash_map_get_counter_lua},
- {"setCounter", hash_map_set_counter_lua},
- {"get", hash_map_get_lua},
- {"getInplace", hash_map_get_inplace_lua},
- {"del", hash_map_del_lua},
- {"size", hash_map_size_lua},
- {"safe", hash_map_autolock_on_lua},
- {"unsafe", hash_map_autolock_off_lua},
- {"toTensors", hash_map_to_tensor_lua},
- {"new", hash_map_init_lua},
- {NULL, NULL}
-};
-
-DLL_EXPORT int luaopen_libdecisiontree(lua_State *L)
-{
- // HashMap
- luaL_newmetatable(L, "dt.HashMap");
- lua_pushstring(L, "__index");
- lua_pushvalue(L, -2);
- lua_settable(L, -3);
- luaT_setfuncs(L, decisiontree_hash_map_routines, 0);
-
- nn_FloatLogitBoostCriterion_init(L);
- nn_DoubleLogitBoostCriterion_init(L);
-
- nn_FloatDFD_init(L);
- nn_DoubleDFD_init(L);
-
- nn_FloatS2D_init(L);
- nn_DoubleS2D_init(L);
-
- nn_FloatCT_init(L);
- nn_DoubleCT_init(L);
-
- nn_FloatGBDT_init(L);
- nn_DoubleGBDT_init(L);
-
- return 1;
-}
diff --git a/contrib/lua-torch/decisiontree/init.lua b/contrib/lua-torch/decisiontree/init.lua
deleted file mode 100644
index 26f790b60..000000000
--- a/contrib/lua-torch/decisiontree/init.lua
+++ /dev/null
@@ -1,70 +0,0 @@
-require 'paths'
---require 'xlua'
-require 'string'
-require 'os'
---require 'sys'
-require 'nn'
-
--- these actually return local variables but we will re-require them
--- when needed. This is just to make sure they are loaded.
-require 'moses'
-
-unpack = unpack or table.unpack
-
-local dt = require 'decisiontree._env'
-
--- c lib:
-require "paths"
-paths.require 'libdecisiontree'
-
-dt.HashMap = torch.getmetatable("dt.HashMap").new
-
-dt.EPSILON = 1e-6
-
--- experimental Tensor-like container
-require 'decisiontree.SparseTensor'
-
--- functions
-require 'decisiontree.math'
-require 'decisiontree.utils'
-
--- for multi-threading
---require 'decisiontree.WorkPool'
-
--- abstract classes
-require 'decisiontree.DecisionTree'
-require 'decisiontree.DecisionForest'
-require 'decisiontree.DecisionForestTrainer'
-require 'decisiontree.TreeState'
-
--- for CaRTree inference
-require 'decisiontree.CartNode'
-require 'decisiontree.CartTree'
-
--- Criterions (extended with updateHessInput and backward2)
-require 'decisiontree.MSECriterion'
-require 'decisiontree.LogitBoostCriterion'
-
--- Used by both RandomForestTrainer and GradientBoostTrainer
-require 'decisiontree.CartTrainer'
-
--- Used by CartTrainer
-require 'decisiontree.DataSet'
-
--- Random Forest Training
-require 'decisiontree.RandomForestTrainer'
-require 'decisiontree.GiniState' -- TreeState subclass
-
--- Gradient Boosted Decision Tree Training
-require 'decisiontree.GradientBoostTrainer'
-require 'decisiontree.GradientBoostState' -- TreeState subclass
-
--- unit tests and benchmarks
---require 'decisiontree.test'
---require 'decisiontree.benchmark'
-
--- nn.Module
-require 'decisiontree.DFD'
-require 'decisiontree.Sparse2Dense'
-
-return dt
diff --git a/contrib/lua-torch/decisiontree/internal_hash_map.h b/contrib/lua-torch/decisiontree/internal_hash_map.h
deleted file mode 100644
index bc8c523ef..000000000
--- a/contrib/lua-torch/decisiontree/internal_hash_map.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "khash.h"
-#include <pthread.h>
-
-KHASH_MAP_INIT_INT64(long, long)
-typedef khash_t(long)* internal_hash_map_t;
-
-typedef struct {
- hash_map_t h;
- int refcount;
- pthread_mutex_t mutex;
- int autolock;
- long counter;
-} hash_map_lua_t;
diff --git a/contrib/lua-torch/decisiontree/khash.h b/contrib/lua-torch/decisiontree/khash.h
deleted file mode 100644
index 20e994063..000000000
--- a/contrib/lua-torch/decisiontree/khash.h
+++ /dev/null
@@ -1,627 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/*
- An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
- int ret, is_missing;
- khiter_t k;
- khash_t(32) *h = kh_init(32);
- k = kh_put(32, h, 5, &ret);
- kh_value(h, k) = 10;
- k = kh_get(32, h, 10);
- is_missing = (k == kh_end(h));
- k = kh_get(32, h, 5);
- kh_del(32, h, k);
- for (k = kh_begin(h); k != kh_end(h); ++k)
- if (kh_exist(h, k)) kh_value(h, k) = 1;
- kh_destroy(32, h);
- return 0;
-}
-*/
-
-/*
- 2013-05-02 (0.2.8):
-
- * Use quadratic probing. When the capacity is power of 2, stepping function
- i*(i+1)/2 guarantees to traverse each bucket. It is better than double
- hashing on cache performance and is more robust than linear probing.
-
- In theory, double hashing should be more robust than quadratic probing.
- However, my implementation is probably not for large hash tables, because
- the second hash function is closely tied to the first hash function,
- which reduce the effectiveness of double hashing.
-
- Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
-
- 2011-12-29 (0.2.7):
-
- * Minor code clean up; no actual effect.
-
- 2011-09-16 (0.2.6):
-
- * The capacity is a power of 2. This seems to dramatically improve the
- speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
-
- - http://code.google.com/p/ulib/
- - http://nothings.org/computer/judy/
-
- * Allow to optionally use linear probing which usually has better
- performance for random input. Double hashing is still the default as it
- is more robust to certain non-random input.
-
- * Added Wang's integer hash function (not used by default). This hash
- function is more robust to certain non-random input.
-
- 2011-02-14 (0.2.5):
-
- * Allow to declare global functions.
-
- 2009-09-26 (0.2.4):
-
- * Improve portability
-
- 2008-09-19 (0.2.3):
-
- * Corrected the example
- * Improved interfaces
-
- 2008-09-11 (0.2.2):
-
- * Improved speed a little in kh_put()
-
- 2008-09-10 (0.2.1):
-
- * Added kh_clear()
- * Fixed a compiling error
-
- 2008-09-02 (0.2.0):
-
- * Changed to token concatenation which increases flexibility.
-
- 2008-08-31 (0.1.2):
-
- * Fixed a bug in kh_get(), which has not been tested previously.
-
- 2008-08-31 (0.1.1):
-
- * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
- @header
-
- Generic hash table library.
- */
-
-#define AC_VERSION_KHASH_H "0.2.8"
-
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-
-/* compiler specific configuration */
-
-#if UINT_MAX == 0xffffffffu
-typedef unsigned int khint32_t;
-#elif ULONG_MAX == 0xffffffffu
-typedef unsigned long khint32_t;
-#endif
-
-#if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khint64_t;
-#else
-typedef unsigned long long khint64_t;
-#endif
-
-#ifndef kh_inline
-#ifdef _MSC_VER
-#define kh_inline __inline
-#else
-#define kh_inline inline
-#endif
-#endif /* kh_inline */
-
-#ifndef klib_unused
-#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
-#define klib_unused __attribute__ ((__unused__))
-#else
-#define klib_unused
-#endif
-#endif /* klib_unused */
-
-typedef khint32_t khint_t;
-typedef khint_t khiter_t;
-
-#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
-#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
-#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
-#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
-#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
-#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
-#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
-
-#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-#ifndef kcalloc
-#define kcalloc(N,Z) calloc(N,Z)
-#endif
-#ifndef kmalloc
-#define kmalloc(Z) malloc(Z)
-#endif
-#ifndef krealloc
-#define krealloc(P,Z) realloc(P,Z)
-#endif
-#ifndef kfree
-#define kfree(P) free(P)
-#endif
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define __KHASH_TYPE(name, khkey_t, khval_t) \
- typedef struct kh_##name##_s { \
- khint_t n_buckets, size, n_occupied, upper_bound; \
- khint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t;
-
-#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \
- extern kh_##name##_t *kh_init_##name(void); \
- extern void kh_destroy_##name(kh_##name##_t *h); \
- extern void kh_clear_##name(kh_##name##_t *h); \
- extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
- extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
- extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
- extern void kh_del_##name(kh_##name##_t *h, khint_t x);
-
-#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- SCOPE kh_##name##_t *kh_init_##name(void) { \
- return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \
- } \
- SCOPE void kh_destroy_##name(kh_##name##_t *h) \
- { \
- if (h) { \
- kfree((void *)h->keys); kfree(h->flags); \
- kfree((void *)h->vals); \
- kfree(h); \
- } \
- } \
- SCOPE void kh_clear_##name(kh_##name##_t *h) \
- { \
- if (h && h->flags) { \
- memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
- h->size = h->n_occupied = 0; \
- } \
- } \
- SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
- { \
- if (h->n_buckets) { \
- khint_t k, i, last, mask, step = 0; \
- mask = h->n_buckets - 1; \
- k = __hash_func(key); i = k & mask; \
- last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- i = (i + (++step)) & mask; \
- if (i == last) return h->n_buckets; \
- } \
- return __ac_iseither(h->flags, i)? h->n_buckets : i; \
- } else return 0; \
- } \
- SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
- { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
- khint32_t *new_flags = 0; \
- khint_t j = 1; \
- { \
- kroundup32(new_n_buckets); \
- if (new_n_buckets < 4) new_n_buckets = 4; \
- if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
- else { /* hash table size to be changed (shrink or expand); rehash */ \
- new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
- if (!new_flags) return -1; \
- memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
- if (h->n_buckets < new_n_buckets) { /* expand */ \
- khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (!new_keys) { kfree(new_flags); return -1; } \
- h->keys = new_keys; \
- if (kh_is_map) { \
- khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
- if (!new_vals) { kfree(new_flags); return -1; } \
- h->vals = new_vals; \
- } \
- } /* otherwise shrink */ \
- } \
- } \
- if (j) { /* rehashing is needed */ \
- for (j = 0; j != h->n_buckets; ++j) { \
- if (__ac_iseither(h->flags, j) == 0) { \
- khkey_t key = h->keys[j]; \
- khval_t val; \
- khint_t new_mask; \
- new_mask = new_n_buckets - 1; \
- if (kh_is_map) val = h->vals[j]; \
- __ac_set_isdel_true(h->flags, j); \
- while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
- khint_t k, i, step = 0; \
- k = __hash_func(key); \
- i = k & new_mask; \
- while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
- __ac_set_isempty_false(new_flags, i); \
- if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
- { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
- if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
- __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
- } else { /* write the element and jump out of the loop */ \
- h->keys[i] = key; \
- if (kh_is_map) h->vals[i] = val; \
- break; \
- } \
- } \
- } \
- } \
- if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
- h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- kfree(h->flags); /* free the working space */ \
- h->flags = new_flags; \
- h->n_buckets = new_n_buckets; \
- h->n_occupied = h->size; \
- h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
- } \
- return 0; \
- } \
- SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
- { \
- khint_t x; \
- if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
- if (h->n_buckets > (h->size<<1)) { \
- if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
- *ret = -1; return h->n_buckets; \
- } \
- } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
- *ret = -1; return h->n_buckets; \
- } \
- } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
- { \
- khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
- x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
- if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
- else { \
- last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (__ac_isdel(h->flags, i)) site = i; \
- i = (i + (++step)) & mask; \
- if (i == last) { x = site; break; } \
- } \
- if (x == h->n_buckets) { \
- if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
- else x = i; \
- } \
- } \
- } \
- if (__ac_isempty(h->flags, x)) { /* not present at all */ \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; ++h->n_occupied; \
- *ret = 1; \
- } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; \
- *ret = 2; \
- } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
- return x; \
- } \
- SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \
- { \
- if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
- __ac_set_isdel_true(h->flags, x); \
- --h->size; \
- } \
- }
-
-#define KHASH_DECLARE(name, khkey_t, khval_t) \
- __KHASH_TYPE(name, khkey_t, khval_t) \
- __KHASH_PROTOTYPES(name, khkey_t, khval_t)
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- __KHASH_TYPE(name, khkey_t, khval_t) \
- __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
- @abstract Integer hash function
- @param key The integer [khint32_t]
- @return The hash value [khint_t]
- */
-#define kh_int_hash_func(key) (khint32_t)(key)
-/*! @function
- @abstract Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract 64-bit integer hash function
- @param key The integer [khint64_t]
- @return The hash value [khint_t]
- */
-#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
-/*! @function
- @abstract 64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract const char* hash function
- @param s Pointer to a null terminated string
- @return The hash value
- */
-static kh_inline khint_t __ac_X31_hash_string(const char *s)
-{
- khint_t h = (khint_t)*s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
- return h;
-}
-/*! @function
- @abstract Another interface to const char* hash function
- @param key Pointer to a null terminated string [const char*]
- @return The hash value [khint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
- @abstract Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-static kh_inline khint_t __ac_Wang_hash(khint_t key)
-{
- key += ~(key << 15);
- key ^= (key >> 10);
- key += (key << 3);
- key ^= (key >> 6);
- key += ~(key << 11);
- key ^= (key >> 16);
- return key;
-}
-#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other convenient macros... */
-
-/*!
- @abstract Type of the hash table.
- @param name Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
- @abstract Initiate a hash table.
- @param name Name of the hash table [symbol]
- @return Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name()
-
-/*! @function
- @abstract Destroy a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
- @abstract Reset a hash table without deallocating memory.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
- @abstract Resize a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param s New size [khint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
- @abstract Insert a key to the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @param r Extra return code: -1 if the operation failed;
- 0 if the key is present in the hash table;
- 1 if the bucket is empty (never used); 2 if the element in
- the bucket has been deleted [int*]
- @return Iterator to the inserted element [khint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
- @abstract Retrieve a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
- @abstract Remove a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Iterator to the element to be deleted [khint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-/*! @function
- @abstract Test whether a bucket contains data.
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return 1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
- @abstract Get key given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
- @abstract Get value given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khint_t]
- @return Value [type of values]
- @discussion For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Get the start iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The start iterator [khint_t]
- */
-#define kh_begin(h) (khint_t)(0)
-
-/*! @function
- @abstract Get the end iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The end iterator [khint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
- @abstract Get the number of elements in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of elements in the hash table [khint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
- @abstract Get the number of buckets in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of buckets in the hash table [khint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/*! @function
- @abstract Iterate over the entries in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @param kvar Variable to which key will be assigned
- @param vvar Variable to which value will be assigned
- @param code Block of code to execute
- */
-#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \
- for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
- if (!kh_exist(h,__i)) continue; \
- (kvar) = kh_key(h,__i); \
- (vvar) = kh_val(h,__i); \
- code; \
- } }
-
-/*! @function
- @abstract Iterate over the values in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @param vvar Variable to which value will be assigned
- @param code Block of code to execute
- */
-#define kh_foreach_value(h, vvar, code) { khint_t __i; \
- for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \
- if (!kh_exist(h,__i)) continue; \
- (vvar) = kh_val(h,__i); \
- code; \
- } }
-
-/* More conenient interfaces */
-
-/*! @function
- @abstract Instantiate a hash set containing integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name) \
- KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t) \
- KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT64(name) \
- KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT64(name, khval_t) \
- KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-typedef const char *kh_cstr_t;
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name) \
- KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t) \
- KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-#endif /* __AC_KHASH_H */
diff --git a/contrib/lua-torch/decisiontree/math.lua b/contrib/lua-torch/decisiontree/math.lua
deleted file mode 100644
index eb71b31ed..000000000
--- a/contrib/lua-torch/decisiontree/math.lua
+++ /dev/null
@@ -1,84 +0,0 @@
-local dt = require "decisiontree._env"
-
-local PSEUDOCOUNT = 1.0
-local MIN_LOGISTIC = 1E-8
-local MAX_LOGISTIC = 1.0 - MIN_LOGISTIC
-
--- Create counts of possible results (last column of each row is the result)
-function dt.uniquecounts(counts, inputset, nclass)
- counts = counts or inputset.input.new()
- nclass = nclass or inputset.target:max()
- counts:resize(nclass):zero()
-
- inputset.target:apply(function(c) counts[c] = counts[c] + 1 end)
- return counts
-end
-
--- Entropy is the sum of -p(x)log(p(x)) across all the different possible results
-local counts, logprobs
-function dt.entropy(inputset, nclass)
- local dt = require 'decisiontree'
- counts = dt.uniquecounts(counts, inputset, nclass)
- -- convert counts to categorical probabilities
- counts:add(0.0000001) -- prevent NaN
- counts:div(counts:sum())
-
- logprobs = logprobs or counts.new()
- logprobs:resize(counts:size())
- logprobs:log(counts):div(math.log(2)) -- log2(x)
-
- counts:cmul(logprobs)
-
- return -counts:sum()
-end
-
--- Compute and return the probability of positive label.
-function dt.probabilityPositive(nPositive, nTotal)
- return (nPositive + PSEUDOCOUNT) / (nTotal + 2.0 * PSEUDOCOUNT);
-end
-
--- Ref. https://en.wikipedia.org/wiki/Logit
--- Calculates logit of the probability.
--- Logit represents the log-odds. Probabilities transformed to logit 'space' can be combined linearly.
-function dt.logit(p)
- assert(p >= 0.0 and p <= 1.0, "Expecting probability for arg 1")
- local truncatedP = math.max(MIN_LOGISTIC, math.min(MAX_LOGISTIC, p))
- return math.log(truncatedP / (1.0 - truncatedP))
-end
-
-function dt.logistic(x)
- return (x >= 0) and (1 / (1 + math.exp(-x))) or (1 - 1 / (1 + math.exp(x)))
-end
-
-function dt.computeGradientBoostLoss(gradient, hessian)
- return -gradient * gradient / hessian
-end
-
-function dt.computeNewtonScore(gradient, hessian)
- return -0.5 * gradient / hessian;
-end
-
--- Calculates the logit score for a Node in a Decision Tree based on the probability of a positive label.
--- params: number of positive examples and total number of examples.
-function dt.calculateLogitScore(nPositive, nTotal)
- local dt = require 'decisiontree'
- return dt.logit(dt.probabilityPositive(nPositive, nTotal))
-end
-
--- Compute and return the Gini impurity score based on an input contingency table.
-function dt.computeGini(leftCount, positiveLeftCount, rightCount, positiveRightCount)
- assert(torch.type(leftCount) == 'number', 'Expecting total number examples falling into leftBranch.')
- assert(torch.type(positiveLeftCount) == 'number', 'Expecting total number of positive examples falling into left branch.')
- assert(torch.type(rightCount) == 'number', 'Expecting total number of examples falling into the right branch.')
- assert(torch.type(positiveRightCount) == 'number', 'Expecting total number of positive examples falling into the right branch.')
-
- local total = leftCount + rightCount
-
- local pPositiveLeft = leftCount == 0 and 0 or (positiveLeftCount / leftCount)
- local leftGini = pPositiveLeft * (1.0 - pPositiveLeft)
-
- local pPositiveRight = rightCount == 0 and 0 or (positiveRightCount / rightCount)
- local rightGini = pPositiveRight * (1.0 - pPositiveRight)
-
- return (leftCount * leftGini + rightCount * rightGini) / total
-end \ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec b/contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec
deleted file mode 100644
index d5d916275..000000000
--- a/contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec
+++ /dev/null
@@ -1,40 +0,0 @@
-package = "decisiontree"
-version = "scm-1"
-
-source = {
- url = "git://github.com/Twitter/decisiontree",
- tag = "master"
-}
-
-description = {
- summary = "Decision trees for Torch by Twitter",
- detailed = [[
- Classification and regression trees (CART).
- Gradients boosted decision trees (GBDT).
- ]],
- homepage = "https://github.com/Twitter/decisiontree",
- license = "BSD"
-}
-
-dependencies = {
- "torch >= 7.0",
- "moses >= 1.3.1",
- "xlua >= 1.0",
- "image >= 1.0",
- "luafilesystem >= 1.6.2",
- "sys >= 1.1",
- "paths >= 1.0",
- "ipc >= 1.0",
- "nn >= 1.0"
-}
-
-build = {
- type = "command",
- build_command = [[
-cmake -E make_directory build;
-cd build;
-cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" -DCMAKE_C_FLAGS=-fPIC -DCMAKE_CXX_FLAGS=-fPIC;
-$(MAKE)
- ]],
- install_command = "cd build && $(MAKE) install"
-}
diff --git a/contrib/lua-torch/decisiontree/test.lua b/contrib/lua-torch/decisiontree/test.lua
deleted file mode 100644
index 80510a4f6..000000000
--- a/contrib/lua-torch/decisiontree/test.lua
+++ /dev/null
@@ -1,817 +0,0 @@
-local dt = require "decisiontree._env"
-
-local dttest = {}
-local nloop = 50
-local epsilon = 0.000001
-local mytester
-
---e.g. usage: th -e "dt = require 'decisiontree'; dt.test()"
-
--- test 99% accuracy
-local function testAccuracy(cartTree, name, dataset, minacc)
- assert(torch.isTypeOf(dataset, 'dt.DataSet'))
- minacc = minacc or 0.99
- local output = torch.Tensor(dataset:size())
- local target, input = dataset.target, dataset.input
-
- for i=1,dataset:size() do
- local stack = {}
- local score = cartTree:score(input[i], stack)
- output[i] = score >= 0 and 1 or 0
-
- if dt.VERBOSE and torch.type(cartTree) == 'dt.CartTree' and target[i] ~= output[i] then
- print(cartTree:stackToString(stack, example.input))
- print(i, score, target[i], output[i])
- end
- end
-
- local accuracy = torch.eq(target, output):float():mean()
- mytester:assert(accuracy >= minacc, name .. ": insufficient accuracy: " .. accuracy .. " < " .. minacc)
-end
-
-function dttest.SparseTensor()
- local keys = torch.LongTensor{1,5,6,10}
- local values = torch.randn(keys:size(1))
- local st = torch.SparseTensor(keys, values)
-
- mytester:assert(st[1] == values[1])
- mytester:assert(st[5] == values[2])
- mytester:assert(st[6] == values[3])
- mytester:assert(st[10] == values[4])
-
- mytester:assert(st[2] == nil)
-
- st:buildIndex()
-
- mytester:assert(st[1] == values[1])
- mytester:assert(st[5] == values[2])
- mytester:assert(st[6] == values[3])
- mytester:assert(st[10] == values[4])
-
- mytester:assert(st[2] == nil)
-
- -- test empty sparse tensor
-
- local est = torch.SparseTensor()
-end
-
-function dttest.GiniState()
- local featureId = 2
- local minLeafSize = 0
-
- local input = torch.Tensor({{0,1,0},{0,2,0},{0,3,0}})
- local target = torch.Tensor({1, 1, 1})
- local dataset = dt.DataSet(input, target, 3)
-
- local splitInfo1 = {_id=1}
- local splitInfo2 = {_id=2, leftChildSize = 1, rightChildSize = 2, splitGain = 0}
- local splitInfo3 = {_id=3, leftChildSize = 2, rightChildSize = 1, splitGain = -1}
-
- local exampleIds = torch.LongTensor{1,2,3}
-
- local treeState = dt.GiniState(exampleIds)
-
- function treeState.computeSplitInfo(self, splitFeatureId, splitFeatureValue)
- if splitFeatureId == featureId and splitFeatureValue == 2 then
- return splitInfo2
- elseif splitFeatureId == featureId and splitFeatureValue == 3 then
- return splitInfo3
- else
- error("Unhandled computeSplitInfo call "..splitFeatureId.." "..splitFeatureValue)
- end
- end
-
- local splitInfo = treeState:findBestFeatureSplit(dataset, featureId, minLeafSize)
- mytester:assert(splitInfo._id == splitInfo3._id)
-end
-
-function dttest.CartTree()
-
- local splitFeatureId = 100
- local splitFeatureValue = 1.0
-
- local function getBinaryCartTreeRootNode()
-
- local leftNodeScore = 0.2
- local rightNodeScore = 0.4
-
- local rootNode = dt.CartNode()
- rootNode.nodeId = 0
- rootNode.score = 0.5
- rootNode.splitFeatureId = splitFeatureId
- rootNode.splitFeautreValue = splitFeatureValue
-
- local leftChild = dt.CartNode()
- leftChild.score = leftNodeScore
- leftChild.nodeId = 1
-
- local rightChild = dt.CartNode()
- rightChild.score = rightNodeScore
- rightChild.nodeId = 2
-
- rootNode.leftChild = leftChild
- rootNode.rightChild = rightChild
-
- return rootNode
- end
-
- local function testScoreCartTreeBranchLeftIfMissing()
- local rootNode = getBinaryCartTreeRootNode()
-
- local cartTree = dt.CartTree(rootNode)
-
- local continuousFeatures = torch.SparseTensor()
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
- mytester:assert(rootNode.leftChild.nodeId == nodeId)
- end
-
- local function testBranchRightWithFeature()
- local rootNode = getBinaryCartTreeRootNode()
-
- local cartTree = dt.CartTree(rootNode)
-
- local continuousFeatures = torch.zeros(100)
- continuousFeatures[splitFeatureId] = splitFeatureValue
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
- mytester:assert(rootNode.rightChild.nodeId == nodeId)
- end
-
- local function testMissingRightNode()
- local rootNode = getBinaryCartTreeRootNode()
-
- rootNode.rightChild = nil
-
- local cartTree = dt.CartTree(rootNode)
-
- local continuousFeatures = torch.Tensor()
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
- mytester:assert(rootNode.leftChild.nodeId == nodeId)
- end
-
- local function testMissingLeftNode()
- local rootNode = getBinaryCartTreeRootNode()
-
- rootNode.leftChild = nil
-
- local cartTree = dt.CartTree(rootNode)
-
- local continuousFeatures = torch.Tensor()
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
- mytester:assert(rootNode.rightChild.nodeId == nodeId)
- end
-
- local function testMissingAllChildren()
- local rootNode = getBinaryCartTreeRootNode()
-
- rootNode.leftChild = nil
- rootNode.rightChild = nil
-
- local cartTree = dt.CartTree(rootNode)
-
- local continuousFeatures = torch.Tensor()
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.score - score) < epsilon)
- mytester:assert(rootNode.nodeId == nodeId)
- end
-
- local function testScoreCartTreeBranchRandomlyRight()
- local rootNode = getBinaryCartTreeRootNode();
-
- -- Force Branch Right
- local cartTree = dt.CartTree(rootNode, function() return false end);
-
- local continuousFeatures = torch.SparseTensor()
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
- mytester:assert(rootNode.rightChild.nodeId == nodeId)
- end
-
- local function testScoreCartTreeBranchRandomlyLeft()
- local rootNode = getBinaryCartTreeRootNode();
-
- -- Force Branch Left
- local cartTree = dt.CartTree(rootNode, function() return true end);
-
- local continuousFeatures = torch.SparseTensor()
-
- local score, nodeId = cartTree:score(continuousFeatures)
-
- mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
- mytester:assert(rootNode.leftChild.nodeId == nodeId)
- end
-
- testScoreCartTreeBranchLeftIfMissing()
- testBranchRightWithFeature()
- testMissingRightNode()
- testMissingLeftNode()
- testMissingAllChildren()
- testScoreCartTreeBranchRandomlyRight()
- testScoreCartTreeBranchRandomlyLeft()
-
-end
-
-function dttest.TreeState_branch()
- local _ = require 'moses'
- local binFeatureId = 1
- local featureId = 2
-
- local input = {
- torch.SparseTensor(torch.LongTensor{binFeatureId},torch.Tensor{1}),
- torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{1,1}),
- torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{0,2}),
- torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{0,3})
- }
- local target = torch.LongTensor(4):fill(1)
-
- local dataset = dt.DataSet(input, target)
-
- local treeState = dt.TreeState(torch.LongTensor():range(1,4))
- local splitInfo = {splitId = binFeatureId, splitValue = 1}
-
- local function testBranchBinaryFeature()
- splitInfo = {splitId = binFeatureId, splitValue = 1}
- local leftBranch, rightBranch = treeState:branch(splitInfo, dataset)
- mytester:assert(leftBranch ~= nil and rightBranch ~= nil)
-
- mytester:assert(2 == leftBranch:size())
- mytester:assert(leftBranch:contains(3))
- mytester:assert(leftBranch:contains(4))
-
- mytester:assert(2 == rightBranch:size())
- mytester:assert(rightBranch:contains(1))
- mytester:assert(rightBranch:contains(2))
- end
-
- local function testBranchContinuousFeature()
- local splitValue = 2
- splitInfo = {splitId = featureId, splitValue = splitValue}
-
- local leftBranch, rightBranch = treeState:branch(splitInfo, dataset)
- mytester:assert(leftBranch ~= nil and rightBranch ~= nil)
-
- mytester:assert(1 == leftBranch:size())
- mytester:assert(leftBranch:contains(2))
-
- mytester:assert(2 == rightBranch:size())
- mytester:assert(rightBranch:contains(3))
- mytester:assert(rightBranch:contains(4))
- end
-
- testBranchBinaryFeature()
- testBranchContinuousFeature()
-
-end
-
-function dttest.DecisionForest()
- -- Create test decision forest, each forest has only a single node, and returns score == score of root node.
-
- local function createCartTreeWithSingleNode(score)
- local cartNode = dt.CartNode()
- cartNode.score = score
- return dt.CartTree(cartNode)
- end
-
- local function getTestDecisionForest()
- local cartTrees = {
- createCartTreeWithSingleNode(1),
- createCartTreeWithSingleNode(2),
- createCartTreeWithSingleNode(3)
- }
- local weight = torch.Tensor{10,20,30}
- local bias = 0.5
-
- return dt.DecisionForest(cartTrees, weight, bias)
- end
-
- local function testScoreDecisionForest()
- local df = getTestDecisionForest()
- local continuousFeatures = torch.SparseTensor()
-
- local expectedResult = 1.0 * 10.0 + 2.0 * 20.0 + 3.0 * 30.0 + 0.5;
- local result = df:score(continuousFeatures)
-
- mytester:assert(math.abs(expectedResult - result) < epsilon)
- end
-
- testScoreDecisionForest()
-end
-
-function dttest.CartTrainer()
- local minLeafSize, maxLeafNodes = 1, 1000
- local nExample = 100
-
- -- 1. dense dataset
- local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample)
-
- -- assert that the dataset is valid
- for clusterId, exampleIds in ipairs(clusterExamples) do
- local exampleIdx = torch.LongTensor(exampleIds)
- local input = inputs:index(1,exampleIdx)
- local target = targets:index(1,exampleIdx)
- assert(input:std(1):mean() < 0.05)
- end
-
- local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
- local treeState = dt.GiniState(trainSet:getExampleIds())
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
-
- mytester:assert(nleaf == nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
- testAccuracy(cartTree, "dense train single-thread first", trainSet, 0.99)
- testAccuracy(cartTree, "dense valid single-thread first", validSet, 0.7) -- they don't generalize very well..
-
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- testAccuracy(cartTree, "dense single-thread second", trainSet)
-
- -- test feature parallelization
- local nThread = 2
- cartTrainer:featureParallel(nThread)
- local treeState = dt.GiniState(trainSet:getExampleIds())
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- testAccuracy(cartTree, "dense feature-parallel", trainSet)
-
- -- 2. sparse-dense dataset
- local trainSet, validSet, clusterExamples, inputs, targets = dt.getSparseDummyData(nExample, nil, 10, nil, nil, 10)
-
- -- assert that the dataset is valid
- for clusterId, exampleIds in ipairs(clusterExamples) do
- local input = torch.Tensor(#exampleIds, 10):zero()
- for i, exampleId in ipairs(exampleIds) do
- input[i]:indexCopy(1, inputs[exampleId].keys, inputs[exampleId].values)
- end
- assert(input:std(1):mean() < 0.05)
- end
-
- local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
- local treeState = dt.GiniState(trainSet:getExampleIds())
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
-
- mytester:assert(nleaf == nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
- testAccuracy(cartTree, "sparse-dense train single-thread first", trainSet, 0.99)
-
- local shuffle = torch.LongTensor():randperm(10)
- for i, input in ipairs(inputs) do
- input.keys = input.keys:index(1, shuffle)
- input.values = input.values:index(1, shuffle)
- input._map = nil
- end
- testAccuracy(cartTree, "sparse-dense shuffled keys train single-thread first", trainSet, 0.99)
- testAccuracy(cartTree, "sparse-dense valid single-thread first", validSet, 0.8)
-
- -- 3. sparse dataset
- local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
-
- local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
- local treeState = dt.GiniState(trainSet:getExampleIds())
- local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
- cartTree.branchleft = function() return true end
-
- mytester:assert(nleaf < nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
- testAccuracy(cartTree, "sparse train single-thread first", trainSet, 0.9) -- the TreeBrancher drops examples with missing features, making it difficult to overfit
- testAccuracy(cartTree, "sparse valid single-thread first", validSet, 0.8)
-end
-
-function dttest.GradientBoostTrainer()
- local nExample = 100
- local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
-
- local maxLeafNode, minLeafSize = nExample/2, nExample/10
- local loss = nn.LogitBoostCriterion(false)
-
- local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNode)
-
- local opt = {
- lossFunction=loss,
- treeTrainer=cartTrainer,
- shrinkage=0.1,
- downsampleRatio=6,
- featureBaggingSize=-1,
- nTree=14,
- evalFreq=8,
- earlyStop=0 -- no early-stopping
- }
-
- -- test single-thread
- local trainer = dt.GradientBoostTrainer(opt)
- local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
-
- mytester:assert(#decisionForest.trees == opt.nTree)
- testAccuracy(decisionForest, "sparse train single-thread first", trainSet, 0.98)
- testAccuracy(decisionForest, "sparse valid single-thread first", validSet, 0.95)
-
- -- test stateless
- local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
-
- mytester:assert(#decisionForest.trees == opt.nTree)
- testAccuracy(decisionForest, "sparse train single-thread second", trainSet, 0.98)
- testAccuracy(decisionForest, "sparse valid single-thread second", validSet, 0.95)
-
- -- test feature-parallel
- local nThread = 2
- cartTrainer:featureParallel(nThread)
-
- local trainer = dt.GradientBoostTrainer(opt)
- local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
-
- mytester:assert(#decisionForest.trees == opt.nTree)
- testAccuracy(decisionForest, "sparse train feature-parallel first", trainSet, 0.98)
- testAccuracy(decisionForest, "sparse valid feature-parallel first", validSet, 0.95)
-
-end
-
-function dttest.RandomForestTrainer()
- local nExample = 100
- local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
-
- local opt = {
- activeRatio=0.5,
- featureBaggingSize=5,
- nTree=14,
- maxLeafNodes=nExample/2,
- minLeafSize=nExample/10,
- }
-
- local trainer = dt.RandomForestTrainer(opt)
-
- local decisionForest = trainer:train(trainSet, trainSet.featureIds)
- mytester:assert(#decisionForest.trees == opt.nTree)
- testAccuracy(decisionForest, "sparse train single-thread first", trainSet, 0.98)
- testAccuracy(decisionForest, "sparse valid single-thread first", validSet, 0.95)
-
- -- test stateless
- local decisionForest = trainer:train(trainSet, trainSet.featureIds)
-
- mytester:assert(#decisionForest.trees == opt.nTree)
- testAccuracy(decisionForest, "sparse train single-thread second", trainSet, 0.98)
- testAccuracy(decisionForest, "sparse valid single-thread second", validSet, 0.95)
-
- -- test tree-parallel
- local nThread = 2
- trainer:treeParallel(nThread)
-
- local trainer = dt.RandomForestTrainer(opt)
- local decisionForest = trainer:train(trainSet, trainSet.featureIds)
-
- mytester:assert(#decisionForest.trees == opt.nTree)
- testAccuracy(decisionForest, "sparse train tree-parallel first", trainSet, 0.98)
- testAccuracy(decisionForest, "sparse valid tree-parallel first", validSet, 0.95)
-end
-
-function dttest.WorkPool()
- local nThread = 2
- local wp = dt.WorkPool(nThread)
-
- -- 1. some easy tests
- local store = {key='nick',value=7}
- wp:update('storeKeyValue', store)
-
- wp:update('require', {libname='decisiontree', varname='dt'})
-
- local bias = 2
- local obj = nn.MSECriterion()
- wp:update('require', {libname='decisiontree', varname='dt'})
- wp:writeup('execute', function(store) return bias + obj:updateOutput(torch.Tensor{1},torch.Tensor{1}) + store.nick end)
-
- local taskname, res = wp:read()
- mytester:assert(taskname == 'execute')
- mytester:assert(res == 9)
-
- -- 2. trying to reproduce a difficult error
- local trainSet, validSet = dt.getSparseDummyData()
-
- -- setup worker store (each worker will have its own copy)
- local store = {
- trainSet=trainSet,
- minLeafSize=2
- }
- wp:update('storeKeysValues', store)
-
- -- arguments/upvalues
- local treeState = dt.GiniState(trainSet:getExampleIds())
- local shardId = 1
- local nShard = nThread
- local featureIds = trainSet.featureIds
-
- local task = function(store, args)
- assert(store.trainSet)
- assert(store.minLeafSize)
-
- local bestSplit = args.treeState:findBestSplit(store.trainSet, args.featureIds, store.minLeafSize, args.shardId, args.nShard)
- return bestSplit
- end
- local args = {treeState=treeState,featureIds=featureIds,shardId=shardId,nShard=nShard}
- wp:writeup("execute", {func=task,args=args})
-
- local taskname, bestSplit = wp:read()
- mytester:assert(taskname == 'execute')
- mytester:assert(torch.type(bestSplit) == 'table')
-
- -- closure
- local task = function(store)
- assert(store.trainSet)
- assert(store.minLeafSize)
-
- local bestSplit = treeState:findBestSplit(store.trainSet, featureIds, store.minLeafSize, shardId, nShard)
- return bestSplit
- end
- wp:writeup("execute", task)
-
- local taskname, bestSplit = wp:read()
- mytester:assert(taskname == 'execute')
- mytester:assert(torch.type(bestSplit) == 'table')
-
- local task = function(store, args)
- assert(store.trainSet)
- assert(torch.isTypeOf(treeState, 'dt.TreeState'), torch.type(treeState))
-
- local bestSplit = treeState:findBestSplit(store.trainSet, featureIds, store.minLeafSize, shardId, nShard)
- return bestSplit
- end
- local args = {featureIds=featureIds,shardId=shardId,nShard=nShard}
- wp:writeup("execute", {func=task,args=args})
-
-
- local taskname, bestSplit = wp:read()
- mytester:assert(taskname == 'execute')
- mytester:assert(torch.type(bestSplit) == 'table')
-
- wp:terminate()
-end
-
-function dttest.Sparse2Dense()
- local batchsize = 4
- local minFeatureId, maxFeatureId = 10, 100
- local input = {{},{}}
- for i=1,batchsize do
- local inputsize = math.random(5,10)
- input[1][i] = torch.LongTensor(inputsize):random(minFeatureId,maxFeatureId)
- input[2][i] = torch.Tensor(inputsize):uniform(0,1)
- end
- local s2d = nn.Sparse2Dense(torch.LongTensor():range(minFeatureId,maxFeatureId))
- -- test 2d forward
- local output = s2d:forward(input)
- local output2 = torch.Tensor(batchsize, maxFeatureId-minFeatureId+1):zero()
- local featureMap = {}
- local j = 0
- for i=minFeatureId,maxFeatureId do
- j = j + 1
- featureMap[i] = j
- end
- for i=1,batchsize do
- local keys, values = input[1][i], input[2][i]
- for j=1,keys:size(1) do
- output2[{i,featureMap[keys[j] ]}] = values[j]
- end
- end
- mytester:assertTensorEq(output, output2, 0.000001)
- -- test 1d forward
- local input = {input[1][batchsize], input[2][batchsize]}
- local output = s2d:forward(input)
- mytester:assertTensorEq(output, output2[batchsize], 0.000001)
-end
-
-function dttest.Sparse2DenseDouble()
- local batchsize = 4
- local minFeatureId, maxFeatureId = 10, 100
- local input = {{},{}}
- for i=1,batchsize do
- local inputsize = math.random(5,10)
- input[1][i] = torch.LongTensor(inputsize):random(minFeatureId,maxFeatureId)
- input[2][i] = torch.Tensor(inputsize):uniform(0,1):double()
- end
- local s2d = nn.Sparse2Dense(torch.LongTensor():range(minFeatureId,maxFeatureId))
- s2d:double()
- -- test 2d forward
- local output = s2d:forward(input)
- local output2 = torch.Tensor(batchsize, maxFeatureId-minFeatureId+1):zero():double()
- local featureMap = {}
- local j = 0
- for i=minFeatureId,maxFeatureId do
- j = j + 1
- featureMap[i] = j
- end
- for i=1,batchsize do
- local keys, values = input[1][i], input[2][i]
- for j=1,keys:size(1) do
- output2[{i,featureMap[keys[j] ]}] = values[j]
- end
- end
- mytester:assertTensorEq(output, output2, 0.000001)
- -- test 1d forward
- local input = {input[1][batchsize], input[2][batchsize]}
- local output = s2d:forward(input)
- mytester:assertTensorEq(output, output2[batchsize], 0.000001)
-end
-
-function dttest.LogitBoostCriterion()
- local input = torch.randn(10)
- local target = torch.LongTensor(10):random(0,1):type(torch.type(input))
-
- local lb = nn.LogitBoostCriterion(false)
- local loss = lb:updateOutput(input, target)
-
- local loss2 = 0
- for i=1,10 do
- loss2 = loss2 + math.log(1 + math.exp(target[i] <= 0 and input[i] or -input[i]))
- end
- mytester:assert(math.abs(loss - loss2) < 0.00001)
-
- local gradInput = lb:updateGradInput(input, target)
- local gradInput2 = gradInput:clone():zero()
- for i=1,10 do
- local p = dt.logistic(input[i])
- gradInput2[i] = (target[i] <= 0) and p or (p - 1)
- end
- mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
-
- local hessInput = lb:updateHessInput(input, target)
- local hessInput2 = hessInput:clone():zero()
- for i=1,10 do
- local p = dt.logistic(input[i])
- hessInput2[i] = p * (1.0 - p)
- end
- mytester:assertTensorEq(hessInput, hessInput2, 0.000001)
-end
-
-function dttest.DFD()
- local nExample = 100
- local batchsize = 4
- local inputsize = 10
-
- -- train Random Forest
- local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample, nil, inputsize)
- local opt = {
- activeRatio=0.5,
- featureBaggingSize=5,
- nTree=4,
- maxLeafNodes=nExample/2,
- minLeafSize=nExample/10,
- }
- local trainer = dt.RandomForestTrainer(opt)
- local df = trainer:train(trainSet, trainSet.featureIds)
- mytester:assert(#df.trees == opt.nTree)
-
- local dfd = nn.DFD(df)
- dfd = nn.DFD(dfd:getReconstructionInfo())
- local dfd2 = nn.DFD(dfd:getReconstructionInfo(), true)
- local input = validSet.input:sub(1,batchsize)
- local output = dfd:forward(input)
- local output2 = dfd2:forward(input)
-
- local _ = require 'moses'
-
- local function hasKey(keys,key)
- local found = false
- keys:apply(function(x)
- if x == key then
- found = true
- end
- end)
- return found
- end
-
- for i=1,batchsize do
- local nodes = {}
- local keys = output[1][i]
- local keys2 = output2[1][i]
- for j,tree in ipairs(df.trees) do
- local stack = {}
- tree:score(input[i], stack)
- mytester:assert(hasKey(keys2, stack[#stack]._nodeId))
-
- for k,node in ipairs(stack) do
- if k > 1 then
- assert(node._nodeId)
- mytester:assert(hasKey(keys, node._nodeId), string.format("missing key=%d in %s", node._nodeId, tostring(keys)))
- table.insert(nodes, node._nodeId)
- end
- end
- end
- mytester:assert(#nodes == keys:size(1))
- mytester:assert(#df.trees == keys2:size(1))
- end
-end
-
-function dttest.DFDDouble()
- local nExample = 100
- local batchsize = 4
- local inputsize = 10
-
- -- train Random Forest
- local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample, nil, inputsize)
- local opt = {
- activeRatio=0.5,
- featureBaggingSize=5,
- nTree=4,
- maxLeafNodes=nExample/2,
- minLeafSize=nExample/10,
- }
- local trainer = dt.RandomForestTrainer(opt)
- local df = trainer:train(trainSet, trainSet.featureIds)
- mytester:assert(#df.trees == opt.nTree)
-
- local dfd = nn.DFD(df)
- dfd:double()
- dfd = nn.DFD(dfd:getReconstructionInfo())
- local dfd2 = nn.DFD(dfd:getReconstructionInfo(), true)
- local input = validSet.input:sub(1,batchsize):double()
- local output = dfd:forward(input)
- local output2 = dfd2:forward(input)
-
- local _ = require 'moses'
-
- local function hasKey(keys,key)
- local found = false
- keys:apply(function(x)
- if x == key then
- found = true
- end
- end)
- return found
- end
-
- for i=1,batchsize do
- local nodes = {}
- local keys = output[1][i]
- local keys2 = output2[1][i]
- for j,tree in ipairs(df.trees) do
- local stack = {}
- tree:score(input[i], stack)
- mytester:assert(hasKey(keys2, stack[#stack]._nodeId))
-
- for k,node in ipairs(stack) do
- if k > 1 then
- assert(node._nodeId)
- mytester:assert(hasKey(keys, node._nodeId), string.format("missing key=%d in %s", node._nodeId, tostring(keys)))
- table.insert(nodes, node._nodeId)
- end
- end
- end
- mytester:assert(#nodes == keys:size(1))
- mytester:assert(#df.trees == keys2:size(1))
- end
-end
-
-function dttest.uniquecounts() -- DEPRECATED
- local target = torch.LongTensor(100):random(1,3)
- local input = torch.Tensor()
- local inputset = {input=input, target=target}
-
- local counts = dt.uniquecounts(nil, inputset, 3)
-
- mytester:assert(counts:sum() == 100)
- mytester:assert(counts:nElement() == 3)
-
- local res = torch.Tensor(3):zero()
- target:apply(function(t) res[t] = res[t] + 1 end)
-
- mytester:assertTensorEq(counts, res)
-end
-
-function dttest.entropy() -- DEPRECATED
- -- 2 clusters with a bit overlap between classes:
- local input = torch.Tensor(100,2)
- input:narrow(1,1,50):normal(-1,.01)
- input:narrow(1,51,50):normal(2,.01)
-
- local target = torch.LongTensor(100):fill(3)
- target:narrow(1,1,45):fill(1)
- target:narrow(1,56,45):fill(2)
-
- local inputset = {input=input, target=target}
-
- -- test entropy()
- local fullent = dt.entropy(inputset)
-
- local halfset = {input=input:narrow(1,1,50), target=target:narrow(1,1,50)}
- local halfent = dt.entropy(halfset)
-
- local perfectset = {input=input:narrow(1,56,45), target=target:narrow(1,56,45)}
- local perfectent = dt.entropy(perfectset)
-
- mytester:assert(fullent > halfent)
- mytester:assert(halfent > perfectent)
- mytester:assert(perfectent < 0.0000001 and perfectent >= 0)
-end
-
-function dt.test(tests)
- math.randomseed(os.time())
- mytester = torch.Tester()
- mytester:add(dttest)
- mytester:run(tests)
-end
diff --git a/contrib/lua-torch/decisiontree/utils.h b/contrib/lua-torch/decisiontree/utils.h
deleted file mode 100644
index 8a0196a58..000000000
--- a/contrib/lua-torch/decisiontree/utils.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "error.h"
-
-#define check_tensors(L, a, b) \
- do { \
- if ((a)->nDimension != (b)->nDimension) \
- return LUA_HANDLE_ERROR_STR((L), "different tensor dimensions"); \
- for (int __local__var = 0; __local__var < (a)->nDimension; __local__var++) \
- if ((a)->size[__local__var] != (b)->size[__local__var]) \
- return LUA_HANDLE_ERROR_STR((L), "different tensor sizes"); \
- } while (0)
-
-#define check_tensor(L, t, type) \
- do { \
- if (!type##_isContiguous(t)) \
- return LUA_HANDLE_ERROR_STR((L), "tensor should be contiguous"); \
- } while (0)
-
-#define get_tensor_size(t, type) \
- (TH##type##Tensor_nElement(t))
-
-#define get_tensor(L, idx, type) \
- (TH##type##Tensor *)luaT_checkudata(L, idx, "torch." #type "Tensor")
-
-static int push_table_contents(lua_State *L, int arg)
-{
- int size = 0;
- while(1) {
- lua_checkstack(L, 1);
- lua_rawgeti(L, arg, size + 1);
- if (lua_isnil(L, -1)) {
- lua_pop(L, 1);
- break;
- }
- size++;
- }
- return size;
-}
-
-#define verify_push_table_contents(L, idx, count) do { \
- int __tmp_count = push_table_contents(L, idx); \
- if (__tmp_count != count) { \
- lua_pop(L, __tmp_count); \
- LUA_HANDLE_ERROR_STR(L, "Table sizes do not match"); \
- } \
- } while(0)
diff --git a/contrib/lua-torch/decisiontree/utils.lua b/contrib/lua-torch/decisiontree/utils.lua
deleted file mode 100644
index c32c3d08b..000000000
--- a/contrib/lua-torch/decisiontree/utils.lua
+++ /dev/null
@@ -1,125 +0,0 @@
-local dt = require "decisiontree._env"
-
--- returns a buffer table local to a thread (no serialized)
-function dt.getBufferTable(name)
- local dt = require 'decisiontree'
- assert(torch.type(name) == 'string')
- dt.buffer = dt.buffer or {}
- dt.buffer[name] = dt.buffer[name] or {}
- return dt.buffer[name]
-end
-
-function dt.getSparseDummyData(nExample, nCluster, nFeature, overlap, nValid, nActive)
- local dt = require 'decisiontree'
- if torch.type(nExample) == 'table' then
- local opt = nExample
- nExample = opt.nExample
- nCluster = opt.nCluster
- nFeature = opt.nFeature
- overlap = opt.overlap
- nValid = opt.nValid
- nActive = opt.nActive
- end
- nExample = nExample or 100 -- training set size
- nCluster = nCluster or 10
- assert(nCluster >= 2)
- nFeature = math.max(2, nFeature or 10)
- overlap = overlap or 0
- nValid = nValid or nExample/10 -- validation set size
- nActive = nActive or math.max(2, nFeature / 2)
-
- -- sample nCluster centers
- local clusterCenter = torch.rand(nCluster, nFeature)
- local clusterLabel = torch.LongTensor(nCluster)
- local clusterExamples = {}
- for i=1,nCluster do
- clusterCenter[i]:add(i)
- clusterLabel[i] = i % 2
- clusterExamples[i] = {}
- end
-
- local sparseCenter = torch.Tensor()
-
- local shuffle = torch.LongTensor()
-
- -- build dataset in pseudo-dense format
- local inputs = {}
- local targets = torch.Tensor(nExample+nValid)
- for i=1,nExample+nValid do
- local clusterIdx = torch.random(1,nCluster)
- table.insert(clusterExamples[clusterIdx], i)
-
- shuffle:randperm(nFeature)
- local keys = torch.LongTensor(nActive):copy(shuffle:narrow(1,1,nActive))
- sparseCenter:index(clusterCenter[clusterIdx], 1, keys)
- local stdiv = i <= nExample and 100 or 1000
- local values = torch.randn(nActive):div(stdiv):add(sparseCenter)
-
- table.insert(inputs, torch.SparseTensor(keys, values))
-
- local label = clusterLabel[clusterIdx]
- if math.random() < overlap then
- targets[i] = label == 1 and 0 or 1
- else
- targets[i] = label
- end
- end
-
- local _ = require 'moses'
- local validSet = dt.DataSet(_.slice(inputs, nExample+1, nExample+nValid), targets:narrow(1,nExample+1,nValid))
- local trainSet = dt.DataSet(_.slice(inputs, 1, nExample), targets:narrow(1,1,nExample))
-
- return trainSet, validSet, clusterExamples, inputs, targets
-end
-
-function dt.getDenseDummyData(nExample, nCluster, nFeature, overlap, nValid)
- local dt = require 'decisiontree'
- if torch.type(nExample) == 'table' then
- local opt = nExample
- nExample = opt.nExample
- nCluster = opt.nCluster
- nFeature = opt.nFeature
- overlap = opt.overlap
- nValid = opt.nValid
- end
- nExample = nExample or 100 -- training set size
- nCluster = nCluster or 10
- assert(nCluster >= 2)
- nFeature = math.max(2, nFeature or 10)
- overlap = overlap or 0
- nValid = nValid or nExample/10 -- validation set size
-
- -- sample nCluster centers
- local clusterCenter = torch.rand(nCluster, nFeature)
- local clusterLabel = torch.LongTensor(nCluster)
- local clusterExamples = {}
- for i=1,nCluster do
- clusterCenter[i]:add(i)
- clusterLabel[i] = i % 2
- clusterExamples[i] = {}
- end
-
- -- build dataset in pseudo-dense format
- local inputs = torch.Tensor(nExample+nValid, nFeature)
- local targets = torch.Tensor(nExample+nValid)
- for i=1,nExample+nValid do
- local clusterIdx = torch.random(1,nCluster)
- table.insert(clusterExamples[clusterIdx], i)
-
- local stdiv = i <= nExample and 100 or 1000
- inputs[i]:normal():div(stdiv):add(clusterCenter[clusterIdx])
-
- local label = clusterLabel[clusterIdx]
- if math.random() < overlap then
- targets[i] = label == 1 and 0 or 1
- else
- targets[i] = label
- end
- end
-
- local _ = require 'moses'
- local validSet = dt.DataSet(inputs:narrow(1,nExample+1,nValid), targets:narrow(1,nExample+1,nValid))
- local trainSet = dt.DataSet(inputs:narrow(1,1,nExample), targets:narrow(1,1,nExample))
-
- return trainSet, validSet, clusterExamples, inputs, targets
-end