]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Add torch-decisiontree package
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 5 Nov 2017 14:34:22 +0000 (14:34 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 5 Nov 2017 14:34:22 +0000 (14:34 +0000)
https://github.com/twitter/torch-decisiontree

This project implements random forests and gradient boosted decision
trees (GBDT). The latter uses gradient tree boosting. Both use ensemble
learning to produce ensembles of decision trees (that is, forests).

46 files changed:
CMakeLists.txt
contrib/torch/decisiontree/CMakeLists.txt [new file with mode: 0644]
contrib/torch/decisiontree/CartNode.lua [new file with mode: 0644]
contrib/torch/decisiontree/CartTrainer.lua [new file with mode: 0644]
contrib/torch/decisiontree/CartTree.lua [new file with mode: 0644]
contrib/torch/decisiontree/DFD.lua [new file with mode: 0644]
contrib/torch/decisiontree/DataSet.lua [new file with mode: 0644]
contrib/torch/decisiontree/DecisionForest.lua [new file with mode: 0644]
contrib/torch/decisiontree/DecisionForestTrainer.lua [new file with mode: 0644]
contrib/torch/decisiontree/DecisionTree.lua [new file with mode: 0644]
contrib/torch/decisiontree/GBDT_common.h [new file with mode: 0644]
contrib/torch/decisiontree/GiniState.lua [new file with mode: 0644]
contrib/torch/decisiontree/GradientBoostState.lua [new file with mode: 0644]
contrib/torch/decisiontree/GradientBoostTrainer.lua [new file with mode: 0644]
contrib/torch/decisiontree/LICENSE [new file with mode: 0644]
contrib/torch/decisiontree/LogitBoostCriterion.lua [new file with mode: 0644]
contrib/torch/decisiontree/MSECriterion.lua [new file with mode: 0644]
contrib/torch/decisiontree/README.md [new file with mode: 0644]
contrib/torch/decisiontree/RandomForestTrainer.lua [new file with mode: 0644]
contrib/torch/decisiontree/Sparse2Dense.lua [new file with mode: 0644]
contrib/torch/decisiontree/SparseTensor.lua [new file with mode: 0644]
contrib/torch/decisiontree/TreeState.lua [new file with mode: 0644]
contrib/torch/decisiontree/WorkPool.lua [new file with mode: 0644]
contrib/torch/decisiontree/_env.lua [new file with mode: 0644]
contrib/torch/decisiontree/benchmark.lua [new file with mode: 0644]
contrib/torch/decisiontree/doc/benchmark.md [new file with mode: 0644]
contrib/torch/decisiontree/error.h [new file with mode: 0644]
contrib/torch/decisiontree/generic/CartTree.c [new file with mode: 0644]
contrib/torch/decisiontree/generic/DFD.c [new file with mode: 0644]
contrib/torch/decisiontree/generic/GBDT.c [new file with mode: 0644]
contrib/torch/decisiontree/generic/GBDT_internal.c [new file with mode: 0644]
contrib/torch/decisiontree/generic/GBDT_internal.h [new file with mode: 0644]
contrib/torch/decisiontree/generic/LogitBoostCriterion.c [new file with mode: 0644]
contrib/torch/decisiontree/generic/S2D.c [new file with mode: 0644]
contrib/torch/decisiontree/hash_map.c [new file with mode: 0644]
contrib/torch/decisiontree/hash_map.h [new file with mode: 0644]
contrib/torch/decisiontree/init.c [new file with mode: 0644]
contrib/torch/decisiontree/init.lua [new file with mode: 0644]
contrib/torch/decisiontree/internal_hash_map.h [new file with mode: 0644]
contrib/torch/decisiontree/khash.h [new file with mode: 0644]
contrib/torch/decisiontree/math.lua [new file with mode: 0644]
contrib/torch/decisiontree/rocks/decisiontree-scm-1.rockspec [new file with mode: 0644]
contrib/torch/decisiontree/test.lua [new file with mode: 0644]
contrib/torch/decisiontree/utils.h [new file with mode: 0644]
contrib/torch/decisiontree/utils.lua [new file with mode: 0644]
contrib/torch/torch7/cmake/TorchPackage.cmake

index 8256f5f3977d74762d6b9d1dacbb324868b862ba..0376739b867d45efa5ecfc603407798f37a2fcc1 100644 (file)
@@ -1284,6 +1284,7 @@ IF(ENABLE_TORCH MATCHES "ON")
                ADD_SUBDIRECTORY(contrib/torch/paths)
                ADD_SUBDIRECTORY(contrib/torch/torch7)
                ADD_SUBDIRECTORY(contrib/torch/nn)
+               ADD_SUBDIRECTORY(contrib/torch/decisiontree)
                SET(WITH_TORCH 1)
        ELSE()
                MESSAGE(FATAL_ERROR "Cannot enable torch without luajit")
diff --git a/contrib/torch/decisiontree/CMakeLists.txt b/contrib/torch/decisiontree/CMakeLists.txt
new file mode 100644 (file)
index 0000000..6434edd
--- /dev/null
@@ -0,0 +1,54 @@
+SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+SET(src
+   init.c
+   hash_map.c
+)
+SET(luasrc
+   _env.lua
+   benchmark.lua
+   CartNode.lua
+   CartTrainer.lua
+   CartTree.lua
+   DataSet.lua
+   DecisionForest.lua
+   DecisionForestTrainer.lua
+   DecisionTree.lua
+   DFD.lua
+   GiniState.lua
+   GradientBoostState.lua
+   GradientBoostTrainer.lua
+   init.lua
+   LogitBoostCriterion.lua
+   math.lua
+   MSECriterion.lua
+   RandomForestTrainer.lua
+   Sparse2Dense.lua
+   SparseTensor.lua
+   test.lua
+   TreeState.lua
+   utils.lua
+   WorkPool.lua
+)
+
+IF (WITH_OPENMP)
+   FIND_PACKAGE(OpenMP)
+   IF(OPENMP_FOUND)
+      MESSAGE(STATUS "Compiling with OpenMP support")
+      SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+   ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+ADD_TORCH_PACKAGE(decisiontree "${src}" "${luasrc}" "A decision tree library, for Torch")
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(decisiontree PROPERTIES
+        PREFIX "lib"
+        IMPORT_PREFIX "lib")
+TARGET_LINK_LIBRARIES(decisiontree ${TH_LIBRARIES})
+INSTALL(TARGETS decisiontree DESTINATION ${RSPAMD_LIBDIR})
diff --git a/contrib/torch/decisiontree/CartNode.lua b/contrib/torch/decisiontree/CartNode.lua
new file mode 100644 (file)
index 0000000..e6a8500
--- /dev/null
@@ -0,0 +1,42 @@
+local dt = require 'decisiontree._env'
+local CartNode = torch.class("dt.CartNode", dt)
+
+function CartNode:__init(nodeId, leftChild, rightChild, splitFeatureId, splitFeatureValue, score, splitGain)
+   self.nodeId = nodeId or 0
+   self.leftChild = leftChild
+   self.rightChild = rightChild
+   self.splitFeatureId = splitFeatureId or -1
+   self.splitFeatureValue = splitFeatureValue or 0
+   self.score = score or 0
+   self.splitGain = splitGain
+end
+
+function CartNode:__tostring__()
+   return self:recursivetostring()
+end
+
+function CartNode:recursivetostring(indent)
+   indent = indent or ' '
+
+   -- Is this a leaf node?
+   local res = ''
+   if not (self.leftChild or self.rightChild) then
+      res = res .. self.score .. '\n'
+   else
+      -- Print the criteria
+      res = res .. 'input[' .. self.splitFeatureId .. '] <' .. self.splitFeatureValue .. '?\n'
+
+      -- Print the branches
+      if self.leftChild then
+         res = res .. indent .. 'True->' .. self.leftChild:recursivetostring(indent .. '  ')
+      end
+      if self.rightChild then
+         res = res .. indent .. 'False->' .. self.rightChild:recursivetostring(indent .. '  ')
+      end
+   end
+   return res
+end
+
+function CartNode:clone()
+   return CartNode(self.nodeId, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, self.score, self.splitGain)
+end
diff --git a/contrib/torch/decisiontree/CartTrainer.lua b/contrib/torch/decisiontree/CartTrainer.lua
new file mode 100644 (file)
index 0000000..63ae6c1
--- /dev/null
@@ -0,0 +1,180 @@
+local dt = require "decisiontree._env"
+local _ = require "moses"
+
+local CartTrainer = torch.class("dt.CartTrainer", dt)
+
+-- Generic CART trainer
+function CartTrainer:__init(dataset, minLeafSize, maxLeafNodes)
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   self.dataset = dataset
+   self.minLeafSize = assert(minLeafSize) -- min examples per leaf
+   self.maxLeafNodes = assert(maxLeafNodes) -- max leaf nodes in tree
+
+   -- by default, single thread
+   self.parallelMode = 'singlethread'
+end
+
+function CartTrainer:train(rootTreeState, activeFeatures)
+   assert(torch.isTypeOf(rootTreeState, 'dt.TreeState'))
+   assert(torch.isTensor(activeFeatures))
+   local root = dt.CartNode()
+   root.id = 0
+   root.score = rootTreeState:score(self.dataset)
+
+   local nleaf = 1
+
+   -- TODO : nodeparallel: parallelize here. The queue is a workqueue.
+   local queue = {}
+   table.insert(queue, 1, {cartNode=root, treeState=rootTreeState})
+
+   while #queue > 0 and nleaf < self.maxLeafNodes do
+      local treeGrowerArgs = table.remove(queue, #queue)
+      local currentTreeState = treeGrowerArgs.treeState
+
+      -- Note: if minLeafSize = 1 and maxLeafNode = inf, then each example will be its own leaf...
+      if self:hasEnoughTrainingExamplesToSplit(currentTreeState.exampleIds:size(1)) then
+         nleaf = self:processNode(nleaf, queue, treeGrowerArgs.cartNode, currentTreeState, activeFeatures)
+      end
+   end
+
+   -- CartTree with random branching (when feature is missing)
+   local branchleft = function() return math.random() < 0.5 end
+   return dt.CartTree(root, branchleft), nleaf
+end
+
+function CartTrainer:processNode(nleaf, queue, node, treeState, activeFeatures)
+   local bestSplit
+   if self.parallelMode == 'singlethread' then
+      bestSplit = self:findBestSplitForAllFeatures(treeState, activeFeatures)
+   elseif self.parallelMode == 'featureparallel' then
+      bestSplit = self:findBestSplitForAllFeaturesFP(treeState, activeFeatures)
+   else
+      error("Unrecognized parallel mode: " .. self.parallelMode)
+   end
+
+   if bestSplit then
+      local leftTreeState, rightTreeState = treeState:branch(bestSplit, self.dataset)
+      assert(bestSplit.leftChildSize + bestSplit.rightChildSize == leftTreeState.exampleIds:size(1) + rightTreeState.exampleIds:size(1), "The left and right subtrees don't match the split found!")
+      self:setValuesAndCreateChildrenForNode(node, bestSplit, leftTreeState, rightTreeState, nleaf)
+
+      table.insert(queue, 1, {cartNode=node.leftChild, treeState=leftTreeState})
+      table.insert(queue, 1, {cartNode=node.rightChild, treeState=rightTreeState})
+
+      return nleaf + 1
+    end
+
+    return nleaf
+end
+
+function CartTrainer:findBestSplitForAllFeatures(treeState, activeFeatures)
+   local timer = torch.Timer()
+   local bestSplit = treeState:findBestSplit(self.dataset, activeFeatures, self.minLeafSize, -1, -1)
+
+   if bestSplit then
+      assert(torch.type(bestSplit) == 'table')
+   end
+
+   if dt.PROFILE then
+      print("findBestSplitForAllFeatures time="..timer:time().real)
+   end
+   return bestSplit
+end
+
+-- Updates the parentNode with the bestSplit information by creates left/right child Nodes.
+function CartTrainer:setValuesAndCreateChildrenForNode(parentNode, bestSplit, leftState, rightState, nleaf)
+   assert(torch.isTypeOf(parentNode, 'dt.CartNode'))
+   assert(torch.type(bestSplit) == 'table')
+   assert(torch.isTypeOf(leftState, 'dt.TreeState'))
+   assert(torch.isTypeOf(rightState, 'dt.TreeState'))
+   assert(torch.type(nleaf) == 'number')
+
+   local leftChild = dt.CartNode()
+   leftChild.score = leftState:score(self.dataset)
+   leftChild.nodeId = 2 * nleaf - 1
+
+   local rightChild = dt.CartNode()
+   rightChild.score = rightState:score(self.dataset)
+   rightChild.nodeId = 2 * nleaf
+
+   parentNode.splitFeatureId = bestSplit.splitId
+   parentNode.splitFeatureValue = bestSplit.splitValue
+   parentNode.leftChild = leftChild
+   parentNode.rightChild = rightChild
+   parentNode.splitGain = bestSplit.splitGain
+end
+
+-- We minimally need 2 * N examples in the parent to satisfy >= N examples per child
+function CartTrainer:hasEnoughTrainingExamplesToSplit(count)
+   return count >= 2 * self.minLeafSize
+end
+
+-- call before training to enable feature-parallelization
+function CartTrainer:featureParallel(workPool)
+   assert(self.parallelMode == 'singlethread', self.parallelMode)
+   self.parallelMode = 'featureparallel'
+   self.workPool = torch.type(workPool) == 'number' and dt.WorkPool(workPool) or workPool
+   assert(torch.isTypeOf(self.workPool, 'dt.WorkPool'))
+
+   -- this deletes all SparseTensor hash maps so that they aren't serialized
+   self.dataset:deleteIndex()
+
+   -- require the dt package
+   self.workPool:update('require', {libname='decisiontree',varname='dt'})
+   -- setup worker store (each worker will have its own copy)
+   local store = {
+      dataset=self.dataset,
+      minLeafSize=self.minLeafSize
+   }
+   self.workPool:update('storeKeysValues', store)
+end
+
+-- feature parallel
+function CartTrainer:findBestSplitForAllFeaturesFP(treeState, activeFeatures)
+   local timer = torch.Timer()
+   local bestSplit
+   if treeState.findBestSplitFP then
+      bestSplit = treeState:findBestSplitFP(self.dataset, activeFeatures, self.minLeafSize, self.workPool.nThread)
+   end
+
+   if not bestSplit then
+      for i=1,self.workPool.nThread do
+         -- upvalues
+         local treeState = treeState
+         local shardId = i
+         local nShard = self.workPool.nThread
+         local featureIds = activeFeatures
+         -- closure
+         local task = function(store)
+            assert(store.dataset)
+            assert(store.minLeafSize)
+            if treeState.threadInitialize then
+               treeState:threadInitialize()
+            end
+
+            local bestSplit = treeState:findBestSplit(store.dataset, featureIds, store.minLeafSize, shardId, nShard)
+            return bestSplit
+         end
+
+         self.workPool:writeup('execute', task)
+      end
+
+      for i=1,self.workPool.nThread do
+         local taskname, candidateSplit = self.workPool:read()
+         assert(taskname == 'execute')
+         if candidateSplit then
+            if ((not bestSplit) or candidateSplit.splitGain < bestSplit.splitGain) then
+               bestSplit = candidateSplit
+            end
+         end
+      end
+   end
+
+   if bestSplit then
+      assert(torch.type(bestSplit) == 'table')
+   end
+
+   if dt.PROFILE then
+      print("findBestSplitForAllFeaturesFP time="..timer:time().real)
+   end
+   return bestSplit
+end
diff --git a/contrib/torch/decisiontree/CartTree.lua b/contrib/torch/decisiontree/CartTree.lua
new file mode 100644 (file)
index 0000000..c74dfda
--- /dev/null
@@ -0,0 +1,90 @@
+local _ = require "moses"
+local dt = require 'decisiontree._env'
+
+-- CART (classification-regression decision tree).
+-- The example is always branched to the left when the splitting feature is missing.
+local CartTree = torch.class("dt.CartTree", "dt.DecisionTree", dt)
+
+function CartTree:__init(root, branchleft)
+   assert(torch.isTypeOf(root, 'dt.CartNode'))
+   self.root = root
+   self.branchleft = branchleft or function() return true end
+end
+
+-- TODO optimize this
+function CartTree:score(input, stack, optimized)
+   if optimized == true and stack == nil and torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      return input.nn.CartTreeFastScore(input, self.root, input.new())
+   end
+   return self:recursivescore(self.root, input, stack)
+end
+
+-- Continuous: if input[node.splitFeatureId] < node.splitFeatureValue then leftNode else rightNode
+-- Binary: if input[node.splitFeatureId] == 0 then leftNode else rightNode
+-- when stack is provided, it is returned as the third argument containing the stack of nodes from root to leaf
+function CartTree:recursivescore(node, input, stack)
+   assert(torch.isTypeOf(node, 'dt.CartNode'))
+
+   if stack then
+      stack = torch.type(stack) == 'table' and stack or {}
+      table.insert(stack, node)
+   end
+
+   if not (node.leftChild or node.rightChild) then
+      return node.score, node.nodeId, stack
+   elseif not node.leftChild then
+      return self:recursivescore(node.rightChild, input, stack)
+   elseif not node.rightChild then
+      return self:recursivescore(node.leftChild, input, stack)
+   end
+
+   local splitId = node.splitFeatureId
+   local splitVal = node.splitFeatureValue
+
+   if input[splitId] then -- if has key
+      local featureVal = input[splitId]
+      local nextNode = featureVal < splitVal and node.leftChild or node.rightChild
+      return self:recursivescore(nextNode, input, stack)
+   end
+
+   -- if feature is missing, branch left
+   local nextNode = self.branchleft() and node.leftChild or node.rightChild
+   return self:recursivescore(nextNode, input, stack)
+end
+
+function CartTree:__tostring__()
+   return self.root:recursivetostring()
+end
+
+-- expects a stack returned by score
+function CartTree:stackToString(stack, input)
+   assert(torch.type(stack) == 'table')
+   assert(torch.isTypeOf(stack[1], 'dt.CartNode'))
+
+   local res = 'Stack nodes from root to leaf\n'
+   for i,node in ipairs(stack) do
+      if not (node.leftChild or node.rightChild) then
+         res = res .. "score="..node.score .. '\n'
+      else
+         local istr = ''
+         if input then
+            istr = '=' .. (input[node.splitFeatureId] or 'nil')
+         end
+         res = res .. 'input[' .. node.splitFeatureId .. ']' .. istr ..' < ' .. node.splitFeatureValue .. ' ? '
+         res = res .. '(' .. ((node.leftChild and node.rightChild) and 'LR' or node.leftChild and 'L' or node.rightChild and 'R' or 'WAT?') .. ') '
+         if node.leftChild == stack[i+1] then
+            res = res .. 'Left\n'
+         elseif node.rightChild == stack[i+1] then
+            res = res .. 'Right\n'
+         else
+            error"stackToString error"
+         end
+      end
+   end
+   return res .. #stack .. " nodes"
+end
+
+function CartTree:clone()
+   return CartTree(self.root:clone(), self.branchleft)
+end
+
diff --git a/contrib/torch/decisiontree/DFD.lua b/contrib/torch/decisiontree/DFD.lua
new file mode 100644 (file)
index 0000000..e474621
--- /dev/null
@@ -0,0 +1,182 @@
+-- nn.DFD: Decision Forest Discretizer
+-- Takes a dense input and outputs a sparse output.
+-- Each node in the forest is its own feature.
+-- When a node is traversed, its commensurate feature takes on a value of 1.
+-- For all non-traversed nodes, the feature is 0.
+local DFD, parent = torch.class("nn.DFD", "nn.Module")
+
+-- TODO: add :type, as the default will convert the long tensors
+function DFD:__init(df, onlyLastNode)
+   parent.__init(self)
+   if torch.type(df) == 'table' then
+      self:reconstructFromInfo(df)
+   else
+      assert(torch.type(df) == 'dt.DecisionForest')
+
+      self.rootIds = torch.LongTensor()
+      -- nodeId of left and right child nodes
+      self.leftChild = torch.LongTensor()
+      self.rightChild = torch.LongTensor()
+      -- index and value of the feature that splits this node
+      self.splitFeatureId = torch.LongTensor()
+      self.splitFeatureValue = torch.Tensor()
+      -- initialize state given df
+      self:convertForest2Tensors(df)
+      self:clearState()
+   end
+   self.onlyLastNode = onlyLastNode
+   self.nTrees = self.rootIds:size(1)
+end
+
+-- converts a DecisionForest to efficient tensor representation
+function DFD:convertForest2Tensors(df)
+   self.rootIds:resize(#df.trees)
+
+   -- nodeId will map to featureId
+   local nodeId = 0
+   -- sets nodeIds of all subnodes
+   -- and measures the maximum depth over all trees
+   local function recursiveTree(node, depth)
+      depth = (depth or 0) + 1
+      local rdepth = depth
+      nodeId = nodeId + 1
+      node._nodeId = nodeId
+
+      if node.leftChild then
+         rdepth = math.max(rdepth, recursiveTree(node.leftChild, depth))
+      end
+      if node.rightChild then
+         rdepth = math.max(rdepth, recursiveTree(node.rightChild, depth))
+      end
+      return rdepth
+   end
+
+   -- sum over trees of max depth
+   self.depth = 0
+   for i,tree in ipairs(df.trees) do
+      assert(torch.isTypeOf(tree.root, 'dt.CartNode'))
+      self.depth = self.depth + recursiveTree(tree.root)
+   end
+   -- remove roots from depth
+   self.depth = self.depth - self.rootIds:size(1)
+
+   -- total number of nodes in all trees
+   self.nNode = nodeId
+
+   -- nodeId of left and right child nodes
+   self.leftChild:resize(self.nNode):fill(-1)
+   self.rightChild:resize(self.nNode):fill(-1)
+   -- index and value of the feature that splits this node
+   self.splitFeatureId:resize(self.nNode):fill(-1)
+   self.splitFeatureValue:resize(self.nNode):fill(-1)
+
+   -- aggregates CartNode attributes to an efficient tensor representation
+   local function recursiveTree2(node)
+      local nodeId = assert(node._nodeId)
+      assert(self.splitFeatureId[nodeId] == -1)
+
+      if node.leftChild then
+         self.leftChild[nodeId] = assert(node.leftChild._nodeId)
+         recursiveTree2(node.leftChild)
+      else
+        self.leftChild[nodeId] = 0
+      end
+
+      if node.rightChild then
+         self.rightChild[nodeId] = assert(node.rightChild._nodeId)
+         recursiveTree2(node.rightChild)
+      else
+        self.rightChild[nodeId] = 0
+      end
+
+      -- each node splits the dataset on a feature id-value pair
+      self.splitFeatureId[nodeId] = assert(node.splitFeatureId)
+      self.splitFeatureValue[nodeId] = assert(node.splitFeatureValue)
+   end
+
+   for i,tree in ipairs(df.trees) do
+      self.rootIds[i] = assert(tree.root._nodeId)
+      recursiveTree2(tree.root)
+   end
+
+   assert(self.leftChild:min() >= 0)
+   assert(self.rightChild:min() >= 0)
+end
+
+-- input is a batchsize x inputsize tensor
+function DFD:updateOutput(input)
+   assert(torch.isTensor(input))
+   assert(input:dim() == 2)
+   input = input:contiguous()
+
+   local batchsize, inputsize = input:size(1), input:size(2)
+   local size = self.onlyLastNode and self.nTree or self.depth
+
+   -- each sample's output keys is resized to maxdepth, which is the maximum size that it can take on
+   self.outputkeys = self.outputkeys or torch.LongTensor()
+   self.outputkeys:resize(batchsize, size)
+   -- values are 1
+   self.outputvalues = self.outputvalues or input.new()
+   self.outputvalues:resize(batchsize, size):fill(1)
+
+   self.output = input.nn.DFD_computeOutput(self.outputkeys, self.outputvalues, self.rootIds, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, input, self.onlyLastNode)
+   return self.output
+end
+
+function DFD:type(type, tensorCache)
+   if type then
+      local info = self:getReconstructionInfo()
+      for k, v in pairs(info) do
+         if torch.type(v) ~= 'torch.LongTensor' then
+            info[k] = nil
+         end
+      end
+      parent.type(self, type, tensorCache)
+      self:reconstructFromInfo(info)
+      return self
+   else
+      return parent.type(self)
+   end
+end
+
+function DFD:updateGradInput()
+   error"Not Implemented"
+end
+
+function DFD:clearState()
+   self.output = {{},{}}
+   self.taskbuffer = {}
+   self.outputkeys = nil
+   self.outputvalues = nil
+   self._range = nil
+   self._indices = nil
+   self._mask = nil
+end
+
+function DFD:reconstructFromInfo(DFDinfo)
+   for k,v in pairs(DFDinfo) do
+      self[k] = v
+   end
+   assert(self.leftChild:nDimension() == 1)
+   assert(self.rightChild:nDimension() == 1)
+   assert(self.leftChild:size(1) == self.nNode)
+   assert(self.rightChild:size(1) == self.nNode)
+   assert(self.leftChild:min() >= 0)
+   assert(self.rightChild:min() >= 0)
+   assert(self.splitFeatureId:nDimension() == 1)
+   assert(self.splitFeatureValue:nDimension() == 1)
+   assert(self.splitFeatureId:size(1) == self.splitFeatureValue:size(1))
+end
+
+function DFD:getReconstructionInfo()
+   local DFDinfo = {
+      nNode = self.nNode,
+      rootIds = self.rootIds,
+      leftChild = self.leftChild,
+      rightChild = self.rightChild,
+      splitFeatureId = self.splitFeatureId,
+      splitFeatureValue = self.splitFeatureValue,
+      depth = self.depth
+   }
+   return DFDinfo
+end
diff --git a/contrib/torch/decisiontree/DataSet.lua b/contrib/torch/decisiontree/DataSet.lua
new file mode 100644 (file)
index 0000000..505ec86
--- /dev/null
@@ -0,0 +1,164 @@
+local dt = require "decisiontree._env"
+local ipc = require 'libipc'
+
+local DataSet = torch.class("dt.DataSet", dt)
+
+function DataSet:__init(input, target, nThreads)
+   if torch.type(input) == 'table' then
+      assert(torch.isTypeOf(input[1], 'torch.SparseTensor'))
+   else
+      assert(torch.isTensor(input))
+   end
+   self.input = input
+   assert(torch.isTensor(target))
+   self.target = target
+   self.nThreads = nThreads or 1
+
+   self.sortedFeatureValues, self.featureIds = self:sortFeatureValues(input)
+end
+
+-- group examples by featureId. For each featureId, sort examples by featureValue (ascending order)
+-- returns a table mapping featureIds to sorted lists of exampleIds
+-- e.g. {featureId={example1,example2,example3}}
+function DataSet:sortFeatureValues(inputs)
+   local isSparse = torch.typename(inputs[1]):match('torch.*SparseTensor')
+   assert(isSparse or torch.isTensor(inputs))
+
+   local featureIds = torch.LongTensor()
+   local dataset = {} -- TODO use tds.Hash (will require SparseTensor to be userdata)
+   if isSparse then
+      local proto = inputs[1].values
+      -- get list of featureIds
+      local featureMap = {}
+      for i,input in ipairs(inputs) do
+         input.keys:apply(function(key)
+            featureMap[key] = (featureMap[key] or 0) + 1
+         end)
+      end
+      local _ = require "moses"
+      featureIds = featureIds.new(_.keys(featureMap))
+      local featureCounts = torch.LongTensor(featureIds:size(1))
+      for i=1,featureIds:size(1) do
+         featureCounts[i] = featureMap[featureIds[i]]
+      end
+
+      for i=1,featureIds:size(1) do
+         local featureId = featureIds[i]
+         local featureCount = featureCounts[i]
+         dataset[featureId] = {
+            values=proto.new(featureCount),
+            examples=torch.LongTensor(featureCount),
+            i=0
+         }
+      end
+
+      for exampleId,input in ipairs(inputs) do
+         local sparseIdx = 0
+         input.keys:apply(function(key)
+            sparseIdx = sparseIdx + 1
+            local f = dataset[key]
+            f.i = f.i + 1
+            f.values[f.i] = input.values[sparseIdx]
+            f.examples[f.i] = exampleId
+         end)
+      end
+
+      local sortVal, sortIdx = proto.new(), torch.LongTensor()
+      for featureId,f in pairs(dataset) do
+         assert(f.values:size(1) == f.i)
+         sortVal:sort(sortIdx, f.values, 1, false)
+
+         local sortedExampleIds = torch.LongTensor(f.i)
+         sortedExampleIds:index(f.examples, 1, sortIdx)
+
+         dataset[featureId] = sortedExampleIds
+      end
+   else
+      assert(torch.isTensor(inputs))
+      featureIds:range(1,inputs:size(2))
+
+      local wq = ipc.workqueue()
+      for i=1,inputs:size(2) do
+         wq:write({i, inputs:select(2, i)})
+      end
+      for i=1,self.nThreads do
+         wq:write(nil)
+      end
+
+      ipc.map(self.nThreads, function(wq)
+         while true do
+            local data = wq:read()
+            if data == nil then break end
+            local featureId = data[1]
+            local values = data[2]
+            local sortFeatureValues, sortExampleIds = values:sort(1, false)
+            sortFeatureValues = nil
+            wq:write({featureId, sortExampleIds})
+            collectgarbage()
+         end
+      end, wq)
+
+      for _=1,inputs:size(2) do
+         local data = wq:read()
+         local featureId = data[1]
+         local sortedFeatureExampleIds = data[2]
+         dataset[featureId] = sortedFeatureExampleIds
+      end
+   end
+
+   return dataset, featureIds
+end
+
+function DataSet:getSortedFeature(featureId)
+   assert(self.sortedFeatureValues)
+   return self.sortedFeatureValues[featureId]
+end
+
+function DataSet:size()
+   return self.target:size(1)
+end
+
+function DataSet:getExampleIds()
+   if not self.exampleIds then
+      self.exampleIds = torch.LongTensor():range(1,self:size())
+   end
+   return self.exampleIds
+end
+
+function DataSet:countPositive(exampleIds)
+   assert(torch.type(exampleIds) == 'torch.LongTensor')
+   local dt = require 'decisiontree'
+   local buffer = dt.getBufferTable('DataSet')
+   buffer.tensor = buffer.tensor or self.target.new()
+   buffer.tensor:index(self.target, 1, exampleIds)
+   local nPositive = 0
+   buffer.tensor:apply(function(x)
+      if x > 0 then nPositive = nPositive + 1 end
+   end)
+   return nPositive
+end
+
+function DataSet:initScore()
+   self.score = self.score or torch.Tensor()
+   self.score:resize(self:size()):fill(0)
+end
+
+function DataSet:buildIndex()
+   if torch.type(self.input) == 'table' then
+      for exampleId,input in ipairs(self.input) do
+         if torch.isTypeOf(input, 'torch.SparseTensor') then
+            input:buildIndex()
+         end
+      end
+   end
+end
+
+function DataSet:deleteIndex()
+   if torch.type(self.input) == 'table' then
+      for exampleId,input in ipairs(self.input) do
+         if torch.isTypeOf(input, 'torch.SparseTensor') then
+            input:deleteIndex()
+         end
+      end
+   end
+end
diff --git a/contrib/torch/decisiontree/DecisionForest.lua b/contrib/torch/decisiontree/DecisionForest.lua
new file mode 100644 (file)
index 0000000..cac748e
--- /dev/null
@@ -0,0 +1,82 @@
+local dt = require "decisiontree._env"
+
+-- Decision forest that ensembles a bag of decision trees.
+local DecisionForest = torch.class("dt.DecisionForest", "dt.DecisionTree", dt)
+
+function DecisionForest:__init(trees, weight, bias)
+   assert(torch.type(trees) == 'table')
+   self.trees = trees
+   if #trees == 0 then
+      self.weight = weight or torch.Tensor()
+      assert(torch.isTensor(self.weight))
+      assert(self.weight:nElement() == 0)
+   else
+      assert(torch.isTypeOf(trees[1], 'dt.DecisionTree'))
+      self.weight = weight or torch.Tensor(#trees):fill(1)
+      assert(torch.isTensor(self.weight))
+      assert(self.weight:dim() == 1)
+      assert(self.weight:min() >= 0, "Expecting positive weights")
+      assert(#trees == self.weight:size(1))
+   end
+
+   self.bias = bias or 0
+   assert(torch.type(self.bias) == 'number')
+end
+
+function DecisionForest:score(input, incrementalId)
+   assert(torch.isTensor(input))
+
+   local buffer = {}
+   if incrementalId then
+      self.buffers = self.buffers or {}
+      self.buffers[incrementalId] = self.buffers[incrementalId] or {}
+      buffer = self.buffers[incrementalId]
+   end
+   buffer.initialCounter = buffer.initialCounter or 0
+
+   -- TODO: score in parallel
+   local output
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      buffer.output = buffer.output or input.new()
+      output = buffer.output
+      assert(output:nElement() == 0 or output:size(1) == input:size(1))
+      if output:nElement() == 0 then
+         output:resize(input:size(1)):fill(self.bias)
+      end
+      for i,tree in ipairs(self.trees) do
+         if i > buffer.initialCounter then
+            local score = tree:score(input, nil, true)
+            output:add(self.weight[i], score)
+         end
+      end
+   else
+      output = buffer.output or self.bias
+      for i,tree in ipairs(self.trees) do
+         if i > buffer.initialCounter then
+            output = output + tree:score(input) * self.weight[i]
+         end
+      end
+      buffer.output = output
+   end
+
+   buffer.initialCounter = #self.trees
+
+   return output
+end
+
+function DecisionForest:add(tree, weight)
+   assert(torch.type(weight) == 'number')
+   assert(weight > 0)
+   table.insert(self.trees, tree)
+   self.weight:resize(#self.trees)
+   self.weight[#self.trees] = weight
+   return self
+end
+
+function DecisionForest:clone()
+   local trees = {}
+   for i, tree in ipairs(self.trees) do
+      trees[i] = tree:clone()
+   end
+   return DecisionForest(trees, self.weight:clone(), self.bias)
+end
diff --git a/contrib/torch/decisiontree/DecisionForestTrainer.lua b/contrib/torch/decisiontree/DecisionForestTrainer.lua
new file mode 100644 (file)
index 0000000..fc90367
--- /dev/null
@@ -0,0 +1,22 @@
+local dt = require "decisiontree._env"
+
+-- Interface for all decisionForestTrainers
+local DFT = torch.class("dt.DecisionForestTrainer", dt)
+
+-- Train a DecisionForest with examples, a table of valid featureIds and a dataset (i.e. sortedExamplesByFeatureId)
+function DFT:train(examples, validFeatureIds, dataset)
+   assert(torch.type(examples) == "table")
+   assert(torch.isTypeOf(examples[1], "dt.LabeledExample"))
+
+   assert(torch.type(validFeatureIds) == 'table')
+
+   assert(torch.type(dataset) == 'table')
+   for k,v in pairs(dataset) do
+      assert(torch.type(v) == 'table')
+      assert(torch.isTypeOf(v[1], 'dt.LabeledExample'))
+      break
+   end
+   -- dataset is a table mapping featureIds to sorted lists of LabeledExamples
+   -- e.g. {featureId={example1,example2,example3}}
+   error"Not Implemented"
+end
diff --git a/contrib/torch/decisiontree/DecisionTree.lua b/contrib/torch/decisiontree/DecisionTree.lua
new file mode 100644 (file)
index 0000000..c61bc37
--- /dev/null
@@ -0,0 +1,12 @@
+local dt = require "decisiontree._env"
+
+-- An interface for decision trees.
+local DecisionTree = torch.class("dt.DecisionTree", dt)
+
+-- Score an input example and return the prediction score.
+-- input is a Tensor or SparseTensor
+-- return prediction score and nodeId
+function DecisionTree:score(input)
+   error"Not Implemented"
+   return score, nodeId
+end
diff --git a/contrib/torch/decisiontree/GBDT_common.h b/contrib/torch/decisiontree/GBDT_common.h
new file mode 100644 (file)
index 0000000..eb99370
--- /dev/null
@@ -0,0 +1,106 @@
+#include "khash.h"
+#include <pthread.h>
+
+#define computeGradientBoostLoss(g, h) (-(g)*(g)/(h))
+
+// we use khash to make iteration faster than lua tables
+KHASH_SET_INIT_INT64(long)
+
+// defines the data we need for running an instance of thet and its constructor/destructor
+typedef struct {
+  khash_t(long)* exampleMap;
+  THLongTensor *exampleIdsWithFeature_cache;
+  long minLeafSize;
+} GBRunData;
+
+
+// allocates data that cannot be shared between threads
+static void gb_local_create_run_data(GBRunData *run_data) {
+  run_data->exampleMap = kh_init(long);
+  run_data->exampleIdsWithFeature_cache = THLongTensor_new();
+}
+
+static void gb_create_run_data(GBRunData *run_data, int minLeafSize) {
+  gb_local_create_run_data(run_data);
+  run_data->minLeafSize = minLeafSize;
+}
+
+static void gb_destroy_run_data(GBRunData *run_data) {
+  THLongTensor_free(run_data->exampleIdsWithFeature_cache);
+  kh_destroy(long, run_data->exampleMap);
+}
+
+// initializes the data required by the optimizer for the given feature.
+static THLongTensor *gb_internal_prepare(lua_State *L, THLongTensor *exampleIds,
+    THLongTensor *exampleIdsWithFeature_cache, int input_index, long feature_id,
+    khash_t(long)* exampleMap) {
+  long *exampleIds_data = THLongTensor_data(exampleIds);
+  long exampleIds_size = THLongTensor_size(exampleIds, 0);
+
+  int ret = 0;
+
+  // if the the input is a table, then we have a sparse dataset
+  if (lua_istable(L, input_index)) {
+    if (exampleIds_size == 0) {
+      return NULL;
+    }
+    else {
+      // loops over the examples' ids that this node has to evaluate and, if they have the feature
+      // we're looking for, marks them as present and stores them in the order provided by the
+      // dataset
+      THLongTensor_resize1d(exampleIdsWithFeature_cache, exampleIds_size);
+      kh_clear(long, exampleMap);
+      kh_resize(long, exampleMap, exampleIds_size*8);
+      long *exampleIdsWithFeature_data = THLongTensor_data(exampleIdsWithFeature_cache);
+      long j = 0;
+      // for each sample to be evaluated
+      for (long i = 0; i < exampleIds_size; i++) {
+        // gets the representation for the example
+        lua_pushinteger(L, exampleIds_data[i]);
+        lua_gettable(L, input_index);
+
+        // builds the index, which happens only once per thread for efficiency
+        lua_pushstring(L, "buildIndex");
+        lua_gettable(L, -2);
+        lua_pushvalue(L, -2);
+        lua_call(L, 1, 0);
+
+        // tries to get the feature for this sample
+        lua_pushinteger(L, feature_id);
+        lua_gettable(L, -2);
+        // if present, then...
+        if (!lua_isnil(L, -1)) {
+          // saves the example
+          exampleIdsWithFeature_data[j] = exampleIds_data[i];
+          j++;
+
+          // marks it as present in the hash table
+          kh_put(long, exampleMap, exampleIds_data[i], &ret);
+        }
+
+        lua_pop(L, 2);
+      }
+
+      // resizes to fit only the samples that have the feature
+      THLongTensor_resize1d(exampleIdsWithFeature_cache, j);
+      kh_resize(long, exampleMap, j*8);
+      return exampleIdsWithFeature_cache;
+    }
+  }
+  else {
+    // if the input isn't a table, then it's dense and we cannot have exampleIds missing, so it
+    // depends on feature_id
+    // since exampleIds is fixed between calls and this is going to store the same values to the
+    // same position, we can cache it between calls
+    if (kh_size(exampleMap) == 0) {
+      kh_resize(long, exampleMap, exampleIds_size*8);
+      for (long i = 0; i < exampleIds_size; i++) {
+        kh_put(long, exampleMap, exampleIds_data[i], &ret);
+      }
+    }
+    // notice that we just return the given tensor of ids instead of copying it. the rest of the
+    // code handles this transparently
+    return exampleIds;
+  }
+}
+
diff --git a/contrib/torch/decisiontree/GiniState.lua b/contrib/torch/decisiontree/GiniState.lua
new file mode 100644 (file)
index 0000000..6dfed28
--- /dev/null
@@ -0,0 +1,54 @@
+local dt = require 'decisiontree._env'
+
+-- used by RandomForestTrainer
+local GiniState, parent = torch.class("dt.GiniState", "dt.TreeState", dt)
+
+function GiniState:__init(exampleIds)
+   parent.__init(self, exampleIds)
+   self.nPositiveInLeftBranch = 0
+   self.nPositiveInRightBranch = 0
+end
+
+function GiniState:score(dataset)
+   local dt = require 'decisiontree'
+   local nPositive = dataset:countPositive(self.exampleIds)
+   return dt.calculateLogitScore(nPositive, self.exampleIds:size(1))
+end
+
+function GiniState:initialize(exampleIdsWithFeature, dataset)
+   assert(torch.type(exampleIdsWithFeature) == 'torch.LongTensor')
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   self.nPositiveInLeftBranch = dataset:countPositive(exampleIdsWithFeature)
+   self.nPositiveInRightBranch = 0
+
+   self.nExampleInLeftBranch = exampleIdsWithFeature:size(1)
+   self.nExampleInRightBranch = 0
+end
+
+function GiniState:update(exampleId, dataset)
+   assert(torch.type(exampleId) == 'number')
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   if dataset.target[exampleId] > 0 then
+      self.nPositiveInLeftBranch = self.nPositiveInLeftBranch - 1
+      self.nPositiveInRightBranch = self.nPositiveInRightBranch + 1
+   end
+
+   self.nExampleInLeftBranch = self.nExampleInLeftBranch - 1
+   self.nExampleInRightBranch = self.nExampleInRightBranch + 1
+end
+
+function GiniState:computeSplitInfo(splitFeatureId, splitFeatureValue)
+   local dt = require 'decisiontree'
+   local gini = dt.computeGini(self.nExampleInLeftBranch, self.nPositiveInLeftBranch, self.nExampleInRightBranch, self.nPositiveInRightBranch)
+   local splitInfo = {
+      splitId = assert(splitFeatureId),
+      splitValue = assert(splitFeatureValue),
+      leftChildSize = assert(self.nExampleInLeftBranch),
+      leftPositiveCount = assert(self.nPositiveInLeftBranch),
+      rightChildSize = assert(self.nExampleInRightBranch),
+      rightPositiveCount = assert(self.nPositiveInRightBranch),
+      gini = assert(gini),
+      splitGain = gini
+   }
+   return splitInfo
+end
\ No newline at end of file
diff --git a/contrib/torch/decisiontree/GradientBoostState.lua b/contrib/torch/decisiontree/GradientBoostState.lua
new file mode 100644 (file)
index 0000000..f268f3d
--- /dev/null
@@ -0,0 +1,57 @@
+local dt = require 'decisiontree._env'
+
+local GradientBoostState, parent = torch.class("dt.GradientBoostState", "dt.TreeState", dt)
+
+function GradientBoostState:__init(exampleIds, gradInput, hessInput)
+   parent.__init(self, exampleIds)
+   self.gradInput = gradInput
+   self.hessInput = hessInput
+end
+
+function GradientBoostState:score(dataset)
+   local dt = require 'decisiontree'
+   local gradInput = self.gradInput:index(1, self.exampleIds)
+   local hessInput = self.hessInput:index(1, self.exampleIds)
+   return dt.computeNewtonScore(gradInput:sum(), hessInput:sum())
+end
+
+-- calls _branch and encapsulates the left and right exampleIds into a TreeStates
+function GradientBoostState:branch(splitInfo, dataset)
+   local leftExampleIds, rightExampleIds = self:_branch(splitInfo, dataset)
+   return self.new(leftExampleIds, self.gradInput, self.hessInput), self.new(rightExampleIds, self.gradInput, self.hessInput)
+end
+
+-- Partitions self given a splitInfo table, producing a pair of exampleIds corresponding to the left and right subtrees.
+function GradientBoostState:_branch(splitInfo, dataset)
+   local input = dataset.input
+   -- if the input is dense, we can use the optimized version
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      return input.nn.GBDT_branch(splitInfo, input, self.exampleIds)
+   end
+   return parent._branch(self, splitInfo, dataset)
+end
+
+-- The following methods are supersets of each other. You can comment out them to re-use the lua
+-- version with just the provided core optimized
+
+-- THIS ONE CANNOT BE COMMENTED OUT
+function GradientBoostState:findBestFeatureSplit(dataset, featureId, minLeafSize)
+   local ret = self.hessInput.nn.GBDT_findBestFeatureSplit(self.exampleIds, dataset, featureId, minLeafSize, self.gradInput, self.hessInput)
+   return ret
+end
+
+-- finds the best split of examples in treeState among featureIds
+function GradientBoostState:findBestSplit(dataset, featureIds, minLeafSize, shardId, nShard)
+   local ret = self.hessInput.nn.GBDT_findBestSplit(self.exampleIds, dataset, featureIds, minLeafSize, shardId, nShard, self.gradInput, self.hessInput)
+   return ret
+end
+
+-- finds the best split like the previous one, but performs feature parallelism. Note that the
+-- optimization is only applied if the input is dense
+function GradientBoostState:findBestSplitFP(dataset, featureIds, minLeafSize, nThread)
+   local input = dataset.input
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      local ret = self.hessInput.nn.GBDT_findBestSplitFP(self.exampleIds, dataset, featureIds, minLeafSize, self.gradInput, self.hessInput, nThread)
+      return ret
+   end
+end
diff --git a/contrib/torch/decisiontree/GradientBoostTrainer.lua b/contrib/torch/decisiontree/GradientBoostTrainer.lua
new file mode 100644 (file)
index 0000000..54d1ba8
--- /dev/null
@@ -0,0 +1,244 @@
+local dt = require "decisiontree._env"
+
+-- Gradient boosted decision tree trainer
+local GradientBoostTrainer = torch.class("dt.GradientBoostTrainer", "dt.DecisionForestTrainer", dt)
+
+function GradientBoostTrainer:__init(opt)
+   assert(torch.type(opt) == 'table')
+
+   assert(torch.isTypeOf(opt.treeTrainer, 'dt.CartTrainer'))
+   self.treeTrainer = opt.treeTrainer
+
+   assert(torch.isTypeOf(opt.lossFunction, 'nn.Criterion'))
+   self.lossFunction = opt.lossFunction
+
+   assert(torch.type(opt.shrinkage) == 'number')
+   assert(opt.shrinkage > 0)
+   self.shrinkage = opt.shrinkage
+
+   assert(torch.type(opt.downsampleRatio) == 'number')
+   assert(opt.downsampleRatio > 0)
+   self.downsampleRatio = opt.downsampleRatio
+
+   assert(torch.type(opt.nTree) == 'number')
+   assert(opt.nTree > 0)
+   self.nTree = opt.nTree
+
+   evalFreq = evalFreq or -1
+   assert(torch.type(opt.evalFreq) == 'number')
+   assert(torch.round(opt.evalFreq) == opt.evalFreq)
+   self.evalFreq = opt.evalFreq
+
+   -- when non-positive, no early-stopping
+   earlyStop = earlyStop or (evalFreq-1)
+   assert(torch.type(opt.earlyStop) == 'number')
+   self.earlyStop = opt.earlyStop
+
+    -- when non-positive, defaults to sqrt(#feature)
+   assert(torch.type(opt.featureBaggingSize) == 'number')
+   self.featureBaggingSize = opt.featureBaggingSize
+
+   if opt.decisionForest then
+      assert(torch.isTypeOf(opt.decisionForest, 'dt.DecisionForest'))
+   end
+   self.decisionForest = opt.decisionForest
+
+   self.useInitBias = opt.useInitBias
+end
+
+function GradientBoostTrainer:computeBias(trainSet, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+
+   if verbose then print("Use new bias generated from the training examples.") end
+
+   return -0.5 * self.gradInput:sum() / self.hessInput:sum()
+end
+
+
+function GradientBoostTrainer:initialize(trainSet, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+
+   trainSet:initScore()
+   self.gradInput, self.hessInput = self.lossFunction:backward2(trainSet.score, trainSet.target)
+
+   -- used for early-stopping (see validate())
+   self.stopCount = 0
+   self.prevTrainLoss = math.huge
+   self.prevTestLoss = math.huge
+
+   if verbose then print("Processing initial decision forest") end
+
+   local decisionForest, bias
+
+   if self.decisionForest then
+      local bias = self.useInitBias and self.decisionForest.bias or self:computeBias(trainSet, verbose)
+
+      decisionForest = dt.DecisionForest(self.decisionForest.trees, self.decisionForest.weight, bias)
+
+      local input = trainSet.input
+      if torch.isTensor(input) and input.isContiguous and input:isContiguous() then
+         score = decisionForest:score(input)
+      else
+         score:resize(trainSet:size())
+         for exampleId=1,trainSet:size() do
+            score[exampleId] = decisionForest:score(input[exampleId])
+         end
+      end
+   else
+      local bias = self:computeBias(trainSet, verbose)
+      decisionForest = dt.DecisionForest({}, torch.Tensor(), bias)
+
+      trainSet.score:fill(bias)
+   end
+
+   if verbose then print("Finish loading initial decision forest") end
+
+   return decisionForest
+end
+
+-- Trains a decision forest of boosted decision trees.
+-- examples are the training examples. validExamples are used for cross-validation.
+function GradientBoostTrainer:train(trainSet, featureIds, validSet, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   assert(torch.isTypeOf(validSet, 'dt.DataSet'))
+
+   local decisionForest = self:initialize(trainSet, verbose)
+   local bestDecisionForest
+
+   if verbose then print(string.format("Get %d featureIds.", featureIds:size(1))) end
+
+   local baggingSize = self.featureBaggingSize > 0 and self.featureBaggingSize or torch.round(math.sqrt(featureIds:size(1)))
+   local trainExampleIds = trainSet:getExampleIds()
+   local baggingIndices, activeFeatures
+   local treeExampleIds
+
+   local timer = torch.Timer()
+
+   for treeId = 1,self.nTree do
+      timer:reset()
+      if verbose then print(string.format("Begin processing tree number %d of %d", treeId, self.nTree)) end
+
+      -- Get active features
+      activeFeatures = activeFeatures or torch.LongTensor()
+      if baggingSize < featureIds:size(1) then
+         if verbose then print(string.format("Tree %d: Bagging %d from %d features", treeId, baggingSize, featureIds:size(1))) end
+
+         baggingIndices = baggingIndices or torch.LongTensor()
+         baggingIndices:randperm(featureIds:size(1))
+         activeFeatures:index(featureIds, 1, baggingIndices:narrow(1,1,baggingSize))
+      else
+         activeFeatures = featureIds
+      end
+
+      -- Get data samples
+      if self.downsampleRatio < 0.99 then
+         local sampleSize = torch.round(trainSet:size() * self.downsampleRatio)
+
+         if verbose then print(string.format("Tree %d: Downsampling %d of %d samples", treeId, sampleSize, trainSet:size())) end
+
+         baggingIndices = baggingIndices or torch.LongTensor()
+         baggingIndices:randperm(trainSet:size())
+
+         treeExampleIds = treeExampleIds or torch.LongTensor()
+         treeExampleIds:index(trainExampleIds, 1, baggingIndices:narrow(1,1,sampleSize))
+      else
+         treeExampleIds = trainExampleIds
+      end
+
+      if verbose then print(string.format("Tree %d: training CART tree", treeId)) end
+
+      local rootTreeState = dt.GradientBoostState(treeExampleIds, self.gradInput, self.hessInput)
+      local cartTree = self.treeTrainer:train(rootTreeState, activeFeatures)
+
+      if verbose then print(string.format("Tree %d: finished training CART tree in %f seconds", treeId, timer:time().real)) end
+
+      decisionForest:add(cartTree, self.shrinkage)
+
+      -- update score
+      local predictionScore
+      local input = trainSet.input
+      if torch.isTensor(input) and input:isContiguous() then
+         predictionScore = cartTree:score(trainSet.input, nil, true)
+      else
+         local size = trainSet:size()
+         predictionScore = torch.Tensor(size)
+         for exampleId=1,size do
+            predictionScore[exampleId] = cartTree:score(trainSet.input[exampleId])
+         end
+      end
+      trainSet.score:add(self.shrinkage, predictionScore)
+      self.gradInput, self.hessInput = self.lossFunction:backward2(trainSet.score, trainSet.target)
+
+      if verbose then print(string.format("Tree %d: training complete in %f seconds", treeId, timer:time().real)) end
+
+      -- cross-validation/early-stopping
+      if self.evalFreq > 0 and treeId % self.evalFreq == 0 then
+         timer:reset()
+         local stop, validLoss, bestDecisionForest = self:validate(trainSet, validSet, decisionForest, bestDecisionForest)
+         if dt.PROFILE then print("validate tree time: "..timer:time().real) end
+         if verbose then print(string.format("Loss: train=%7.4f, valid=%7.4f", trainLoss, validLoss)) end
+         if stop then
+            if verbose then print(string.format("GBDT early stopped on tree %d", treeId)) end
+            break
+         end
+
+      end
+   end
+
+   return bestDecisionForest or decisionForest
+end
+
+function dt.GradientBoostTrainer:validate(trainSet, validSet, decisionForest, bestDecisionForest)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.isTypeOf(validSet, 'dt.DataSet'))
+   assert(torch.isTypeOf(decisionForest, 'dt.DecisionForest'))
+   assert(not bestDecisionForest or torch.isTypeOf(decisionForest, 'dt.DecisionForest'))
+
+   -- buffer
+   local buffer = dt.getBufferTable('GradientBoost')
+   buffer.tensor = buffer.tensor or trainSet.score.new()
+   local score = buffer.tensor
+
+   -- per thread loss function (tensors are shared)
+   local lossname = torch.typename(self.lossFunction)
+   buffer[lossname] = buffer[lossname] or self.lossFunction:clone()
+   local lossFunction = buffer[lossname]
+
+   -- TODO batch this for large datasets
+   local input = validSet.input
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() then
+      score = decisionForest:score(input, 'val')
+   else
+      score:resize(validSet:size())
+      for exampleId=1,validSet:size() do
+         score[exampleId] = decisionForest:score(input[exampleId], 'val')
+      end
+   end
+   local validLoss = lossFunction:forward(score, validSet.target)
+
+   -- early stop is not enabled when earlyStop=0
+   local stop = false
+   if self.earlyStop > 0 then
+      -- Track test loss and detect early stop
+      if self.prevTestLoss - validLoss < 0 then
+         self.stopCount = self.stopCount + 1
+      else
+         bestDecisionForest = decisionForest:clone()
+         self.stopCount = 0
+      end
+
+      stop = self.stopCount >= self.earlyStop
+   end
+
+   self.prevTestLoss = validLoss
+
+   return stop, validLoss, bestDecisionForest
+end
+
+function GradientBoostTrainer:getName()
+   return string.format(
+      "gbdt-dRatio-%s-maxLeaf-%s-minExample-%s-nTree-%s-shrinkage-%s",
+      self.downsampleRatio, self.maxLeafNodes, self.minLeafSize, self.nTree, self.shrinkage
+   )
+end
diff --git a/contrib/torch/decisiontree/LICENSE b/contrib/torch/decisiontree/LICENSE
new file mode 100644 (file)
index 0000000..8dada3e
--- /dev/null
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/contrib/torch/decisiontree/LogitBoostCriterion.lua b/contrib/torch/decisiontree/LogitBoostCriterion.lua
new file mode 100644 (file)
index 0000000..5b9eb60
--- /dev/null
@@ -0,0 +1,45 @@
+local dt = require "decisiontree._env"
+
+-- Ref: slide 17 in https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf
+
+-- equivalent to nn.Sigmoid() + nn.BCECriterion()
+local LogitBoostCriterion, parent = torch.class("nn.LogitBoostCriterion", "nn.Criterion")
+
+function LogitBoostCriterion:__init(sizeAverage)
+   parent.__init(self)
+   self.sizeAverage = sizeAverage
+   self.hessInput = self.gradInput.new()
+   self._output = torch.Tensor()
+end
+
+function LogitBoostCriterion:updateOutput(input, target)
+   input.nn.LogitBoostCriterion_updateOutput(input, target, self._output, self.sizeAverage)
+   self.output = self._output[1]
+   return self.output
+end
+
+function LogitBoostCriterion:updateGradInput(input, target)
+   input.nn.LogitBoostCriterion_updateGradInput(input, target, self.gradInput)
+   return self.gradInput
+end
+
+function LogitBoostCriterion:updateHessInput(input, target)
+   input.nn.LogitBoostCriterion_updateHessInput(input, target, self.hessInput)
+   return self.hessInput
+end
+
+-- returns gradInput and hessInput
+function LogitBoostCriterion:backward2(input, target)
+   return self:updateGradInput(input, target), self:updateHessInput(input, target)
+end
+
+local gradWrapper = function(input, target, grad)
+   input.nn.LogitBoostCriterion_updateGradInput(input, target, grad)
+end
+local hessianWrapper = function(input, target, hessian)
+   input.nn.LogitBoostCriterion_updateHessInput(input, target, hessian)
+end
+
+function LogitBoostCriterion:getWrappers()
+   return gradWrapper, hessianWrapper
+end
diff --git a/contrib/torch/decisiontree/MSECriterion.lua b/contrib/torch/decisiontree/MSECriterion.lua
new file mode 100644 (file)
index 0000000..948c1a1
--- /dev/null
@@ -0,0 +1,13 @@
+local dt = require "decisiontree._env"
+
+function nn.MSECriterion.updateHessianInput(self, input, target)
+   self.hessInput = self.hessInput or input.new()
+   self.hessInput:resize(input:size()):fill(2)
+   return self.hessInput
+end
+
+-- returns gradInput and hessInput
+function nn.MSECriterion.backward2(self, input, target)
+   return self:updateGradInput(input, target), self:updateHessInput(input, target)
+end
+
diff --git a/contrib/torch/decisiontree/README.md b/contrib/torch/decisiontree/README.md
new file mode 100644 (file)
index 0000000..db4622a
--- /dev/null
@@ -0,0 +1,386 @@
+# Torch decision tree library
+
+```lua
+local dt = require 'decisiontree'
+```
+
+This project implements random forests and gradient boosted decision trees (GBDT).
+The latter uses gradient tree boosting.
+Both use ensemble learning to produce ensembles of decision trees (that is, forests).
+
+## `nn.DFD`
+
+One practical application for decision forests is to *discretize* an input feature space into a richer output feature space.
+The  `nn.DFD` Module can be used as a decision forest discretizer (DFD):
+
+```lua
+local dfd = nn.DFD(df, onlyLastNode)
+```
+
+where `df` is a `dt.DecisionForest` instance or the table returned by the method `getReconstructionInfo()` on another `nn.DFD` module, and `onlyLastNode` is a boolean that indicates that module should return only the id of the last node visited on each tree (by default it outputs all traversed nodes except for the roots).
+The `nn.DFD` module requires dense `input` tensors.
+Sparse `input` tensors (tables of tensors) are not supported.
+The `output` returned by a call to `updateOutput` is a batch of sparse tensors.
+This `output` where `output[1]` and `output[2]` are a respectively a list of key and value tensors:
+
+```lua
+{
+  { [torch.LongTensor], ... , [torch.LongTensor] },
+  { [torch.Tensor], ... , [torch.Tensor] }
+}
+```
+
+This module doesn't support CUDA.
+
+### Example
+As a concrete example, let us first train a Random Forest on a dummy dense dataset:
+
+```lua
+local nExample = 100
+local batchsize = 2
+local inputsize = 10
+
+-- train Random Forest
+local trainSet = dt.getDenseDummyData(nExample, nil, inputsize)
+local opt = {
+   activeRatio=0.5,
+   featureBaggingSize=5,
+   nTree=4,
+   maxLeafNodes=nExample/2,
+   minLeafSize=nExample/10,
+}
+local trainer = dt.RandomForestTrainer(opt)
+local df = trainer:train(trainSet, trainSet.featureIds)
+mytester:assert(#df.trees == opt.nTree)
+```
+
+Now that we have `df`, a `dt.DecisionForest` instance, we can use it to initialize `nn.DFD`:
+
+```lua
+local dfd = nn.DFD(df)
+```
+
+The `dfd` instance holds no reference to `df`, instead it extracts the relevant attributes from `df`.
+These attributes are stored in tensors for batching and efficiency.
+
+We can discretize a hypothetical `input` by calling `forward`:
+```lua
+local input = trainSet.input:sub(1,batchsize)
+local output = dfd:forward(input)
+```
+
+The resulting output is a table consisting of two tables: keys and values.
+The keys and values tables each contains `batchsize` tensors:
+
+```lua
+print(output)
+{
+  1 :
+    {
+      1 : LongTensor - size: 14
+      2 : LongTensor - size: 16
+      3 : LongTensor - size: 15
+      4 : LongTensor - size: 13
+    }
+  2 :
+    {
+      1 : DoubleTensor - size: 14
+      2 : DoubleTensor - size: 16
+      3 : DoubleTensor - size: 15
+      4 : DoubleTensor - size: 13
+    }
+}
+```
+
+An example's feature keys (`LongTensor`) and commensurate values (`DoubleTensor`) have the same number of elements.
+The examples have variable number of key-value pairs representing the nodes traversed in the tree.
+The output feature space has as many dimensions (that is, possible feature keys) for each node in the forest.
+
+## `torch.SparseTensor`
+
+Suppose you have a set of `keys` mapped to `values`:
+```lua
+local keys = torch.LongTensor{1,3,4,7,2}
+local values = torch.Tensor{0.1,0.3,0.4,0.7,0.2}
+```
+
+You can use a `SparseTensor` to encapsulate these into a read-only tensor:
+
+```lua
+local st = torch.SparseTensor(input, target)
+```
+
+The _decisiontree_ library uses `SparseTensors` to simulate the `__index` method of the `torch.Tensor`.
+For example, one can obtain the value associated to key 3 of the above `st` instance:
+
+```lua
+local value = st[3]
+assert(value == 0.3)
+```
+
+When the key,value pair are missing, `nil` is returned instead:
+
+```lua
+local value = st[2]
+assert(value == nil)
+```
+
+The best implementation for this kind of indexing is slow (it uses a sequential scan of the `keys).
+To speedup indexing, one can call the `buildIndex()` method before hand:
+
+```lua
+st:buildIndex()
+```
+
+The `buildIndex()` creates a hash map (a Lua table) of keys to their commensurate indices in the `values` table.
+
+## `dt.DataSet`
+
+The `CartTrainer`, `RandomForestTrainer` and `GradientBoostTrainer` require that data sets be encapsulated into a `DataSet`.
+Suppose you have a dataset of dense inputs and targets:
+
+```lua
+local nExample = 10
+local nFeature = 5
+local input = torch.randn(nExample, nFeature)
+local target = torch.Tensor(nExample):random(0,1)
+```
+
+these can be encapsulated into a `DataSet` object:
+
+```lua
+local dataset = dt.DataSet(input, target)
+```
+
+Now suppose you have a dataset where the `input` is a table of `SparseTensor` instances:
+
+```lua
+local input = {}
+for i=1,nExample do
+   local nKeyVal = math.random(2,nFeature)
+   local keys = torch.LongTensor(nKeyVal):random(1,nFeature)
+   local values = torch.randn(nKeyVal)
+   input[i] = torch.SparseTensor(keys, values)
+end
+```
+
+You can still use a `DataSet` to encapsulate the sparse dataset:
+
+```lua
+local dataset = dt.DataSet(input, target)
+```
+
+The main purpose of the `DataSet` class is to sort each feature by value.
+This is captured by the `sortFeatureValues(input)` method, which is called in the constructor:
+
+```lua
+local sortedFeatureValues, featureIds = self:sortFeatureValues(input)
+```
+
+The `featureIds` is a `torch.LongTensor` of all available feature IDs.
+For a dense `input` tensor, this is just `torch.LongTensor():range(1,input:size(2))`.
+But for a sparse `input` tensor, the `featureIds` tensor only contains the feature IDs present in the dataset.
+
+The resulting `sortedFeatureValues` is a table mapping `featureIds` to `exampleIds` sorted by `featureValues`.
+For each `featureId`, examples are sorted by `featureValue` in ascending order.
+For example, the table might look like: `{featureId=exampleIds}` where `examplesIds={1,3,2}`.
+
+The `CartTrainer` accesses the `sortedFeatureValues` tensor by calling `getSortedFeature(featureId)`:
+
+```lua
+local exampleIdsWithFeature = dataset:getSortedFeature(featureId)
+```
+
+The ability to access examples IDs sorted by feature value, given a feature ID, is the main purpose of the `DataSet`.
+The `CartTrainer` relies on these sorted lists to find the best way to split a set of examples between two tree nodes.
+
+## `dt.CartTrainer`
+
+```lua
+local trainer = dt.CartTrainer(dataset, minLeafSize, maxLeafNodes)
+```
+
+The `CartTrainer` is used by the `RandomForestTrainer` and `GradientBoostTrainer` to train individual trees.
+CART stands for classification and regression trees.
+However, only binary classifiers are unit tested.
+
+The constructor takes the following arguments:
+
+ * `dataset` is a `dt.DataSet` instance representing the training set.
+ * `minLeafSize` is the minimum examples per leaf node in a tree. The larger the value, the more regularization.
+ * `maxLeafNodes` is the maximum nodes in the tree. The lower the value, the more regularization.
+
+Training is initiated by calling the `train()` method:
+
+```lua
+local trainSet = dt.DataSet(input, target)
+local rootTreeState = dt.GiniState(trainSet:getExampleIds())
+local activeFeatures = trainSet.featureIds
+local tree = trainer:train(rootTreeState, activeFeatures)
+```
+
+The resulting `tree` is a `CartTree` instance.
+The `rootTreeState` is a `TreeState` instance like `GiniState` (used by `RandomForestTrainer`) or `GradientBoostState` (used by `GradientBoostTrainer`).
+The `activeFeatures` is a `LongTensor` of feature IDs that used to build the tree.
+Every other feature ID is ignored during training. This is useful for feature bagging.
+
+By default the `CartTrainer` runs in a single-thread.
+The `featureParallel(nThread)` method can be called before calling `train()` to parallelize training using `nThread` workers:
+
+```lua
+local nThread = 3
+trainer:featureParallel(nThread)
+trainer:train(rootTreeState, activeFeatures)
+```
+
+Feature parallelization assigns a set of features IDs to each thread.
+
+The `CartTrainer` can be used as a stand-alone tree trainer.
+But it is recommended to use it within the context of a `RandomForestTrainer` or `GradientBoostTrainer` instead.
+The latter typically generalize better.
+
+## RandomForestTrainer
+
+The `RandomForestTrainer` is used to train a random forest:
+
+```lua
+local nExample = trainSet:size()
+local opt = {
+   activeRatio=0.5,
+   featureBaggingSize=5,
+   nTree=14,
+   maxLeafNodes=nExample/2,
+   minLeafSize=nExample/10,
+}
+local trainer = dt.RandomForestTrainer(opt)
+local forest = trainer:train(trainSet, trainSet.featureIds)
+```
+
+The returned `forest` is a `DecisionForest` instance.
+A `DecisionForest` has a similar interface to the `CartTree`.
+Indeed, they both sub-class the `DecisionTree` abstract class.
+
+The constructor takes a single `opt` table argument, which contains the actual arguments:
+
+ * `activeRatio` is the ratio of active examples per tree. This is used for boostrap sampling.
+ * `featureBaggingSize` is the number of features per tree. This is also used fpr feature bagging.
+ * `nTree` is the number of trees to be trained.
+ * `maxLeafNodes` and `minLeafSize` are passed to the underlying `CartTrainer` constructor (controls regularization).
+
+Internally, the `RandomForestTrainer` passes a `GiniBoostState` to the `CartTrainer:train()` method.
+
+Training can be parallelized by calling `treeParallel(nThread)`:
+
+```lua
+local nThread = 3
+trainer:treeParallel(nThread)
+local forest = trainer:train(trainSet, trainSet.featureIds)
+```
+
+Training then parallelizes by training each tree in its own thread worker.
+
+## GradientBoostTrainer
+
+References:
+ * A. [Boosted Tree presentation](https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
+
+Graient boosted decision trees (GBDT) can be trained as follows:
+```lua
+local nExample = trainSet:size()
+local maxLeafNode, minLeafSize = nExample/2, nExample/10
+local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNode)
+
+local opt = {
+  lossFunction=nn.LogitBoostCriterion(false),
+  treeTrainer=cartTrainer,
+  shrinkage=0.1,
+  downsampleRatio=0.8,
+  featureBaggingSize=-1,
+  nTree=14,
+  evalFreq=8,
+  earlyStop=0
+}
+
+local trainer = dt.GradientBoostTrainer(opt)
+local forest = trainer:train(trainSet, trainSet.featureIds, validSet)
+```
+
+The above code snippet uses the `LogitBoostCriterion` outlined in reference A.
+It is used for training binary classification trees.
+
+The returned `forest` is a `DecisionForest` instance.
+A `DecisionForest` has a similar interface to the `CartTree`.
+Indeed, they both sub-class the `DecisionTree` abstract class.
+
+The constructor takes a single `opt` table argument, which contains the actual arguments:
+
+ * `lossFunction` is a `nn.Criterion` instance extended to include the `updateHessInput(input, target)` and `backward2(input, target)`. These return the hessian of the `input`.
+ * `treeTrainer` is a `CartTrainer` instance. Its `featureParallel()` method can be called to implement feature parallelization.
+ * `shrinkage` is the weight of each additional tree.
+ * `downsampleRatio` is the ratio of examples to be sampled for each tree. Used for bootstrap sampling.
+ * `featureBaggingSize` is the number of features to sample per tree. Used for feature bagging. `-1` defaults to `torch.round(math.sqrt(featureIds:size(1)))`
+ * `nTree` is the maximum number of trees.
+ * `evalFreq` is the number of epochs between calls to `validate()` for cross-validation and early-stopping.
+ * `earlyStop` is the maximum number of epochs to wait for early-stopping.
+
+Internally, the `GradientBoostTrainer` passes a `GradientBoostState` to the `CartTrainer:train()` method.
+
+## TreeState
+
+An abstract class that holds the state of a subtree during decision tree training.
+It also manages the state of candidate splits.
+
+```lua
+local treeState = dt.TreeState(exampleIds)
+```
+
+The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
+
+## GiniState
+
+A `TreeState` subclass used internally by the `RandomForestTrainer`.
+Uses Gini impurity to determine how to split trees.
+
+```lua
+local treeState = dt.GiniState(exampleIds)
+```
+
+The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
+
+## GradientBoostState
+
+A `TreeState` subclass used internally by the `GradientBoostTrainer`.
+It implements the GBDT spliting algorithm, which uses a loss function.
+
+```lua
+local treeState = dt.GradientBoostState(exampleIds, lossFunction)
+```
+
+The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
+The `lossFunction` is an `nn.Criterion` instance (see `GradientBoostTrainer`).
+
+
+## WorkPool
+
+Utility class that simplifies construction of a pool of daemon threads with which to execute tasks in parallel.
+
+```lua
+local workpool = dt.WorkPool(nThread)
+```
+
+## CartTree
+
+Implements a trained CART decision tree:
+
+```lua
+local tree = nn.CartTree(rootNode)
+```
+
+The `rootNode` is a `CartNode` instance.
+Each `CartNode` contains pointers to left and right branches, which are themselves `CartNode` instances.
+
+For inference, use the `score(input)` method:
+
+```lua
+local score = tree:score(input)
+```
diff --git a/contrib/torch/decisiontree/RandomForestTrainer.lua b/contrib/torch/decisiontree/RandomForestTrainer.lua
new file mode 100644 (file)
index 0000000..41040b2
--- /dev/null
@@ -0,0 +1,159 @@
+local dt = require "decisiontree._env"
+
+local RandomForestTrainer = torch.class("dt.RandomForestTrainer", dt)
+
+function RandomForestTrainer:__init(opt)
+   assert(torch.type(opt.nTree) == 'number')
+   assert(opt.nTree > 0)
+   self.nTree = opt.nTree
+   -- max number of leaf nodes per tree
+   assert(torch.type(opt.maxLeafNodes) == 'number')
+   assert(opt.maxLeafNodes > 0)
+   self.maxLeafNodes = opt.maxLeafNodes
+   -- min number of examples per leaf
+   assert(torch.type(opt.minLeafSize) == 'number')
+   assert(opt.minLeafSize > 0)
+   self.minLeafSize = opt.minLeafSize
+
+   -- when non-positive, defaults to sqrt(#feature)
+   assert(torch.type(opt.featureBaggingSize) == 'number')
+   self.featureBaggingSize = opt.featureBaggingSize
+
+   assert(torch.type(opt.activeRatio) == 'number')
+   assert(opt.activeRatio > 0)
+   self.activeRatio = opt.activeRatio
+
+   -- default parallelization is singlethread
+   self.parallelMode = 'singlethread'
+end
+
+-- Train a DecisionForest
+function RandomForestTrainer:train(trainSet, featureIds, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+
+   if verbose then print(string.format("Begin training Decision Forest with %d trees", self.nTree)) end
+
+   local weight = torch.Tensor(self.nTree):fill(1 / self.nTree) -- RF uses uniform weights
+
+   local trees
+   if self.parallelMode == 'singlethread' then
+      trees = self:trainTrees(trainSet, featureIds, verbose)
+   elseif self.parallelMode == 'treeparallel' then
+      trainSet:deleteIndex() -- prevents serialization bottleneck
+      trees = self:trainTreesTP(trainSet, featureIds, verbose)
+   else
+      error("Unrecognized parallel mode: " .. self.parallelMode)
+   end
+
+   if verbose then print(string.format("Successfully trained %d trees", #trees)) end
+
+   -- set bias
+   local bias = 0;
+   for i, tree in ipairs(trees) do
+      bias = bias + tree.root.score * weight[i]
+   end
+
+   return dt.DecisionForest(trees, weight, bias)
+end
+
+function RandomForestTrainer:trainTrees(trainSet, featureIds, verbose)
+
+   -- the same CartTrainer will be used for each tree
+   local cartTrainer = dt.CartTrainer(trainSet, self.minLeafSize, self.maxLeafNodes)
+
+   local trees = {}
+   for treeId=1,self.nTree do
+      -- Train a CartTree
+      local tree = self.trainTree(cartTrainer, featureIds, self.featureBaggingSize, self.activeRatio, treeId, verbose)
+      table.insert(trees, tree)
+   end
+   return trees
+end
+
+-- static function that returns a cartTree
+function RandomForestTrainer.trainTree(cartTrainer, featureIds, baggingSize, activeRatio, treeId, verbose)
+   assert(torch.isTypeOf(cartTrainer, 'dt.CartTrainer'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   local baggingSize = baggingSize > 0 and baggingSize or torch.round(math.sqrt(featureIds:size(1)))
+
+   if verbose then
+      print(string.format("Tree %d: Creating features bootstrap sample with baggingSize %d, nFeatures %d", treeId, baggingSize, featureIds:size(1)))
+   end
+
+   local trainSet = cartTrainer.dataset
+
+   -- sample boot strap features
+   local baggingIndices = torch.LongTensor(baggingSize):random(1,featureIds:size(1))
+   local activeFeatures = featureIds:index(1, baggingIndices)
+
+    -- sample boot strap examples
+   local sampleSize = torch.round(trainSet:size() * activeRatio)
+   if verbose then print(string.format("Creating bootstrap sample created of size %d", sampleSize)) end
+
+   baggingIndices:resize(sampleSize):random(1,trainSet:size())
+   local bootStrapExampleIds = torch.LongTensor()
+   bootStrapExampleIds:index(trainSet:getExampleIds(), 1, baggingIndices)
+
+   local cartTree = cartTrainer:train(dt.GiniState(bootStrapExampleIds), activeFeatures)
+
+   if verbose then print(string.format("Complete processing tree number %d", treeId)) end
+
+   return cartTree
+end
+
+-- call before training to enable tree-level parallelization
+function RandomForestTrainer:treeParallel(workPool)
+   assert(self.parallelMode == 'singlethread', self.parallelMode)
+   self.parallelMode = 'treeparallel'
+   self.workPool = torch.type(workPool) == 'number' and dt.WorkPool(workPool) or workPool
+   assert(torch.isTypeOf(self.workPool, 'dt.WorkPool'))
+
+   -- require the dt package
+   self.workPool:update('require', {libname='decisiontree',varname='dt'})
+end
+
+-- TP is for tree parallel (not toilet paper)
+function RandomForestTrainer:trainTreesTP(trainSet, featureIds, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   local minLeafSize = self.minLeafSize
+   local maxLeafNodes = self.maxLeafNodes
+
+   -- setup worker store (each worker will have its own cartTrainer)
+   self.workPool:updateup('execute', function(store)
+      local dt = require 'decisiontree'
+
+      store.cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+      store.featureIds = featureIds
+   end)
+
+   for treeId=1,self.nTree do
+      -- upvalues
+      local baggingSize = self.featureBaggingSize
+      local activeRatio = self.activeRatio
+      -- task closure that will be executed in worker-thread
+      local function trainTreeTask(store)
+         local dt = require 'decisiontree'
+         return dt.RandomForestTrainer.trainTree(store.cartTrainer, store.featureIds, baggingSize, activeRatio, treeId, verbose)
+      end
+      self.workPool:writeup('execute', trainTreeTask)
+   end
+
+   local trees = {}
+   for treeId=1,self.nTree do
+      local taskname, tree = self.workPool:read()
+      assert(taskname=='execute')
+      assert(torch.isTypeOf(tree, 'dt.CartTree'))
+      table.insert(trees, tree)
+   end
+   return trees
+end
+
+function RandomForestTrainer:getName()
+   return string.format(
+      "randomforest-aRatio-%4.2f-maxLeaf-%d-minExample-%d-nTree-%d",
+      self.activeRatio, self.maxLeafNodes, self.minLeafSize, self.nTree
+   )
+end
+
diff --git a/contrib/torch/decisiontree/Sparse2Dense.lua b/contrib/torch/decisiontree/Sparse2Dense.lua
new file mode 100644 (file)
index 0000000..4e5b79d
--- /dev/null
@@ -0,0 +1,88 @@
+local S2D, parent = torch.class("nn.Sparse2Dense", "nn.Module")
+local dt = require 'decisiontree._env'
+
+function S2D:__init(features)
+   parent.__init(self)
+   if torch.type(features) == 'table' then
+      assert(#features > 0)
+      features = torch.LongTensor(features)
+   end
+   assert(torch.isTensor(features))
+   self.features = features
+   self.featureMap = nil
+   self.masks = {}
+   self.mappedKeys = {}
+end
+
+function S2D:updateOutput(input)
+   if not self.featureMap then
+      self.featureMap = dt.HashMap()
+      self.featureMap:fill(self.features)
+   end
+   local batched, keys, values
+   if torch.isTensor(input[1]) then
+      keys = {input[1]}
+      values = {input[2]}
+      batched = false
+   else
+      keys = input[1]
+      values = input[2]
+      batched = true
+   end
+   assert(#keys == #values)
+
+   local masks = self.masks
+   local mappedKeys = self.mappedKeys
+   local nKeys = #keys
+   local nMasks = #masks
+   if nMasks < nKeys then
+      for i=nMasks+1,nKeys do
+         masks[i] = torch.ByteTensor()
+         mappedKeys[i] = torch.LongTensor()
+      end
+   elseif nMasks > nKeys then
+      for i=nKeys+1,nMasks do
+         masks[i] = nil
+         mappedKeys[i] = nil
+      end
+   end
+
+   self.featureMap:get(keys, mappedKeys, masks)
+   self.output = self.output or torch.Tensor():type(self._type)
+   self.output.nn.S2D_computeOutput(self.output, mappedKeys, values, masks, self.features)
+   if not batched then
+      self.output = self.output:view(-1)
+   end
+   return self.output
+end
+
+function S2D:type(type, tensorCache)
+   if type then
+      local features = self.features
+      self.features = nil
+      parent.type(self, type, tensorCache)
+      self.features = features
+      return self
+   else
+      return parent.type(self)
+   end
+end
+
+function S2D:updateGradInput(input, gradOutput)
+   error"Not Implemented"
+end
+
+function S2D:reset()
+   parent.reset(self)
+   self.featureMap = nil
+end
+
+function S2D:write(file)
+   self.featureMap = nil
+   parent.write(self, file)
+end
+
+function S2D:read(file)
+   self.featureMap = nil
+   parent.read(self, file)
+end
diff --git a/contrib/torch/decisiontree/SparseTensor.lua b/contrib/torch/decisiontree/SparseTensor.lua
new file mode 100644 (file)
index 0000000..4c620e6
--- /dev/null
@@ -0,0 +1,54 @@
+
+local SparseTensor = torch.class("torch.SparseTensor")
+
+function SparseTensor:__init(keys, values)
+   if keys and values then
+      assert(torch.typename(keys):find('torch%..*LongTensor'))
+      assert(torch.isTensor(values))
+      assert(keys:nElement() == values:nElement(), "Expecting key and value tensors of same size")
+      self.keys = keys
+      self.values = values
+   elseif not (keys or values) then
+      self.keys = torch.LongTensor()
+      self.values = torch.Tensor()
+   else
+      error"Expecting zero or two args"
+   end
+end
+
+function SparseTensor:buildIndex(overwrite)
+   if self._map and not overwrite then return end
+   assert(self.keys and self.keys:dim() == 1)
+   assert(self.values and self.values:dim() == 1)
+   -- hash table
+   self._map = {}
+   for i=1,self.keys:size(1) do
+      self._map[self.keys[i]] = i
+   end
+end
+
+function SparseTensor:deleteIndex()
+   self._map = nil
+end
+
+local __index = SparseTensor.__index
+function SparseTensor:__index(key)
+   if key == nil then
+      error"Attempt to index using a nil key"
+   elseif torch.type(key) ~= 'number' then
+      return __index(self, key)
+   end
+
+   if self._map then
+      assert(torch.type(self._map) == 'table')
+      local idx = self._map[key]
+      return idx and self.values[idx] or nil
+   elseif self.keys:nElement() > 0 then
+      for i=1,self.keys:size(1) do
+         if self.keys[i] == key then
+            return self.values[i]
+         end
+      end
+   end
+   return nil
+end
\ No newline at end of file
diff --git a/contrib/torch/decisiontree/TreeState.lua b/contrib/torch/decisiontree/TreeState.lua
new file mode 100644 (file)
index 0000000..3928649
--- /dev/null
@@ -0,0 +1,191 @@
+local dt = require "decisiontree._env"
+
+local TreeState = torch.class("dt.TreeState", dt)
+
+-- Holds the state of a subtree during decision tree training.
+-- Also, manages the state of candidate splits
+function TreeState:__init(exampleIds)
+   assert(torch.type(exampleIds) == 'torch.LongTensor')
+   self.exampleIds = exampleIds
+
+   self.nExampleInLeftBranch = 0
+   self.nExampleInRightBranch = 0
+end
+
+-- computes and returns the score of the node based on its examples
+function TreeState:score(dataset)
+   error"NotImplemented"
+end
+
+
+-- Initializes the split-state-updater. Initially all examples are in the left branch.
+-- exampleIdsWithFeature is list of examples to split (those having a particular feature)
+function TreeState:initialize(exampleIdsWithFeature, dataset)
+   error"NotImplemented"
+end
+
+-- Update the split state. This call has the effect of shifting the example from the left to the right branch.
+function TreeState:update(exampleId, dataset)
+   error"NotImplemented"
+end
+
+-- Computes the SplitInfo determined by the current split state
+-- @param splitFeatureId the feature id of the split feature
+-- @param splitFeatureValue the feature value of the split feature
+-- @return the SplitInfo determined by the current split state
+function TreeState:computeSplitInfo(splitFeatureId, splitFeatureValue)
+   error"NotImplemented"
+end
+
+-- bottleneck
+function TreeState:findBestFeatureSplit(dataset, featureId, minLeafSize)
+   local dt = require "decisiontree"
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   assert(torch.type(featureId) == 'number')
+   assert(torch.type(minLeafSize) == 'number')
+
+   -- all dataset example having this feature, sorted by value
+   local featureExampleIds = dataset:getSortedFeature(featureId)
+
+   local buffer = dt.getBufferTable('TreeState')
+   buffer.longtensor = buffer.longtensor or torch.LongTensor()
+   local exampleIdsWithFeature = buffer.longtensor
+
+   -- map and tensor of examples containing feature:
+   local exampleMap = {}
+   local getExampleFeatureValue
+
+   local j = 0
+   if torch.type(dataset.input) == 'table' then
+      exampleIdsWithFeature:resize(self.exampleIds:size())
+      self.exampleIds:apply(function(exampleId)
+         local input = dataset.input[exampleId]
+         input:buildIndex()-- only builds index first time
+         if input[featureId] then
+            j = j + 1
+            exampleIdsWithFeature[j] = exampleId
+            exampleMap[exampleId] = j
+         end
+      end)
+      if j == 0 then
+         return
+      end
+      exampleIdsWithFeature:resize(j)
+      getExampleFeatureValue = function(exampleId) return dataset.input[exampleId][featureId] end
+   else
+      exampleIdsWithFeature = self.exampleIds
+      self.exampleIds:apply(function(exampleId)
+         j = j + 1
+         exampleMap[exampleId] = j
+      end)
+      local featureValues = dataset.input:select(2,featureId)
+      getExampleFeatureValue = function(exampleId) return featureValues[exampleId] end
+   end
+
+
+   self:initialize(exampleIdsWithFeature, dataset)
+
+   -- bottleneck
+   local bestSplit, previousSplitValue, _tictoc
+   for i=featureExampleIds:size(1),1,-1 do -- loop over examples sorted (desc) by feature value
+      local exampleId = featureExampleIds[i]
+
+      local exampleIdx = exampleMap[exampleId]
+      if exampleIdx then
+         local splitValue = getExampleFeatureValue(exampleId)
+
+         if previousSplitValue and math.abs(splitValue - previousSplitValue) > dt.EPSILON then
+            local splitInfo = self:computeSplitInfo(featureId, previousSplitValue, _tictoc)
+            if (splitInfo.leftChildSize >= minLeafSize) and (splitInfo.rightChildSize >= minLeafSize) then
+
+               if (not bestSplit) or (splitInfo.splitGain < bestSplit.splitGain) then
+                  _tictoc = bestSplit or {} -- reuse table
+                  bestSplit = splitInfo
+               end
+
+            end
+         end
+
+         previousSplitValue = splitValue
+
+         -- bottleneck
+         self:update(exampleId, dataset, exampleIdx)
+      end
+   end
+
+   return bestSplit
+end
+
+-- finds the best split of examples in treeState among featureIds
+function TreeState:findBestSplit(dataset, featureIds, minLeafSize, shardId, nShard)
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   assert(torch.type(minLeafSize) == 'number')
+   assert(torch.type(shardId) == 'number')
+   assert(torch.type(nShard) == 'number')
+
+   local bestSplit
+   for i=1,featureIds:size(1) do
+      local featureId = featureIds[i]
+      if (nShard <= 1) or ( (featureId % nShard) + 1 == shardId ) then -- feature sharded
+         local splitCandidate = self:findBestFeatureSplit(dataset, featureId, minLeafSize)
+         if splitCandidate and ((not bestSplit) or (splitCandidate.splitGain < bestSplit.splitGain)) then
+            bestSplit = splitCandidate
+         end
+      end
+   end
+
+   return bestSplit
+end
+
+-- Partitions self given a splitInfo table, producing a pair of exampleIds corresponding to the left and right subtrees.
+function TreeState:_branch(splitInfo, dataset)
+   local leftIdx, rightIdx = 0, 0
+   local nExample = self.exampleIds:size(1)
+   local splitExampleIds = torch.LongTensor(nExample)
+
+
+   for i=1,self.exampleIds:size(1) do
+      local exampleId = self.exampleIds[i]
+      local input = dataset.input[exampleId]
+      local val = input[splitInfo.splitId]
+      -- Note: when the feature is not present in the example, the example is droped from all sub-trees.
+      -- Which means that for most sparse data, a tree cannot reach 100% accuracy...
+      if val then
+         if val < splitInfo.splitValue then
+            leftIdx = leftIdx + 1
+            splitExampleIds[leftIdx] = exampleId
+         else
+            rightIdx = rightIdx + 1
+            splitExampleIds[nExample-rightIdx+1] = exampleId
+         end
+      end
+   end
+
+   local leftExampleIds = splitExampleIds:narrow(1,1,leftIdx)
+   local rightExampleIds = splitExampleIds:narrow(1,nExample-rightIdx+1,rightIdx)
+
+   assert(leftExampleIds:size(1) + rightExampleIds:size(1) <= self.exampleIds:size(1), "Left and right branches contain more data than the parent!")
+   return leftExampleIds, rightExampleIds
+end
+
+-- calls _branch and encapsulates the left and right exampleIds into a TreeStates
+function TreeState:branch(splitInfo, dataset)
+   local leftExampleIds, rightExampleIds = self:_branch(splitInfo, dataset)
+   return self.new(leftExampleIds), self.new(rightExampleIds)
+end
+
+function TreeState:size()
+   return self.exampleIds:size(1)
+end
+
+function TreeState:contains(exampleId)
+   local found = false
+   self.exampleIds:apply(function(x)
+      if x == exampleId then
+         found = true
+      end
+   end)
+   return found
+end
+
diff --git a/contrib/torch/decisiontree/WorkPool.lua b/contrib/torch/decisiontree/WorkPool.lua
new file mode 100644 (file)
index 0000000..8f47372
--- /dev/null
@@ -0,0 +1,156 @@
+local dt = require "decisiontree._env"
+
+-- Utility to simplify construction of a pool of daemon threads with which to execute tasks in parallel.
+local WorkPool = torch.class("dt.WorkPool", dt)
+
+function WorkPool:__init(nThread)
+   self.nThread = nThread or 16
+   assert(torch.type(self.nThread) == 'number')
+   assert(self.nThread > 0)
+
+   self:initialize()
+end
+
+function WorkPool:initialize()
+   local ipc = require 'libipc'
+   self.queuename = os.tmpname()
+   self.queue = ipc.workqueue(self.queuename)
+   self.queues = {}
+   for i=1,self.nThread do
+      self.queues[i] = ipc.workqueue(self.queuename.."/"..i)
+   end
+
+   -- spawn thread workers
+   ipc.map(self.nThread, function(queuename, nThread, myId)
+      assert(nThread)
+      assert(myId)
+      local ipc = require 'libipc'
+
+      -- Open the queue by name (the main thread already created it)
+      local mainqueue = ipc.workqueue(queuename)
+      local workqueue = ipc.workqueue(queuename.."/"..myId)
+
+      local taskname, args
+
+      local store = {}
+      local queue = mainqueue
+
+      repeat
+         local msg = queue:read()
+         assert(torch.type(msg) == 'table')
+         taskname, task = unpack(msg)
+         if taskname == nil then
+            break
+         elseif torch.type(taskname) ~= 'string' then
+            error("Expecting taskname string. Got "..torch.type(taskname))
+         elseif taskname == 'storeKeyValue' then
+            assert(torch.type(task) == 'table')
+            assert(queue == workqueue)
+            store[task.key] = task.value
+            queue:write({taskname})
+         elseif taskname == 'storeKeysValues' then
+            assert(torch.type(task) == 'table')
+            assert(queue == workqueue)
+            for key,value in pairs(task) do
+               store[key] = value
+            end
+            queue:write({taskname})
+         elseif taskname == 'require' then
+            assert(torch.type(task) == 'table')
+            assert(torch.type(task.libname) == 'string')
+            assert(torch.type(task.varname) == 'string')
+            _G[task.varname] = require(task.libname)
+            assert(queue == workqueue)
+            queue:write({taskname})
+         elseif taskname == 'storeReset' then
+            store = {}
+            mainqueue:write({taskname})
+         elseif taskname == 'echo' then
+            mainqueue:write({taskname, task})
+         elseif taskname == 'readWorkerQueue' then
+            queue = workqueue
+         elseif taskname == 'readMainQueue' then
+            queue = mainqueue
+         elseif taskname == 'execute' then
+            if torch.type(task) == 'table' then
+               assert(task.func and task.args)
+               queue:write({taskname, task.func(store, task.args, myId)})
+            else
+               assert(torch.type(task) == 'function')
+               queue:write({taskname, task(store, myId)})
+            end
+         else
+            error("Unknown taskname: "..taskname)
+         end
+      until taskname == nil
+   end, self.queuename, self.nThread)
+
+end
+
+-- Terminates all daemon threads.
+function WorkPool:terminate()
+   for i=1,self.nThread do
+      self.queue:write({})
+   end
+end
+
+-- this function is used to update the store of data in each worker thread
+function WorkPool:_update(taskname, task, upval)
+   assert(torch.type(taskname) == 'string')
+   local _ = require 'moses'
+   assert(_.contains({'storeKeyValue','storeKeysValues','require','execute'}, taskname))
+   assert(torch.type(task) == 'table' or torch.type(task) == 'function')
+
+   -- tell the workers to read their individual queue
+   for i=1,self.nThread do
+      self.queue:write({'readWorkerQueue'})
+   end
+
+   -- write to individual worker queues
+   for i=1,self.nThread do
+      if upval then
+         self.queues[i]:writeup({taskname, task})
+      else
+         self.queues[i]:write({taskname, task})
+      end
+   end
+
+   -- TODO use ipc.mutex:barrier(nThread+1)
+   -- barrier: make sure that every worker has completed task by reading their queue
+   for i=1,self.nThread do
+      assert(self.queues[i]:read()[1] == taskname)
+   end
+
+   -- finally, tell them to read the main queue
+   for i=1,self.nThread do
+      self.queues[i]:write({'readMainQueue'})
+   end
+end
+
+function WorkPool:update(taskname, task)
+   return self:_update(taskname, task, false)
+end
+
+function WorkPool:updateup(taskname, task)
+   return self:_update(taskname, task, true)
+end
+
+function WorkPool:write(taskname, task)
+   assert(torch.type(taskname) == 'string')
+   assert(taskname ~= 'storeKeyValue' or taskname ~= 'storeKeysValues')
+   self.queue:write({taskname, task})
+end
+
+function WorkPool:writeup(taskname, task)
+   assert(torch.type(taskname) == 'string')
+   assert(taskname ~= 'storeKeyValue' or taskname ~= 'storeKeysValues')
+   self.queue:writeup({taskname, task})
+end
+
+function WorkPool:read()
+   local res = self.queue:read()
+   assert(torch.type(res) == 'table')
+   assert(torch.type(res[1] == 'string'))
+   return unpack(res)
+end
+
diff --git a/contrib/torch/decisiontree/_env.lua b/contrib/torch/decisiontree/_env.lua
new file mode 100644 (file)
index 0000000..a927701
--- /dev/null
@@ -0,0 +1,5 @@
+
+-- https://github.com/torch/torch7/issues/525
+
+local dl = {}
+return dl
\ No newline at end of file
diff --git a/contrib/torch/decisiontree/benchmark.lua b/contrib/torch/decisiontree/benchmark.lua
new file mode 100644 (file)
index 0000000..2b6a03d
--- /dev/null
@@ -0,0 +1,171 @@
+local dt = require "decisiontree._env"
+
+local bm = {}
+function bm.CartTrainer(opt)
+   local timer = torch.Timer()
+   local trainSet, validSet = dt.getSparseDummyData(opt)
+   print(string.format("CartTrainer: sparse dataset create: %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+
+   local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   timer:reset()
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   print(string.format("CartTrainer: train single-thread : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+
+   timer:reset()
+   cartTrainer:featureParallel(opt.nThread)
+   print(string.format("CartTrainer: setup feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+   timer:reset()
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   print(string.format("CartTrainer: train feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+end
+
+function bm.GradientBoostState(opt)
+   local trainSet, validSet = dt.getSparseDummyData(opt)
+
+   trainSet:initScore()
+
+   local treeState = dt.GradientBoostState(trainSet:getExampleIds(), nn.LogitBoostCriterion(false))
+
+   local timer = torch.Timer() -- first step also calls SparseTensor:buildIndex()
+   treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
+   print(string.format("GradientBoostState: findBestSplit (first) : %f sec", timer:time().real))
+
+   timer:reset()
+   treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
+   print(string.format("GradientBoostState: findBestSplit (second) : %f sec", timer:time().real))
+
+end
+
+local function file_exists(name)
+   local f=io.open(name,"r")
+   if f~=nil then io.close(f) return true else return false end
+end
+
+function bm.GradientBoostTrainer(opt)
+   local trainSet, validSet
+   if file_exists("/tmp/train.bin") and file_exists("/tmp/valid.bin") then
+      trainSet = torch.load("/tmp/train.bin")
+      validSet = torch.load("/tmp/valid.bin")
+   else
+      if opt.sparse then
+         trainSet, validSet = dt.getSparseDummyData(opt)
+      else
+         trainSet, validSet = dt.getDenseDummyData(opt)
+      end
+      torch.save("/tmp/train.bin", trainSet)
+      torch.save("/tmp/valid.bin", validSet)
+   end
+
+   local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
+   opt.lossFunction = nn.LogitBoostCriterion(false)
+   opt.treeTrainer = cartTrainer
+   local forestTrainer = dt.GradientBoostTrainer(opt)
+
+   local timer = torch.Timer()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
+   local time = timer:time().real
+   print(string.format("GradientBoostTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+
+   cartTrainer:featureParallel(opt.nThread)
+   timer:reset()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
+   local time = timer:time().real
+   print(string.format("GradientBoostTrainer: train feature-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+end
+
+function bm.RandomForestTrainer(opt)
+   local trainSet, validSet = dt.getSparseDummyData(opt)
+
+   local forestTrainer = dt.RandomForestTrainer(opt)
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+
+   local timer = torch.Timer()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+   local time = timer:time().real
+   print(string.format("RandomForestTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+
+   timer:reset()
+   forestTrainer:treeParallel(opt.nThread)
+   print(string.format("RandomForestTrainer: setup tree-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+
+   timer:reset()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+   local time = timer:time().real
+   print(string.format("RandomForestTrainer: train tree-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+end
+
+function bm.DFD(opt)
+   local _ = require 'moses'
+   local opt = _.clone(opt)
+   opt.nExample = 200
+   local trainSet, validSet = dt.getDenseDummyData(opt)
+
+   local forestTrainer = dt.RandomForestTrainer(opt)
+   forestTrainer:treeParallel(opt.nThread)
+   local timer = torch.Timer()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+   local time = timer:time().real
+   print(string.format("DFD: train random forest in parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+
+
+   -- benchmark nn.DFD
+   local input = trainSet.input:sub(1,opt.batchsize)
+   local dfd = nn.DFD(decisionForest)
+   dfd:forward(input)
+   timer:reset()
+   for i=1,opt.nloop do
+      dfd:forward(input)
+   end
+   print(string.format("DFD: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
+end
+
+function bm.Sparse2Dense(opt)
+   local _ = require 'moses'
+   local opt = _.clone(opt)
+   opt.nExample = opt.batchsize
+   local trainSet = dt.getSparseDummyData(opt)
+
+   local input = {{},{}}
+   for i=1,opt.batchsize do
+      input[1][i] = trainSet.input[i].keys
+      input[2][i] = trainSet.input[i].values
+   end
+   assert(#input[1] == opt.batchsize)
+
+   -- benchmark nn.Sparse2Dense
+   local s2d = nn.Sparse2Dense(torch.LongTensor():range(1,opt.nFeature))
+   s2d:forward(input)
+   local timer = torch.Timer()
+   for i=1,opt.nloop do
+      s2d:forward(input)
+   end
+   print(string.format("Sparse2Dense: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
+end
+
+function dt.benchmark(benchmarks, opt2)
+   local opt = {
+      nExample=10000, nCluster=2, nFeature=1000, overlap=0, nValid=100,        -- getSparseDummyData
+      nTree=20, featureBaggingSize=-1, sparse=true,                           -- GradientBoostTrainer and RandomForestTrainer
+      nThread=2, shrinkage=0.1, downsampleRatio=0.1, evalFreq=5, earlyStop=0, -- GradientBoostTrainer
+      activeRatio=0.5,                                                      -- RandomForestTrainer
+      batchsize=32, nloop=10
+   }
+
+   local _ = require 'moses'
+   benchmarks = benchmarks or _.keys(bm)
+   assert(torch.type(benchmarks) == 'table')
+   for i,benchmark in ipairs(benchmarks) do
+      local opt1 = _.clone(opt)
+      for key, value in pairs(opt2 or {}) do
+         opt1[key] = value
+      end
+      opt1.nActive = opt1.nActive or torch.round(opt1.nFeature/10)
+      opt1.maxLeafNodes = opt1.maxLeafNodes or (opt1.nExample/10)
+      opt1.minLeafSize = opt1.minLeafSize or (opt1.nExample/100)
+
+      assert(torch.type(benchmark) == 'string', benchmark)
+      assert(bm[benchmark], benchmark)
+      bm[benchmark](opt1)
+   end
+end
diff --git a/contrib/torch/decisiontree/doc/benchmark.md b/contrib/torch/decisiontree/doc/benchmark.md
new file mode 100644 (file)
index 0000000..cb8f905
--- /dev/null
@@ -0,0 +1,291 @@
+# Benchmarks
+
+This file outlines the roadmap (and commensurate benchmarks) of optimizations and refactorings over time.
+
+## Baseline
+
+The baseline implementation is very slow.
+We converted the Twitter decision tree library (used internally) from Java to Lua.
+The objective was to replicate the GBDT and Random Forest implementations as is (more or less).
+The Java library is very good and reasonably fast. The same code in Lua is slow.
+The point of this Lua baseline was not to obtain the same computational performance as the Java library.
+Instead, we wanted the training and inferences algorithms of the Lua lib to match thoses of the Java lib.
+As such, the training/validation error of the baseline Lua lib should match that of the Java lib.
+The unit tests seem to validate this claim as both training/validation set performance is unit tested.
+We also used the conversion exercise as a way to learn about decision tree implementation (our background is deep learning).
+That being said, the baseline performance is terrible:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark()"
+CartTrainer: sparse dataset create: 2963.192386 samples/sec; 0.337479 sec
+CartTrainer: train single-thread : 14.165438 samples/sec; 70.594361 sec
+CartTrainer: setup feature-parallel : 5.129034 samples/sec; 194.968478 sec
+CartTrainer: train feature-parallel : 9.736592 samples/sec; 102.705344 sec
+```
+
+The original Java lib had approximately 43 classes.
+The baseline has about 24.
+This reduction is due to obvious merging of classes. But also to conversions of classes to functions.
+The next patches continue this process of reducing the number of classes.
+
+## Patch 1 (complete):
+
+This patch further reduces the number of classes, but adds the DataSet class.
+The code is much simple to read. Examples are batched.
+
+ * [x] examples are batched in dt.DataSet: {input, target, score}
+ * [x] deprecate dt.LabeledExample
+ * [x] list of examples are replaced with torch.LongTensors of exampleIds
+ * [x] merge TreeBrancher into TreeState
+ * [x] merge BestSplitFinder and SplitStateUpdater into TreeState
+ * [x] TreeState subclasses: GradientBoostState and GiniState
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark()"
+CartTrainer: sparse dataset create: 3597.392294 samples/sec; 0.277984 sec
+CartTrainer: train single-thread : 35.763255 samples/sec; 27.961663 sec
+CartTrainer: setup feature-parallel : 36759.250495 samples/sec; 0.027220 sec
+CartTrainer: train feature-parallel : 72.523658 samples/sec; 13.788606 sec
+```
+
+ The setup time for feature-parallelization is most improved.
+ The run-time for feature-parallel also about half that of single-thread.
+ Since its using 2 threads, that means the parallelization is working quite well.
+
+ We also added benchmarks for the `RandomForestTrainer` and `GradientBoostTrainer`:
+
+```
+GradientBoostTrainer: train single-thread : 599.895105 samples/sec; 0.083348 sec/tree, 1.666958 sec
+GradientBoostTrainer: train feature-parallel : 974.235273 samples/sec; 0.051322 sec/tree, 1.026446 sec
+RandomForestTrainer: train single-thread : 134.781044 samples/sec; 0.370972 sec/tree, 7.419441 sec
+RandomForestTrainer: setup tree-parallel : 73341.097064 samples/sec; 0.013649 sec
+RandomForestTrainer: train tree-parallel : 262.975891 samples/sec; 0.190131 sec/tree, 3.802630 sec
+```
+
+Looks good.
+
+## Patch 2 (complete):
+
+ * [x] dt.LossFunction -> nn.Criterion (LogitBoost is done, missing MSE)
+ * [x] use SparseTensor:buildIndex() to accelerate TreeState:findBestSplit()
+ * [x] benchmarks use 10000 instead of 1000 examples
+
+The benchmarks indicate good improvements. Most improvements were made possible by the use of `buildIndex`:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark()"
+GradientBoostState: findBestSplit (first) : 11.415645 sec
+GradientBoostState: findBestSplit (second) : 11.246336 sec
+CartTrainer: sparse dataset create: 3284.803629 samples/sec; 3.044327 sec
+CartTrainer: train single-thread : 239.544758 samples/sec; 41.745858 sec
+CartTrainer: setup feature-parallel : 10996.443063 samples/sec; 0.909390 sec
+CartTrainer: train feature-parallel : 473.888592 samples/sec; 21.102011 sec
+RandomForestTrainer: train single-thread : 892.985186 samples/sec; 0.559920 sec/tree, 11.198394 sec
+RandomForestTrainer: setup tree-parallel : 176806.252266 samples/sec; 0.056569 sec
+RandomForestTrainer: train tree-parallel : 1377.849291 samples/sec; 0.362884 sec/tree, 7.257688 sec
+GradientBoostTrainer: train single-thread : 2685.485128 samples/sec; 0.186186 sec/tree, 3.723722 sec
+GradientBoostTrainer: train feature-parallel : 3712.313215 samples/sec; 0.134687 sec/tree, 2.693738 sec
+```
+
+The main bottleneck now is in serializing the SparseTensor hash maps. We temporarly overcame this bottleneck by
+deleting indexes when calling `CartTrainer:featureParallel()` and `RandomForestTrainer:treeParallel()`.
+In this way, the indexes are recreated for each thread. Ideally, we would use a C hash map such that a pointer
+could be serialized instead. But `tds.Hash` does not serialize well. For now instead, we use lua tables.
+
+This is the benchmark for `GradientBoostTrainer` on a large dataset of dense inputs:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
+GradientBoostTrainer: train single-thread : 152.463989 samples/sec; 131.178517 sec/tree, 655.892584 sec
+GradientBoostTrainer: train feature-parallel : 224.288488 samples/sec; 89.170872 sec/tree, 445.854358 sec
+[tw-mbp-nleonard decisiontree]$ th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8,nThread=4})"
+GradientBoostTrainer: train single-thread : 163.836896 samples/sec; 122.072625 sec/tree, 610.363126 sec
+GradientBoostTrainer: train feature-parallel : 407.981442 samples/sec; 49.021838 sec/tree, 245.109188 sec
+```
+
+## Patch 3 :
+
+Optimize GBDT for large datasets consisting of dense inputs. The benchmarks:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
+GradientBoostTrainer: train single-thread : 547.553407 samples/sec; 36.526117 sec/tree, 182.630587 sec
+GradientBoostTrainer: train feature-parallel : 792.964678 samples/sec; 25.221804 sec/tree, 126.109022 sec
+[tw-mbp-nleonard decisiontree]$ th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8,nThread=4})"
+GradientBoostTrainer: train single-thread : 555.793759 samples/sec; 35.984571 sec/tree, 179.922855 sec
+GradientBoostTrainer: train feature-parallel : 1289.977846 samples/sec; 15.504142 sec/tree, 77.520711 sec
+```
+
+For 1, 2 and 4 threads, the speedups of patch 3 over patch 2 are respectively: 3.39, 3.53, and 3.18.
+For this patch, the multi-threading speedup of 2 and 4 threads over a single thread are respectively: 1.42 and 2.33.
+Improvements over the previous patch were obtained by optimizing two aspects:
+
+  1. Optimizing `TreeState.findBestFeatureSplit` for dense datasets (for example: `if dense, then ...`);
+  2. Removing `assert` clauses in `GradientBoostState.update`. The `update` method is called for every (example, feature), making it a major bottleneck.
+
+Converting the `update` to C could lead to further optimizations.
+
+This patch also improves the benchmark on sparse datasets:
+```
+$ th -e "dt = require 'decisiontree'; dt.benchmark()"
+RandomForestTrainer: train single-thread : 1121.311196 samples/sec; 0.445907 sec/tree, 8.918131 sec
+RandomForestTrainer: setup tree-parallel : 168773.323354 samples/sec; 0.059256 sec
+RandomForestTrainer: train tree-parallel : 1701.280938 samples/sec; 0.293896 sec/tree, 5.877924 sec
+GradientBoostState: findBestSplit (first) : 8.250646 sec
+GradientBoostState: findBestSplit (second) : 7.952077 sec
+GradientBoostTrainer: train single-thread : 3355.248596 samples/sec; 0.149020 sec/tree, 2.980405 sec
+GradientBoostTrainer: train feature-parallel : 4399.133369 samples/sec; 0.113659 sec/tree, 2.273175 sec
+CartTrainer: sparse dataset create: 3428.105601 samples/sec; 2.917069 sec
+CartTrainer: train single-thread : 282.172416 samples/sec; 35.439331 sec
+CartTrainer: setup feature-parallel : 9455.440801 samples/sec; 1.057598 sec
+CartTrainer: train feature-parallel : 594.054049 samples/sec; 16.833491 sec
+DFD: train random forest in parallel : 346.831378 samples/sec; 0.288325 sec/tree, 5.766491 sec
+DFD: updateOutput : 831.105546 samples/sec; 0.038509 sec
+```
+
+## Patch 4 :
+
+This patch improves `nn.DFD` from
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
+DFD: train random forest in parallel : 10.527251 samples/sec; 0.037997 sec/tree, 18.998313 sec
+DFD: updateOutput : 32.442950 samples/sec; 9.863472 sec
+```
+
+to
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
+DFD: train random forest in parallel : 10.839547 samples/sec; 0.036902 sec/tree, 18.450956 sec
+DFD: updateOutput : 359.158353 samples/sec; 0.890975 sec
+Sparse2Dense: updateOutput : 15395.648952 samples/sec; 0.020791 sec
+```
+
+That is a 10x speedup for `nn.DFD`.
+
+The patch also adds a benchmark for `nn.Sparse2Dense`:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
+Sparse2Dense: updateOutput : 17158.126406 samples/sec; 0.018653 sec
+```
+
+Indeed, `nn.Sparse2Dense` is not the bottleneck; `nn.DFD` is.
+
+## Patch 5 :
+
+This patch improves `nn.DFD` inference from
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=100})"; done
+DFD: train random forest in parallel : 8.452295 samples/sec; 0.047324 sec/tree, 23.662212 sec
+DFD: updateOutput : 176.617872 samples/sec; 9.059109 sec
+DFD: train random forest in parallel : 8.350019 samples/sec; 0.047904 sec/tree, 23.952042 sec
+DFD: updateOutput : 183.508204 samples/sec; 8.718962 sec
+DFD: train random forest in parallel : 8.525779 samples/sec; 0.046917 sec/tree, 23.458266 sec
+DFD: updateOutput : 178.877077 samples/sec; 8.944692 sec
+```
+
+to
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=100})"; done
+DFD: train random forest in parallel : 8.434502 samples/sec; 0.047424 sec/tree, 23.712129 sec
+DFD: updateOutput : 6479.597179 samples/sec; 0.246933 sec
+DFD: train random forest in parallel : 8.334543 samples/sec; 0.047993 sec/tree, 23.996518 sec
+DFD: updateOutput : 6663.641184 samples/sec; 0.240114 sec
+DFD: train random forest in parallel : 8.353265 samples/sec; 0.047885 sec/tree, 23.942735 sec
+DFD: updateOutput : 6882.607456 samples/sec; 0.232475 sec
+```
+
+That is a 37x speedup for `nn.DFD`.
+
+## Patch 6:
+
+This patch improves `nn.DFD` from the previous result to
+
+```
+for i in `seq 5`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=10000})"; done
+DFD: train random forest in parallel : 8.353504 samples/sec; 0.047884 sec/tree, 23.942050 sec
+DFD: updateOutput : 91967.342339 samples/sec; 1.739753 sec
+DFD: train random forest in parallel : 8.528141 samples/sec; 0.046904 sec/tree, 23.451770 sec
+DFD: updateOutput : 91405.321702 samples/sec; 1.750451 sec
+DFD: train random forest in parallel : 8.184562 samples/sec; 0.048872 sec/tree, 24.436250 sec
+DFD: updateOutput : 91623.388867 samples/sec; 1.746284 sec
+DFD: train random forest in parallel : 8.779561 samples/sec; 0.045560 sec/tree, 22.780182 sec
+DFD: updateOutput : 93914.242852 samples/sec; 1.703686 sec
+DFD: train random forest in parallel : 8.636201 samples/sec; 0.046317 sec/tree, 23.158330 sec
+DFD: updateOutput : 94092.241963 samples/sec; 1.700465 sec
+```
+
+That is another 13.8x speedup.
+
+## Patch 7:
+
+This patch improves `nn.Sparse2Dense` computation from
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
+Sparse2Dense: updateOutput : 1103.570777 samples/sec; 28.996786 sec
+Sparse2Dense: updateOutput : 1092.064331 samples/sec; 29.302309 sec
+Sparse2Dense: updateOutput : 1036.963572 samples/sec; 30.859334 sec
+```
+
+to
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
+Sparse2Dense: updateOutput : 62995.834470 samples/sec; 0.507978 sec
+Sparse2Dense: updateOutput : 62471.568253 samples/sec; 0.512242 sec
+Sparse2Dense: updateOutput : 62965.099331 samples/sec; 0.508226 sec
+```
+
+This represents a speedup of about 57x.
+
+## Patch 8:
+
+This patch improves `nn.Sparse2Dense` from the previous result to
+
+```for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
+Sparse2Dense: updateOutput : 124268.079914 samples/sec; 0.257515 sec
+Sparse2Dense: updateOutput : 114750.039542 samples/sec; 0.278873 sec
+Sparse2Dense: updateOutput : 122863.314766 samples/sec; 0.260458 sec
+```
+
+which corresponds to another 1.95x speedup.
+
+## Patch 9:
+
+This patches moves the core of training GBDTs, which used to be a big bottleneck, to C. It also
+performs small optimizations across the board (faster scoring, faster branching, ...) that provide a
+little more performance.
+
+The original commit had this performance:
+
+```
+th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
+GradientBoostTrainer: train single-thread : 500.414666 samples/sec; 39.966854 sec/tree, 199.834271 sec
+GradientBoostTrainer: train feature-parallel : 1227.228044 samples/sec; 16.296890 sec/tree, 81.484448 sec (4 threads)
+GradientBoostTrainer: train feature-parallel : 1385.926280 samples/sec; 14.430782 sec/tree, 72.153910 sec (8 threads)
+```
+
+and the new version has
+
+```
+GradientBoostTrainer: train single-thread : 15285.644631 samples/sec; 1.308417 sec/tree, 6.542086 sec
+GradientBoostTrainer: train feature-parallel : 43170.435932 samples/sec; 0.463280 sec/tree, 2.316400 sec (4 threads)
+GradientBoostTrainer: train feature-parallel : 50062.681239 samples/sec; 0.399499 sec/tree, 1.997496 sec (8 threads)
+```
+
+That represents a speedup of about 30.5x over the baseline for 1 thread and 36.1x for 8 threads.
+Note that the performance doesn't increase much as we increase the number of threads since we use
+feature parallelism and the number of features evaluated is small (29 in this case) due to bagging.
+If we disable bagging, then we have the following result with 8 threads and the new code:
+
+```
+GradientBoostTrainer: train single-thread : 590.823965 samples/sec; 33.851030 sec/tree, 169.255152 sec
+GradientBoostTrainer: train feature-parallel : 3232.188576 samples/sec; 6.187758 sec/tree, 30.938789 sec
+```
+
+So processing 836 features now is much faster than processing 29 before.
diff --git a/contrib/torch/decisiontree/error.h b/contrib/torch/decisiontree/error.h
new file mode 100644 (file)
index 0000000..18df3c9
--- /dev/null
@@ -0,0 +1,24 @@
+#ifndef _ERROR_H_
+#define _ERROR_H_
+
+#include "luaT.h"
+#include <string.h>
+
+static inline int _lua_error(lua_State *L, int ret, const char* file, int line) {
+   int pos_ret = ret >= 0 ? ret : -ret;
+   return luaL_error(L, "ERROR: (%s, %d): (%d, %s)\n", file, line, pos_ret, strerror(pos_ret));
+}
+
+static inline int _lua_error_str(lua_State *L, const char *str, const char* file, int line) {
+   return luaL_error(L, "ERROR: (%s, %d): (%s)\n", file, line, str);
+}
+
+static inline int _lua_error_str_str(lua_State *L, const char *str, const char* file, int line, const char *extra) {
+   return luaL_error(L, "ERROR: (%s, %d): (%s: %s)\n", file, line, str, extra);
+}
+
+#define LUA_HANDLE_ERROR(L, ret) _lua_error(L, ret, __FILE__, __LINE__)
+#define LUA_HANDLE_ERROR_STR(L, str) _lua_error_str(L, str, __FILE__, __LINE__)
+#define LUA_HANDLE_ERROR_STR_STR(L, str, extra) _lua_error_str_str(L, str, __FILE__, __LINE__, extra)
+
+#endif
diff --git a/contrib/torch/decisiontree/generic/CartTree.c b/contrib/torch/decisiontree/generic/CartTree.c
new file mode 100644 (file)
index 0000000..eb29fcf
--- /dev/null
@@ -0,0 +1,88 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/CartTree.c"
+#else
+
+static int nn_(tree_fast_score)(lua_State *L) {
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *score = luaT_checkudata(L, 3, torch_Tensor);
+  long n_samples = THTensor_(size)(input, 0);
+  long n_features = THTensor_(size)(input, 1);
+  THTensor_(resize1d)(score, n_samples);
+  real *input_data = THTensor_(data)(input);
+  real *score_data = THTensor_(data)(score);
+
+  lua_pushstring(L, "leftChild");
+  const int left_child_string = 4;
+  lua_pushstring(L, "rightChild");
+  const int right_child_string = 5;
+  lua_pushstring(L, "score");
+  const int score_string = 6;
+  lua_pushstring(L, "splitFeatureId");
+  const int id_string = 7;
+  lua_pushstring(L, "splitFeatureValue");
+  const int value_string = 8;
+
+  const int original_top = lua_gettop(L);
+  for (long i = 0; i < n_samples; i++) {
+    int node = 2;
+    while (1) {
+      int current_top = lua_gettop(L);
+      lua_pushvalue(L, left_child_string);
+      lua_rawget(L, node);
+      lua_pushvalue(L, right_child_string);
+      lua_rawget(L, node);
+      if (lua_isnil(L, -2) && lua_isnil(L, -1)) {
+        lua_pushvalue(L, score_string);
+        lua_rawget(L, node);
+        score_data[i] = lua_tonumber(L, -1);
+        break;
+      }
+      if (lua_isnil(L, -2)) {
+        // go to right
+        node = current_top + 2;
+        continue;
+      }
+      if (lua_isnil(L, -1)) {
+        // go to left
+        node = current_top + 1;
+        continue;
+      }
+      lua_pushvalue(L, id_string);
+      lua_rawget(L, node);
+      lua_pushvalue(L, value_string);
+      lua_rawget(L, node);
+      long feature_id = lua_tointeger(L, -2);
+      real feature_value = lua_tonumber(L, -1);
+
+      real current_value = input_data[i * n_features + (feature_id-1)];
+      if (current_value < feature_value) {
+        // go to left
+        node = current_top + 1;
+      }
+      else {
+        // go to right
+        node = current_top + 2;
+      }
+    }
+    lua_pop(L, lua_gettop(L) - original_top);
+  }
+
+  lua_pop(L, 5);
+
+  lua_pushvalue(L, 3);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(CT__) [] = {
+  {"CartTreeFastScore", nn_(tree_fast_score)},
+  {NULL, NULL}
+};
+
+static void nn_(CT_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(CT__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/torch/decisiontree/generic/DFD.c b/contrib/torch/decisiontree/generic/DFD.c
new file mode 100644 (file)
index 0000000..599c4d7
--- /dev/null
@@ -0,0 +1,157 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DFD.c"
+#else
+
+static int nn_(DFD_computeOutput)(lua_State *L) {
+  THLongTensor *outputkeys =       luaT_checkudata(L, 1, "torch.LongTensor");
+  THTensor *outputvalues =         luaT_checkudata(L, 2, torch_Tensor);
+  THLongTensor *root_ids =         luaT_checkudata(L, 3, "torch.LongTensor");
+  THLongTensor *left_child =       luaT_checkudata(L, 4, "torch.LongTensor");
+  THLongTensor *right_child =      luaT_checkudata(L, 5, "torch.LongTensor");
+  THLongTensor *split_feature_id = luaT_checkudata(L, 6, "torch.LongTensor");
+  THTensor *split_feature_value =  luaT_checkudata(L, 7, torch_Tensor);
+  THTensor *input =                luaT_checkudata(L, 8, torch_Tensor);
+  char only_last_node =            lua_toboolean(L, 9);
+
+  // gets some important sizes from the input
+  long batch_size = THTensor_(size)(input, 0);
+  long input_size = THTensor_(size)(input, 1);
+  long roots_size = THLongTensor_size(root_ids, 0);
+  long depth = THLongTensor_size(outputkeys, 1);
+
+  // keeps track of the number of nodes traversed in the trees by each sample.
+  // each traversed node maps to an output feature having a value of 1
+  long outputsize[batch_size];
+  for (long i = 0; i < batch_size; i++)
+    outputsize[i] = 0;
+
+  // gets direct pointers to the memory of each tensor for efficiency
+  long *root_ids_data = THLongTensor_data(root_ids);
+  long *left_child_data = THLongTensor_data(left_child);
+  long *right_child_data = THLongTensor_data(right_child);
+  real *split_feature_value_data = THTensor_(data)(split_feature_value);
+  long *split_feature_id_data = THLongTensor_data(split_feature_id);
+  long *outputkeys_data = THLongTensor_data(outputkeys);
+  real *input_data = THTensor_(data)(input);
+
+  // for each sample in the batch
+  for (long sample_index = 0; sample_index < batch_size; sample_index++) {
+    // gets pointers to the direct memory associated with each sample for efficiency
+    const long outputkeys_offset = sample_index * depth;
+    const long input_offset = sample_index * input_size;
+    long *local_outputkeys_data = &outputkeys_data[outputkeys_offset];
+    real *local_input_data = &input_data[input_offset];
+
+    // for each tree in the forest
+    for (long i = 0; i < roots_size; i++) {
+      int root = 1;
+      long node_id = root_ids_data[i];
+
+      // traverses the whole tree keeping track of which nodes were seen
+      while (1) {
+        if (root) {
+          // root nodes aren't added to output because they are always traversed
+          root = 0;
+        }
+        else if (!only_last_node) {
+          // updates the outputsize for all samples traversing this node; and
+          // set the traversed node as a feature in output for exampleIds
+          long output_index = outputsize[sample_index];
+          // updates the outputsize for all samples traversing this node
+          outputsize[sample_index]++;
+          // sets the traversed node as a feature in output for exampleIds
+          local_outputkeys_data[output_index] = node_id;
+        }
+
+        // gets the left and right nodes. values of -1 represent missing node
+        long left_id = left_child_data[node_id-1];
+        long right_id = right_child_data[node_id-1];
+
+        if (left_id <= 0 && right_id <= 0) {
+          if (only_last_node) {
+            long output_index = outputsize[sample_index];
+            outputsize[sample_index]++;
+            local_outputkeys_data[output_index] = node_id;
+          }
+          // if no children, stops
+          break;
+        }
+        else if (left_id <= 0) {
+          // if no left child, traverses right node
+          node_id = right_id;
+        }
+        else if (right_id <= 0) {
+          // if no right child, traverses left node
+          node_id = left_id;
+        }
+        else {
+          // if both left and right children, finds the direction for this sample
+          // first get the reference from the node
+          real split_value = split_feature_value_data[node_id-1];
+          long split_id = split_feature_id_data[node_id-1]-1;
+
+          // then gets the value of the sample
+          real node_value = local_input_data[split_id];
+          // and branchs
+          if (node_value < split_value)
+            node_id = left_id;
+          else
+            node_id = right_id;
+        }
+      }
+    }
+  }
+
+  // now that we know which nodes were traverse for each sample, we can create the sparse output
+  // with 1 entry pair for each sample
+  THTensor *input_feature = THTensor_(new)();
+  THLongTensor *indices = THLongTensor_new();
+
+  // pushes the return table with 2 children tables
+  lua_newtable(L);
+  lua_pushinteger(L, 1);
+  lua_newtable(L);
+  lua_pushinteger(L, 2);
+  lua_newtable(L);
+
+  // for each sample...
+  for (long i = 0; i < batch_size; i++) {
+    long j = outputsize[i];
+    // selects the tensor lines from the dense output
+    THLongTensor_select(indices, outputkeys, 0, i);
+    THTensor_(select)(input_feature, outputvalues, 0, i);
+
+    // narrows the keys to actual number of nodes traversed and saves to the output
+    lua_pushinteger(L, i+1);
+    luaT_pushudata(L, THLongTensor_newNarrow(indices, 0, 0, j), "torch.LongTensor");
+    lua_settable(L, -5);
+
+    // and narrows the values
+    lua_pushinteger(L, i+1);
+    luaT_pushudata(L, THTensor_(newNarrow)(input_feature, 0, 0, j), torch_Tensor);
+    lua_settable(L, -3);
+  }
+
+  // pushes the two parts of the output into the output table
+  lua_settable(L, -5);
+  lua_settable(L, -3);
+
+  THLongTensor_free(indices);
+  THTensor_(free)(input_feature);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(DFD__) [] = {
+  {"DFD_computeOutput", nn_(DFD_computeOutput)},
+  {NULL, NULL}
+};
+
+static void nn_(DFD_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(DFD__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/torch/decisiontree/generic/GBDT.c b/contrib/torch/decisiontree/generic/GBDT.c
new file mode 100644 (file)
index 0000000..31f5b02
--- /dev/null
@@ -0,0 +1,392 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GBDT.c"
+#else
+
+#include "GBDT_internal.h"
+#include "GBDT_internal.c"
+
+// note that each one of the functions to find the best split is a subset of the next.
+// first we have one that can only evaluate a single feature, using the logic in lua to control the
+// features
+// then we have one that can go over a shard of faetures, following the feature parallelism
+// introduced by the lua logic
+// and finally we have one that performans the feature parallelism itself in the special case of
+// dense tensors
+// these functions are provided for completeness and to test in case the logic is to be changed
+
+// finds the best split for a given node and feature
+static int nn_(gb_findBestFeatureSplit)(lua_State *L) {
+  THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
+  const int dataset_index = 2;
+  if (!lua_isnumber(L, 3))
+    return LUA_HANDLE_ERROR_STR(L, "third argument should be an integer");
+  long feature_id = lua_tointeger(L, 3);
+  if (!lua_isnumber(L, 4))
+    return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
+  long minLeafSize = lua_tointeger(L, 4);
+  // Since minLeafSize == 1 corresponds to each sample in its own leaf, any value below it doesn't
+  // make sense
+  if (minLeafSize < 1)
+    minLeafSize = 1;
+  THTensor *grad = luaT_checkudata(L, 5, torch_Tensor);
+  THTensor *hess = luaT_checkudata(L, 6, torch_Tensor);
+
+  if (!THLongTensor_isContiguous(exampleIds))
+    return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
+  if (!THTensor_(isContiguous)(grad))
+    return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
+  if (!THTensor_(isContiguous)(hess))
+    return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
+
+  // initializes the static data
+  nn_(GBInitialization) initialization_data;
+  nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
+
+  // initializes the dynamic data
+  GBRunData run_data;
+  gb_create_run_data(&run_data, minLeafSize);
+
+  // finds the best state possible for the split
+  nn_(GBBestState) bs;
+  nn_(gb_find_best_feature_split)(L, &initialization_data, &bs, feature_id, &run_data);
+
+  lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
+
+  // fills the table we the best split found and the lua logic above will do everything else
+  // if no state was found, returns nil
+  if (bs.valid_state == 0) {
+    lua_pop(L, 1);
+    lua_pushnil(L);
+  }
+  else {
+    nn_(gb_internal_split_info)(L, &bs, initialization_data.splitInfo_index);
+  }
+
+  gb_destroy_run_data(&run_data);
+
+  return 1;
+}
+
+// finds the best split for a given node and shard of features
+// this is more efficient than calling the previous one multiple times
+static int nn_(gb_findBestSplit)(lua_State *L) {
+  THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
+  const int dataset_index = 2;
+  THLongTensor *feature_ids = luaT_checkudata(L, 3, "torch.LongTensor");
+  if (!lua_isnumber(L, 4))
+    return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
+  long minLeafSize = lua_tointeger(L, 4);
+  // Since minLeafSize == 1 corresponds to each sample in its own leaf, any value below it doesn't
+  // make sense
+  if (minLeafSize < 1)
+    minLeafSize = 1;
+  if (!lua_isnumber(L, 5))
+    return LUA_HANDLE_ERROR_STR(L, "fifth argument should be an integer");
+  long shardId = lua_tointeger(L, 5);
+  if (!lua_isnumber(L, 6))
+    return LUA_HANDLE_ERROR_STR(L, "sixth argument should be an integer");
+  long nShard = lua_tointeger(L, 6);
+  THTensor *grad = luaT_checkudata(L, 7, torch_Tensor);
+  THTensor *hess = luaT_checkudata(L, 8, torch_Tensor);
+
+  if (!THLongTensor_isContiguous(exampleIds))
+    return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
+  if (!THTensor_(isContiguous)(grad))
+    return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
+  if (!THTensor_(isContiguous)(hess))
+    return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
+
+  // initializes the static data
+  nn_(GBInitialization) initialization_data;
+  nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
+
+  // initializes the dynamic data
+  GBRunData run_data;
+  gb_create_run_data(&run_data, minLeafSize);
+
+  // initializes to evaluate all the features in this shard
+  nn_(GBBestState) global_bs;
+  global_bs.valid_state = 0;
+  long n_features = THLongTensor_size(feature_ids, 0);
+  if (!THLongTensor_isContiguous(feature_ids))
+    return LUA_HANDLE_ERROR_STR(L, "feature_ids must be contiguous");
+  long *feature_ids_data = THLongTensor_data(feature_ids);
+
+  // for every feature
+  for (long i = 0; i < n_features; i++) {
+    long feature_id = feature_ids_data[i];
+    // if we are responsible for it
+    if (nShard <= 1 || (feature_id % nShard) + 1 == shardId) {
+      // finds the best state possible for the split
+      nn_(GBBestState) bs;
+      nn_(gb_find_best_feature_split)(L, &initialization_data, &bs, feature_id, &run_data);
+
+      // if it's valid and better than one we found before, saves it
+      if (bs.valid_state) {
+        if (global_bs.valid_state == 0 || bs.gain < global_bs.gain) {
+          global_bs = bs;
+        }
+      }
+    }
+  }
+
+  lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
+
+  // fills the table we the best split found and the lua logic above will do everything else
+  // if no state was found, returns nil
+  if (global_bs.valid_state == 0) {
+    lua_pop(L, 1);
+    lua_pushnil(L);
+  }
+  else {
+    nn_(gb_internal_split_info)(L, &global_bs, initialization_data.splitInfo_index);
+  }
+
+  gb_destroy_run_data(&run_data);
+
+  return 1;
+}
+
+// all the info we have to apss to the slave threads so that they can do their jobs
+// note that we do not pass the lua state since it isn't required. we perform direct C parallelism
+// instead of using lua's parallelism like with the previous version
+typedef struct {
+  nn_(GBInitialization) *initialization_data;
+  GBRunData *run_data;
+  long *index;
+  nn_(GBBestState) *global_bs;
+  long n_features;
+  long *feature_ids_data;
+  pthread_mutex_t *mutex;
+  THLongTensor *exampleIds;
+  THTensor *input;
+  THLongTensor **sorted_ids_per_feature;
+} nn_(ThreadInfo);
+
+// loops over all the features in parallel and finds the best global split
+static void* nn_(thread_worker)(void *arg) {
+  nn_(ThreadInfo) *info = (nn_(ThreadInfo) *)arg;
+
+  while (1) {
+    pthread_mutex_lock(info->mutex);
+    long index = (*info->index);
+    (*info->index)++;
+    pthread_mutex_unlock(info->mutex);
+
+    if (index >= info->n_features)
+      break;
+
+    // performs part of steps (1) and (2) of gb_find_best_feature_split without having to access the
+    // lua state using pre-loaded data
+    long feature_id = info->feature_ids_data[index];
+    THLongTensor *exampleIdsWithFeature_ret = info->exampleIds;
+    THLongTensor *featureExampleIds = info->sorted_ids_per_feature[index];
+    nn_(GBInitialization) *initialization_data = info->initialization_data;
+    GBRunData *run_data = info->run_data;
+
+    // performs steps (3) and (4) of gb_find_best_feature_split since (1) and (2) were already
+    // performed before
+    nn_(GBBestState) bs;
+    nn_(gb_internal_create)(initialization_data->grad, initialization_data->hess,
+        exampleIdsWithFeature_ret, &bs.state);
+    nn_(gb_internal_get_best_split_special)(&bs, featureExampleIds, run_data->exampleMap,
+        info->input, run_data->minLeafSize, feature_id);
+
+    // saves to the global state if it's better
+    if (bs.valid_state) {
+      pthread_mutex_lock(info->mutex);
+      if (info->global_bs->valid_state == 0 || bs.gain < info->global_bs->gain) {
+        (*info->global_bs) = bs;
+      }
+      pthread_mutex_unlock(info->mutex);
+    }
+  }
+
+  return NULL;
+}
+
+// finds the global best split by doing feature parallelism directly in C
+static int nn_(gb_findBestSplitFP)(lua_State *L) {
+  THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
+  const int dataset_index = 2;
+  THLongTensor *feature_ids = luaT_checkudata(L, 3, "torch.LongTensor");
+  if (!lua_isnumber(L, 4))
+    return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
+  long minLeafSize = lua_tointeger(L, 4);
+  THTensor *grad = luaT_checkudata(L, 5, torch_Tensor);
+  THTensor *hess = luaT_checkudata(L, 6, torch_Tensor);
+  if (!lua_isnumber(L, 7))
+    return LUA_HANDLE_ERROR_STR(L, "seventh argument should be an integer");
+  long nThread = lua_tointeger(L, 7);
+
+  if (!THLongTensor_isContiguous(exampleIds))
+    return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
+  if (!THTensor_(isContiguous)(grad))
+    return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
+  if (!THTensor_(isContiguous)(hess))
+    return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
+
+  pthread_mutex_t mutex;
+  pthread_mutex_init(&mutex, NULL);
+
+  // initializes the static data
+  nn_(GBInitialization) initialization_data;
+  nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
+
+  // initializes the dynamic data
+  GBRunData run_data;
+  gb_create_run_data(&run_data, minLeafSize);
+
+  // initializes to evaluate all the features
+  nn_(GBBestState) global_bs;
+  global_bs.valid_state = 0;
+  long n_features = THLongTensor_size(feature_ids, 0);
+  if (!THLongTensor_isContiguous(feature_ids))
+    return LUA_HANDLE_ERROR_STR(L, "feature_ids must be contiguous");
+  long *feature_ids_data = THLongTensor_data(feature_ids);
+
+  THTensor *input = luaT_checkudata(L, initialization_data.input_index, torch_Tensor);
+
+  // performs step (1) of gb_find_best_feature_split so that we don't have to pass the lua state
+  THLongTensor *sorted_ids_per_feature[n_features];
+  for (long i = 0; i < n_features; i++) {
+    long feature_id = feature_ids_data[i];
+    lua_pushvalue(L, initialization_data.getSortedFeature_index);
+    lua_pushvalue(L, initialization_data.dataset_index);
+    lua_pushinteger(L, feature_id);
+    lua_call(L, 2, 1);
+
+    THLongTensor *featureExampleIds = luaT_checkudata(L, -1, "torch.LongTensor");
+    sorted_ids_per_feature[i] = featureExampleIds;
+  }
+
+  // performas step (2) of gb_find_best_feature_split since it's the same for all features when the
+  // data is dense
+  long exampleIds_size = THLongTensor_size(initialization_data.exampleIds, 0);
+  long *exampleIds_data = THLongTensor_data(initialization_data.exampleIds);
+
+  int ret;
+  kh_resize(long, run_data.exampleMap, exampleIds_size*8);
+  for (long i = 0; i < exampleIds_size; i++)
+    kh_put(long, run_data.exampleMap, exampleIds_data[i], &ret);
+
+  // saves the info for the threads
+  long index = 0;
+  nn_(ThreadInfo) info;
+  info.initialization_data = &initialization_data;
+  info.run_data = &run_data;
+  info.index = &index;
+  info.global_bs = &global_bs;
+  info.n_features = n_features;
+  info.feature_ids_data = feature_ids_data;
+  info.mutex = &mutex;
+  info.exampleIds = exampleIds;
+  info.input = input;
+  info.sorted_ids_per_feature = sorted_ids_per_feature;
+
+  pthread_t threads[nThread];
+
+  // let the threads run like crazy over the features to find the minimum
+  for (long i = 0; i < nThread; i++) {
+    int ret = pthread_create(&threads[i], NULL, nn_(thread_worker), &info);
+    if (ret)
+      return LUA_HANDLE_ERROR_STR(L, "falied to create thread");
+  }
+
+  for (long i = 0; i < nThread; i++) {
+    int ret = pthread_join(threads[i], NULL);
+    if (ret)
+      return LUA_HANDLE_ERROR_STR(L, "failed to join thread");
+  }
+
+  lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
+
+  // fills the table we the best split found and the lua logic above will do everything else
+  // if no state was found, returns nil
+  if (global_bs.valid_state == 0) {
+    lua_pop(L, 1);
+    lua_pushnil(L);
+  }
+  else {
+    nn_(gb_internal_split_info)(L, &global_bs, initialization_data.splitInfo_index);
+  }
+
+  gb_destroy_run_data(&run_data);
+  pthread_mutex_destroy(&mutex);
+
+  return 1;
+}
+
+// performs an efficient branch of the current examples based on a split info provided
+static int nn_(gb_branch)(lua_State *L) {
+  if (!lua_istable(L, 1))
+    return LUA_HANDLE_ERROR_STR(L, "first argument must be a table");
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THLongTensor *exampleIds = luaT_checkudata(L, 3, "torch.LongTensor");
+
+  // gets direct access to the dataset
+  long n_exampleIds = THLongTensor_size(exampleIds, 0);
+  long *exampleIds_data = THLongTensor_data(exampleIds);
+  long n_features = THTensor_(size)(input, 1);
+  real *input_data = THTensor_(data)(input);
+
+  // creates the tensors to be returned
+  luaT_pushudata(L, THLongTensor_new(), "torch.LongTensor");
+  luaT_pushudata(L, THLongTensor_new(), "torch.LongTensor");
+  THLongTensor *leftExampleIds = luaT_checkudata(L, 4, "torch.LongTensor");
+  THLongTensor *rightExampleIds = luaT_checkudata(L, 5, "torch.LongTensor");
+  THLongTensor_resize1d(leftExampleIds, n_exampleIds);
+
+  // gets direct access to the examples
+  THLongTensor *splitExampleIds = leftExampleIds;
+  long *splitExampleIds_data = THLongTensor_data(splitExampleIds);
+
+  // gets the split info
+  lua_pushstring(L, "splitId");
+  lua_rawget(L, 1);
+  const long splitId = lua_tointeger(L, -1);
+  lua_pushstring(L, "splitValue");
+  lua_rawget(L, 1);
+  const real splitValue = lua_tonumber(L, -1);
+  lua_pop(L, 2);
+
+  long leftIdx = 0, rightIdx = 0;
+
+  // goes over all the samples dividing them into the two sides
+  for (long i = 0; i < n_exampleIds; i++) {
+    long exampleId = exampleIds_data[i];
+    real val = input_data[(exampleId-1) * n_features + (splitId - 1)];
+    if (val <= splitValue) {
+      leftIdx++;
+      splitExampleIds_data[leftIdx-1] = exampleId;
+    }
+    else {
+      rightIdx++;
+      splitExampleIds_data[n_exampleIds - rightIdx + 1 - 1] = exampleId;
+    }
+  }
+
+  // once done, the resulting tensors are just splits of the sample base. this is more efficient
+  // than having 2 tensors since we didn't know where the split would happen (how much to each
+  // side), but we knew that the sum would be constant
+  THLongTensor_narrow(rightExampleIds, splitExampleIds, 0, n_exampleIds-rightIdx+1-1, rightIdx);
+  THLongTensor_narrow(leftExampleIds, splitExampleIds, 0, 0, leftIdx);
+  return 2;
+}
+
+static const struct luaL_Reg nn_(GBDT__) [] = {
+  {"GBDT_findBestFeatureSplit", nn_(gb_findBestFeatureSplit)},
+  {"GBDT_findBestSplit", nn_(gb_findBestSplit)},
+  {"GBDT_findBestSplitFP", nn_(gb_findBestSplitFP)},
+  {"GBDT_branch", nn_(gb_branch)},
+  {NULL, NULL}
+};
+
+static void nn_(GBDT_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(GBDT__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/torch/decisiontree/generic/GBDT_internal.c b/contrib/torch/decisiontree/generic/GBDT_internal.c
new file mode 100644 (file)
index 0000000..739aabf
--- /dev/null
@@ -0,0 +1,312 @@
+// initializes the optimization structure based on the arguments provided, either filling directly
+// or making calls to lua to load some kind of data
+static void nn_(gb_initialize)(lua_State *L, nn_(GBInitialization) *initialization_data,
+    THLongTensor *exampleIds, THTensor *grad, THTensor *hess, int dataset_index) {
+  initialization_data->dataset_index = dataset_index;
+  initialization_data->exampleIds = exampleIds;
+  initialization_data->grad = grad;
+  initialization_data->hess = hess;
+
+  lua_newtable(L);
+  initialization_data->splitInfo_index = lua_gettop(L);
+
+  lua_pushstring(L, "input");
+  lua_gettable(L, dataset_index);
+  initialization_data->input_index = lua_gettop(L);
+
+  lua_pushstring(L, "getSortedFeature");
+  lua_gettable(L, dataset_index);
+  initialization_data->getSortedFeature_index = lua_gettop(L);
+}
+
+// initializes a state that will be passed to the optimizer
+static void nn_(gb_internal_create)(THTensor *grad, THTensor *hessian,
+    THLongTensor *exampleIds, nn_(GBState)* s) {
+  long *exampleIds_data = THLongTensor_data(exampleIds);
+  long n_examples = THLongTensor_size(exampleIds, 0);
+  accreal leftGradientSum = 0;
+  accreal leftHessianSum = 0;
+
+  real *grad_data = THTensor_(data)(grad);
+  real *hessian_data = THTensor_(data)(hessian);
+
+  // only sums the relevant gradients and hessians
+  for (long i = 0; i < n_examples; i++) {
+    long exampleId = exampleIds_data[i]-1;
+    leftGradientSum += grad_data[exampleId];
+    leftHessianSum += hessian_data[exampleId];
+  }
+
+  // we move data from the left branch to the right branch
+  s->rightGradientSum = 0;
+  s->rightHessianSum = 1;
+  s->nExampleInRightBranch = 0;
+  s->leftGradientSum = leftGradientSum;
+  s->leftHessianSum = leftHessianSum + 1;
+  s->nExampleInLeftBranch = n_examples;
+
+  // stores the loss in parent for efficiency
+  real lossInParent = computeGradientBoostLoss(s->leftGradientSum + s->rightGradientSum,
+      s->leftHessianSum + s->rightHessianSum);
+  s->lossInParent = lossInParent;
+
+  // caches the direct pointers to the data for efficiency
+  s->grad_data = grad_data;
+  s->hessian_data = hessian_data;
+}
+
+// computes the gain obtained by performing the split
+static real nn_(computeSplitGain)(nn_(GBState) *s) {
+  real lossInLeftBranch = computeGradientBoostLoss(s->leftGradientSum, s->leftHessianSum);
+  real lossInRightBranch = computeGradientBoostLoss(s->rightGradientSum, s->rightHessianSum);
+  return lossInLeftBranch + lossInRightBranch - s->lossInParent;
+}
+
+// uses the state information to build the table required by the lua library about the best split
+static void nn_(gb_internal_split_info)(lua_State *L, nn_(GBBestState) *bs, int res) {
+  long feature_id = bs->feature_id;
+  real feature_value = bs->feature_value;
+  real gain  = bs->gain;
+  nn_(GBState) *s = &bs->state;
+  lua_pushstring(L, "splitGain");
+  lua_pushnumber(L, gain);
+  lua_rawset(L, res);
+  lua_pushstring(L, "splitId");
+  lua_pushinteger(L, feature_id);
+  lua_rawset(L, res);
+  lua_pushstring(L, "splitValue");
+  lua_pushnumber(L, feature_value);
+  lua_rawset(L, res);
+  lua_pushstring(L, "leftChildSize");
+  lua_pushinteger(L, s->nExampleInLeftBranch);
+  lua_rawset(L, res);
+  lua_pushstring(L, "rightChildSize");
+  lua_pushinteger(L, s->nExampleInRightBranch);
+  lua_rawset(L, res);
+  lua_pushstring(L, "leftGradient");
+  lua_pushnumber(L, s->leftGradientSum);
+  lua_rawset(L, res);
+  lua_pushstring(L, "rightGradient");
+  lua_pushnumber(L, s->rightGradientSum);
+  lua_rawset(L, res);
+  lua_pushstring(L, "leftHessian");
+  lua_pushnumber(L, s->leftHessianSum);
+  lua_rawset(L, res);
+  lua_pushstring(L, "rightHessian");
+  lua_pushnumber(L, s->rightHessianSum);
+  lua_rawset(L, res);
+}
+
+// core of the computation, where we loop over all the relevant samples looking for the best split
+// we can find
+static void nn_(gb_internal_get_best_split)(lua_State *L, nn_(GBBestState) *bs,
+    THLongTensor *featureExampleIds, khash_t(long)* exampleMap, int input_table_index,
+    long minLeafSize, long feature_id) {
+  nn_(GBState) current_state;
+  nn_(GBState) best_state;
+  current_state = bs->state;
+
+  real best_gain = INFINITY;
+  real best_value = 0;
+
+  // if the data is dense, pre-loads direct access to it
+  THTensor *input = NULL;
+  real *input_data = NULL;
+  long n_features = 0;
+  if (lua_istable(L, input_table_index)) {
+  }
+  else {
+    input = luaT_checkudata(L, input_table_index, torch_Tensor);
+    input_data = THTensor_(data)(input);
+    n_features = THTensor_(size)(input, 1);
+  }
+
+  long stride = featureExampleIds->stride[0];
+  long *featureExampleIds_data = THLongTensor_data(featureExampleIds);
+
+  khiter_t k;
+
+  real previousSplitValue = 0;
+  // for each example with the given feature and from large to small value...
+  for (long i = THLongTensor_size(featureExampleIds, 0)-1; i >= 0; i--) {
+    long exampleId = featureExampleIds_data[i * stride];
+
+    // checks if the sample is in the list of ones that have to be evaluated by this node
+    k = kh_get(long, exampleMap, exampleId);
+    if (k != kh_end(exampleMap)) {
+      long exampleIdx = exampleId;
+
+      // gets the split value, depending on whether the input is sparse or dense
+      real splitValue;
+      if (input_data) {
+        splitValue = input_data[(exampleId-1) * n_features + feature_id-1];
+      }
+      else {
+        lua_pushinteger(L, exampleId);
+        lua_gettable(L, input_table_index);
+        lua_pushinteger(L, feature_id);
+        lua_gettable(L, -2);
+        splitValue = lua_tonumber(L, -1);
+        lua_pop(L, 2);
+      }
+
+      // performs one update of the state, moving a sample from the left branch to the right
+      real gradient = current_state.grad_data[exampleIdx-1];
+      real hessian = current_state.hessian_data[exampleIdx-1];
+      current_state.leftGradientSum -= gradient;
+      current_state.rightGradientSum += gradient;
+      current_state.leftHessianSum -= hessian;
+      current_state.rightHessianSum += hessian;
+      current_state.nExampleInLeftBranch--;
+      current_state.nExampleInRightBranch++;
+
+      // since we remove from the left, once this becomes true, it stays true forever
+      // hence we stop the loop
+      if (current_state.nExampleInLeftBranch < minLeafSize)
+        break;
+
+      if (current_state.nExampleInRightBranch >= minLeafSize) {
+        // if the values are equal between the steps, it doesn't make sense to evaluate the score
+        // since we won't be able to separate the two
+        if (previousSplitValue != splitValue) {
+          // computes the gain **without including the parent** since it doesn't change as we move
+          // examples between branches
+          real lossInLeftBranch = computeGradientBoostLoss(current_state.leftGradientSum, current_state.leftHessianSum);
+          real lossInRightBranch = computeGradientBoostLoss(current_state.rightGradientSum, current_state.rightHessianSum);
+          real current_gain = lossInLeftBranch + lossInRightBranch;
+          if (current_gain < best_gain) {
+            best_gain = current_gain;
+            best_value = splitValue;
+            best_state = current_state;
+          }
+        }
+      }
+      previousSplitValue = splitValue;
+    }
+  }
+
+  // if there is a valid gain, then marks the state as valid and fills the meta-info
+  if (!isfinite(best_gain)) {
+    bs->valid_state = 0;
+  }
+  else {
+    bs->valid_state = 1;
+    bs->state = best_state;
+    bs->feature_id = feature_id;
+    bs->gain = nn_(computeSplitGain)(&bs->state);
+    bs->feature_value = best_value;
+  }
+}
+
+// exactly like the previous version, but direct access to the data for efficiency. it also doesn't
+// rely on the lua state in the particular case of dense data, so we can evaluate this without using
+// the lua state
+static void nn_(gb_internal_get_best_split_special)(nn_(GBBestState) *bs,
+    THLongTensor *featureExampleIds, khash_t(long)* exampleMap, THTensor *input, long minLeafSize,
+    long feature_id) {
+  nn_(GBState) current_state;
+  nn_(GBState) best_state;
+  current_state = bs->state;
+
+  real best_gain = INFINITY;
+  real best_value = 0;
+
+  real *input_data = NULL;
+  long n_features = 0;
+  input_data = THTensor_(data)(input);
+  n_features = THTensor_(size)(input, 1);
+
+  long stride = featureExampleIds->stride[0];
+  long *featureExampleIds_data = THLongTensor_data(featureExampleIds);
+
+  khiter_t k;
+
+  real previousSplitValue = 0;
+  for (long i = THLongTensor_size(featureExampleIds, 0)-1; i >= 0; i--) {
+    long exampleId = featureExampleIds_data[i * stride];
+
+    k = kh_get(long, exampleMap, exampleId);
+    if (k != kh_end(exampleMap)) {
+      long exampleIdx = exampleId;
+
+      // THIS is the main part that changes. seems crazy to have a special case just for this, but
+      // since there are a **lot** of samples to be evaluated, the "if" in the previous case can
+      // become expensive
+      real splitValue;
+      splitValue = input_data[(exampleId-1) * n_features + feature_id-1];
+
+      real gradient = current_state.grad_data[exampleIdx-1];
+      real hessian = current_state.hessian_data[exampleIdx-1];
+      current_state.leftGradientSum -= gradient;
+      current_state.rightGradientSum += gradient;
+      current_state.leftHessianSum -= hessian;
+      current_state.rightHessianSum += hessian;
+      current_state.nExampleInLeftBranch--;
+      current_state.nExampleInRightBranch++;
+
+      // since we remove from the left, once this becomes true, it stays true forever
+      // hence we stop the loop
+      if (current_state.nExampleInLeftBranch < minLeafSize)
+        break;
+
+      // This will always fail in the first pass since minLeafSize >= 1 and nExampleInRightBranch
+      // starts at 0
+      if (current_state.nExampleInRightBranch >= minLeafSize) {
+        if (previousSplitValue != splitValue) {
+          real lossInLeftBranch = computeGradientBoostLoss(current_state.leftGradientSum, current_state.leftHessianSum);
+          real lossInRightBranch = computeGradientBoostLoss(current_state.rightGradientSum, current_state.rightHessianSum);
+          real current_gain = lossInLeftBranch + lossInRightBranch;
+          if (current_gain < best_gain) {
+            best_gain = current_gain;
+            best_value = splitValue;
+            best_state = current_state;
+          }
+        }
+      }
+      previousSplitValue = splitValue;
+    }
+  }
+
+  if (!isfinite(best_gain)) {
+    bs->valid_state = 0;
+  }
+  else {
+    bs->valid_state = 1;
+    bs->state = best_state;
+    bs->feature_id = feature_id;
+    bs->gain = nn_(computeSplitGain)(&bs->state);
+    bs->feature_value = best_value;
+  }
+}
+
+// core of the computation to find the split for a given feature and is divided in 4 steps
+static void nn_(gb_find_best_feature_split)(lua_State *L,
+    nn_(GBInitialization) *initialization_data, nn_(GBBestState) *bs, long feature_id,
+    GBRunData *run_data) {
+
+  // 1) loads the examples in the dataset ordered by their feature value
+  lua_pushvalue(L, initialization_data->getSortedFeature_index);
+  lua_pushvalue(L, initialization_data->dataset_index);
+  lua_pushinteger(L, feature_id);
+  lua_call(L, 2, 1);
+
+  THLongTensor *featureExampleIds = luaT_checkudata(L, -1, "torch.LongTensor");
+
+  // 2) processes the data to find the intersection between the examples in the dataset and the
+  // examples the current node has to evaluate
+  THLongTensor *exampleIdsWithFeature_ret = gb_internal_prepare(L, initialization_data->exampleIds,
+      run_data->exampleIdsWithFeature_cache, initialization_data->input_index, feature_id,
+      run_data->exampleMap);
+  if (!exampleIdsWithFeature_ret) {
+    bs->valid_state = 0;
+    return;
+  }
+
+  // 3) creates a new state to be used by the optimizer
+  nn_(gb_internal_create)(initialization_data->grad, initialization_data->hess,
+      exampleIdsWithFeature_ret, &bs->state);
+
+  // 4) optimize away!
+  nn_(gb_internal_get_best_split)(L, bs, featureExampleIds, run_data->exampleMap,
+      initialization_data->input_index, run_data->minLeafSize, feature_id);
+}
diff --git a/contrib/torch/decisiontree/generic/GBDT_internal.h b/contrib/torch/decisiontree/generic/GBDT_internal.h
new file mode 100644 (file)
index 0000000..7119365
--- /dev/null
@@ -0,0 +1,34 @@
+// representation of a state used while searching for the best split
+typedef struct {
+  real leftGradientSum, rightGradientSum;
+  real leftHessianSum, rightHessianSum;
+  real lossInParent;
+  long nExampleInLeftBranch, nExampleInRightBranch;
+  real *grad_data, *hessian_data;
+} nn_(GBState);
+
+// representation for the best state found for a given feature
+typedef struct {
+  nn_(GBState) state;
+  real gain;
+  long feature_id;
+  real feature_value;
+  int valid_state;
+} nn_(GBBestState);
+
+// full data that must be initialized before calling the optimizer
+typedef struct {
+  // *_index represent positions on the lua stack
+  int dataset_index;
+  int splitInfo_index;
+  int input_index;
+  // position of the dataset's function to return the samples ordered for a given feature
+  int getSortedFeature_index;
+
+  // samples that this node has to evaluate
+  THLongTensor *exampleIds;
+
+  // cached gradient and hessian for all data
+  THTensor *grad;
+  THTensor *hess;
+} nn_(GBInitialization);
diff --git a/contrib/torch/decisiontree/generic/LogitBoostCriterion.c b/contrib/torch/decisiontree/generic/LogitBoostCriterion.c
new file mode 100644 (file)
index 0000000..f2ea1ef
--- /dev/null
@@ -0,0 +1,90 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogitBoostCriterion.c"
+#else
+
+#define EPS 1e-12
+
+static int nn_(LogitBoostCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = lua_toboolean(L, 4);
+
+  if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
+    luaL_error(L, "inconsistent input and target size");
+  }
+  THTensor_(resize1d)(output, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    // math.log(1 + math.exp(target[i] <= 0 and input[i] or -input[i]))
+    sum += log(1 + exp(y <= 0 ? x : -x));
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+  return 0;
+}
+
+static int nn_(LogitBoostCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradInput = luaT_checkudata(L, 3, torch_Tensor);
+
+  if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
+    luaL_error(L, "inconsistent input and target size");
+  }
+  THTensor_(resizeAs)(gradInput, input);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    real p = (x >= 0) ? (1 / (1 + exp(-x))) : (1 - 1 / (1 + exp(x)));
+    *gradInput_data = (y <= 0) ? p : (p - 1);
+  );
+
+  return 0;
+}
+
+static int nn_(LogitBoostCriterion_updateHessInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *hessInput = luaT_checkudata(L, 3, torch_Tensor);
+
+  if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
+    luaL_error(L, "inconsistent input and target size");
+  }
+  THTensor_(resizeAs)(hessInput, input);
+
+  TH_TENSOR_APPLY3(real, hessInput, real, input, real, target,
+    real x = *input_data;
+    real p = (x >= 0) ? (1 / (1 + exp(-x))) : (1 - 1 / (1 + exp(x)));
+    *hessInput_data = p * (1.0 - p);
+  );
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(LogitBoostCriterion__) [] = {
+  {"LogitBoostCriterion_updateOutput", nn_(LogitBoostCriterion_updateOutput)},
+  {"LogitBoostCriterion_updateGradInput", nn_(LogitBoostCriterion_updateGradInput)},
+  {"LogitBoostCriterion_updateHessInput", nn_(LogitBoostCriterion_updateHessInput)},
+  {NULL, NULL}
+};
+
+static void nn_(LogitBoostCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(LogitBoostCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/torch/decisiontree/generic/S2D.c b/contrib/torch/decisiontree/generic/S2D.c
new file mode 100644 (file)
index 0000000..2392ee7
--- /dev/null
@@ -0,0 +1,90 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/S2D.c"
+#else
+
+static int nn_(S2D_computeOutput)(lua_State *L) {
+  THTensor *output = luaT_checkudata(L, 1, torch_Tensor);
+  const int keys_index = 2;
+  const int values_index = 3;
+  const int masks_index = 4;
+
+  if (!lua_istable(L, keys_index))
+    return LUA_HANDLE_ERROR_STR(L, "expeced position 2 to be a table");
+  if (!lua_istable(L, values_index))
+    return LUA_HANDLE_ERROR_STR(L, "expeced position 3 to be a table");
+  if (!lua_istable(L, masks_index))
+    return LUA_HANDLE_ERROR_STR(L, "expeced position 4 to be a table");
+
+
+  THLongTensor *features = luaT_checkudata(L, 5, "torch.LongTensor");
+
+  const int original_top = lua_gettop(L);
+
+  long outputsize = THLongTensor_size(features, 0);
+  long batch_size = lua_objlen(L, keys_index);
+
+  // initializes output
+  THTensor_(resize2d)(output, batch_size, outputsize);
+  THTensor_(zero)(output);
+  real *output_data = THTensor_(data)(output);
+
+  // iterates over samples
+  lua_pushnil(L);
+  const int local_top = lua_gettop(L);
+  while (lua_next(L, keys_index) != 0) {
+    // gets data corresponding to the current sample
+    long i = lua_tointeger(L, -2)-1;
+    real *current_output_data = &output_data[i * outputsize];
+    THLongTensor *keys = luaT_checkudata(L, -1, "torch.LongTensor");
+    lua_rawgeti(L, values_index, i+1);
+    THTensor *values = luaT_checkudata(L, -1, torch_Tensor);
+    lua_rawgeti(L, masks_index, i+1);
+    THByteTensor *mask = luaT_checkudata(L, -1, "torch.ByteTensor");
+
+    long n_keys = THLongTensor_size(keys, 0);
+    long n_values = THTensor_(size)(values, 0);
+
+    // quick safety check
+    if (n_keys != n_values)
+      return LUA_HANDLE_ERROR_STR(L, "keys and values have to have the same size");
+
+    // gets the direct memory pointers
+    long *keys_data = THLongTensor_data(keys);
+    real *values_data = THTensor_(data)(values);
+    unsigned char *mask_data = THByteTensor_data(mask);
+
+    // for each value in the sparse input...
+    for (long j = 0; j < n_keys; j++) {
+      // loads the value and key
+      real current_value = values_data[j];
+      long current_key = keys_data[j];
+      unsigned char current_mask = mask_data[j];
+
+      // if the feature is present in the map
+      if (current_mask)
+        // saves in the given position
+        current_output_data[current_key-1] = current_value;
+    }
+    // cleans up the trash we create by iterating over keys to avoid it from overflowing
+    lua_pop(L, lua_gettop(L) - local_top);
+  }
+
+  // cleans up the trash we added to the stack
+  lua_pop(L, lua_gettop(L) - original_top);
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(S2D__) [] = {
+  {"S2D_computeOutput", nn_(S2D_computeOutput)},
+  {NULL, NULL}
+};
+
+static void nn_(S2D_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(S2D__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/torch/decisiontree/hash_map.c b/contrib/torch/decisiontree/hash_map.c
new file mode 100644 (file)
index 0000000..2c679e2
--- /dev/null
@@ -0,0 +1,445 @@
+#include "utils.h"
+#include "hash_map.h"
+#include "internal_hash_map.h"
+#include <pthread.h>
+
+hash_map_t hash_map_init(void) {
+   return kh_init(long);
+}
+
+void hash_map_destroy(hash_map_t h_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   kh_destroy(long, h);
+}
+
+void hash_map_clear(hash_map_t h_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   kh_clear(long, h);
+}
+
+int hash_map_put(hash_map_t h_, long key, long val) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   int ret;
+   khiter_t k = kh_put(long, h, key, &ret);
+   ret = (ret >= 0);
+   if (ret)
+      kh_value(h, k) = val;
+   return ret;
+}
+
+int hash_map_put_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_) {
+   long *keys = THLongTensor_data(keys_);
+   long *vals = THLongTensor_data(vals_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      if (!hash_map_put(h_, keys[i], vals[i]))
+         return 0;
+   return 1;
+}
+
+int hash_map_fill(hash_map_t h_, long key, long *counter) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   khiter_t k = kh_get(long, h, key);
+   if (k == kh_end(h))
+      return hash_map_put(h_, key, ++(*counter));
+   return 1;
+}
+
+int hash_map_fill_tensor(hash_map_t h_, THLongTensor *keys_, long *counter) {
+   long *keys = THLongTensor_data(keys_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      if (!hash_map_fill(h_, keys[i], counter))
+         return 0;
+   return 1;
+}
+
+int hash_map_get(hash_map_t h_, long key, long* val) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   khiter_t k = kh_get(long, h, key);
+   if (k == kh_end(h))
+      return 0;
+   *val = kh_value(h, k);
+   return 1;
+}
+
+void hash_map_get_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_, THByteTensor *mask_) {
+   long *keys = THLongTensor_data(keys_);
+   long *vals = THLongTensor_data(vals_);;
+   unsigned char *mask = THByteTensor_data(mask_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      mask[i] = hash_map_get(h_, keys[i], &vals[i]);
+}
+
+void hash_map_del(hash_map_t h_, long key) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   khiter_t k = kh_get(long, h, key);
+   if (k != kh_end(h))
+      kh_del(long, h, k);
+}
+
+void hash_map_del_tensor(hash_map_t h_, THLongTensor *keys_) {
+   long *keys = THLongTensor_data(keys_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      hash_map_del(h_, keys[i]);
+}
+
+size_t hash_map_size(hash_map_t h_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   return kh_size(h);
+}
+
+void hash_map_to_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   long *keys = THLongTensor_data(keys_);
+   long *vals = THLongTensor_data(vals_);
+   long key, val, i = 0;
+   kh_foreach(h, key, val, {
+      keys[i] = key;
+      vals[i] = val;
+      i++;
+   });
+}
+
+static void autolock(hash_map_lua_t *h) {
+   if (h->autolock) {
+      pthread_mutex_lock(&h->mutex);
+   }
+}
+
+static void autounlock(hash_map_lua_t *h) {
+   if (h->autolock) {
+      pthread_mutex_unlock(&h->mutex);
+   }
+}
+
+int hash_map_autolock_on_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   h->autolock = 1;
+   return 0;
+}
+
+int hash_map_autolock_off_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   h->autolock = 0;
+   return 0;
+}
+
+int hash_map_init_lua(lua_State *L) {
+   hash_map_lua_t **hp = (hash_map_lua_t**)lua_newuserdata(L, sizeof(hash_map_lua_t*));
+   *hp = (hash_map_lua_t*)malloc(sizeof(hash_map_lua_t));
+   hash_map_lua_t *h = *hp;
+   h->refcount = 1;
+   h->counter = 0;
+   h->autolock = 0;
+   h->h = hash_map_init();
+
+   pthread_mutexattr_t mutex_attr;
+   pthread_mutexattr_init(&mutex_attr);
+   pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
+   pthread_mutex_init(&h->mutex, &mutex_attr);
+
+   luaL_getmetatable(L, "dt.HashMap");
+   lua_setmetatable(L, -2);
+   return 1;
+}
+
+int hash_map_gc_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (THAtomicDecrementRef(&h->refcount)) {
+      pthread_mutex_destroy(&h->mutex);
+      hash_map_destroy(h->h);
+      free(h);
+   }
+   return 0;
+}
+
+int hash_map_retain_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   THAtomicIncrementRef(&h->refcount);
+   return 0;
+}
+
+int hash_map_metatablename_lua(lua_State *L) {
+   lua_pushstring(L, "dt.HashMap");
+   return 1;
+}
+
+int hash_map_clear_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   autolock(h);
+   hash_map_clear(h->h);
+   autounlock(h);
+   return 0;
+}
+
+int hash_map_put_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   int ret;
+#if LUA_VERSION_NUM <= 501
+#define lua_isinteger lua_isnumber
+#endif
+   if (lua_isinteger(L, 2)) {
+      if (!lua_isinteger(L, 3))
+         return LUA_HANDLE_ERROR_STR(L, "second parameter is not a number");
+      long key = lua_tointeger(L, 2);
+      long val = lua_tointeger(L, 3);
+      autolock(h);
+      ret = hash_map_put(h->h, key, val);
+      autounlock(h);
+   }
+   else {
+      THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      THLongTensor *vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
+      check_tensor(L, keys, THLongTensor);
+      check_tensor(L, vals, THLongTensor);
+      check_tensors(L, keys, vals);
+      autolock(h);
+      ret = hash_map_put_tensor(h->h, keys, vals);
+      autounlock(h);
+   }
+   if (!ret)
+      return LUA_HANDLE_ERROR_STR(L, "failed to put into hash map");
+   return 0;
+}
+
+int hash_map_fill_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   int ret;
+   if (lua_isinteger(L, 2)) {
+      long key = lua_tointeger(L, 2);
+      autolock(h);
+      ret = hash_map_fill(h->h, key, &h->counter);
+      autounlock(h);
+   }
+   else {
+      THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      check_tensor(L, keys, THLongTensor);
+      autolock(h);
+      ret = hash_map_fill_tensor(h->h, keys, &h->counter);
+      autounlock(h);
+   }
+   if (!ret)
+      return LUA_HANDLE_ERROR_STR(L, "failed to fill into hash map");
+   return 0;
+}
+
+int hash_map_adjust_counter_lua(lua_State *L) {
+   hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   internal_hash_map_t h = (internal_hash_map_t) h_->h;
+
+   long val;
+   kh_foreach_value(h, val, {
+      if (val >= h_->counter)
+         h_->counter = val;
+   });
+   return 0;
+}
+
+int hash_map_set_counter_lua(lua_State *L) {
+   hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   h_->counter = lua_tointeger(L, 2);
+   return 0;
+}
+
+int hash_map_get_counter_lua(lua_State *L) {
+   hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   lua_pushinteger(L, h_->counter);
+   return 1;
+}
+
+static int hash_map_get_tensor_lua(lua_State *L, hash_map_lua_t *h, int inplace) {
+    THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+    check_tensor(L, keys, THLongTensor);
+    THLongTensor *vals = inplace ? keys : NULL;
+    THByteTensor *mask = NULL;
+
+    int maskIdx = inplace ? 3 : 4;
+
+    if (!inplace) {
+       if (lua_gettop(L) < 3) {
+          vals = THLongTensor_new();
+       } else {
+          vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
+          check_tensor(L, vals, THLongTensor);
+       }
+    }
+
+    if (lua_gettop(L) < maskIdx) {
+        mask = THByteTensor_new();
+    } else {
+        mask = (THByteTensor *)luaT_checkudata(L, maskIdx, "torch.ByteTensor");
+        check_tensor(L, mask, THByteTensor);
+    }
+
+    int n_dim = THLongTensor_nDimension(keys);
+    THLongStorage *st = THLongStorage_newWithSize1(n_dim);
+    for (int i = 0; i < n_dim; i++) {
+        THLongStorage_set(st, i, THLongTensor_size(keys, i));
+    }
+    THByteTensor_resize(mask, st, NULL);
+    if (!inplace) THLongTensor_resize(vals, st, NULL);
+    THLongStorage_free(st);
+
+    autolock(h);
+    hash_map_get_tensor(h->h, keys, vals, mask);
+    autounlock(h);
+
+    if (!inplace && lua_gettop(L) < 3)
+        luaT_pushudata(L, vals, "torch.LongTensor");
+    if (lua_gettop(L) < maskIdx)
+        luaT_pushudata(L, mask, "torch.ByteTensor");
+
+    return 2;
+}
+
+static int hash_map_get_table_lua(lua_State *L, hash_map_lua_t *h, int inplace) {
+    const int kidx = 2;
+    const int vidx = inplace ? 2 : 3;
+    const int midx = inplace ? 3 : 4;
+    const int narg = lua_gettop(L);
+
+    if (inplace) {
+        if (narg < 3) {
+            LUA_HANDLE_ERROR_STR(L, "HashMap.getInplace requires two arguments.");
+        }
+    } else {
+        if (narg < 4) {
+            LUA_HANDLE_ERROR_STR(L, "HashMap.get requires three arguments.");
+        }
+    }
+
+    int count = push_table_contents(L, kidx);
+    verify_push_table_contents(L, vidx, count);
+    verify_push_table_contents(L, midx, count);
+
+    THLongTensor *keys;
+    THLongTensor *vals;
+    THByteTensor *mask;
+    for (int i = count - 1; i >= 0; i--) {
+        int maskIdx = i - count;
+        int valIdx = maskIdx -  count;
+        int keyIdx = inplace ? valIdx : (valIdx - count);
+
+        keys = (THLongTensor *)luaT_checkudata(L, keyIdx, "torch.LongTensor");
+        check_tensor(L, keys, THLongTensor);
+        if (inplace) {
+            vals = keys;
+        } else {
+            vals = (THLongTensor *)luaT_checkudata(L, valIdx, "torch.LongTensor");
+        }
+        mask = (THByteTensor *)luaT_checkudata(L, maskIdx, "torch.ByteTensor");
+
+        int n_dim = THLongTensor_nDimension(keys);
+        THLongStorage *st = THLongStorage_newWithSize1(n_dim);
+        for (int i = 0; i < n_dim; i++) {
+            THLongStorage_set(st, i, THLongTensor_size(keys, i));
+        }
+        THByteTensor_resize(mask, st, NULL);
+        THLongTensor_resize(vals, st, NULL);
+        THLongStorage_free(st);
+
+        autolock(h);
+        hash_map_get_tensor(h->h, keys, vals, mask);
+        autounlock(h);
+    }
+    lua_pop(L, (narg - 1) * count);
+    return 2;
+}
+
+int hash_map_get_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (lua_isinteger(L, 2)) {
+      long key = lua_tointeger(L, 2);
+      long val;
+      autolock(h);
+      int ret = hash_map_get(h->h, key, &val);
+      autounlock(h);
+      if (ret) {
+         lua_pushinteger(L, val);
+         lua_pushinteger(L, 1);
+      }
+      else {
+         lua_pushinteger(L, 0);
+         lua_pushinteger(L, 0);
+      }
+   } else if (lua_istable(L, 2)) {
+      return hash_map_get_table_lua(L, h, 0);
+   } else {
+      return hash_map_get_tensor_lua(L, h, 0);
+   }
+   return 2;
+}
+
+int hash_map_get_inplace_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (lua_isinteger(L, 2)) {
+      LUA_HANDLE_ERROR_STR(L, "HashMap.getInplace does not support integer arguments.");
+   } else if (lua_istable(L, 2)) {
+      return hash_map_get_table_lua(L, h, 1);
+   } else {
+      return hash_map_get_tensor_lua(L, h, 1);
+   }
+   return 2;
+}
+
+int hash_map_del_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (lua_isinteger(L, 2)) {
+      long key = lua_tointeger(L, 2);
+      autolock(h);
+      hash_map_del(h->h, key);
+      autounlock(h);
+   }
+   else {
+      THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      autolock(h);
+      hash_map_del_tensor(h->h, keys);
+      autounlock(h);
+   }
+   return 0;
+}
+
+int hash_map_size_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   long size = hash_map_size(h->h);
+   lua_pushinteger(L, size);
+   return 1;
+}
+
+int hash_map_to_tensor_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   THLongTensor *keys, *vals;
+
+   if (lua_gettop(L) < 2) {
+      keys = THLongTensor_new();
+   }
+   else {
+      keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      check_tensor(L, keys, THLongTensor);
+   }
+
+   if (lua_gettop(L) < 3) {
+      vals = THLongTensor_new();
+   }
+   else {
+      vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
+      check_tensor(L, vals, THLongTensor);
+   }
+
+   size_t size = hash_map_size(h->h);
+   THLongTensor_resize1d(keys, size);
+   THLongTensor_resize1d(vals, size);
+
+   autolock(h);
+   hash_map_to_tensor(h->h, keys, vals);
+   autounlock(h);
+
+   if (lua_gettop(L) < 2)
+      luaT_pushudata(L, keys, "torch.LongTensor");
+   if (lua_gettop(L) < 3)
+      luaT_pushudata(L, vals, "torch.LongTensor");
+   return 2;
+}
diff --git a/contrib/torch/decisiontree/hash_map.h b/contrib/torch/decisiontree/hash_map.h
new file mode 100644 (file)
index 0000000..5b215e4
--- /dev/null
@@ -0,0 +1,36 @@
+#include "luaT.h"
+#include "TH.h"
+
+typedef void* hash_map_t;
+
+hash_map_t hash_map_init(void);
+void hash_map_destroy(hash_map_t);
+void hash_map_clear(hash_map_t);
+int hash_map_put(hash_map_t, long key, long val);
+int hash_map_put_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_);
+int hash_map_fill(hash_map_t, long key, long *counter);
+int hash_map_fill_tensor(hash_map_t, THLongTensor *keys_, long *counter);
+int hash_map_get(hash_map_t, long key, long *val);
+void hash_map_get_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_, THByteTensor *mask_);
+void hash_map_del(hash_map_t, long key);
+void hash_map_del_tensor(hash_map_t, THLongTensor *keys_);
+size_t hash_map_size(hash_map_t);
+void hash_map_to_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_);
+
+int hash_map_autolock_on_lua(lua_State *L);
+int hash_map_autolock_off_lua(lua_State *L);
+int hash_map_init_lua(lua_State *L);
+int hash_map_gc_lua(lua_State *L);
+int hash_map_retain_lua(lua_State *L);
+int hash_map_metatablename_lua(lua_State *L);
+int hash_map_clear_lua(lua_State *L);
+int hash_map_put_lua(lua_State *L);
+int hash_map_fill_lua(lua_State *L);
+int hash_map_adjust_counter_lua(lua_State *L);
+int hash_map_set_counter_lua(lua_State *L);
+int hash_map_get_counter_lua(lua_State *L);
+int hash_map_get_lua(lua_State *L);
+int hash_map_get_inplace_lua(lua_State *L);
+int hash_map_del_lua(lua_State *L);
+int hash_map_size_lua(lua_State *L);
+int hash_map_to_tensor_lua(lua_State *L);
diff --git a/contrib/torch/decisiontree/init.c b/contrib/torch/decisiontree/init.c
new file mode 100644 (file)
index 0000000..276241e
--- /dev/null
@@ -0,0 +1,77 @@
+#include "TH.h"
+#include "luaT.h"
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "error.h"
+#include "hash_map.h"
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#include "generic/LogitBoostCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/DFD.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/S2D.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/CartTree.c"
+#include "THGenerateFloatTypes.h"
+
+#include "GBDT_common.h"
+#include "generic/GBDT.c"
+#include "THGenerateFloatTypes.h"
+
+static const struct luaL_Reg decisiontree_hash_map_routines[] = {
+   {"__gc", hash_map_gc_lua},
+   {"retain", hash_map_retain_lua},
+   {"metatablename", hash_map_metatablename_lua},
+   {"clear", hash_map_clear_lua},
+   {"put", hash_map_put_lua},
+   {"fill", hash_map_fill_lua},
+   {"adjustCounter", hash_map_adjust_counter_lua},
+   {"getCounter", hash_map_get_counter_lua},
+   {"setCounter", hash_map_set_counter_lua},
+   {"get", hash_map_get_lua},
+   {"getInplace", hash_map_get_inplace_lua},
+   {"del", hash_map_del_lua},
+   {"size", hash_map_size_lua},
+   {"safe", hash_map_autolock_on_lua},
+   {"unsafe", hash_map_autolock_off_lua},
+   {"toTensors", hash_map_to_tensor_lua},
+   {"new", hash_map_init_lua},
+   {NULL, NULL}
+};
+
+DLL_EXPORT int luaopen_libdecisiontree(lua_State *L)
+{
+  // HashMap
+  luaL_newmetatable(L, "dt.HashMap");
+  lua_pushstring(L, "__index");
+  lua_pushvalue(L, -2);
+  lua_settable(L, -3);
+  luaT_setfuncs(L, decisiontree_hash_map_routines, 0);
+
+  nn_FloatLogitBoostCriterion_init(L);
+  nn_DoubleLogitBoostCriterion_init(L);
+
+  nn_FloatDFD_init(L);
+  nn_DoubleDFD_init(L);
+
+  nn_FloatS2D_init(L);
+  nn_DoubleS2D_init(L);
+
+  nn_FloatCT_init(L);
+  nn_DoubleCT_init(L);
+
+  nn_FloatGBDT_init(L);
+  nn_DoubleGBDT_init(L);
+
+  return 1;
+}
diff --git a/contrib/torch/decisiontree/init.lua b/contrib/torch/decisiontree/init.lua
new file mode 100644 (file)
index 0000000..fdaf1b5
--- /dev/null
@@ -0,0 +1,72 @@
+require 'paths'
+require 'xlua'
+require 'string'
+require 'os'
+require 'sys'
+require 'image'
+require 'lfs'
+require 'nn'
+
+-- these actually return local variables but we will re-require them
+-- when needed. This is just to make sure they are loaded.
+require 'moses'
+
+unpack = unpack or table.unpack
+
+local dt = require 'decisiontree._env'
+
+-- c lib:
+require "paths"
+paths.require 'libdecisiontree'
+
+dt.HashMap = torch.getmetatable("dt.HashMap").new
+
+dt.EPSILON = 1e-6
+
+-- experimental Tensor-like container
+require 'decisiontree.SparseTensor'
+
+-- functions
+require 'decisiontree.math'
+require 'decisiontree.utils'
+
+-- for multi-threading
+require 'decisiontree.WorkPool'
+
+-- abstract classes
+require 'decisiontree.DecisionTree'
+require 'decisiontree.DecisionForest'
+require 'decisiontree.DecisionForestTrainer'
+require 'decisiontree.TreeState'
+
+-- for CaRTree inference
+require 'decisiontree.CartNode'
+require 'decisiontree.CartTree'
+
+-- Criterions (extended with updateHessInput and backward2)
+require 'decisiontree.MSECriterion'
+require 'decisiontree.LogitBoostCriterion'
+
+-- Used by both RandomForestTrainer and GradientBoostTrainer
+require 'decisiontree.CartTrainer'
+
+-- Used by CartTrainer
+require 'decisiontree.DataSet'
+
+-- Random Forest Training
+require 'decisiontree.RandomForestTrainer'
+require 'decisiontree.GiniState' -- TreeState subclass
+
+-- Gradient Boosted Decision Tree Training
+require 'decisiontree.GradientBoostTrainer'
+require 'decisiontree.GradientBoostState' -- TreeState subclass
+
+-- unit tests and benchmarks
+require 'decisiontree.test'
+require 'decisiontree.benchmark'
+
+-- nn.Module
+require 'decisiontree.DFD'
+require 'decisiontree.Sparse2Dense'
+
+return dt
diff --git a/contrib/torch/decisiontree/internal_hash_map.h b/contrib/torch/decisiontree/internal_hash_map.h
new file mode 100644 (file)
index 0000000..bc8c523
--- /dev/null
@@ -0,0 +1,13 @@
+#include "khash.h"
+#include <pthread.h>
+
+KHASH_MAP_INIT_INT64(long, long)
+typedef khash_t(long)* internal_hash_map_t;
+
+typedef struct {
+   hash_map_t h;
+   int refcount;
+   pthread_mutex_t mutex;
+   int autolock;
+   long counter;
+} hash_map_lua_t;
diff --git a/contrib/torch/decisiontree/khash.h b/contrib/torch/decisiontree/khash.h
new file mode 100644 (file)
index 0000000..20e9940
--- /dev/null
@@ -0,0 +1,627 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+   int ret, is_missing;
+   khiter_t k;
+   khash_t(32) *h = kh_init(32);
+   k = kh_put(32, h, 5, &ret);
+   kh_value(h, k) = 10;
+   k = kh_get(32, h, 10);
+   is_missing = (k == kh_end(h));
+   k = kh_get(32, h, 5);
+   kh_del(32, h, k);
+   for (k = kh_begin(h); k != kh_end(h); ++k)
+      if (kh_exist(h, k)) kh_value(h, k) = 1;
+   kh_destroy(32, h);
+   return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+   * Use quadratic probing. When the capacity is power of 2, stepping function
+     i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+     hashing on cache performance and is more robust than linear probing.
+
+     In theory, double hashing should be more robust than quadratic probing.
+     However, my implementation is probably not for large hash tables, because
+     the second hash function is closely tied to the first hash function,
+     which reduce the effectiveness of double hashing.
+
+   Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+   * The capacity is a power of 2. This seems to dramatically improve the
+     speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+      - http://code.google.com/p/ulib/
+      - http://nothings.org/computer/judy/
+
+   * Allow to optionally use linear probing which usually has better
+     performance for random input. Double hashing is still the default as it
+     is more robust to certain non-random input.
+
+   * Added Wang's integer hash function (not used by default). This hash
+     function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+   * Corrected the example
+   * Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+   * Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+   * Added kh_clear()
+   * Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+   * Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+   * Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+   * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+   typedef struct kh_##name##_s { \
+      khint_t n_buckets, size, n_occupied, upper_bound; \
+      khint32_t *flags; \
+      khkey_t *keys; \
+      khval_t *vals; \
+   } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \
+   extern kh_##name##_t *kh_init_##name(void);                    \
+   extern void kh_destroy_##name(kh_##name##_t *h);               \
+   extern void kh_clear_##name(kh_##name##_t *h);                 \
+   extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \
+   extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+   extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+   extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+   SCOPE kh_##name##_t *kh_init_##name(void) {                    \
+      return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \
+   }                                                  \
+   SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \
+   {                                                  \
+      if (h) {                                        \
+         kfree((void *)h->keys); kfree(h->flags);              \
+         kfree((void *)h->vals);                            \
+         kfree(h);                                       \
+      }                                               \
+   }                                                  \
+   SCOPE void kh_clear_##name(kh_##name##_t *h)                \
+   {                                                  \
+      if (h && h->flags) {                               \
+         memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+         h->size = h->n_occupied = 0;                       \
+      }                                               \
+   }                                                  \
+   SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \
+   {                                                  \
+      if (h->n_buckets) {                                   \
+         khint_t k, i, last, mask, step = 0; \
+         mask = h->n_buckets - 1;                           \
+         k = __hash_func(key); i = k & mask;                   \
+         last = i; \
+         while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+            i = (i + (++step)) & mask; \
+            if (i == last) return h->n_buckets;                \
+         }                                            \
+         return __ac_iseither(h->flags, i)? h->n_buckets : i;     \
+      } else return 0;                                   \
+   }                                                  \
+   SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+   { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+      khint32_t *new_flags = 0;                             \
+      khint_t j = 1;                                     \
+      {                                               \
+         kroundup32(new_n_buckets);                            \
+         if (new_n_buckets < 4) new_n_buckets = 4;             \
+         if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+         else { /* hash table size to be changed (shrink or expand); rehash */ \
+            new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
+            if (!new_flags) return -1;                      \
+            memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+            if (h->n_buckets < new_n_buckets) { /* expand */      \
+               khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+               if (!new_keys) { kfree(new_flags); return -1; }    \
+               h->keys = new_keys;                          \
+               if (kh_is_map) {                          \
+                  khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+                  if (!new_vals) { kfree(new_flags); return -1; } \
+                  h->vals = new_vals;                       \
+               }                                      \
+            } /* otherwise shrink */                        \
+         }                                            \
+      }                                               \
+      if (j) { /* rehashing is needed */                       \
+         for (j = 0; j != h->n_buckets; ++j) {                 \
+            if (__ac_iseither(h->flags, j) == 0) {             \
+               khkey_t key = h->keys[j];                    \
+               khval_t val;                              \
+               khint_t new_mask;                         \
+               new_mask = new_n_buckets - 1;                   \
+               if (kh_is_map) val = h->vals[j];             \
+               __ac_set_isdel_true(h->flags, j);               \
+               while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+                  khint_t k, i, step = 0; \
+                  k = __hash_func(key);                     \
+                  i = k & new_mask;                      \
+                  while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+                  __ac_set_isempty_false(new_flags, i);        \
+                  if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+                     { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+                     if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+                     __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+                  } else { /* write the element and jump out of the loop */ \
+                     h->keys[i] = key;                   \
+                     if (kh_is_map) h->vals[i] = val;       \
+                     break;                              \
+                  }                                   \
+               }                                      \
+            }                                         \
+         }                                            \
+         if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+            h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+            if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+         }                                            \
+         kfree(h->flags); /* free the working space */            \
+         h->flags = new_flags;                              \
+         h->n_buckets = new_n_buckets;                      \
+         h->n_occupied = h->size;                           \
+         h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+      }                                               \
+      return 0;                                          \
+   }                                                  \
+   SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+   {                                                  \
+      khint_t x;                                         \
+      if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+         if (h->n_buckets > (h->size<<1)) {                    \
+            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+               *ret = -1; return h->n_buckets;                 \
+            }                                         \
+         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+            *ret = -1; return h->n_buckets;                    \
+         }                                            \
+      } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+      {                                               \
+         khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+         x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+         if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+         else {                                          \
+            last = i; \
+            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+               if (__ac_isdel(h->flags, i)) site = i;          \
+               i = (i + (++step)) & mask; \
+               if (i == last) { x = site; break; }             \
+            }                                         \
+            if (x == h->n_buckets) {                        \
+               if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+               else x = i;                               \
+            }                                         \
+         }                                            \
+      }                                               \
+      if (__ac_isempty(h->flags, x)) { /* not present at all */      \
+         h->keys[x] = key;                               \
+         __ac_set_isboth_false(h->flags, x);                   \
+         ++h->size; ++h->n_occupied;                           \
+         *ret = 1;                                       \
+      } else if (__ac_isdel(h->flags, x)) { /* deleted */            \
+         h->keys[x] = key;                               \
+         __ac_set_isboth_false(h->flags, x);                   \
+         ++h->size;                                      \
+         *ret = 2;                                       \
+      } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+      return x;                                          \
+   }                                                  \
+   SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \
+   {                                                  \
+      if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \
+         __ac_set_isdel_true(h->flags, x);                     \
+         --h->size;                                      \
+      }                                               \
+   }
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)                     \
+   __KHASH_TYPE(name, khkey_t, khval_t)                        \
+   __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+   __KHASH_TYPE(name, khkey_t, khval_t)                        \
+   __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+   KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+   khint_t h = (khint_t)*s;
+   if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+   return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+            the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \
+   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
+      if (!kh_exist(h,__i)) continue;                 \
+      (kvar) = kh_key(h,__i);                      \
+      (vvar) = kh_val(h,__i);                      \
+      code;                                  \
+   } }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;      \
+   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
+      if (!kh_exist(h,__i)) continue;                 \
+      (vvar) = kh_val(h,__i);                      \
+      code;                                  \
+   } }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)                            \
+   KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)                      \
+   KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)                             \
+   KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)                       \
+   KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)                            \
+   KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)                      \
+   KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/contrib/torch/decisiontree/math.lua b/contrib/torch/decisiontree/math.lua
new file mode 100644 (file)
index 0000000..eb71b31
--- /dev/null
@@ -0,0 +1,84 @@
+local dt = require "decisiontree._env"
+
+local PSEUDOCOUNT = 1.0
+local MIN_LOGISTIC = 1E-8
+local MAX_LOGISTIC = 1.0 - MIN_LOGISTIC
+
+-- Create counts of possible results (last column of each row is the result)
+function dt.uniquecounts(counts, inputset, nclass)
+   counts = counts or inputset.input.new()
+   nclass = nclass or inputset.target:max()
+   counts:resize(nclass):zero()
+
+   inputset.target:apply(function(c) counts[c] = counts[c] + 1 end)
+   return counts
+end
+
+-- Entropy is the sum of -p(x)log(p(x)) across all the different possible results
+local counts, logprobs
+function dt.entropy(inputset, nclass)
+   local dt = require 'decisiontree'
+   counts = dt.uniquecounts(counts, inputset, nclass)
+   -- convert counts to categorical probabilities
+   counts:add(0.0000001) -- prevent NaN
+   counts:div(counts:sum())
+
+   logprobs = logprobs or counts.new()
+   logprobs:resize(counts:size())
+   logprobs:log(counts):div(math.log(2)) -- log2(x)
+
+   counts:cmul(logprobs)
+
+   return -counts:sum()
+end
+
+-- Compute and return the probability of positive label.
+function dt.probabilityPositive(nPositive, nTotal)
+   return (nPositive + PSEUDOCOUNT) / (nTotal + 2.0 * PSEUDOCOUNT);
+end
+
+-- Ref. https://en.wikipedia.org/wiki/Logit
+-- Calculates logit of the probability.
+-- Logit represents the log-odds. Probabilities transformed to logit 'space' can be combined linearly.
+function dt.logit(p)
+   assert(p >= 0.0 and p <= 1.0, "Expecting probability for arg 1")
+   local truncatedP = math.max(MIN_LOGISTIC, math.min(MAX_LOGISTIC, p))
+   return math.log(truncatedP / (1.0 - truncatedP))
+end
+
+function dt.logistic(x)
+   return (x >= 0) and (1 / (1 + math.exp(-x))) or (1 - 1 / (1 + math.exp(x)))
+end
+
+function dt.computeGradientBoostLoss(gradient, hessian)
+   return -gradient * gradient / hessian
+end
+
+function dt.computeNewtonScore(gradient, hessian)
+   return -0.5 * gradient / hessian;
+end
+
+-- Calculates the logit score for a Node in a Decision Tree based on the probability of a positive label.
+-- params: number of positive examples and total number of examples.
+function dt.calculateLogitScore(nPositive, nTotal)
+   local dt = require 'decisiontree'
+   return dt.logit(dt.probabilityPositive(nPositive, nTotal))
+end
+
+-- Compute and return the Gini impurity score based on an input contingency table.
+function dt.computeGini(leftCount, positiveLeftCount, rightCount, positiveRightCount)
+   assert(torch.type(leftCount) == 'number', 'Expecting total number examples falling into leftBranch.')
+   assert(torch.type(positiveLeftCount) == 'number', 'Expecting total number of positive examples falling into left branch.')
+   assert(torch.type(rightCount) == 'number', 'Expecting total number of examples falling into the right branch.')
+   assert(torch.type(positiveRightCount) == 'number', 'Expecting total number of positive examples falling into the right branch.')
+
+   local total = leftCount + rightCount
+
+   local pPositiveLeft = leftCount == 0 and 0 or (positiveLeftCount / leftCount)
+   local leftGini = pPositiveLeft * (1.0 - pPositiveLeft)
+
+   local pPositiveRight = rightCount == 0 and 0 or (positiveRightCount / rightCount)
+   local rightGini = pPositiveRight * (1.0 - pPositiveRight)
+
+   return (leftCount * leftGini + rightCount * rightGini) / total
+end
\ No newline at end of file
diff --git a/contrib/torch/decisiontree/rocks/decisiontree-scm-1.rockspec b/contrib/torch/decisiontree/rocks/decisiontree-scm-1.rockspec
new file mode 100644 (file)
index 0000000..d5d9162
--- /dev/null
@@ -0,0 +1,40 @@
+package = "decisiontree"
+version = "scm-1"
+
+source = {
+   url = "git://github.com/Twitter/decisiontree",
+   tag = "master"
+}
+
+description = {
+   summary = "Decision trees for Torch by Twitter",
+   detailed = [[
+   Classification and regression trees (CART).
+   Gradients boosted decision trees (GBDT).
+   ]],
+   homepage = "https://github.com/Twitter/decisiontree",
+   license = "BSD"
+}
+
+dependencies = {
+   "torch >= 7.0",
+   "moses >= 1.3.1",
+   "xlua >= 1.0",
+   "image >= 1.0",
+   "luafilesystem >= 1.6.2",
+   "sys >= 1.1",
+   "paths >= 1.0",
+   "ipc >= 1.0",
+   "nn >= 1.0"
+}
+
+build = {
+   type = "command",
+   build_command = [[
+cmake -E make_directory build;
+cd build;
+cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" -DCMAKE_C_FLAGS=-fPIC -DCMAKE_CXX_FLAGS=-fPIC;
+$(MAKE)
+   ]],
+   install_command = "cd build && $(MAKE) install"
+}
diff --git a/contrib/torch/decisiontree/test.lua b/contrib/torch/decisiontree/test.lua
new file mode 100644 (file)
index 0000000..80510a4
--- /dev/null
@@ -0,0 +1,817 @@
+local dt = require "decisiontree._env"
+
+local dttest = {}
+local nloop = 50
+local epsilon = 0.000001
+local mytester
+
+--e.g. usage: th -e "dt = require 'decisiontree'; dt.test()"
+
+-- test 99% accuracy
+local function testAccuracy(cartTree, name, dataset, minacc)
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   minacc = minacc or 0.99
+   local output = torch.Tensor(dataset:size())
+   local target, input = dataset.target, dataset.input
+
+   for i=1,dataset:size() do
+      local stack = {}
+      local score = cartTree:score(input[i], stack)
+      output[i] = score >= 0 and 1 or 0
+
+      if dt.VERBOSE and torch.type(cartTree) == 'dt.CartTree' and target[i] ~= output[i] then
+         print(cartTree:stackToString(stack, example.input))
+         print(i, score, target[i], output[i])
+      end
+   end
+
+   local accuracy = torch.eq(target, output):float():mean()
+   mytester:assert(accuracy >= minacc, name .. ": insufficient accuracy: " .. accuracy .. " < " .. minacc)
+end
+
+function dttest.SparseTensor()
+   local keys = torch.LongTensor{1,5,6,10}
+   local values = torch.randn(keys:size(1))
+   local st = torch.SparseTensor(keys, values)
+
+   mytester:assert(st[1] == values[1])
+   mytester:assert(st[5] == values[2])
+   mytester:assert(st[6] == values[3])
+   mytester:assert(st[10] == values[4])
+
+   mytester:assert(st[2] == nil)
+
+   st:buildIndex()
+
+   mytester:assert(st[1] == values[1])
+   mytester:assert(st[5] == values[2])
+   mytester:assert(st[6] == values[3])
+   mytester:assert(st[10] == values[4])
+
+   mytester:assert(st[2] == nil)
+
+   -- test empty sparse tensor
+
+   local est = torch.SparseTensor()
+end
+
+function dttest.GiniState()
+   local featureId = 2
+   local minLeafSize = 0
+
+   local input = torch.Tensor({{0,1,0},{0,2,0},{0,3,0}})
+   local target = torch.Tensor({1, 1, 1})
+   local dataset = dt.DataSet(input, target, 3)
+
+   local splitInfo1 = {_id=1}
+   local splitInfo2 = {_id=2, leftChildSize = 1, rightChildSize = 2, splitGain = 0}
+   local splitInfo3 = {_id=3, leftChildSize = 2, rightChildSize = 1, splitGain = -1}
+
+   local exampleIds = torch.LongTensor{1,2,3}
+
+   local treeState = dt.GiniState(exampleIds)
+
+   function treeState.computeSplitInfo(self, splitFeatureId, splitFeatureValue)
+      if splitFeatureId  == featureId and splitFeatureValue == 2 then
+         return splitInfo2
+      elseif splitFeatureId  == featureId and splitFeatureValue == 3 then
+         return splitInfo3
+      else
+         error("Unhandled computeSplitInfo call "..splitFeatureId.." "..splitFeatureValue)
+      end
+   end
+
+   local splitInfo = treeState:findBestFeatureSplit(dataset, featureId, minLeafSize)
+   mytester:assert(splitInfo._id == splitInfo3._id)
+end
+
+function dttest.CartTree()
+
+   local splitFeatureId = 100
+   local splitFeatureValue = 1.0
+
+   local function getBinaryCartTreeRootNode()
+
+      local leftNodeScore = 0.2
+      local rightNodeScore = 0.4
+
+      local rootNode = dt.CartNode()
+      rootNode.nodeId = 0
+      rootNode.score = 0.5
+      rootNode.splitFeatureId = splitFeatureId
+      rootNode.splitFeautreValue = splitFeatureValue
+
+      local leftChild = dt.CartNode()
+      leftChild.score = leftNodeScore
+      leftChild.nodeId = 1
+
+      local rightChild = dt.CartNode()
+      rightChild.score = rightNodeScore
+      rightChild.nodeId = 2
+
+      rootNode.leftChild = leftChild
+      rootNode.rightChild = rightChild
+
+      return rootNode
+   end
+
+   local function testScoreCartTreeBranchLeftIfMissing()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.SparseTensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
+      mytester:assert(rootNode.leftChild.nodeId == nodeId)
+   end
+
+   local function testBranchRightWithFeature()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.zeros(100)
+      continuousFeatures[splitFeatureId] = splitFeatureValue
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
+      mytester:assert(rootNode.rightChild.nodeId == nodeId)
+   end
+
+   local function testMissingRightNode()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      rootNode.rightChild = nil
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.Tensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
+      mytester:assert(rootNode.leftChild.nodeId == nodeId)
+   end
+
+   local function testMissingLeftNode()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      rootNode.leftChild = nil
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.Tensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
+      mytester:assert(rootNode.rightChild.nodeId == nodeId)
+   end
+
+   local function testMissingAllChildren()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      rootNode.leftChild = nil
+      rootNode.rightChild = nil
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.Tensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.score - score) < epsilon)
+      mytester:assert(rootNode.nodeId == nodeId)
+   end
+
+   local function testScoreCartTreeBranchRandomlyRight()
+      local rootNode = getBinaryCartTreeRootNode();
+
+      -- Force Branch Right
+      local cartTree = dt.CartTree(rootNode, function() return false end);
+
+      local continuousFeatures = torch.SparseTensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
+      mytester:assert(rootNode.rightChild.nodeId == nodeId)
+   end
+
+   local function testScoreCartTreeBranchRandomlyLeft()
+      local rootNode = getBinaryCartTreeRootNode();
+
+      -- Force Branch Left
+      local cartTree = dt.CartTree(rootNode, function() return true end);
+
+      local continuousFeatures = torch.SparseTensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
+      mytester:assert(rootNode.leftChild.nodeId == nodeId)
+   end
+
+   testScoreCartTreeBranchLeftIfMissing()
+   testBranchRightWithFeature()
+   testMissingRightNode()
+   testMissingLeftNode()
+   testMissingAllChildren()
+   testScoreCartTreeBranchRandomlyRight()
+   testScoreCartTreeBranchRandomlyLeft()
+
+end
+
+function dttest.TreeState_branch()
+   local _ = require 'moses'
+   local binFeatureId = 1
+   local featureId = 2
+
+   local input = {
+      torch.SparseTensor(torch.LongTensor{binFeatureId},torch.Tensor{1}),
+      torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{1,1}),
+      torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{0,2}),
+      torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{0,3})
+   }
+   local target = torch.LongTensor(4):fill(1)
+
+   local dataset = dt.DataSet(input, target)
+
+   local treeState = dt.TreeState(torch.LongTensor():range(1,4))
+   local splitInfo = {splitId = binFeatureId, splitValue = 1}
+
+   local function testBranchBinaryFeature()
+      splitInfo = {splitId = binFeatureId, splitValue = 1}
+      local leftBranch, rightBranch = treeState:branch(splitInfo, dataset)
+      mytester:assert(leftBranch ~= nil and rightBranch ~= nil)
+
+      mytester:assert(2 == leftBranch:size())
+      mytester:assert(leftBranch:contains(3))
+      mytester:assert(leftBranch:contains(4))
+
+      mytester:assert(2 == rightBranch:size())
+      mytester:assert(rightBranch:contains(1))
+      mytester:assert(rightBranch:contains(2))
+   end
+
+   local function testBranchContinuousFeature()
+      local splitValue = 2
+      splitInfo = {splitId = featureId, splitValue = splitValue}
+
+      local leftBranch, rightBranch = treeState:branch(splitInfo, dataset)
+      mytester:assert(leftBranch ~= nil and rightBranch ~= nil)
+
+      mytester:assert(1 == leftBranch:size())
+      mytester:assert(leftBranch:contains(2))
+
+      mytester:assert(2 == rightBranch:size())
+      mytester:assert(rightBranch:contains(3))
+      mytester:assert(rightBranch:contains(4))
+   end
+
+   testBranchBinaryFeature()
+   testBranchContinuousFeature()
+
+end
+
+function dttest.DecisionForest()
+   -- Create test decision forest, each forest has only a single node, and returns score == score of root node.
+
+   local function createCartTreeWithSingleNode(score)
+      local cartNode = dt.CartNode()
+      cartNode.score = score
+      return dt.CartTree(cartNode)
+   end
+
+   local function getTestDecisionForest()
+      local cartTrees = {
+         createCartTreeWithSingleNode(1),
+         createCartTreeWithSingleNode(2),
+         createCartTreeWithSingleNode(3)
+      }
+      local weight = torch.Tensor{10,20,30}
+      local bias = 0.5
+
+      return dt.DecisionForest(cartTrees, weight, bias)
+   end
+
+   local function testScoreDecisionForest()
+      local df = getTestDecisionForest()
+      local continuousFeatures = torch.SparseTensor()
+
+      local expectedResult = 1.0 * 10.0 + 2.0 * 20.0 + 3.0 * 30.0 + 0.5;
+      local result = df:score(continuousFeatures)
+
+      mytester:assert(math.abs(expectedResult - result) < epsilon)
+   end
+
+   testScoreDecisionForest()
+end
+
+function dttest.CartTrainer()
+   local minLeafSize, maxLeafNodes = 1, 1000
+   local nExample = 100
+
+   -- 1. dense dataset
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample)
+
+   -- assert that the dataset is valid
+   for clusterId, exampleIds in ipairs(clusterExamples) do
+      local exampleIdx = torch.LongTensor(exampleIds)
+      local input = inputs:index(1,exampleIdx)
+      local target = targets:index(1,exampleIdx)
+      assert(input:std(1):mean() < 0.05)
+   end
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+
+   mytester:assert(nleaf == nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
+   testAccuracy(cartTree, "dense train single-thread first", trainSet, 0.99)
+   testAccuracy(cartTree, "dense valid single-thread first", validSet, 0.7) -- they don't generalize very well..
+
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   testAccuracy(cartTree, "dense single-thread second", trainSet)
+
+   -- test feature parallelization
+   local nThread = 2
+   cartTrainer:featureParallel(nThread)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   testAccuracy(cartTree, "dense feature-parallel", trainSet)
+
+   -- 2. sparse-dense dataset
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getSparseDummyData(nExample, nil, 10, nil, nil, 10)
+
+   -- assert that the dataset is valid
+   for clusterId, exampleIds in ipairs(clusterExamples) do
+      local input = torch.Tensor(#exampleIds, 10):zero()
+      for i, exampleId in ipairs(exampleIds) do
+         input[i]:indexCopy(1, inputs[exampleId].keys, inputs[exampleId].values)
+      end
+      assert(input:std(1):mean() < 0.05)
+   end
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+
+   mytester:assert(nleaf == nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
+   testAccuracy(cartTree, "sparse-dense train single-thread first", trainSet, 0.99)
+
+   local shuffle = torch.LongTensor():randperm(10)
+   for i, input in ipairs(inputs) do
+      input.keys = input.keys:index(1, shuffle)
+      input.values = input.values:index(1, shuffle)
+      input._map = nil
+   end
+   testAccuracy(cartTree, "sparse-dense shuffled keys train single-thread first", trainSet, 0.99)
+   testAccuracy(cartTree, "sparse-dense valid single-thread first", validSet, 0.8)
+
+   -- 3. sparse dataset
+   local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   cartTree.branchleft = function() return true end
+
+   mytester:assert(nleaf < nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
+   testAccuracy(cartTree, "sparse train single-thread first", trainSet, 0.9) -- the TreeBrancher drops examples with missing features, making it difficult to overfit
+   testAccuracy(cartTree, "sparse valid single-thread first", validSet, 0.8)
+end
+
+function dttest.GradientBoostTrainer()
+   local nExample = 100
+   local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
+
+   local maxLeafNode, minLeafSize = nExample/2, nExample/10
+   local loss = nn.LogitBoostCriterion(false)
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNode)
+
+   local opt = {
+      lossFunction=loss,
+      treeTrainer=cartTrainer,
+      shrinkage=0.1,
+      downsampleRatio=6,
+      featureBaggingSize=-1,
+      nTree=14,
+      evalFreq=8,
+      earlyStop=0 -- no early-stopping
+   }
+
+   -- test single-thread
+   local trainer = dt.GradientBoostTrainer(opt)
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread first", validSet, 0.95)
+
+   -- test stateless
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread second", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread second", validSet, 0.95)
+
+   -- test feature-parallel
+   local nThread = 2
+   cartTrainer:featureParallel(nThread)
+
+   local trainer = dt.GradientBoostTrainer(opt)
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train feature-parallel first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid feature-parallel first", validSet, 0.95)
+
+end
+
+function dttest.RandomForestTrainer()
+   local nExample = 100
+   local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
+
+   local opt = {
+      activeRatio=0.5,
+      featureBaggingSize=5,
+      nTree=14,
+      maxLeafNodes=nExample/2,
+      minLeafSize=nExample/10,
+   }
+
+   local trainer = dt.RandomForestTrainer(opt)
+
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds)
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread first", validSet, 0.95)
+
+   -- test stateless
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread second", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread second", validSet, 0.95)
+
+   -- test tree-parallel
+   local nThread = 2
+   trainer:treeParallel(nThread)
+
+   local trainer = dt.RandomForestTrainer(opt)
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train tree-parallel first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid tree-parallel first", validSet, 0.95)
+end
+
+function dttest.WorkPool()
+   local nThread = 2
+   local wp = dt.WorkPool(nThread)
+
+   -- 1. some easy tests
+   local store = {key='nick',value=7}
+   wp:update('storeKeyValue', store)
+
+   wp:update('require', {libname='decisiontree', varname='dt'})
+
+   local bias = 2
+   local obj = nn.MSECriterion()
+   wp:update('require', {libname='decisiontree', varname='dt'})
+   wp:writeup('execute', function(store) return bias + obj:updateOutput(torch.Tensor{1},torch.Tensor{1}) + store.nick end)
+
+   local taskname, res = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(res == 9)
+
+   -- 2. trying to reproduce a difficult error
+   local trainSet, validSet = dt.getSparseDummyData()
+
+   -- setup worker store (each worker will have its own copy)
+   local store = {
+      trainSet=trainSet,
+      minLeafSize=2
+   }
+   wp:update('storeKeysValues', store)
+
+   -- arguments/upvalues
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local shardId = 1
+   local nShard = nThread
+   local featureIds = trainSet.featureIds
+
+   local task = function(store, args)
+      assert(store.trainSet)
+      assert(store.minLeafSize)
+
+      local bestSplit = args.treeState:findBestSplit(store.trainSet, args.featureIds, store.minLeafSize, args.shardId, args.nShard)
+      return bestSplit
+   end
+   local args = {treeState=treeState,featureIds=featureIds,shardId=shardId,nShard=nShard}
+   wp:writeup("execute", {func=task,args=args})
+
+   local taskname, bestSplit = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(torch.type(bestSplit) == 'table')
+
+   -- closure
+   local task = function(store)
+      assert(store.trainSet)
+      assert(store.minLeafSize)
+
+      local bestSplit = treeState:findBestSplit(store.trainSet, featureIds, store.minLeafSize, shardId, nShard)
+      return bestSplit
+   end
+   wp:writeup("execute", task)
+
+   local taskname, bestSplit = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(torch.type(bestSplit) == 'table')
+
+   local task = function(store, args)
+      assert(store.trainSet)
+      assert(torch.isTypeOf(treeState, 'dt.TreeState'), torch.type(treeState))
+
+      local bestSplit = treeState:findBestSplit(store.trainSet, featureIds, store.minLeafSize, shardId, nShard)
+      return bestSplit
+   end
+   local args = {featureIds=featureIds,shardId=shardId,nShard=nShard}
+   wp:writeup("execute", {func=task,args=args})
+
+
+   local taskname, bestSplit = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(torch.type(bestSplit) == 'table')
+
+   wp:terminate()
+end
+
+function dttest.Sparse2Dense()
+   local batchsize = 4
+   local minFeatureId, maxFeatureId = 10, 100
+   local input = {{},{}}
+   for i=1,batchsize do
+      local inputsize = math.random(5,10)
+      input[1][i] = torch.LongTensor(inputsize):random(minFeatureId,maxFeatureId)
+      input[2][i] = torch.Tensor(inputsize):uniform(0,1)
+   end
+   local s2d = nn.Sparse2Dense(torch.LongTensor():range(minFeatureId,maxFeatureId))
+   -- test 2d forward
+   local output = s2d:forward(input)
+   local output2 = torch.Tensor(batchsize, maxFeatureId-minFeatureId+1):zero()
+   local featureMap = {}
+   local j = 0
+   for i=minFeatureId,maxFeatureId do
+      j = j + 1
+      featureMap[i] = j
+   end
+   for i=1,batchsize do
+      local keys, values = input[1][i], input[2][i]
+      for j=1,keys:size(1) do
+         output2[{i,featureMap[keys[j] ]}] = values[j]
+      end
+   end
+   mytester:assertTensorEq(output, output2, 0.000001)
+   -- test 1d forward
+   local input = {input[1][batchsize], input[2][batchsize]}
+   local output = s2d:forward(input)
+   mytester:assertTensorEq(output, output2[batchsize], 0.000001)
+end
+
+function dttest.Sparse2DenseDouble()
+   local batchsize = 4
+   local minFeatureId, maxFeatureId = 10, 100
+   local input = {{},{}}
+   for i=1,batchsize do
+      local inputsize = math.random(5,10)
+      input[1][i] = torch.LongTensor(inputsize):random(minFeatureId,maxFeatureId)
+      input[2][i] = torch.Tensor(inputsize):uniform(0,1):double()
+   end
+   local s2d = nn.Sparse2Dense(torch.LongTensor():range(minFeatureId,maxFeatureId))
+   s2d:double()
+   -- test 2d forward
+   local output = s2d:forward(input)
+   local output2 = torch.Tensor(batchsize, maxFeatureId-minFeatureId+1):zero():double()
+   local featureMap = {}
+   local j = 0
+   for i=minFeatureId,maxFeatureId do
+      j = j + 1
+      featureMap[i] = j
+   end
+   for i=1,batchsize do
+      local keys, values = input[1][i], input[2][i]
+      for j=1,keys:size(1) do
+         output2[{i,featureMap[keys[j] ]}] = values[j]
+      end
+   end
+   mytester:assertTensorEq(output, output2, 0.000001)
+   -- test 1d forward
+   local input = {input[1][batchsize], input[2][batchsize]}
+   local output = s2d:forward(input)
+   mytester:assertTensorEq(output, output2[batchsize], 0.000001)
+end
+
+function dttest.LogitBoostCriterion()
+   local input = torch.randn(10)
+   local target = torch.LongTensor(10):random(0,1):type(torch.type(input))
+
+   local lb = nn.LogitBoostCriterion(false)
+   local loss = lb:updateOutput(input, target)
+
+   local loss2 = 0
+   for i=1,10 do
+      loss2 = loss2 + math.log(1 + math.exp(target[i] <= 0 and input[i] or -input[i]))
+   end
+   mytester:assert(math.abs(loss - loss2) < 0.00001)
+
+   local gradInput = lb:updateGradInput(input, target)
+   local gradInput2 = gradInput:clone():zero()
+   for i=1,10 do
+      local p = dt.logistic(input[i])
+      gradInput2[i] = (target[i] <= 0) and p or (p - 1)
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+
+   local hessInput = lb:updateHessInput(input, target)
+   local hessInput2 = hessInput:clone():zero()
+   for i=1,10 do
+      local p = dt.logistic(input[i])
+      hessInput2[i] = p * (1.0 - p)
+   end
+   mytester:assertTensorEq(hessInput, hessInput2, 0.000001)
+end
+
+function dttest.DFD()
+   local nExample = 100
+   local batchsize = 4
+   local inputsize = 10
+
+   -- train Random Forest
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample, nil, inputsize)
+   local opt = {
+      activeRatio=0.5,
+      featureBaggingSize=5,
+      nTree=4,
+      maxLeafNodes=nExample/2,
+      minLeafSize=nExample/10,
+   }
+   local trainer = dt.RandomForestTrainer(opt)
+   local df = trainer:train(trainSet, trainSet.featureIds)
+   mytester:assert(#df.trees == opt.nTree)
+
+   local dfd = nn.DFD(df)
+   dfd = nn.DFD(dfd:getReconstructionInfo())
+   local dfd2 = nn.DFD(dfd:getReconstructionInfo(), true)
+   local input = validSet.input:sub(1,batchsize)
+   local output = dfd:forward(input)
+   local output2 = dfd2:forward(input)
+
+   local _ = require 'moses'
+
+   local function hasKey(keys,key)
+      local found = false
+      keys:apply(function(x)
+         if x == key then
+            found = true
+         end
+      end)
+      return found
+   end
+
+   for i=1,batchsize do
+      local nodes = {}
+      local keys = output[1][i]
+      local keys2 = output2[1][i]
+      for j,tree in ipairs(df.trees) do
+         local stack = {}
+         tree:score(input[i], stack)
+         mytester:assert(hasKey(keys2, stack[#stack]._nodeId))
+
+         for k,node in ipairs(stack) do
+            if k > 1 then
+               assert(node._nodeId)
+               mytester:assert(hasKey(keys, node._nodeId), string.format("missing key=%d in %s", node._nodeId, tostring(keys)))
+               table.insert(nodes, node._nodeId)
+            end
+         end
+      end
+      mytester:assert(#nodes == keys:size(1))
+      mytester:assert(#df.trees == keys2:size(1))
+   end
+end
+
+function dttest.DFDDouble()
+   local nExample = 100
+   local batchsize = 4
+   local inputsize = 10
+
+   -- train Random Forest
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample, nil, inputsize)
+   local opt = {
+      activeRatio=0.5,
+      featureBaggingSize=5,
+      nTree=4,
+      maxLeafNodes=nExample/2,
+      minLeafSize=nExample/10,
+   }
+   local trainer = dt.RandomForestTrainer(opt)
+   local df = trainer:train(trainSet, trainSet.featureIds)
+   mytester:assert(#df.trees == opt.nTree)
+
+   local dfd = nn.DFD(df)
+   dfd:double()
+   dfd = nn.DFD(dfd:getReconstructionInfo())
+   local dfd2 = nn.DFD(dfd:getReconstructionInfo(), true)
+   local input = validSet.input:sub(1,batchsize):double()
+   local output = dfd:forward(input)
+   local output2 = dfd2:forward(input)
+
+   local _ = require 'moses'
+
+   local function hasKey(keys,key)
+      local found = false
+      keys:apply(function(x)
+         if x == key then
+            found = true
+         end
+      end)
+      return found
+   end
+
+   for i=1,batchsize do
+      local nodes = {}
+      local keys = output[1][i]
+      local keys2 = output2[1][i]
+      for j,tree in ipairs(df.trees) do
+         local stack = {}
+         tree:score(input[i], stack)
+         mytester:assert(hasKey(keys2, stack[#stack]._nodeId))
+
+         for k,node in ipairs(stack) do
+            if k > 1 then
+               assert(node._nodeId)
+               mytester:assert(hasKey(keys, node._nodeId), string.format("missing key=%d in %s", node._nodeId, tostring(keys)))
+               table.insert(nodes, node._nodeId)
+            end
+         end
+      end
+      mytester:assert(#nodes == keys:size(1))
+      mytester:assert(#df.trees == keys2:size(1))
+   end
+end
+
+function dttest.uniquecounts() -- DEPRECATED
+   local target = torch.LongTensor(100):random(1,3)
+   local input = torch.Tensor()
+   local inputset = {input=input, target=target}
+
+   local counts = dt.uniquecounts(nil, inputset, 3)
+
+   mytester:assert(counts:sum() == 100)
+   mytester:assert(counts:nElement() == 3)
+
+   local res = torch.Tensor(3):zero()
+   target:apply(function(t) res[t] = res[t] + 1 end)
+
+   mytester:assertTensorEq(counts, res)
+end
+
+function dttest.entropy() -- DEPRECATED
+   -- 2 clusters with a bit overlap between classes:
+   local input = torch.Tensor(100,2)
+   input:narrow(1,1,50):normal(-1,.01)
+   input:narrow(1,51,50):normal(2,.01)
+
+   local target = torch.LongTensor(100):fill(3)
+   target:narrow(1,1,45):fill(1)
+   target:narrow(1,56,45):fill(2)
+
+   local inputset = {input=input, target=target}
+
+   -- test entropy()
+   local fullent = dt.entropy(inputset)
+
+   local halfset = {input=input:narrow(1,1,50), target=target:narrow(1,1,50)}
+   local halfent = dt.entropy(halfset)
+
+   local perfectset = {input=input:narrow(1,56,45), target=target:narrow(1,56,45)}
+   local perfectent = dt.entropy(perfectset)
+
+   mytester:assert(fullent > halfent)
+   mytester:assert(halfent > perfectent)
+   mytester:assert(perfectent < 0.0000001 and perfectent >= 0)
+end
+
+function dt.test(tests)
+   math.randomseed(os.time())
+   mytester = torch.Tester()
+   mytester:add(dttest)
+   mytester:run(tests)
+end
diff --git a/contrib/torch/decisiontree/utils.h b/contrib/torch/decisiontree/utils.h
new file mode 100644 (file)
index 0000000..8a0196a
--- /dev/null
@@ -0,0 +1,45 @@
+#include "error.h"
+
+#define check_tensors(L, a, b) \
+   do { \
+      if ((a)->nDimension != (b)->nDimension) \
+         return LUA_HANDLE_ERROR_STR((L), "different tensor dimensions"); \
+      for (int __local__var = 0; __local__var < (a)->nDimension; __local__var++) \
+         if ((a)->size[__local__var] != (b)->size[__local__var]) \
+            return LUA_HANDLE_ERROR_STR((L), "different tensor sizes"); \
+   } while (0)
+
+#define check_tensor(L, t, type) \
+   do { \
+      if (!type##_isContiguous(t)) \
+         return LUA_HANDLE_ERROR_STR((L), "tensor should be contiguous"); \
+   } while (0)
+
+#define get_tensor_size(t, type)                \
+    (TH##type##Tensor_nElement(t))
+
+#define get_tensor(L, idx, type)                                        \
+    (TH##type##Tensor *)luaT_checkudata(L, idx, "torch." #type "Tensor")
+
+static int push_table_contents(lua_State *L, int arg)
+{
+    int size = 0;
+    while(1) {
+        lua_checkstack(L, 1);
+        lua_rawgeti(L, arg, size + 1);
+        if (lua_isnil(L, -1)) {
+            lua_pop(L, 1);
+            break;
+        }
+        size++;
+    }
+    return size;
+}
+
+#define verify_push_table_contents(L, idx, count)  do {             \
+        int __tmp_count = push_table_contents(L, idx);              \
+        if (__tmp_count != count) {                                 \
+            lua_pop(L, __tmp_count);                                \
+            LUA_HANDLE_ERROR_STR(L, "Table sizes do not match");    \
+        }                                                           \
+    } while(0)
diff --git a/contrib/torch/decisiontree/utils.lua b/contrib/torch/decisiontree/utils.lua
new file mode 100644 (file)
index 0000000..c32c3d0
--- /dev/null
@@ -0,0 +1,125 @@
+local dt = require "decisiontree._env"
+
+-- returns a buffer table local to a thread (no serialized)
+function dt.getBufferTable(name)
+   local dt = require 'decisiontree'
+   assert(torch.type(name) == 'string')
+   dt.buffer = dt.buffer or {}
+   dt.buffer[name] = dt.buffer[name] or {}
+   return dt.buffer[name]
+end
+
+function dt.getSparseDummyData(nExample, nCluster, nFeature, overlap, nValid, nActive)
+   local dt = require 'decisiontree'
+   if torch.type(nExample) == 'table' then
+      local opt = nExample
+      nExample = opt.nExample
+      nCluster = opt.nCluster
+      nFeature = opt.nFeature
+      overlap = opt.overlap
+      nValid = opt.nValid
+      nActive = opt.nActive
+   end
+   nExample = nExample or 100 -- training set size
+   nCluster = nCluster or 10
+   assert(nCluster >= 2)
+   nFeature = math.max(2, nFeature or 10)
+   overlap = overlap or 0
+   nValid = nValid or nExample/10 -- validation set size
+   nActive = nActive or math.max(2, nFeature / 2)
+
+   -- sample nCluster centers
+   local clusterCenter = torch.rand(nCluster, nFeature)
+   local clusterLabel = torch.LongTensor(nCluster)
+   local clusterExamples = {}
+   for i=1,nCluster do
+      clusterCenter[i]:add(i)
+      clusterLabel[i] = i % 2
+      clusterExamples[i] = {}
+   end
+
+   local sparseCenter = torch.Tensor()
+
+   local shuffle = torch.LongTensor()
+
+   -- build dataset in pseudo-dense format
+   local inputs = {}
+   local targets = torch.Tensor(nExample+nValid)
+   for i=1,nExample+nValid do
+      local clusterIdx = torch.random(1,nCluster)
+      table.insert(clusterExamples[clusterIdx], i)
+
+      shuffle:randperm(nFeature)
+      local keys = torch.LongTensor(nActive):copy(shuffle:narrow(1,1,nActive))
+      sparseCenter:index(clusterCenter[clusterIdx], 1, keys)
+      local stdiv = i <= nExample and 100 or 1000
+      local values = torch.randn(nActive):div(stdiv):add(sparseCenter)
+
+      table.insert(inputs, torch.SparseTensor(keys, values))
+
+      local label = clusterLabel[clusterIdx]
+      if math.random() < overlap then
+         targets[i] = label == 1 and 0 or 1
+      else
+         targets[i] = label
+      end
+   end
+
+   local _ = require 'moses'
+   local validSet = dt.DataSet(_.slice(inputs, nExample+1, nExample+nValid), targets:narrow(1,nExample+1,nValid))
+   local trainSet = dt.DataSet(_.slice(inputs, 1, nExample), targets:narrow(1,1,nExample))
+
+   return trainSet, validSet, clusterExamples, inputs, targets
+end
+
+function dt.getDenseDummyData(nExample, nCluster, nFeature, overlap, nValid)
+   local dt = require 'decisiontree'
+   if torch.type(nExample) == 'table' then
+      local opt = nExample
+      nExample = opt.nExample
+      nCluster = opt.nCluster
+      nFeature = opt.nFeature
+      overlap = opt.overlap
+      nValid = opt.nValid
+   end
+   nExample = nExample or 100 -- training set size
+   nCluster = nCluster or 10
+   assert(nCluster >= 2)
+   nFeature = math.max(2, nFeature or 10)
+   overlap = overlap or 0
+   nValid = nValid or nExample/10 -- validation set size
+
+   -- sample nCluster centers
+   local clusterCenter = torch.rand(nCluster, nFeature)
+   local clusterLabel = torch.LongTensor(nCluster)
+   local clusterExamples = {}
+   for i=1,nCluster do
+      clusterCenter[i]:add(i)
+      clusterLabel[i] = i % 2
+      clusterExamples[i] = {}
+   end
+
+   -- build dataset in pseudo-dense format
+   local inputs = torch.Tensor(nExample+nValid, nFeature)
+   local targets = torch.Tensor(nExample+nValid)
+   for i=1,nExample+nValid do
+      local clusterIdx = torch.random(1,nCluster)
+      table.insert(clusterExamples[clusterIdx], i)
+
+      local stdiv = i <= nExample and 100 or 1000
+      inputs[i]:normal():div(stdiv):add(clusterCenter[clusterIdx])
+
+      local label = clusterLabel[clusterIdx]
+      if math.random() < overlap then
+         targets[i] = label == 1 and 0 or 1
+      else
+         targets[i] = label
+      end
+   end
+
+   local _ = require 'moses'
+   local validSet = dt.DataSet(inputs:narrow(1,nExample+1,nValid), targets:narrow(1,nExample+1,nValid))
+   local trainSet = dt.DataSet(inputs:narrow(1,1,nExample), targets:narrow(1,1,nExample))
+
+   return trainSet, validSet, clusterExamples, inputs, targets
+end
index 241dd195b2e324e369876b4c3122c906ce9f2d77..9ffdcf788abc10feca463b6f9b05a8f8d363d142 100644 (file)
@@ -19,6 +19,10 @@ ENDMACRO()
 
 MACRO(ADD_TORCH_PACKAGE package src luasrc)
   INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+  INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/contrib/torch/torch7/lib/TH)
+  INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/contrib/torch/torch7/lib/luaT)
+  INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/contrib/torch/torch7/lib/TH)
+  INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/contrib/torch/torch7/lib/luaT)
   INCLUDE_DIRECTORIES(${Torch_LUA_INCLUDE_DIR})
 
  ### C/C++ sources