[Minor] Move lua contrib libraries to lua- prefix

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-05-23 18:14:15 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-05-23 18:14:15 +0100
commit: 714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch)
tree: 84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/lua-torch/decisiontree
parent: 220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff)
download: rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz
rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip
44 files changed, 6593 insertions, 0 deletions
diff --git a/contrib/lua-torch/decisiontree/CMakeLists.txt b/contrib/lua-torch/decisiontree/CMakeLists.txt
new file mode 100644
index 000000000..6434eddd9
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/CMakeLists.txt
@@ -0,0 +1,54 @@
+SET(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR})
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+SET(src
+   init.c
+   hash_map.c
+)
+SET(luasrc
+   _env.lua
+   benchmark.lua
+   CartNode.lua
+   CartTrainer.lua
+   CartTree.lua
+   DataSet.lua
+   DecisionForest.lua
+   DecisionForestTrainer.lua
+   DecisionTree.lua
+   DFD.lua
+   GiniState.lua
+   GradientBoostState.lua
+   GradientBoostTrainer.lua
+   init.lua
+   LogitBoostCriterion.lua
+   math.lua
+   MSECriterion.lua
+   RandomForestTrainer.lua
+   Sparse2Dense.lua
+   SparseTensor.lua
+   test.lua
+   TreeState.lua
+   utils.lua
+   WorkPool.lua
+)
+
+IF (WITH_OPENMP)
+   FIND_PACKAGE(OpenMP)
+   IF(OPENMP_FOUND)
+      MESSAGE(STATUS "Compiling with OpenMP support")
+      SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+      SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+   ENDIF(OPENMP_FOUND)
+ENDIF (WITH_OPENMP)
+
+ADD_TORCH_PACKAGE(decisiontree "${src}" "${luasrc}" "A decision tree library, for Torch")
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+### Torch packages supposes libraries prefix is "lib"
+SET_TARGET_PROPERTIES(decisiontree PROPERTIES
+        PREFIX "lib"
+        IMPORT_PREFIX "lib")
+TARGET_LINK_LIBRARIES(decisiontree ${TH_LIBRARIES})
+INSTALL(TARGETS decisiontree DESTINATION ${RSPAMD_LIBDIR})
diff --git a/contrib/lua-torch/decisiontree/CartNode.lua b/contrib/lua-torch/decisiontree/CartNode.lua
new file mode 100644
index 000000000..e6a85006c
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/CartNode.lua
@@ -0,0 +1,42 @@
+local dt = require 'decisiontree._env'
+local CartNode = torch.class("dt.CartNode", dt)
+
+function CartNode:__init(nodeId, leftChild, rightChild, splitFeatureId, splitFeatureValue, score, splitGain)
+   self.nodeId = nodeId or 0
+   self.leftChild = leftChild
+   self.rightChild = rightChild
+   self.splitFeatureId = splitFeatureId or -1
+   self.splitFeatureValue = splitFeatureValue or 0
+   self.score = score or 0
+   self.splitGain = splitGain
+end
+
+function CartNode:__tostring__()
+   return self:recursivetostring()
+end
+
+function CartNode:recursivetostring(indent)
+   indent = indent or ' '
+
+   -- Is this a leaf node?
+   local res = ''
+   if not (self.leftChild or self.rightChild) then
+      res = res .. self.score .. '\n'
+   else
+      -- Print the criteria
+      res = res .. 'input[' .. self.splitFeatureId .. '] <' .. self.splitFeatureValue .. '?\n'
+
+      -- Print the branches
+      if self.leftChild then
+         res = res .. indent .. 'True->' .. self.leftChild:recursivetostring(indent .. '  ')
+      end
+      if self.rightChild then
+         res = res .. indent .. 'False->' .. self.rightChild:recursivetostring(indent .. '  ')
+      end
+   end
+   return res
+end
+
+function CartNode:clone()
+   return CartNode(self.nodeId, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, self.score, self.splitGain)
+end
diff --git a/contrib/lua-torch/decisiontree/CartTrainer.lua b/contrib/lua-torch/decisiontree/CartTrainer.lua
new file mode 100644
index 000000000..63ae6c148
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/CartTrainer.lua
@@ -0,0 +1,180 @@
+local dt = require "decisiontree._env"
+local _ = require "moses"
+
+local CartTrainer = torch.class("dt.CartTrainer", dt)
+
+-- Generic CART trainer
+function CartTrainer:__init(dataset, minLeafSize, maxLeafNodes)
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   self.dataset = dataset
+   self.minLeafSize = assert(minLeafSize) -- min examples per leaf
+   self.maxLeafNodes = assert(maxLeafNodes) -- max leaf nodes in tree
+
+   -- by default, single thread
+   self.parallelMode = 'singlethread'
+end
+
+function CartTrainer:train(rootTreeState, activeFeatures)
+   assert(torch.isTypeOf(rootTreeState, 'dt.TreeState'))
+   assert(torch.isTensor(activeFeatures))
+   local root = dt.CartNode()
+   root.id = 0
+   root.score = rootTreeState:score(self.dataset)
+
+   local nleaf = 1
+
+   -- TODO : nodeparallel: parallelize here. The queue is a workqueue.
+   local queue = {}
+   table.insert(queue, 1, {cartNode=root, treeState=rootTreeState})
+
+   while #queue > 0 and nleaf < self.maxLeafNodes do
+      local treeGrowerArgs = table.remove(queue, #queue)
+      local currentTreeState = treeGrowerArgs.treeState
+
+      -- Note: if minLeafSize = 1 and maxLeafNode = inf, then each example will be its own leaf...
+      if self:hasEnoughTrainingExamplesToSplit(currentTreeState.exampleIds:size(1)) then
+         nleaf = self:processNode(nleaf, queue, treeGrowerArgs.cartNode, currentTreeState, activeFeatures)
+      end
+   end
+
+   -- CartTree with random branching (when feature is missing)
+   local branchleft = function() return math.random() < 0.5 end
+   return dt.CartTree(root, branchleft), nleaf
+end
+
+function CartTrainer:processNode(nleaf, queue, node, treeState, activeFeatures)
+   local bestSplit
+   if self.parallelMode == 'singlethread' then
+      bestSplit = self:findBestSplitForAllFeatures(treeState, activeFeatures)
+   elseif self.parallelMode == 'featureparallel' then
+      bestSplit = self:findBestSplitForAllFeaturesFP(treeState, activeFeatures)
+   else
+      error("Unrecognized parallel mode: " .. self.parallelMode)
+   end
+
+   if bestSplit then
+      local leftTreeState, rightTreeState = treeState:branch(bestSplit, self.dataset)
+      assert(bestSplit.leftChildSize + bestSplit.rightChildSize == leftTreeState.exampleIds:size(1) + rightTreeState.exampleIds:size(1), "The left and right subtrees don't match the split found!")
+      self:setValuesAndCreateChildrenForNode(node, bestSplit, leftTreeState, rightTreeState, nleaf)
+
+      table.insert(queue, 1, {cartNode=node.leftChild, treeState=leftTreeState})
+      table.insert(queue, 1, {cartNode=node.rightChild, treeState=rightTreeState})
+
+      return nleaf + 1
+    end
+
+    return nleaf
+end
+
+function CartTrainer:findBestSplitForAllFeatures(treeState, activeFeatures)
+   local timer = torch.Timer()
+   local bestSplit = treeState:findBestSplit(self.dataset, activeFeatures, self.minLeafSize, -1, -1)
+
+   if bestSplit then
+      assert(torch.type(bestSplit) == 'table')
+   end
+
+   if dt.PROFILE then
+      print("findBestSplitForAllFeatures time="..timer:time().real)
+   end
+   return bestSplit
+end
+
+-- Updates the parentNode with the bestSplit information by creates left/right child Nodes.
+function CartTrainer:setValuesAndCreateChildrenForNode(parentNode, bestSplit, leftState, rightState, nleaf)
+   assert(torch.isTypeOf(parentNode, 'dt.CartNode'))
+   assert(torch.type(bestSplit) == 'table')
+   assert(torch.isTypeOf(leftState, 'dt.TreeState'))
+   assert(torch.isTypeOf(rightState, 'dt.TreeState'))
+   assert(torch.type(nleaf) == 'number')
+
+   local leftChild = dt.CartNode()
+   leftChild.score = leftState:score(self.dataset)
+   leftChild.nodeId = 2 * nleaf - 1
+
+   local rightChild = dt.CartNode()
+   rightChild.score = rightState:score(self.dataset)
+   rightChild.nodeId = 2 * nleaf
+
+   parentNode.splitFeatureId = bestSplit.splitId
+   parentNode.splitFeatureValue = bestSplit.splitValue
+   parentNode.leftChild = leftChild
+   parentNode.rightChild = rightChild
+   parentNode.splitGain = bestSplit.splitGain
+end
+
+-- We minimally need 2 * N examples in the parent to satisfy >= N examples per child
+function CartTrainer:hasEnoughTrainingExamplesToSplit(count)
+   return count >= 2 * self.minLeafSize
+end
+
+-- call before training to enable feature-parallelization
+function CartTrainer:featureParallel(workPool)
+   assert(self.parallelMode == 'singlethread', self.parallelMode)
+   self.parallelMode = 'featureparallel'
+   self.workPool = torch.type(workPool) == 'number' and dt.WorkPool(workPool) or workPool
+   assert(torch.isTypeOf(self.workPool, 'dt.WorkPool'))
+
+   -- this deletes all SparseTensor hash maps so that they aren't serialized
+   self.dataset:deleteIndex()
+
+   -- require the dt package
+   self.workPool:update('require', {libname='decisiontree',varname='dt'})
+   -- setup worker store (each worker will have its own copy)
+   local store = {
+      dataset=self.dataset,
+      minLeafSize=self.minLeafSize
+   }
+   self.workPool:update('storeKeysValues', store)
+end
+
+-- feature parallel
+function CartTrainer:findBestSplitForAllFeaturesFP(treeState, activeFeatures)
+   local timer = torch.Timer()
+   local bestSplit
+   if treeState.findBestSplitFP then
+      bestSplit = treeState:findBestSplitFP(self.dataset, activeFeatures, self.minLeafSize, self.workPool.nThread)
+   end
+
+   if not bestSplit then
+      for i=1,self.workPool.nThread do
+         -- upvalues
+         local treeState = treeState
+         local shardId = i
+         local nShard = self.workPool.nThread
+         local featureIds = activeFeatures
+         -- closure
+         local task = function(store)
+            assert(store.dataset)
+            assert(store.minLeafSize)
+            if treeState.threadInitialize then
+               treeState:threadInitialize()
+            end
+
+            local bestSplit = treeState:findBestSplit(store.dataset, featureIds, store.minLeafSize, shardId, nShard)
+            return bestSplit
+         end
+
+         self.workPool:writeup('execute', task)
+      end
+
+      for i=1,self.workPool.nThread do
+         local taskname, candidateSplit = self.workPool:read()
+         assert(taskname == 'execute')
+         if candidateSplit then
+            if ((not bestSplit) or candidateSplit.splitGain < bestSplit.splitGain) then
+               bestSplit = candidateSplit
+            end
+         end
+      end
+   end
+
+   if bestSplit then
+      assert(torch.type(bestSplit) == 'table')
+   end
+
+   if dt.PROFILE then
+      print("findBestSplitForAllFeaturesFP time="..timer:time().real)
+   end
+   return bestSplit
+end
diff --git a/contrib/lua-torch/decisiontree/CartTree.lua b/contrib/lua-torch/decisiontree/CartTree.lua
new file mode 100644
index 000000000..c74dfda9e
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/CartTree.lua
@@ -0,0 +1,90 @@
+local _ = require "moses"
+local dt = require 'decisiontree._env'
+
+-- CART (classification-regression decision tree).
+-- The example is always branched to the left when the splitting feature is missing.
+local CartTree = torch.class("dt.CartTree", "dt.DecisionTree", dt)
+
+function CartTree:__init(root, branchleft)
+   assert(torch.isTypeOf(root, 'dt.CartNode'))
+   self.root = root
+   self.branchleft = branchleft or function() return true end
+end
+
+-- TODO optimize this
+function CartTree:score(input, stack, optimized)
+   if optimized == true and stack == nil and torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      return input.nn.CartTreeFastScore(input, self.root, input.new())
+   end
+   return self:recursivescore(self.root, input, stack)
+end
+
+-- Continuous: if input[node.splitFeatureId] < node.splitFeatureValue then leftNode else rightNode
+-- Binary: if input[node.splitFeatureId] == 0 then leftNode else rightNode
+-- when stack is provided, it is returned as the third argument containing the stack of nodes from root to leaf
+function CartTree:recursivescore(node, input, stack)
+   assert(torch.isTypeOf(node, 'dt.CartNode'))
+
+   if stack then
+      stack = torch.type(stack) == 'table' and stack or {}
+      table.insert(stack, node)
+   end
+
+   if not (node.leftChild or node.rightChild) then
+      return node.score, node.nodeId, stack
+   elseif not node.leftChild then
+      return self:recursivescore(node.rightChild, input, stack)
+   elseif not node.rightChild then
+      return self:recursivescore(node.leftChild, input, stack)
+   end
+
+   local splitId = node.splitFeatureId
+   local splitVal = node.splitFeatureValue
+
+   if input[splitId] then -- if has key
+      local featureVal = input[splitId]
+      local nextNode = featureVal < splitVal and node.leftChild or node.rightChild
+      return self:recursivescore(nextNode, input, stack)
+   end
+
+   -- if feature is missing, branch left
+   local nextNode = self.branchleft() and node.leftChild or node.rightChild
+   return self:recursivescore(nextNode, input, stack)
+end
+
+function CartTree:__tostring__()
+   return self.root:recursivetostring()
+end
+
+-- expects a stack returned by score
+function CartTree:stackToString(stack, input)
+   assert(torch.type(stack) == 'table')
+   assert(torch.isTypeOf(stack[1], 'dt.CartNode'))
+
+   local res = 'Stack nodes from root to leaf\n'
+   for i,node in ipairs(stack) do
+      if not (node.leftChild or node.rightChild) then
+         res = res .. "score="..node.score .. '\n'
+      else
+         local istr = ''
+         if input then
+            istr = '=' .. (input[node.splitFeatureId] or 'nil')
+         end
+         res = res .. 'input[' .. node.splitFeatureId .. ']' .. istr ..' < ' .. node.splitFeatureValue .. ' ? '
+         res = res .. '(' .. ((node.leftChild and node.rightChild) and 'LR' or node.leftChild and 'L' or node.rightChild and 'R' or 'WAT?') .. ') '
+         if node.leftChild == stack[i+1] then
+            res = res .. 'Left\n'
+         elseif node.rightChild == stack[i+1] then
+            res = res .. 'Right\n'
+         else
+            error"stackToString error"
+         end
+      end
+   end
+   return res .. #stack .. " nodes"
+end
+
+function CartTree:clone()
+   return CartTree(self.root:clone(), self.branchleft)
+end
+
diff --git a/contrib/lua-torch/decisiontree/DFD.lua b/contrib/lua-torch/decisiontree/DFD.lua
new file mode 100644
index 000000000..e4746212a
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/DFD.lua
@@ -0,0 +1,182 @@
+-- nn.DFD: Decision Forest Discretizer
+-- Takes a dense input and outputs a sparse output.
+-- Each node in the forest is its own feature.
+-- When a node is traversed, its commensurate feature takes on a value of 1.
+-- For all non-traversed nodes, the feature is 0.
+local DFD, parent = torch.class("nn.DFD", "nn.Module")
+
+-- TODO: add :type, as the default will convert the long tensors
+function DFD:__init(df, onlyLastNode)
+   parent.__init(self)
+   if torch.type(df) == 'table' then
+      self:reconstructFromInfo(df)
+   else
+      assert(torch.type(df) == 'dt.DecisionForest')
+
+      self.rootIds = torch.LongTensor()
+      -- nodeId of left and right child nodes
+      self.leftChild = torch.LongTensor()
+      self.rightChild = torch.LongTensor()
+      -- index and value of the feature that splits this node
+      self.splitFeatureId = torch.LongTensor()
+      self.splitFeatureValue = torch.Tensor()
+      -- initialize state given df
+      self:convertForest2Tensors(df)
+      self:clearState()
+   end
+   self.onlyLastNode = onlyLastNode
+   self.nTrees = self.rootIds:size(1)
+end
+
+-- converts a DecisionForest to efficient tensor representation
+function DFD:convertForest2Tensors(df)
+   self.rootIds:resize(#df.trees)
+
+   -- nodeId will map to featureId
+   local nodeId = 0
+   -- sets nodeIds of all subnodes
+   -- and measures the maximum depth over all trees
+   local function recursiveTree(node, depth)
+      depth = (depth or 0) + 1
+      local rdepth = depth
+      nodeId = nodeId + 1
+      node._nodeId = nodeId
+
+      if node.leftChild then
+         rdepth = math.max(rdepth, recursiveTree(node.leftChild, depth))
+      end
+      if node.rightChild then
+         rdepth = math.max(rdepth, recursiveTree(node.rightChild, depth))
+      end
+      return rdepth
+   end
+
+   -- sum over trees of max depth
+   self.depth = 0
+   for i,tree in ipairs(df.trees) do
+      assert(torch.isTypeOf(tree.root, 'dt.CartNode'))
+      self.depth = self.depth + recursiveTree(tree.root)
+   end
+   -- remove roots from depth
+   self.depth = self.depth - self.rootIds:size(1)
+
+   -- total number of nodes in all trees
+   self.nNode = nodeId
+
+   -- nodeId of left and right child nodes
+   self.leftChild:resize(self.nNode):fill(-1)
+   self.rightChild:resize(self.nNode):fill(-1)
+   -- index and value of the feature that splits this node
+   self.splitFeatureId:resize(self.nNode):fill(-1)
+   self.splitFeatureValue:resize(self.nNode):fill(-1)
+
+   -- aggregates CartNode attributes to an efficient tensor representation
+   local function recursiveTree2(node)
+      local nodeId = assert(node._nodeId)
+      assert(self.splitFeatureId[nodeId] == -1)
+
+      if node.leftChild then
+         self.leftChild[nodeId] = assert(node.leftChild._nodeId)
+         recursiveTree2(node.leftChild)
+      else
+      	 self.leftChild[nodeId] = 0
+      end
+
+      if node.rightChild then
+         self.rightChild[nodeId] = assert(node.rightChild._nodeId)
+         recursiveTree2(node.rightChild)
+      else
+      	 self.rightChild[nodeId] = 0
+      end
+
+      -- each node splits the dataset on a feature id-value pair
+      self.splitFeatureId[nodeId] = assert(node.splitFeatureId)
+      self.splitFeatureValue[nodeId] = assert(node.splitFeatureValue)
+   end
+
+   for i,tree in ipairs(df.trees) do
+      self.rootIds[i] = assert(tree.root._nodeId)
+      recursiveTree2(tree.root)
+   end
+
+   assert(self.leftChild:min() >= 0)
+   assert(self.rightChild:min() >= 0)
+end
+
+-- input is a batchsize x inputsize tensor
+function DFD:updateOutput(input)
+   assert(torch.isTensor(input))
+   assert(input:dim() == 2)
+   input = input:contiguous()
+
+   local batchsize, inputsize = input:size(1), input:size(2)
+   local size = self.onlyLastNode and self.nTree or self.depth
+
+   -- each sample's output keys is resized to maxdepth, which is the maximum size that it can take on
+   self.outputkeys = self.outputkeys or torch.LongTensor()
+   self.outputkeys:resize(batchsize, size)
+   -- values are 1
+   self.outputvalues = self.outputvalues or input.new()
+   self.outputvalues:resize(batchsize, size):fill(1)
+
+   self.output = input.nn.DFD_computeOutput(self.outputkeys, self.outputvalues, self.rootIds, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, input, self.onlyLastNode)
+   return self.output
+end
+
+function DFD:type(type, tensorCache)
+   if type then
+      local info = self:getReconstructionInfo()
+      for k, v in pairs(info) do
+         if torch.type(v) ~= 'torch.LongTensor' then
+            info[k] = nil
+         end
+      end
+      parent.type(self, type, tensorCache)
+      self:reconstructFromInfo(info)
+      return self
+   else
+      return parent.type(self)
+   end
+end
+
+function DFD:updateGradInput()
+   error"Not Implemented"
+end
+
+function DFD:clearState()
+   self.output = {{},{}}
+   self.taskbuffer = {}
+   self.outputkeys = nil
+   self.outputvalues = nil
+   self._range = nil
+   self._indices = nil
+   self._mask = nil
+end
+
+function DFD:reconstructFromInfo(DFDinfo)
+   for k,v in pairs(DFDinfo) do
+      self[k] = v
+   end
+   assert(self.leftChild:nDimension() == 1)
+   assert(self.rightChild:nDimension() == 1)
+   assert(self.leftChild:size(1) == self.nNode)
+   assert(self.rightChild:size(1) == self.nNode)
+   assert(self.leftChild:min() >= 0)
+   assert(self.rightChild:min() >= 0)
+   assert(self.splitFeatureId:nDimension() == 1)
+   assert(self.splitFeatureValue:nDimension() == 1)
+   assert(self.splitFeatureId:size(1) == self.splitFeatureValue:size(1))
+end
+
+function DFD:getReconstructionInfo()
+   local DFDinfo = {
+      nNode = self.nNode,
+      rootIds = self.rootIds,
+      leftChild = self.leftChild,
+      rightChild = self.rightChild,
+      splitFeatureId = self.splitFeatureId,
+      splitFeatureValue = self.splitFeatureValue,
+      depth = self.depth
+   }
+   return DFDinfo
+end
diff --git a/contrib/lua-torch/decisiontree/DataSet.lua b/contrib/lua-torch/decisiontree/DataSet.lua
new file mode 100644
index 000000000..15058a7c6
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/DataSet.lua
@@ -0,0 +1,142 @@
+local dt = require "decisiontree._env"
+
+local DataSet = torch.class("dt.DataSet", dt)
+
+function DataSet:__init(input, target, nThreads)
+   if torch.type(input) == 'table' then
+      assert(torch.isTypeOf(input[1], 'torch.SparseTensor'))
+   else
+      assert(torch.isTensor(input))
+   end
+   self.input = input
+   assert(torch.isTensor(target))
+   self.target = target
+   self.nThreads = nThreads or 1
+
+   self.sortedFeatureValues, self.featureIds = self:sortFeatureValues(input)
+end
+
+-- group examples by featureId. For each featureId, sort examples by featureValue (ascending order)
+-- returns a table mapping featureIds to sorted lists of exampleIds
+-- e.g. {featureId={example1,example2,example3}}
+function DataSet:sortFeatureValues(inputs)
+   local isSparse = torch.typename(inputs[1]):match('torch.*SparseTensor')
+   assert(isSparse or torch.isTensor(inputs))
+
+   local featureIds = torch.LongTensor()
+   local dataset = {} -- TODO use tds.Hash (will require SparseTensor to be userdata)
+   if isSparse then
+      local proto = inputs[1].values
+      -- get list of featureIds
+      local featureMap = {}
+      for i,input in ipairs(inputs) do
+         input.keys:apply(function(key)
+            featureMap[key] = (featureMap[key] or 0) + 1
+         end)
+      end
+      local _ = require "moses"
+      featureIds = featureIds.new(_.keys(featureMap))
+      local featureCounts = torch.LongTensor(featureIds:size(1))
+      for i=1,featureIds:size(1) do
+         featureCounts[i] = featureMap[featureIds[i]]
+      end
+
+      for i=1,featureIds:size(1) do
+         local featureId = featureIds[i]
+         local featureCount = featureCounts[i]
+         dataset[featureId] = {
+            values=proto.new(featureCount),
+            examples=torch.LongTensor(featureCount),
+            i=0
+         }
+      end
+
+      for exampleId,input in ipairs(inputs) do
+         local sparseIdx = 0
+         input.keys:apply(function(key)
+            sparseIdx = sparseIdx + 1
+            local f = dataset[key]
+            f.i = f.i + 1
+            f.values[f.i] = input.values[sparseIdx]
+            f.examples[f.i] = exampleId
+         end)
+      end
+
+      local sortVal, sortIdx = proto.new(), torch.LongTensor()
+      for featureId,f in pairs(dataset) do
+         assert(f.values:size(1) == f.i)
+         sortVal:sort(sortIdx, f.values, 1, false)
+
+         local sortedExampleIds = torch.LongTensor(f.i)
+         sortedExampleIds:index(f.examples, 1, sortIdx)
+
+         dataset[featureId] = sortedExampleIds
+      end
+   else
+      assert(torch.isTensor(inputs))
+      featureIds:range(1,inputs:size(2))
+
+      for i=1,inputs:size(2) do
+         local featureId = i
+         local values = inputs:select(2, i)
+         local _, sortedFeatureExampleIds = values:sort(1, false)
+         dataset[featureId] = sortedFeatureExampleIds
+      end
+   end
+
+   return dataset, featureIds
+end
+
+function DataSet:getSortedFeature(featureId)
+   assert(self.sortedFeatureValues)
+   return self.sortedFeatureValues[featureId]
+end
+
+function DataSet:size()
+   return self.target:size(1)
+end
+
+function DataSet:getExampleIds()
+   if not self.exampleIds then
+      self.exampleIds = torch.LongTensor():range(1,self:size())
+   end
+   return self.exampleIds
+end
+
+function DataSet:countPositive(exampleIds)
+   assert(torch.type(exampleIds) == 'torch.LongTensor')
+   local dt = require 'decisiontree'
+   local buffer = dt.getBufferTable('DataSet')
+   buffer.tensor = buffer.tensor or self.target.new()
+   buffer.tensor:index(self.target, 1, exampleIds)
+   local nPositive = 0
+   buffer.tensor:apply(function(x)
+      if x > 0 then nPositive = nPositive + 1 end
+   end)
+   return nPositive
+end
+
+function DataSet:initScore()
+   self.score = self.score or torch.Tensor()
+   self.score:resize(self:size()):fill(0)
+end
+
+function DataSet:buildIndex()
+   if torch.type(self.input) == 'table' then
+      for exampleId,input in ipairs(self.input) do
+         if torch.isTypeOf(input, 'torch.SparseTensor') then
+            input:buildIndex()
+         end
+      end
+   end
+end
+
+function DataSet:deleteIndex()
+   if torch.type(self.input) == 'table' then
+      for exampleId,input in ipairs(self.input) do
+         if torch.isTypeOf(input, 'torch.SparseTensor') then
+            input:deleteIndex()
+         end
+      end
+   end
+end
diff --git a/contrib/lua-torch/decisiontree/DecisionForest.lua b/contrib/lua-torch/decisiontree/DecisionForest.lua
new file mode 100644
index 000000000..cac748e7e
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/DecisionForest.lua
@@ -0,0 +1,82 @@
+local dt = require "decisiontree._env"
+
+-- Decision forest that ensembles a bag of decision trees.
+local DecisionForest = torch.class("dt.DecisionForest", "dt.DecisionTree", dt)
+
+function DecisionForest:__init(trees, weight, bias)
+   assert(torch.type(trees) == 'table')
+   self.trees = trees
+   if #trees == 0 then
+      self.weight = weight or torch.Tensor()
+      assert(torch.isTensor(self.weight))
+      assert(self.weight:nElement() == 0)
+   else
+      assert(torch.isTypeOf(trees[1], 'dt.DecisionTree'))
+      self.weight = weight or torch.Tensor(#trees):fill(1)
+      assert(torch.isTensor(self.weight))
+      assert(self.weight:dim() == 1)
+      assert(self.weight:min() >= 0, "Expecting positive weights")
+      assert(#trees == self.weight:size(1))
+   end
+
+   self.bias = bias or 0
+   assert(torch.type(self.bias) == 'number')
+end
+
+function DecisionForest:score(input, incrementalId)
+   assert(torch.isTensor(input))
+
+   local buffer = {}
+   if incrementalId then
+      self.buffers = self.buffers or {}
+      self.buffers[incrementalId] = self.buffers[incrementalId] or {}
+      buffer = self.buffers[incrementalId]
+   end
+   buffer.initialCounter = buffer.initialCounter or 0
+
+   -- TODO: score in parallel
+   local output
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      buffer.output = buffer.output or input.new()
+      output = buffer.output
+      assert(output:nElement() == 0 or output:size(1) == input:size(1))
+      if output:nElement() == 0 then
+         output:resize(input:size(1)):fill(self.bias)
+      end
+      for i,tree in ipairs(self.trees) do
+         if i > buffer.initialCounter then
+            local score = tree:score(input, nil, true)
+            output:add(self.weight[i], score)
+         end
+      end
+   else
+      output = buffer.output or self.bias
+      for i,tree in ipairs(self.trees) do
+         if i > buffer.initialCounter then
+            output = output + tree:score(input) * self.weight[i]
+         end
+      end
+      buffer.output = output
+   end
+
+   buffer.initialCounter = #self.trees
+
+   return output
+end
+
+function DecisionForest:add(tree, weight)
+   assert(torch.type(weight) == 'number')
+   assert(weight > 0)
+   table.insert(self.trees, tree)
+   self.weight:resize(#self.trees)
+   self.weight[#self.trees] = weight
+   return self
+end
+
+function DecisionForest:clone()
+   local trees = {}
+   for i, tree in ipairs(self.trees) do
+      trees[i] = tree:clone()
+   end
+   return DecisionForest(trees, self.weight:clone(), self.bias)
+end
diff --git a/contrib/lua-torch/decisiontree/DecisionForestTrainer.lua b/contrib/lua-torch/decisiontree/DecisionForestTrainer.lua
new file mode 100644
index 000000000..fc903678b
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/DecisionForestTrainer.lua
@@ -0,0 +1,22 @@
+local dt = require "decisiontree._env"
+
+-- Interface for all decisionForestTrainers
+local DFT = torch.class("dt.DecisionForestTrainer", dt)
+
+-- Train a DecisionForest with examples, a table of valid featureIds and a dataset (i.e. sortedExamplesByFeatureId)
+function DFT:train(examples, validFeatureIds, dataset)
+   assert(torch.type(examples) == "table")
+   assert(torch.isTypeOf(examples[1], "dt.LabeledExample"))
+
+   assert(torch.type(validFeatureIds) == 'table')
+
+   assert(torch.type(dataset) == 'table')
+   for k,v in pairs(dataset) do
+      assert(torch.type(v) == 'table')
+      assert(torch.isTypeOf(v[1], 'dt.LabeledExample'))
+      break
+   end
+   -- dataset is a table mapping featureIds to sorted lists of LabeledExamples
+   -- e.g. {featureId={example1,example2,example3}}
+   error"Not Implemented"
+end
diff --git a/contrib/lua-torch/decisiontree/DecisionTree.lua b/contrib/lua-torch/decisiontree/DecisionTree.lua
new file mode 100644
index 000000000..c61bc3757
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/DecisionTree.lua
@@ -0,0 +1,12 @@
+local dt = require "decisiontree._env"
+
+-- An interface for decision trees.
+local DecisionTree = torch.class("dt.DecisionTree", dt)
+
+-- Score an input example and return the prediction score.
+-- input is a Tensor or SparseTensor
+-- return prediction score and nodeId
+function DecisionTree:score(input)
+   error"Not Implemented"
+   return score, nodeId
+end
diff --git a/contrib/lua-torch/decisiontree/GBDT_common.h b/contrib/lua-torch/decisiontree/GBDT_common.h
new file mode 100644
index 000000000..eb993702d
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/GBDT_common.h
@@ -0,0 +1,106 @@
+#include "khash.h"
+#include <pthread.h>
+
+#define computeGradientBoostLoss(g, h) (-(g)*(g)/(h))
+
+// we use khash to make iteration faster than lua tables
+KHASH_SET_INIT_INT64(long)
+
+// defines the data we need for running an instance of thet and its constructor/destructor
+typedef struct {
+  khash_t(long)* exampleMap;
+  THLongTensor *exampleIdsWithFeature_cache;
+  long minLeafSize;
+} GBRunData;
+
+
+// allocates data that cannot be shared between threads
+static void gb_local_create_run_data(GBRunData *run_data) {
+  run_data->exampleMap = kh_init(long);
+  run_data->exampleIdsWithFeature_cache = THLongTensor_new();
+}
+
+static void gb_create_run_data(GBRunData *run_data, int minLeafSize) {
+  gb_local_create_run_data(run_data);
+  run_data->minLeafSize = minLeafSize;
+}
+
+static void gb_destroy_run_data(GBRunData *run_data) {
+  THLongTensor_free(run_data->exampleIdsWithFeature_cache);
+  kh_destroy(long, run_data->exampleMap);
+}
+
+// initializes the data required by the optimizer for the given feature.
+static THLongTensor *gb_internal_prepare(lua_State *L, THLongTensor *exampleIds,
+    THLongTensor *exampleIdsWithFeature_cache, int input_index, long feature_id,
+    khash_t(long)* exampleMap) {
+  long *exampleIds_data = THLongTensor_data(exampleIds);
+  long exampleIds_size = THLongTensor_size(exampleIds, 0);
+
+  int ret = 0;
+
+  // if the the input is a table, then we have a sparse dataset
+  if (lua_istable(L, input_index)) {
+    if (exampleIds_size == 0) {
+      return NULL;
+    }
+    else {
+      // loops over the examples' ids that this node has to evaluate and, if they have the feature
+      // we're looking for, marks them as present and stores them in the order provided by the
+      // dataset
+      THLongTensor_resize1d(exampleIdsWithFeature_cache, exampleIds_size);
+      kh_clear(long, exampleMap);
+      kh_resize(long, exampleMap, exampleIds_size*8);
+      long *exampleIdsWithFeature_data = THLongTensor_data(exampleIdsWithFeature_cache);
+      long j = 0;
+      // for each sample to be evaluated
+      for (long i = 0; i < exampleIds_size; i++) {
+        // gets the representation for the example
+        lua_pushinteger(L, exampleIds_data[i]);
+        lua_gettable(L, input_index);
+
+        // builds the index, which happens only once per thread for efficiency
+        lua_pushstring(L, "buildIndex");
+        lua_gettable(L, -2);
+        lua_pushvalue(L, -2);
+        lua_call(L, 1, 0);
+
+        // tries to get the feature for this sample
+        lua_pushinteger(L, feature_id);
+        lua_gettable(L, -2);
+        // if present, then...
+        if (!lua_isnil(L, -1)) {
+          // saves the example
+          exampleIdsWithFeature_data[j] = exampleIds_data[i];
+          j++;
+
+          // marks it as present in the hash table
+          kh_put(long, exampleMap, exampleIds_data[i], &ret);
+        }
+
+        lua_pop(L, 2);
+      }
+
+      // resizes to fit only the samples that have the feature
+      THLongTensor_resize1d(exampleIdsWithFeature_cache, j);
+      kh_resize(long, exampleMap, j*8);
+      return exampleIdsWithFeature_cache;
+    }
+  }
+  else {
+    // if the input isn't a table, then it's dense and we cannot have exampleIds missing, so it
+    // depends on feature_id
+    // since exampleIds is fixed between calls and this is going to store the same values to the
+    // same position, we can cache it between calls
+    if (kh_size(exampleMap) == 0) {
+      kh_resize(long, exampleMap, exampleIds_size*8);
+      for (long i = 0; i < exampleIds_size; i++) {
+        kh_put(long, exampleMap, exampleIds_data[i], &ret);
+      }
+    }
+    // notice that we just return the given tensor of ids instead of copying it. the rest of the
+    // code handles this transparently
+    return exampleIds;
+  }
+}
+
diff --git a/contrib/lua-torch/decisiontree/GiniState.lua b/contrib/lua-torch/decisiontree/GiniState.lua
new file mode 100644
index 000000000..6dfed2845
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/GiniState.lua
@@ -0,0 +1,54 @@
+local dt = require 'decisiontree._env'
+
+-- used by RandomForestTrainer
+local GiniState, parent = torch.class("dt.GiniState", "dt.TreeState", dt)
+
+function GiniState:__init(exampleIds)
+   parent.__init(self, exampleIds)
+   self.nPositiveInLeftBranch = 0
+   self.nPositiveInRightBranch = 0
+end
+
+function GiniState:score(dataset)
+   local dt = require 'decisiontree'
+   local nPositive = dataset:countPositive(self.exampleIds)
+   return dt.calculateLogitScore(nPositive, self.exampleIds:size(1))
+end
+
+function GiniState:initialize(exampleIdsWithFeature, dataset)
+   assert(torch.type(exampleIdsWithFeature) == 'torch.LongTensor')
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   self.nPositiveInLeftBranch = dataset:countPositive(exampleIdsWithFeature)
+   self.nPositiveInRightBranch = 0
+
+   self.nExampleInLeftBranch = exampleIdsWithFeature:size(1)
+   self.nExampleInRightBranch = 0
+end
+
+function GiniState:update(exampleId, dataset)
+   assert(torch.type(exampleId) == 'number')
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   if dataset.target[exampleId] > 0 then
+      self.nPositiveInLeftBranch = self.nPositiveInLeftBranch - 1
+      self.nPositiveInRightBranch = self.nPositiveInRightBranch + 1
+   end
+
+   self.nExampleInLeftBranch = self.nExampleInLeftBranch - 1
+   self.nExampleInRightBranch = self.nExampleInRightBranch + 1
+end
+
+function GiniState:computeSplitInfo(splitFeatureId, splitFeatureValue)
+   local dt = require 'decisiontree'
+   local gini = dt.computeGini(self.nExampleInLeftBranch, self.nPositiveInLeftBranch, self.nExampleInRightBranch, self.nPositiveInRightBranch)
+   local splitInfo = {
+      splitId = assert(splitFeatureId),
+      splitValue = assert(splitFeatureValue),
+      leftChildSize = assert(self.nExampleInLeftBranch),
+      leftPositiveCount = assert(self.nPositiveInLeftBranch),
+      rightChildSize = assert(self.nExampleInRightBranch),
+      rightPositiveCount = assert(self.nPositiveInRightBranch),
+      gini = assert(gini),
+      splitGain = gini
+   }
+   return splitInfo
+end
+\ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/GradientBoostState.lua b/contrib/lua-torch/decisiontree/GradientBoostState.lua
new file mode 100644
index 000000000..f268f3da8
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/GradientBoostState.lua
@@ -0,0 +1,57 @@
+local dt = require 'decisiontree._env'
+
+local GradientBoostState, parent = torch.class("dt.GradientBoostState", "dt.TreeState", dt)
+
+function GradientBoostState:__init(exampleIds, gradInput, hessInput)
+   parent.__init(self, exampleIds)
+   self.gradInput = gradInput
+   self.hessInput = hessInput
+end
+
+function GradientBoostState:score(dataset)
+   local dt = require 'decisiontree'
+   local gradInput = self.gradInput:index(1, self.exampleIds)
+   local hessInput = self.hessInput:index(1, self.exampleIds)
+   return dt.computeNewtonScore(gradInput:sum(), hessInput:sum())
+end
+
+-- calls _branch and encapsulates the left and right exampleIds into a TreeStates
+function GradientBoostState:branch(splitInfo, dataset)
+   local leftExampleIds, rightExampleIds = self:_branch(splitInfo, dataset)
+   return self.new(leftExampleIds, self.gradInput, self.hessInput), self.new(rightExampleIds, self.gradInput, self.hessInput)
+end
+
+-- Partitions self given a splitInfo table, producing a pair of exampleIds corresponding to the left and right subtrees.
+function GradientBoostState:_branch(splitInfo, dataset)
+   local input = dataset.input
+   -- if the input is dense, we can use the optimized version
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      return input.nn.GBDT_branch(splitInfo, input, self.exampleIds)
+   end
+   return parent._branch(self, splitInfo, dataset)
+end
+
+-- The following methods are supersets of each other. You can comment out them to re-use the lua
+-- version with just the provided core optimized
+
+-- THIS ONE CANNOT BE COMMENTED OUT
+function GradientBoostState:findBestFeatureSplit(dataset, featureId, minLeafSize)
+   local ret = self.hessInput.nn.GBDT_findBestFeatureSplit(self.exampleIds, dataset, featureId, minLeafSize, self.gradInput, self.hessInput)
+   return ret
+end
+
+-- finds the best split of examples in treeState among featureIds
+function GradientBoostState:findBestSplit(dataset, featureIds, minLeafSize, shardId, nShard)
+   local ret = self.hessInput.nn.GBDT_findBestSplit(self.exampleIds, dataset, featureIds, minLeafSize, shardId, nShard, self.gradInput, self.hessInput)
+   return ret
+end
+
+-- finds the best split like the previous one, but performs feature parallelism. Note that the
+-- optimization is only applied if the input is dense
+function GradientBoostState:findBestSplitFP(dataset, featureIds, minLeafSize, nThread)
+   local input = dataset.input
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() and input:nDimension() == 2 then
+      local ret = self.hessInput.nn.GBDT_findBestSplitFP(self.exampleIds, dataset, featureIds, minLeafSize, self.gradInput, self.hessInput, nThread)
+      return ret
+   end
+end
diff --git a/contrib/lua-torch/decisiontree/GradientBoostTrainer.lua b/contrib/lua-torch/decisiontree/GradientBoostTrainer.lua
new file mode 100644
index 000000000..51299b109
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/GradientBoostTrainer.lua
@@ -0,0 +1,244 @@
+local dt = require "decisiontree._env"
+
+-- Gradient boosted decision tree trainer
+local GradientBoostTrainer = torch.class("dt.GradientBoostTrainer", "dt.DecisionForestTrainer", dt)
+
+function GradientBoostTrainer:__init(opt)
+   assert(torch.type(opt) == 'table')
+
+   assert(torch.isTypeOf(opt.treeTrainer, 'dt.CartTrainer'))
+   self.treeTrainer = opt.treeTrainer
+
+   assert(torch.isTypeOf(opt.lossFunction, 'nn.Criterion'))
+   self.lossFunction = opt.lossFunction
+
+   assert(torch.type(opt.shrinkage) == 'number')
+   assert(opt.shrinkage > 0)
+   self.shrinkage = opt.shrinkage
+
+   assert(torch.type(opt.downsampleRatio) == 'number')
+   assert(opt.downsampleRatio > 0)
+   self.downsampleRatio = opt.downsampleRatio
+
+   assert(torch.type(opt.nTree) == 'number')
+   assert(opt.nTree > 0)
+   self.nTree = opt.nTree
+
+   evalFreq = evalFreq or -1
+   assert(torch.type(opt.evalFreq) == 'number')
+   assert(torch.round(opt.evalFreq) == opt.evalFreq)
+   self.evalFreq = opt.evalFreq
+
+   -- when non-positive, no early-stopping
+   earlyStop = earlyStop or (evalFreq-1)
+   assert(torch.type(opt.earlyStop) == 'number')
+   self.earlyStop = opt.earlyStop
+
+    -- when non-positive, defaults to sqrt(#feature)
+   assert(torch.type(opt.featureBaggingSize) == 'number')
+   self.featureBaggingSize = opt.featureBaggingSize
+
+   if opt.decisionForest then
+      assert(torch.isTypeOf(opt.decisionForest, 'dt.DecisionForest'))
+   end
+   self.decisionForest = opt.decisionForest
+
+   self.useInitBias = opt.useInitBias
+end
+
+function GradientBoostTrainer:computeBias(trainSet, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+
+   if verbose then print("Use new bias generated from the training examples.") end
+
+   return -0.5 * self.gradInput:sum() / self.hessInput:sum()
+end
+
+
+function GradientBoostTrainer:initialize(trainSet, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+
+   trainSet:initScore()
+   self.gradInput, self.hessInput = self.lossFunction:backward2(trainSet.score, trainSet.target)
+
+   -- used for early-stopping (see validate())
+   self.stopCount = 0
+   self.prevTrainLoss = math.huge
+   self.prevTestLoss = math.huge
+
+   if verbose then print("Processing initial decision forest") end
+
+   local decisionForest, bias
+
+   if self.decisionForest then
+      local bias = self.useInitBias and self.decisionForest.bias or self:computeBias(trainSet, verbose)
+
+      decisionForest = dt.DecisionForest(self.decisionForest.trees, self.decisionForest.weight, bias)
+
+      local input = trainSet.input
+      if torch.isTensor(input) and input.isContiguous and input:isContiguous() then
+         score = decisionForest:score(input)
+      else
+         score:resize(trainSet:size())
+         for exampleId=1,trainSet:size() do
+            score[exampleId] = decisionForest:score(input[exampleId])
+         end
+      end
+   else
+      local bias = self:computeBias(trainSet, verbose)
+      decisionForest = dt.DecisionForest({}, torch.Tensor(), bias)
+
+      trainSet.score:fill(bias)
+   end
+
+   if verbose then print("Finish loading initial decision forest") end
+
+   return decisionForest
+end
+
+-- Trains a decision forest of boosted decision trees.
+-- examples are the training examples. validExamples are used for cross-validation.
+function GradientBoostTrainer:train(trainSet, featureIds, validSet, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   assert(torch.isTypeOf(validSet, 'dt.DataSet'))
+
+   local decisionForest = self:initialize(trainSet, verbose)
+   local bestDecisionForest
+
+   if verbose then print(string.format("Get %d featureIds.", featureIds:size(1))) end
+
+   local baggingSize = self.featureBaggingSize > 0 and self.featureBaggingSize or torch.round(math.sqrt(featureIds:size(1)))
+   local trainExampleIds = trainSet:getExampleIds()
+   local baggingIndices, activeFeatures
+   local treeExampleIds
+
+   local timer = torch.Timer()
+
+   for treeId = 1,self.nTree do
+      timer:reset()
+      if verbose then print(string.format("Begin processing tree number %d of %d", treeId, self.nTree)) end
+
+      -- Get active features
+      activeFeatures = activeFeatures or torch.LongTensor()
+      if baggingSize < featureIds:size(1) then
+         if verbose then print(string.format("Tree %d: Bagging %d from %d features", treeId, baggingSize, featureIds:size(1))) end
+
+         baggingIndices = baggingIndices or torch.LongTensor()
+         baggingIndices:randperm(featureIds:size(1))
+         activeFeatures:index(featureIds, 1, baggingIndices:narrow(1,1,baggingSize))
+      else
+         activeFeatures = featureIds
+      end
+
+      -- Get data samples
+      if self.downsampleRatio < 0.99 then
+         local sampleSize = torch.round(trainSet:size() * self.downsampleRatio)
+
+         if verbose then print(string.format("Tree %d: Downsampling %d of %d samples", treeId, sampleSize, trainSet:size())) end
+
+         baggingIndices = baggingIndices or torch.LongTensor()
+         baggingIndices:randperm(trainSet:size())
+
+         treeExampleIds = treeExampleIds or torch.LongTensor()
+         treeExampleIds:index(trainExampleIds, 1, baggingIndices:narrow(1,1,sampleSize))
+      else
+         treeExampleIds = trainExampleIds
+      end
+
+      if verbose then print(string.format("Tree %d: training CART tree", treeId)) end
+
+      local rootTreeState = dt.GradientBoostState(treeExampleIds, self.gradInput, self.hessInput)
+      local cartTree = self.treeTrainer:train(rootTreeState, activeFeatures)
+
+      if verbose then print(string.format("Tree %d: finished training CART tree in %f seconds", treeId, timer:time().real)) end
+
+      decisionForest:add(cartTree, self.shrinkage)
+
+      -- update score
+      local predictionScore
+      local input = trainSet.input
+      if torch.isTensor(input) and input:isContiguous() then
+         predictionScore = cartTree:score(trainSet.input, nil, true)
+      else
+         local size = trainSet:size()
+         predictionScore = torch.Tensor(size)
+         for exampleId=1,size do
+            predictionScore[exampleId] = cartTree:score(trainSet.input[exampleId])
+         end
+      end
+      trainSet.score:add(self.shrinkage, predictionScore)
+      self.gradInput, self.hessInput = self.lossFunction:backward2(trainSet.score, trainSet.target)
+
+      if verbose then print(string.format("Tree %d: training complete in %f seconds", treeId, timer:time().real)) end
+
+      -- cross-validation/early-stopping
+      if self.evalFreq > 0 and treeId % self.evalFreq == 0 then
+         timer:reset()
+         local stop, validLoss, bestDecisionForest = self:validate(trainSet, validSet, decisionForest, bestDecisionForest)
+         if dt.PROFILE then print("validate tree time: "..timer:time().real) end
+         if verbose then print(string.format("Loss: train=%7.4f, valid=%7.4f", 0, validLoss)) end
+         if stop then
+            if verbose then print(string.format("GBDT early stopped on tree %d", treeId)) end
+            break
+         end
+
+      end
+   end
+
+   return bestDecisionForest or decisionForest
+end
+
+function dt.GradientBoostTrainer:validate(trainSet, validSet, decisionForest, bestDecisionForest)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.isTypeOf(validSet, 'dt.DataSet'))
+   assert(torch.isTypeOf(decisionForest, 'dt.DecisionForest'))
+   assert(not bestDecisionForest or torch.isTypeOf(decisionForest, 'dt.DecisionForest'))
+
+   -- buffer
+   local buffer = dt.getBufferTable('GradientBoost')
+   buffer.tensor = buffer.tensor or trainSet.score.new()
+   local score = buffer.tensor
+
+   -- per thread loss function (tensors are shared)
+   local lossname = torch.typename(self.lossFunction)
+   buffer[lossname] = buffer[lossname] or self.lossFunction:clone()
+   local lossFunction = buffer[lossname]
+
+   -- TODO batch this for large datasets
+   local input = validSet.input
+   if torch.isTensor(input) and input.isContiguous and input:isContiguous() then
+      score = decisionForest:score(input, 'val')
+   else
+      score:resize(validSet:size())
+      for exampleId=1,validSet:size() do
+         score[exampleId] = decisionForest:score(input[exampleId], 'val')
+      end
+   end
+   local validLoss = lossFunction:forward(score, validSet.target)
+
+   -- early stop is not enabled when earlyStop=0
+   local stop = false
+   if self.earlyStop > 0 then
+      -- Track test loss and detect early stop
+      if self.prevTestLoss - validLoss < 0 then
+         self.stopCount = self.stopCount + 1
+      else
+         bestDecisionForest = decisionForest:clone()
+         self.stopCount = 0
+      end
+
+      stop = self.stopCount >= self.earlyStop
+   end
+
+   self.prevTestLoss = validLoss
+
+   return stop, validLoss, bestDecisionForest
+end
+
+function GradientBoostTrainer:getName()
+   return string.format(
+      "gbdt-dRatio-%s-maxLeaf-%s-minExample-%s-nTree-%s-shrinkage-%s",
+      self.downsampleRatio, self.maxLeafNodes, self.minLeafSize, self.nTree, self.shrinkage
+   )
+end
diff --git a/contrib/lua-torch/decisiontree/LICENSE b/contrib/lua-torch/decisiontree/LICENSE
new file mode 100644
index 000000000..8dada3eda
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/contrib/lua-torch/decisiontree/LogitBoostCriterion.lua b/contrib/lua-torch/decisiontree/LogitBoostCriterion.lua
new file mode 100644
index 000000000..5b9eb6028
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/LogitBoostCriterion.lua
@@ -0,0 +1,45 @@
+local dt = require "decisiontree._env"
+
+-- Ref: slide 17 in https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf
+
+-- equivalent to nn.Sigmoid() + nn.BCECriterion()
+local LogitBoostCriterion, parent = torch.class("nn.LogitBoostCriterion", "nn.Criterion")
+
+function LogitBoostCriterion:__init(sizeAverage)
+   parent.__init(self)
+   self.sizeAverage = sizeAverage
+   self.hessInput = self.gradInput.new()
+   self._output = torch.Tensor()
+end
+
+function LogitBoostCriterion:updateOutput(input, target)
+   input.nn.LogitBoostCriterion_updateOutput(input, target, self._output, self.sizeAverage)
+   self.output = self._output[1]
+   return self.output
+end
+
+function LogitBoostCriterion:updateGradInput(input, target)
+   input.nn.LogitBoostCriterion_updateGradInput(input, target, self.gradInput)
+   return self.gradInput
+end
+
+function LogitBoostCriterion:updateHessInput(input, target)
+   input.nn.LogitBoostCriterion_updateHessInput(input, target, self.hessInput)
+   return self.hessInput
+end
+
+-- returns gradInput and hessInput
+function LogitBoostCriterion:backward2(input, target)
+   return self:updateGradInput(input, target), self:updateHessInput(input, target)
+end
+
+local gradWrapper = function(input, target, grad)
+   input.nn.LogitBoostCriterion_updateGradInput(input, target, grad)
+end
+local hessianWrapper = function(input, target, hessian)
+   input.nn.LogitBoostCriterion_updateHessInput(input, target, hessian)
+end
+
+function LogitBoostCriterion:getWrappers()
+   return gradWrapper, hessianWrapper
+end
diff --git a/contrib/lua-torch/decisiontree/MSECriterion.lua b/contrib/lua-torch/decisiontree/MSECriterion.lua
new file mode 100644
index 000000000..948c1a17e
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/MSECriterion.lua
@@ -0,0 +1,13 @@
+local dt = require "decisiontree._env"
+
+function nn.MSECriterion.updateHessianInput(self, input, target)
+   self.hessInput = self.hessInput or input.new()
+   self.hessInput:resize(input:size()):fill(2)
+   return self.hessInput
+end
+
+-- returns gradInput and hessInput
+function nn.MSECriterion.backward2(self, input, target)
+   return self:updateGradInput(input, target), self:updateHessInput(input, target)
+end
+
diff --git a/contrib/lua-torch/decisiontree/README.md b/contrib/lua-torch/decisiontree/README.md
new file mode 100644
index 000000000..db4622add
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/README.md
@@ -0,0 +1,386 @@
+# Torch decision tree library
+
+```lua
+local dt = require 'decisiontree'
+```
+
+This project implements random forests and gradient boosted decision trees (GBDT).
+The latter uses gradient tree boosting.
+Both use ensemble learning to produce ensembles of decision trees (that is, forests).
+
+## `nn.DFD`
+
+One practical application for decision forests is to *discretize* an input feature space into a richer output feature space.
+The  `nn.DFD` Module can be used as a decision forest discretizer (DFD):
+
+```lua
+local dfd = nn.DFD(df, onlyLastNode)
+```
+
+where `df` is a `dt.DecisionForest` instance or the table returned by the method `getReconstructionInfo()` on another `nn.DFD` module, and `onlyLastNode` is a boolean that indicates that module should return only the id of the last node visited on each tree (by default it outputs all traversed nodes except for the roots).
+The `nn.DFD` module requires dense `input` tensors.
+Sparse `input` tensors (tables of tensors) are not supported.
+The `output` returned by a call to `updateOutput` is a batch of sparse tensors.
+This `output` where `output[1]` and `output[2]` are a respectively a list of key and value tensors:
+
+```lua
+{
+  { [torch.LongTensor], ... , [torch.LongTensor] },
+  { [torch.Tensor], ... , [torch.Tensor] }
+}
+```
+
+This module doesn't support CUDA.
+
+### Example
+As a concrete example, let us first train a Random Forest on a dummy dense dataset:
+
+```lua
+local nExample = 100
+local batchsize = 2
+local inputsize = 10
+
+-- train Random Forest
+local trainSet = dt.getDenseDummyData(nExample, nil, inputsize)
+local opt = {
+   activeRatio=0.5,
+   featureBaggingSize=5,
+   nTree=4,
+   maxLeafNodes=nExample/2,
+   minLeafSize=nExample/10,
+}
+local trainer = dt.RandomForestTrainer(opt)
+local df = trainer:train(trainSet, trainSet.featureIds)
+mytester:assert(#df.trees == opt.nTree)
+```
+
+Now that we have `df`, a `dt.DecisionForest` instance, we can use it to initialize `nn.DFD`:
+
+```lua
+local dfd = nn.DFD(df)
+```
+
+The `dfd` instance holds no reference to `df`, instead it extracts the relevant attributes from `df`.
+These attributes are stored in tensors for batching and efficiency.
+
+We can discretize a hypothetical `input` by calling `forward`:
+```lua
+local input = trainSet.input:sub(1,batchsize)
+local output = dfd:forward(input)
+```
+
+The resulting output is a table consisting of two tables: keys and values.
+The keys and values tables each contains `batchsize` tensors:
+
+```lua
+print(output)
+{
+  1 :
+    {
+      1 : LongTensor - size: 14
+      2 : LongTensor - size: 16
+      3 : LongTensor - size: 15
+      4 : LongTensor - size: 13
+    }
+  2 :
+    {
+      1 : DoubleTensor - size: 14
+      2 : DoubleTensor - size: 16
+      3 : DoubleTensor - size: 15
+      4 : DoubleTensor - size: 13
+    }
+}
+```
+
+An example's feature keys (`LongTensor`) and commensurate values (`DoubleTensor`) have the same number of elements.
+The examples have variable number of key-value pairs representing the nodes traversed in the tree.
+The output feature space has as many dimensions (that is, possible feature keys) for each node in the forest.
+
+## `torch.SparseTensor`
+
+Suppose you have a set of `keys` mapped to `values`:
+```lua
+local keys = torch.LongTensor{1,3,4,7,2}
+local values = torch.Tensor{0.1,0.3,0.4,0.7,0.2}
+```
+
+You can use a `SparseTensor` to encapsulate these into a read-only tensor:
+
+```lua
+local st = torch.SparseTensor(input, target)
+```
+
+The _decisiontree_ library uses `SparseTensors` to simulate the `__index` method of the `torch.Tensor`.
+For example, one can obtain the value associated to key 3 of the above `st` instance:
+
+```lua
+local value = st[3]
+assert(value == 0.3)
+```
+
+When the key,value pair are missing, `nil` is returned instead:
+
+```lua
+local value = st[2]
+assert(value == nil)
+```
+
+The best implementation for this kind of indexing is slow (it uses a sequential scan of the `keys).
+To speedup indexing, one can call the `buildIndex()` method before hand:
+
+```lua
+st:buildIndex()
+```
+
+The `buildIndex()` creates a hash map (a Lua table) of keys to their commensurate indices in the `values` table.
+
+## `dt.DataSet`
+
+The `CartTrainer`, `RandomForestTrainer` and `GradientBoostTrainer` require that data sets be encapsulated into a `DataSet`.
+Suppose you have a dataset of dense inputs and targets:
+
+```lua
+local nExample = 10
+local nFeature = 5
+local input = torch.randn(nExample, nFeature)
+local target = torch.Tensor(nExample):random(0,1)
+```
+
+these can be encapsulated into a `DataSet` object:
+
+```lua
+local dataset = dt.DataSet(input, target)
+```
+
+Now suppose you have a dataset where the `input` is a table of `SparseTensor` instances:
+
+```lua
+local input = {}
+for i=1,nExample do
+   local nKeyVal = math.random(2,nFeature)
+   local keys = torch.LongTensor(nKeyVal):random(1,nFeature)
+   local values = torch.randn(nKeyVal)
+   input[i] = torch.SparseTensor(keys, values)
+end
+```
+
+You can still use a `DataSet` to encapsulate the sparse dataset:
+
+```lua
+local dataset = dt.DataSet(input, target)
+```
+
+The main purpose of the `DataSet` class is to sort each feature by value.
+This is captured by the `sortFeatureValues(input)` method, which is called in the constructor:
+
+```lua
+local sortedFeatureValues, featureIds = self:sortFeatureValues(input)
+```
+
+The `featureIds` is a `torch.LongTensor` of all available feature IDs.
+For a dense `input` tensor, this is just `torch.LongTensor():range(1,input:size(2))`.
+But for a sparse `input` tensor, the `featureIds` tensor only contains the feature IDs present in the dataset.
+
+The resulting `sortedFeatureValues` is a table mapping `featureIds` to `exampleIds` sorted by `featureValues`.
+For each `featureId`, examples are sorted by `featureValue` in ascending order.
+For example, the table might look like: `{featureId=exampleIds}` where `examplesIds={1,3,2}`.
+
+The `CartTrainer` accesses the `sortedFeatureValues` tensor by calling `getSortedFeature(featureId)`:
+
+```lua
+local exampleIdsWithFeature = dataset:getSortedFeature(featureId)
+```
+
+The ability to access examples IDs sorted by feature value, given a feature ID, is the main purpose of the `DataSet`.
+The `CartTrainer` relies on these sorted lists to find the best way to split a set of examples between two tree nodes.
+
+## `dt.CartTrainer`
+
+```lua
+local trainer = dt.CartTrainer(dataset, minLeafSize, maxLeafNodes)
+```
+
+The `CartTrainer` is used by the `RandomForestTrainer` and `GradientBoostTrainer` to train individual trees.
+CART stands for classification and regression trees.
+However, only binary classifiers are unit tested.
+
+The constructor takes the following arguments:
+
+ * `dataset` is a `dt.DataSet` instance representing the training set.
+ * `minLeafSize` is the minimum examples per leaf node in a tree. The larger the value, the more regularization.
+ * `maxLeafNodes` is the maximum nodes in the tree. The lower the value, the more regularization.
+
+Training is initiated by calling the `train()` method:
+
+```lua
+local trainSet = dt.DataSet(input, target)
+local rootTreeState = dt.GiniState(trainSet:getExampleIds())
+local activeFeatures = trainSet.featureIds
+local tree = trainer:train(rootTreeState, activeFeatures)
+```
+
+The resulting `tree` is a `CartTree` instance.
+The `rootTreeState` is a `TreeState` instance like `GiniState` (used by `RandomForestTrainer`) or `GradientBoostState` (used by `GradientBoostTrainer`).
+The `activeFeatures` is a `LongTensor` of feature IDs that used to build the tree.
+Every other feature ID is ignored during training. This is useful for feature bagging.
+
+By default the `CartTrainer` runs in a single-thread.
+The `featureParallel(nThread)` method can be called before calling `train()` to parallelize training using `nThread` workers:
+
+```lua
+local nThread = 3
+trainer:featureParallel(nThread)
+trainer:train(rootTreeState, activeFeatures)
+```
+
+Feature parallelization assigns a set of features IDs to each thread.
+
+The `CartTrainer` can be used as a stand-alone tree trainer.
+But it is recommended to use it within the context of a `RandomForestTrainer` or `GradientBoostTrainer` instead.
+The latter typically generalize better.
+
+## RandomForestTrainer
+
+The `RandomForestTrainer` is used to train a random forest:
+
+```lua
+local nExample = trainSet:size()
+local opt = {
+   activeRatio=0.5,
+   featureBaggingSize=5,
+   nTree=14,
+   maxLeafNodes=nExample/2,
+   minLeafSize=nExample/10,
+}
+local trainer = dt.RandomForestTrainer(opt)
+local forest = trainer:train(trainSet, trainSet.featureIds)
+```
+
+The returned `forest` is a `DecisionForest` instance.
+A `DecisionForest` has a similar interface to the `CartTree`.
+Indeed, they both sub-class the `DecisionTree` abstract class.
+
+The constructor takes a single `opt` table argument, which contains the actual arguments:
+
+ * `activeRatio` is the ratio of active examples per tree. This is used for boostrap sampling.
+ * `featureBaggingSize` is the number of features per tree. This is also used fpr feature bagging.
+ * `nTree` is the number of trees to be trained.
+ * `maxLeafNodes` and `minLeafSize` are passed to the underlying `CartTrainer` constructor (controls regularization).
+
+Internally, the `RandomForestTrainer` passes a `GiniBoostState` to the `CartTrainer:train()` method.
+
+Training can be parallelized by calling `treeParallel(nThread)`:
+
+```lua
+local nThread = 3
+trainer:treeParallel(nThread)
+local forest = trainer:train(trainSet, trainSet.featureIds)
+```
+
+Training then parallelizes by training each tree in its own thread worker.
+
+## GradientBoostTrainer
+
+References:
+ * A. [Boosted Tree presentation](https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
+
+Graient boosted decision trees (GBDT) can be trained as follows:
+```lua
+local nExample = trainSet:size()
+local maxLeafNode, minLeafSize = nExample/2, nExample/10
+local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNode)
+
+local opt = {
+  lossFunction=nn.LogitBoostCriterion(false),
+  treeTrainer=cartTrainer,
+  shrinkage=0.1,
+  downsampleRatio=0.8,
+  featureBaggingSize=-1,
+  nTree=14,
+  evalFreq=8,
+  earlyStop=0
+}
+
+local trainer = dt.GradientBoostTrainer(opt)
+local forest = trainer:train(trainSet, trainSet.featureIds, validSet)
+```
+
+The above code snippet uses the `LogitBoostCriterion` outlined in reference A.
+It is used for training binary classification trees.
+
+The returned `forest` is a `DecisionForest` instance.
+A `DecisionForest` has a similar interface to the `CartTree`.
+Indeed, they both sub-class the `DecisionTree` abstract class.
+
+The constructor takes a single `opt` table argument, which contains the actual arguments:
+
+ * `lossFunction` is a `nn.Criterion` instance extended to include the `updateHessInput(input, target)` and `backward2(input, target)`. These return the hessian of the `input`.
+ * `treeTrainer` is a `CartTrainer` instance. Its `featureParallel()` method can be called to implement feature parallelization.
+ * `shrinkage` is the weight of each additional tree.
+ * `downsampleRatio` is the ratio of examples to be sampled for each tree. Used for bootstrap sampling.
+ * `featureBaggingSize` is the number of features to sample per tree. Used for feature bagging. `-1` defaults to `torch.round(math.sqrt(featureIds:size(1)))`
+ * `nTree` is the maximum number of trees.
+ * `evalFreq` is the number of epochs between calls to `validate()` for cross-validation and early-stopping.
+ * `earlyStop` is the maximum number of epochs to wait for early-stopping.
+
+Internally, the `GradientBoostTrainer` passes a `GradientBoostState` to the `CartTrainer:train()` method.
+
+## TreeState
+
+An abstract class that holds the state of a subtree during decision tree training.
+It also manages the state of candidate splits.
+
+```lua
+local treeState = dt.TreeState(exampleIds)
+```
+
+The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
+
+## GiniState
+
+A `TreeState` subclass used internally by the `RandomForestTrainer`.
+Uses Gini impurity to determine how to split trees.
+
+```lua
+local treeState = dt.GiniState(exampleIds)
+```
+
+The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
+
+## GradientBoostState
+
+A `TreeState` subclass used internally by the `GradientBoostTrainer`.
+It implements the GBDT spliting algorithm, which uses a loss function.
+
+```lua
+local treeState = dt.GradientBoostState(exampleIds, lossFunction)
+```
+
+The `exampleIds` argument is a `LongTensor` containing the example IDs that make up the sub-tree.
+The `lossFunction` is an `nn.Criterion` instance (see `GradientBoostTrainer`).
+
+
+## WorkPool
+
+Utility class that simplifies construction of a pool of daemon threads with which to execute tasks in parallel.
+
+```lua
+local workpool = dt.WorkPool(nThread)
+```
+
+## CartTree
+
+Implements a trained CART decision tree:
+
+```lua
+local tree = nn.CartTree(rootNode)
+```
+
+The `rootNode` is a `CartNode` instance.
+Each `CartNode` contains pointers to left and right branches, which are themselves `CartNode` instances.
+
+For inference, use the `score(input)` method:
+
+```lua
+local score = tree:score(input)
+```
diff --git a/contrib/lua-torch/decisiontree/RandomForestTrainer.lua b/contrib/lua-torch/decisiontree/RandomForestTrainer.lua
new file mode 100644
index 000000000..41040b25b
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/RandomForestTrainer.lua
@@ -0,0 +1,159 @@
+local dt = require "decisiontree._env"
+
+local RandomForestTrainer = torch.class("dt.RandomForestTrainer", dt)
+
+function RandomForestTrainer:__init(opt)
+   assert(torch.type(opt.nTree) == 'number')
+   assert(opt.nTree > 0)
+   self.nTree = opt.nTree
+   -- max number of leaf nodes per tree
+   assert(torch.type(opt.maxLeafNodes) == 'number')
+   assert(opt.maxLeafNodes > 0)
+   self.maxLeafNodes = opt.maxLeafNodes
+   -- min number of examples per leaf
+   assert(torch.type(opt.minLeafSize) == 'number')
+   assert(opt.minLeafSize > 0)
+   self.minLeafSize = opt.minLeafSize
+
+   -- when non-positive, defaults to sqrt(#feature)
+   assert(torch.type(opt.featureBaggingSize) == 'number')
+   self.featureBaggingSize = opt.featureBaggingSize
+
+   assert(torch.type(opt.activeRatio) == 'number')
+   assert(opt.activeRatio > 0)
+   self.activeRatio = opt.activeRatio
+
+   -- default parallelization is singlethread
+   self.parallelMode = 'singlethread'
+end
+
+-- Train a DecisionForest
+function RandomForestTrainer:train(trainSet, featureIds, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+
+   if verbose then print(string.format("Begin training Decision Forest with %d trees", self.nTree)) end
+
+   local weight = torch.Tensor(self.nTree):fill(1 / self.nTree) -- RF uses uniform weights
+
+   local trees
+   if self.parallelMode == 'singlethread' then
+      trees = self:trainTrees(trainSet, featureIds, verbose)
+   elseif self.parallelMode == 'treeparallel' then
+      trainSet:deleteIndex() -- prevents serialization bottleneck
+      trees = self:trainTreesTP(trainSet, featureIds, verbose)
+   else
+      error("Unrecognized parallel mode: " .. self.parallelMode)
+   end
+
+   if verbose then print(string.format("Successfully trained %d trees", #trees)) end
+
+   -- set bias
+   local bias = 0;
+   for i, tree in ipairs(trees) do
+      bias = bias + tree.root.score * weight[i]
+   end
+
+   return dt.DecisionForest(trees, weight, bias)
+end
+
+function RandomForestTrainer:trainTrees(trainSet, featureIds, verbose)
+
+   -- the same CartTrainer will be used for each tree
+   local cartTrainer = dt.CartTrainer(trainSet, self.minLeafSize, self.maxLeafNodes)
+
+   local trees = {}
+   for treeId=1,self.nTree do
+      -- Train a CartTree
+      local tree = self.trainTree(cartTrainer, featureIds, self.featureBaggingSize, self.activeRatio, treeId, verbose)
+      table.insert(trees, tree)
+   end
+   return trees
+end
+
+-- static function that returns a cartTree
+function RandomForestTrainer.trainTree(cartTrainer, featureIds, baggingSize, activeRatio, treeId, verbose)
+   assert(torch.isTypeOf(cartTrainer, 'dt.CartTrainer'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   local baggingSize = baggingSize > 0 and baggingSize or torch.round(math.sqrt(featureIds:size(1)))
+
+   if verbose then
+      print(string.format("Tree %d: Creating features bootstrap sample with baggingSize %d, nFeatures %d", treeId, baggingSize, featureIds:size(1)))
+   end
+
+   local trainSet = cartTrainer.dataset
+
+   -- sample boot strap features
+   local baggingIndices = torch.LongTensor(baggingSize):random(1,featureIds:size(1))
+   local activeFeatures = featureIds:index(1, baggingIndices)
+
+    -- sample boot strap examples
+   local sampleSize = torch.round(trainSet:size() * activeRatio)
+   if verbose then print(string.format("Creating bootstrap sample created of size %d", sampleSize)) end
+
+   baggingIndices:resize(sampleSize):random(1,trainSet:size())
+   local bootStrapExampleIds = torch.LongTensor()
+   bootStrapExampleIds:index(trainSet:getExampleIds(), 1, baggingIndices)
+
+   local cartTree = cartTrainer:train(dt.GiniState(bootStrapExampleIds), activeFeatures)
+
+   if verbose then print(string.format("Complete processing tree number %d", treeId)) end
+
+   return cartTree
+end
+
+-- call before training to enable tree-level parallelization
+function RandomForestTrainer:treeParallel(workPool)
+   assert(self.parallelMode == 'singlethread', self.parallelMode)
+   self.parallelMode = 'treeparallel'
+   self.workPool = torch.type(workPool) == 'number' and dt.WorkPool(workPool) or workPool
+   assert(torch.isTypeOf(self.workPool, 'dt.WorkPool'))
+
+   -- require the dt package
+   self.workPool:update('require', {libname='decisiontree',varname='dt'})
+end
+
+-- TP is for tree parallel (not toilet paper)
+function RandomForestTrainer:trainTreesTP(trainSet, featureIds, verbose)
+   assert(torch.isTypeOf(trainSet, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   local minLeafSize = self.minLeafSize
+   local maxLeafNodes = self.maxLeafNodes
+
+   -- setup worker store (each worker will have its own cartTrainer)
+   self.workPool:updateup('execute', function(store)
+      local dt = require 'decisiontree'
+
+      store.cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+      store.featureIds = featureIds
+   end)
+
+   for treeId=1,self.nTree do
+      -- upvalues
+      local baggingSize = self.featureBaggingSize
+      local activeRatio = self.activeRatio
+      -- task closure that will be executed in worker-thread
+      local function trainTreeTask(store)
+         local dt = require 'decisiontree'
+         return dt.RandomForestTrainer.trainTree(store.cartTrainer, store.featureIds, baggingSize, activeRatio, treeId, verbose)
+      end
+      self.workPool:writeup('execute', trainTreeTask)
+   end
+
+   local trees = {}
+   for treeId=1,self.nTree do
+      local taskname, tree = self.workPool:read()
+      assert(taskname=='execute')
+      assert(torch.isTypeOf(tree, 'dt.CartTree'))
+      table.insert(trees, tree)
+   end
+   return trees
+end
+
+function RandomForestTrainer:getName()
+   return string.format(
+      "randomforest-aRatio-%4.2f-maxLeaf-%d-minExample-%d-nTree-%d",
+      self.activeRatio, self.maxLeafNodes, self.minLeafSize, self.nTree
+   )
+end
+
diff --git a/contrib/lua-torch/decisiontree/Sparse2Dense.lua b/contrib/lua-torch/decisiontree/Sparse2Dense.lua
new file mode 100644
index 000000000..4e5b79d2f
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/Sparse2Dense.lua
@@ -0,0 +1,88 @@
+local S2D, parent = torch.class("nn.Sparse2Dense", "nn.Module")
+local dt = require 'decisiontree._env'
+
+function S2D:__init(features)
+   parent.__init(self)
+   if torch.type(features) == 'table' then
+      assert(#features > 0)
+      features = torch.LongTensor(features)
+   end
+   assert(torch.isTensor(features))
+   self.features = features
+   self.featureMap = nil
+   self.masks = {}
+   self.mappedKeys = {}
+end
+
+function S2D:updateOutput(input)
+   if not self.featureMap then
+      self.featureMap = dt.HashMap()
+      self.featureMap:fill(self.features)
+   end
+   local batched, keys, values
+   if torch.isTensor(input[1]) then
+      keys = {input[1]}
+      values = {input[2]}
+      batched = false
+   else
+      keys = input[1]
+      values = input[2]
+      batched = true
+   end
+   assert(#keys == #values)
+
+   local masks = self.masks
+   local mappedKeys = self.mappedKeys
+   local nKeys = #keys
+   local nMasks = #masks
+   if nMasks < nKeys then
+      for i=nMasks+1,nKeys do
+         masks[i] = torch.ByteTensor()
+         mappedKeys[i] = torch.LongTensor()
+      end
+   elseif nMasks > nKeys then
+      for i=nKeys+1,nMasks do
+         masks[i] = nil
+         mappedKeys[i] = nil
+      end
+   end
+
+   self.featureMap:get(keys, mappedKeys, masks)
+   self.output = self.output or torch.Tensor():type(self._type)
+   self.output.nn.S2D_computeOutput(self.output, mappedKeys, values, masks, self.features)
+   if not batched then
+      self.output = self.output:view(-1)
+   end
+   return self.output
+end
+
+function S2D:type(type, tensorCache)
+   if type then
+      local features = self.features
+      self.features = nil
+      parent.type(self, type, tensorCache)
+      self.features = features
+      return self
+   else
+      return parent.type(self)
+   end
+end
+
+function S2D:updateGradInput(input, gradOutput)
+   error"Not Implemented"
+end
+
+function S2D:reset()
+   parent.reset(self)
+   self.featureMap = nil
+end
+
+function S2D:write(file)
+   self.featureMap = nil
+   parent.write(self, file)
+end
+
+function S2D:read(file)
+   self.featureMap = nil
+   parent.read(self, file)
+end
diff --git a/contrib/lua-torch/decisiontree/SparseTensor.lua b/contrib/lua-torch/decisiontree/SparseTensor.lua
new file mode 100644
index 000000000..4c620e618
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/SparseTensor.lua
@@ -0,0 +1,54 @@
+
+local SparseTensor = torch.class("torch.SparseTensor")
+
+function SparseTensor:__init(keys, values)
+   if keys and values then
+      assert(torch.typename(keys):find('torch%..*LongTensor'))
+      assert(torch.isTensor(values))
+      assert(keys:nElement() == values:nElement(), "Expecting key and value tensors of same size")
+      self.keys = keys
+      self.values = values
+   elseif not (keys or values) then
+      self.keys = torch.LongTensor()
+      self.values = torch.Tensor()
+   else
+      error"Expecting zero or two args"
+   end
+end
+
+function SparseTensor:buildIndex(overwrite)
+   if self._map and not overwrite then return end
+   assert(self.keys and self.keys:dim() == 1)
+   assert(self.values and self.values:dim() == 1)
+   -- hash table
+   self._map = {}
+   for i=1,self.keys:size(1) do
+      self._map[self.keys[i]] = i
+   end
+end
+
+function SparseTensor:deleteIndex()
+   self._map = nil
+end
+
+local __index = SparseTensor.__index
+function SparseTensor:__index(key)
+   if key == nil then
+      error"Attempt to index using a nil key"
+   elseif torch.type(key) ~= 'number' then
+      return __index(self, key)
+   end
+
+   if self._map then
+      assert(torch.type(self._map) == 'table')
+      local idx = self._map[key]
+      return idx and self.values[idx] or nil
+   elseif self.keys:nElement() > 0 then
+      for i=1,self.keys:size(1) do
+         if self.keys[i] == key then
+            return self.values[i]
+         end
+      end
+   end
+   return nil
+end
+\ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/TreeState.lua b/contrib/lua-torch/decisiontree/TreeState.lua
new file mode 100644
index 000000000..3928649fd
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/TreeState.lua
@@ -0,0 +1,191 @@
+local dt = require "decisiontree._env"
+
+local TreeState = torch.class("dt.TreeState", dt)
+
+-- Holds the state of a subtree during decision tree training.
+-- Also, manages the state of candidate splits
+function TreeState:__init(exampleIds)
+   assert(torch.type(exampleIds) == 'torch.LongTensor')
+   self.exampleIds = exampleIds
+
+   self.nExampleInLeftBranch = 0
+   self.nExampleInRightBranch = 0
+end
+
+-- computes and returns the score of the node based on its examples
+function TreeState:score(dataset)
+   error"NotImplemented"
+end
+
+
+-- Initializes the split-state-updater. Initially all examples are in the left branch.
+-- exampleIdsWithFeature is list of examples to split (those having a particular feature)
+function TreeState:initialize(exampleIdsWithFeature, dataset)
+   error"NotImplemented"
+end
+
+-- Update the split state. This call has the effect of shifting the example from the left to the right branch.
+function TreeState:update(exampleId, dataset)
+   error"NotImplemented"
+end
+
+-- Computes the SplitInfo determined by the current split state
+-- @param splitFeatureId the feature id of the split feature
+-- @param splitFeatureValue the feature value of the split feature
+-- @return the SplitInfo determined by the current split state
+function TreeState:computeSplitInfo(splitFeatureId, splitFeatureValue)
+   error"NotImplemented"
+end
+
+-- bottleneck
+function TreeState:findBestFeatureSplit(dataset, featureId, minLeafSize)
+   local dt = require "decisiontree"
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   assert(torch.type(featureId) == 'number')
+   assert(torch.type(minLeafSize) == 'number')
+
+   -- all dataset example having this feature, sorted by value
+   local featureExampleIds = dataset:getSortedFeature(featureId)
+
+   local buffer = dt.getBufferTable('TreeState')
+   buffer.longtensor = buffer.longtensor or torch.LongTensor()
+   local exampleIdsWithFeature = buffer.longtensor
+
+   -- map and tensor of examples containing feature:
+   local exampleMap = {}
+   local getExampleFeatureValue
+
+   local j = 0
+   if torch.type(dataset.input) == 'table' then
+      exampleIdsWithFeature:resize(self.exampleIds:size())
+      self.exampleIds:apply(function(exampleId)
+         local input = dataset.input[exampleId]
+         input:buildIndex()-- only builds index first time
+         if input[featureId] then
+            j = j + 1
+            exampleIdsWithFeature[j] = exampleId
+            exampleMap[exampleId] = j
+         end
+      end)
+      if j == 0 then
+         return
+      end
+      exampleIdsWithFeature:resize(j)
+      getExampleFeatureValue = function(exampleId) return dataset.input[exampleId][featureId] end
+   else
+      exampleIdsWithFeature = self.exampleIds
+      self.exampleIds:apply(function(exampleId)
+         j = j + 1
+         exampleMap[exampleId] = j
+      end)
+      local featureValues = dataset.input:select(2,featureId)
+      getExampleFeatureValue = function(exampleId) return featureValues[exampleId] end
+   end
+
+
+   self:initialize(exampleIdsWithFeature, dataset)
+
+   -- bottleneck
+   local bestSplit, previousSplitValue, _tictoc
+   for i=featureExampleIds:size(1),1,-1 do -- loop over examples sorted (desc) by feature value
+      local exampleId = featureExampleIds[i]
+
+      local exampleIdx = exampleMap[exampleId]
+      if exampleIdx then
+         local splitValue = getExampleFeatureValue(exampleId)
+
+         if previousSplitValue and math.abs(splitValue - previousSplitValue) > dt.EPSILON then
+            local splitInfo = self:computeSplitInfo(featureId, previousSplitValue, _tictoc)
+            if (splitInfo.leftChildSize >= minLeafSize) and (splitInfo.rightChildSize >= minLeafSize) then
+
+               if (not bestSplit) or (splitInfo.splitGain < bestSplit.splitGain) then
+                  _tictoc = bestSplit or {} -- reuse table
+                  bestSplit = splitInfo
+               end
+
+            end
+         end
+
+         previousSplitValue = splitValue
+
+         -- bottleneck
+         self:update(exampleId, dataset, exampleIdx)
+      end
+   end
+
+   return bestSplit
+end
+
+-- finds the best split of examples in treeState among featureIds
+function TreeState:findBestSplit(dataset, featureIds, minLeafSize, shardId, nShard)
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   assert(torch.type(featureIds) == 'torch.LongTensor')
+   assert(torch.type(minLeafSize) == 'number')
+   assert(torch.type(shardId) == 'number')
+   assert(torch.type(nShard) == 'number')
+
+   local bestSplit
+   for i=1,featureIds:size(1) do
+      local featureId = featureIds[i]
+      if (nShard <= 1) or ( (featureId % nShard) + 1 == shardId ) then -- feature sharded
+         local splitCandidate = self:findBestFeatureSplit(dataset, featureId, minLeafSize)
+         if splitCandidate and ((not bestSplit) or (splitCandidate.splitGain < bestSplit.splitGain)) then
+            bestSplit = splitCandidate
+         end
+      end
+   end
+
+   return bestSplit
+end
+
+-- Partitions self given a splitInfo table, producing a pair of exampleIds corresponding to the left and right subtrees.
+function TreeState:_branch(splitInfo, dataset)
+   local leftIdx, rightIdx = 0, 0
+   local nExample = self.exampleIds:size(1)
+   local splitExampleIds = torch.LongTensor(nExample)
+
+
+   for i=1,self.exampleIds:size(1) do
+      local exampleId = self.exampleIds[i]
+      local input = dataset.input[exampleId]
+      local val = input[splitInfo.splitId]
+      -- Note: when the feature is not present in the example, the example is droped from all sub-trees.
+      -- Which means that for most sparse data, a tree cannot reach 100% accuracy...
+      if val then
+         if val < splitInfo.splitValue then
+            leftIdx = leftIdx + 1
+            splitExampleIds[leftIdx] = exampleId
+         else
+            rightIdx = rightIdx + 1
+            splitExampleIds[nExample-rightIdx+1] = exampleId
+         end
+      end
+   end
+
+   local leftExampleIds = splitExampleIds:narrow(1,1,leftIdx)
+   local rightExampleIds = splitExampleIds:narrow(1,nExample-rightIdx+1,rightIdx)
+
+   assert(leftExampleIds:size(1) + rightExampleIds:size(1) <= self.exampleIds:size(1), "Left and right branches contain more data than the parent!")
+   return leftExampleIds, rightExampleIds
+end
+
+-- calls _branch and encapsulates the left and right exampleIds into a TreeStates
+function TreeState:branch(splitInfo, dataset)
+   local leftExampleIds, rightExampleIds = self:_branch(splitInfo, dataset)
+   return self.new(leftExampleIds), self.new(rightExampleIds)
+end
+
+function TreeState:size()
+   return self.exampleIds:size(1)
+end
+
+function TreeState:contains(exampleId)
+   local found = false
+   self.exampleIds:apply(function(x)
+      if x == exampleId then
+         found = true
+      end
+   end)
+   return found
+end
+
diff --git a/contrib/lua-torch/decisiontree/WorkPool.lua b/contrib/lua-torch/decisiontree/WorkPool.lua
new file mode 100644
index 000000000..8f473727e
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/WorkPool.lua
@@ -0,0 +1,156 @@
+local dt = require "decisiontree._env"
+
+-- Utility to simplify construction of a pool of daemon threads with which to execute tasks in parallel.
+local WorkPool = torch.class("dt.WorkPool", dt)
+
+function WorkPool:__init(nThread)
+   self.nThread = nThread or 16
+   assert(torch.type(self.nThread) == 'number')
+   assert(self.nThread > 0)
+
+   self:initialize()
+end
+
+function WorkPool:initialize()
+   local ipc = require 'libipc'
+   self.queuename = os.tmpname()
+   self.queue = ipc.workqueue(self.queuename)
+   self.queues = {}
+   for i=1,self.nThread do
+      self.queues[i] = ipc.workqueue(self.queuename.."/"..i)
+   end
+
+   -- spawn thread workers
+   ipc.map(self.nThread, function(queuename, nThread, myId)
+      assert(nThread)
+      assert(myId)
+      local ipc = require 'libipc'
+
+      -- Open the queue by name (the main thread already created it)
+      local mainqueue = ipc.workqueue(queuename)
+      local workqueue = ipc.workqueue(queuename.."/"..myId)
+
+      local taskname, args
+
+      local store = {}
+      local queue = mainqueue
+
+      repeat
+         local msg = queue:read()
+         assert(torch.type(msg) == 'table')
+         taskname, task = unpack(msg)
+         if taskname == nil then
+            break
+         elseif torch.type(taskname) ~= 'string' then
+            error("Expecting taskname string. Got "..torch.type(taskname))
+         elseif taskname == 'storeKeyValue' then
+            assert(torch.type(task) == 'table')
+            assert(queue == workqueue)
+            store[task.key] = task.value
+            queue:write({taskname})
+         elseif taskname == 'storeKeysValues' then
+            assert(torch.type(task) == 'table')
+            assert(queue == workqueue)
+            for key,value in pairs(task) do
+               store[key] = value
+            end
+            queue:write({taskname})
+         elseif taskname == 'require' then
+            assert(torch.type(task) == 'table')
+            assert(torch.type(task.libname) == 'string')
+            assert(torch.type(task.varname) == 'string')
+            _G[task.varname] = require(task.libname)
+            assert(queue == workqueue)
+            queue:write({taskname})
+         elseif taskname == 'storeReset' then
+            store = {}
+            mainqueue:write({taskname})
+         elseif taskname == 'echo' then
+            mainqueue:write({taskname, task})
+         elseif taskname == 'readWorkerQueue' then
+            queue = workqueue
+         elseif taskname == 'readMainQueue' then
+            queue = mainqueue
+         elseif taskname == 'execute' then
+            if torch.type(task) == 'table' then
+               assert(task.func and task.args)
+               queue:write({taskname, task.func(store, task.args, myId)})
+            else
+               assert(torch.type(task) == 'function')
+               queue:write({taskname, task(store, myId)})
+            end
+         else
+            error("Unknown taskname: "..taskname)
+         end
+      until taskname == nil
+   end, self.queuename, self.nThread)
+
+end
+
+-- Terminates all daemon threads.
+function WorkPool:terminate()
+   for i=1,self.nThread do
+      self.queue:write({})
+   end
+end
+
+-- this function is used to update the store of data in each worker thread
+function WorkPool:_update(taskname, task, upval)
+   assert(torch.type(taskname) == 'string')
+   local _ = require 'moses'
+   assert(_.contains({'storeKeyValue','storeKeysValues','require','execute'}, taskname))
+   assert(torch.type(task) == 'table' or torch.type(task) == 'function')
+
+   -- tell the workers to read their individual queue
+   for i=1,self.nThread do
+      self.queue:write({'readWorkerQueue'})
+   end
+
+   -- write to individual worker queues
+   for i=1,self.nThread do
+      if upval then
+         self.queues[i]:writeup({taskname, task})
+      else
+         self.queues[i]:write({taskname, task})
+      end
+   end
+
+   -- TODO use ipc.mutex:barrier(nThread+1)
+   -- barrier: make sure that every worker has completed task by reading their queue
+   for i=1,self.nThread do
+      assert(self.queues[i]:read()[1] == taskname)
+   end
+
+   -- finally, tell them to read the main queue
+   for i=1,self.nThread do
+      self.queues[i]:write({'readMainQueue'})
+   end
+end
+
+function WorkPool:update(taskname, task)
+   return self:_update(taskname, task, false)
+end
+
+function WorkPool:updateup(taskname, task)
+   return self:_update(taskname, task, true)
+end
+
+function WorkPool:write(taskname, task)
+   assert(torch.type(taskname) == 'string')
+   assert(taskname ~= 'storeKeyValue' or taskname ~= 'storeKeysValues')
+   self.queue:write({taskname, task})
+end
+
+function WorkPool:writeup(taskname, task)
+   assert(torch.type(taskname) == 'string')
+   assert(taskname ~= 'storeKeyValue' or taskname ~= 'storeKeysValues')
+   self.queue:writeup({taskname, task})
+end
+
+function WorkPool:read()
+   local res = self.queue:read()
+   assert(torch.type(res) == 'table')
+   assert(torch.type(res[1] == 'string'))
+   return unpack(res)
+end
+
diff --git a/contrib/lua-torch/decisiontree/_env.lua b/contrib/lua-torch/decisiontree/_env.lua
new file mode 100644
index 000000000..a92770152
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/_env.lua
@@ -0,0 +1,5 @@
+
+-- https://github.com/torch/torch7/issues/525
+
+local dl = {}
+return dl
+\ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/benchmark.lua b/contrib/lua-torch/decisiontree/benchmark.lua
new file mode 100644
index 000000000..2b6a03dc6
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/benchmark.lua
@@ -0,0 +1,171 @@
+local dt = require "decisiontree._env"
+
+local bm = {}
+function bm.CartTrainer(opt)
+   local timer = torch.Timer()
+   local trainSet, validSet = dt.getSparseDummyData(opt)
+   print(string.format("CartTrainer: sparse dataset create: %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+
+   local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   timer:reset()
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   print(string.format("CartTrainer: train single-thread : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+
+   timer:reset()
+   cartTrainer:featureParallel(opt.nThread)
+   print(string.format("CartTrainer: setup feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+   timer:reset()
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   print(string.format("CartTrainer: train feature-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+end
+
+function bm.GradientBoostState(opt)
+   local trainSet, validSet = dt.getSparseDummyData(opt)
+
+   trainSet:initScore()
+
+   local treeState = dt.GradientBoostState(trainSet:getExampleIds(), nn.LogitBoostCriterion(false))
+
+   local timer = torch.Timer() -- first step also calls SparseTensor:buildIndex()
+   treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
+   print(string.format("GradientBoostState: findBestSplit (first) : %f sec", timer:time().real))
+
+   timer:reset()
+   treeState:findBestSplit(trainSet, trainSet.featureIds, 10, 1, 3)
+   print(string.format("GradientBoostState: findBestSplit (second) : %f sec", timer:time().real))
+
+end
+
+local function file_exists(name)
+   local f=io.open(name,"r")
+   if f~=nil then io.close(f) return true else return false end
+end
+
+function bm.GradientBoostTrainer(opt)
+   local trainSet, validSet
+   if file_exists("/tmp/train.bin") and file_exists("/tmp/valid.bin") then
+      trainSet = torch.load("/tmp/train.bin")
+      validSet = torch.load("/tmp/valid.bin")
+   else
+      if opt.sparse then
+         trainSet, validSet = dt.getSparseDummyData(opt)
+      else
+         trainSet, validSet = dt.getDenseDummyData(opt)
+      end
+      torch.save("/tmp/train.bin", trainSet)
+      torch.save("/tmp/valid.bin", validSet)
+   end
+
+   local cartTrainer = dt.CartTrainer(trainSet, opt.minLeafSize, opt.maxLeafNodes)
+   opt.lossFunction = nn.LogitBoostCriterion(false)
+   opt.treeTrainer = cartTrainer
+   local forestTrainer = dt.GradientBoostTrainer(opt)
+
+   local timer = torch.Timer()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
+   local time = timer:time().real
+   print(string.format("GradientBoostTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+
+   cartTrainer:featureParallel(opt.nThread)
+   timer:reset()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds, validSet)
+   local time = timer:time().real
+   print(string.format("GradientBoostTrainer: train feature-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+end
+
+function bm.RandomForestTrainer(opt)
+   local trainSet, validSet = dt.getSparseDummyData(opt)
+
+   local forestTrainer = dt.RandomForestTrainer(opt)
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+
+   local timer = torch.Timer()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+   local time = timer:time().real
+   print(string.format("RandomForestTrainer: train single-thread : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+
+   timer:reset()
+   forestTrainer:treeParallel(opt.nThread)
+   print(string.format("RandomForestTrainer: setup tree-parallel : %f samples/sec; %f sec", opt.nExample/timer:time().real, timer:time().real))
+
+   timer:reset()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+   local time = timer:time().real
+   print(string.format("RandomForestTrainer: train tree-parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+end
+
+function bm.DFD(opt)
+   local _ = require 'moses'
+   local opt = _.clone(opt)
+   opt.nExample = 200
+   local trainSet, validSet = dt.getDenseDummyData(opt)
+
+   local forestTrainer = dt.RandomForestTrainer(opt)
+   forestTrainer:treeParallel(opt.nThread)
+   local timer = torch.Timer()
+   local decisionForest = forestTrainer:train(trainSet, trainSet.featureIds)
+   local time = timer:time().real
+   print(string.format("DFD: train random forest in parallel : %f samples/sec; %f sec/tree, %f sec", opt.nExample/time, time/opt.nTree, time))
+
+
+   -- benchmark nn.DFD
+   local input = trainSet.input:sub(1,opt.batchsize)
+   local dfd = nn.DFD(decisionForest)
+   dfd:forward(input)
+   timer:reset()
+   for i=1,opt.nloop do
+      dfd:forward(input)
+   end
+   print(string.format("DFD: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
+end
+
+function bm.Sparse2Dense(opt)
+   local _ = require 'moses'
+   local opt = _.clone(opt)
+   opt.nExample = opt.batchsize
+   local trainSet = dt.getSparseDummyData(opt)
+
+   local input = {{},{}}
+   for i=1,opt.batchsize do
+      input[1][i] = trainSet.input[i].keys
+      input[2][i] = trainSet.input[i].values
+   end
+   assert(#input[1] == opt.batchsize)
+
+   -- benchmark nn.Sparse2Dense
+   local s2d = nn.Sparse2Dense(torch.LongTensor():range(1,opt.nFeature))
+   s2d:forward(input)
+   local timer = torch.Timer()
+   for i=1,opt.nloop do
+      s2d:forward(input)
+   end
+   print(string.format("Sparse2Dense: updateOutput : %f samples/sec; %f sec", opt.nloop*opt.batchsize/timer:time().real, timer:time().real))
+end
+
+function dt.benchmark(benchmarks, opt2)
+   local opt = {
+      nExample=10000, nCluster=2, nFeature=1000, overlap=0, nValid=100,        -- getSparseDummyData
+      nTree=20, featureBaggingSize=-1, sparse=true,                           -- GradientBoostTrainer and RandomForestTrainer
+      nThread=2, shrinkage=0.1, downsampleRatio=0.1, evalFreq=5, earlyStop=0, -- GradientBoostTrainer
+      activeRatio=0.5,                                                      -- RandomForestTrainer
+      batchsize=32, nloop=10
+   }
+
+   local _ = require 'moses'
+   benchmarks = benchmarks or _.keys(bm)
+   assert(torch.type(benchmarks) == 'table')
+   for i,benchmark in ipairs(benchmarks) do
+      local opt1 = _.clone(opt)
+      for key, value in pairs(opt2 or {}) do
+         opt1[key] = value
+      end
+      opt1.nActive = opt1.nActive or torch.round(opt1.nFeature/10)
+      opt1.maxLeafNodes = opt1.maxLeafNodes or (opt1.nExample/10)
+      opt1.minLeafSize = opt1.minLeafSize or (opt1.nExample/100)
+
+      assert(torch.type(benchmark) == 'string', benchmark)
+      assert(bm[benchmark], benchmark)
+      bm[benchmark](opt1)
+   end
+end
diff --git a/contrib/lua-torch/decisiontree/doc/benchmark.md b/contrib/lua-torch/decisiontree/doc/benchmark.md
new file mode 100644
index 000000000..cb8f905d6
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/doc/benchmark.md
@@ -0,0 +1,291 @@
+# Benchmarks
+
+This file outlines the roadmap (and commensurate benchmarks) of optimizations and refactorings over time.
+
+## Baseline
+
+The baseline implementation is very slow.
+We converted the Twitter decision tree library (used internally) from Java to Lua.
+The objective was to replicate the GBDT and Random Forest implementations as is (more or less).
+The Java library is very good and reasonably fast. The same code in Lua is slow.
+The point of this Lua baseline was not to obtain the same computational performance as the Java library.
+Instead, we wanted the training and inferences algorithms of the Lua lib to match thoses of the Java lib.
+As such, the training/validation error of the baseline Lua lib should match that of the Java lib.
+The unit tests seem to validate this claim as both training/validation set performance is unit tested.
+We also used the conversion exercise as a way to learn about decision tree implementation (our background is deep learning).
+That being said, the baseline performance is terrible:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark()"
+CartTrainer: sparse dataset create: 2963.192386 samples/sec; 0.337479 sec
+CartTrainer: train single-thread : 14.165438 samples/sec; 70.594361 sec
+CartTrainer: setup feature-parallel : 5.129034 samples/sec; 194.968478 sec
+CartTrainer: train feature-parallel : 9.736592 samples/sec; 102.705344 sec
+```
+
+The original Java lib had approximately 43 classes.
+The baseline has about 24.
+This reduction is due to obvious merging of classes. But also to conversions of classes to functions.
+The next patches continue this process of reducing the number of classes.
+
+## Patch 1 (complete):
+
+This patch further reduces the number of classes, but adds the DataSet class.
+The code is much simple to read. Examples are batched.
+
+ * [x] examples are batched in dt.DataSet: {input, target, score}
+ * [x] deprecate dt.LabeledExample
+ * [x] list of examples are replaced with torch.LongTensors of exampleIds
+ * [x] merge TreeBrancher into TreeState
+ * [x] merge BestSplitFinder and SplitStateUpdater into TreeState
+ * [x] TreeState subclasses: GradientBoostState and GiniState
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark()"
+CartTrainer: sparse dataset create: 3597.392294 samples/sec; 0.277984 sec
+CartTrainer: train single-thread : 35.763255 samples/sec; 27.961663 sec
+CartTrainer: setup feature-parallel : 36759.250495 samples/sec; 0.027220 sec
+CartTrainer: train feature-parallel : 72.523658 samples/sec; 13.788606 sec
+```
+
+ The setup time for feature-parallelization is most improved.
+ The run-time for feature-parallel also about half that of single-thread.
+ Since its using 2 threads, that means the parallelization is working quite well.
+
+ We also added benchmarks for the `RandomForestTrainer` and `GradientBoostTrainer`:
+
+```
+GradientBoostTrainer: train single-thread : 599.895105 samples/sec; 0.083348 sec/tree, 1.666958 sec
+GradientBoostTrainer: train feature-parallel : 974.235273 samples/sec; 0.051322 sec/tree, 1.026446 sec
+RandomForestTrainer: train single-thread : 134.781044 samples/sec; 0.370972 sec/tree, 7.419441 sec
+RandomForestTrainer: setup tree-parallel : 73341.097064 samples/sec; 0.013649 sec
+RandomForestTrainer: train tree-parallel : 262.975891 samples/sec; 0.190131 sec/tree, 3.802630 sec
+```
+
+Looks good.
+
+## Patch 2 (complete):
+
+ * [x] dt.LossFunction -> nn.Criterion (LogitBoost is done, missing MSE)
+ * [x] use SparseTensor:buildIndex() to accelerate TreeState:findBestSplit()
+ * [x] benchmarks use 10000 instead of 1000 examples
+
+The benchmarks indicate good improvements. Most improvements were made possible by the use of `buildIndex`:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark()"
+GradientBoostState: findBestSplit (first) : 11.415645 sec
+GradientBoostState: findBestSplit (second) : 11.246336 sec
+CartTrainer: sparse dataset create: 3284.803629 samples/sec; 3.044327 sec
+CartTrainer: train single-thread : 239.544758 samples/sec; 41.745858 sec
+CartTrainer: setup feature-parallel : 10996.443063 samples/sec; 0.909390 sec
+CartTrainer: train feature-parallel : 473.888592 samples/sec; 21.102011 sec
+RandomForestTrainer: train single-thread : 892.985186 samples/sec; 0.559920 sec/tree, 11.198394 sec
+RandomForestTrainer: setup tree-parallel : 176806.252266 samples/sec; 0.056569 sec
+RandomForestTrainer: train tree-parallel : 1377.849291 samples/sec; 0.362884 sec/tree, 7.257688 sec
+GradientBoostTrainer: train single-thread : 2685.485128 samples/sec; 0.186186 sec/tree, 3.723722 sec
+GradientBoostTrainer: train feature-parallel : 3712.313215 samples/sec; 0.134687 sec/tree, 2.693738 sec
+```
+
+The main bottleneck now is in serializing the SparseTensor hash maps. We temporarly overcame this bottleneck by
+deleting indexes when calling `CartTrainer:featureParallel()` and `RandomForestTrainer:treeParallel()`.
+In this way, the indexes are recreated for each thread. Ideally, we would use a C hash map such that a pointer
+could be serialized instead. But `tds.Hash` does not serialize well. For now instead, we use lua tables.
+
+This is the benchmark for `GradientBoostTrainer` on a large dataset of dense inputs:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
+GradientBoostTrainer: train single-thread : 152.463989 samples/sec; 131.178517 sec/tree, 655.892584 sec
+GradientBoostTrainer: train feature-parallel : 224.288488 samples/sec; 89.170872 sec/tree, 445.854358 sec
+[tw-mbp-nleonard decisiontree]$ th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8,nThread=4})"
+GradientBoostTrainer: train single-thread : 163.836896 samples/sec; 122.072625 sec/tree, 610.363126 sec
+GradientBoostTrainer: train feature-parallel : 407.981442 samples/sec; 49.021838 sec/tree, 245.109188 sec
+```
+
+## Patch 3 :
+
+Optimize GBDT for large datasets consisting of dense inputs. The benchmarks:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
+GradientBoostTrainer: train single-thread : 547.553407 samples/sec; 36.526117 sec/tree, 182.630587 sec
+GradientBoostTrainer: train feature-parallel : 792.964678 samples/sec; 25.221804 sec/tree, 126.109022 sec
+[tw-mbp-nleonard decisiontree]$ th -e "dt = require 'decisiontree'; dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8,nThread=4})"
+GradientBoostTrainer: train single-thread : 555.793759 samples/sec; 35.984571 sec/tree, 179.922855 sec
+GradientBoostTrainer: train feature-parallel : 1289.977846 samples/sec; 15.504142 sec/tree, 77.520711 sec
+```
+
+For 1, 2 and 4 threads, the speedups of patch 3 over patch 2 are respectively: 3.39, 3.53, and 3.18.
+For this patch, the multi-threading speedup of 2 and 4 threads over a single thread are respectively: 1.42 and 2.33.
+Improvements over the previous patch were obtained by optimizing two aspects:
+
+  1. Optimizing `TreeState.findBestFeatureSplit` for dense datasets (for example: `if dense, then ...`);
+  2. Removing `assert` clauses in `GradientBoostState.update`. The `update` method is called for every (example, feature), making it a major bottleneck.
+
+Converting the `update` to C could lead to further optimizations.
+
+This patch also improves the benchmark on sparse datasets:
+```
+$ th -e "dt = require 'decisiontree'; dt.benchmark()"
+RandomForestTrainer: train single-thread : 1121.311196 samples/sec; 0.445907 sec/tree, 8.918131 sec
+RandomForestTrainer: setup tree-parallel : 168773.323354 samples/sec; 0.059256 sec
+RandomForestTrainer: train tree-parallel : 1701.280938 samples/sec; 0.293896 sec/tree, 5.877924 sec
+GradientBoostState: findBestSplit (first) : 8.250646 sec
+GradientBoostState: findBestSplit (second) : 7.952077 sec
+GradientBoostTrainer: train single-thread : 3355.248596 samples/sec; 0.149020 sec/tree, 2.980405 sec
+GradientBoostTrainer: train feature-parallel : 4399.133369 samples/sec; 0.113659 sec/tree, 2.273175 sec
+CartTrainer: sparse dataset create: 3428.105601 samples/sec; 2.917069 sec
+CartTrainer: train single-thread : 282.172416 samples/sec; 35.439331 sec
+CartTrainer: setup feature-parallel : 9455.440801 samples/sec; 1.057598 sec
+CartTrainer: train feature-parallel : 594.054049 samples/sec; 16.833491 sec
+DFD: train random forest in parallel : 346.831378 samples/sec; 0.288325 sec/tree, 5.766491 sec
+DFD: updateOutput : 831.105546 samples/sec; 0.038509 sec
+```
+
+## Patch 4 :
+
+This patch improves `nn.DFD` from
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
+DFD: train random forest in parallel : 10.527251 samples/sec; 0.037997 sec/tree, 18.998313 sec
+DFD: updateOutput : 32.442950 samples/sec; 9.863472 sec
+```
+
+to
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
+DFD: train random forest in parallel : 10.839547 samples/sec; 0.036902 sec/tree, 18.450956 sec
+DFD: updateOutput : 359.158353 samples/sec; 0.890975 sec
+Sparse2Dense: updateOutput : 15395.648952 samples/sec; 0.020791 sec
+```
+
+That is a 10x speedup for `nn.DFD`.
+
+The patch also adds a benchmark for `nn.Sparse2Dense`:
+
+```
+th -e "dt = require 'decisiontree'; dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1})"
+Sparse2Dense: updateOutput : 17158.126406 samples/sec; 0.018653 sec
+```
+
+Indeed, `nn.Sparse2Dense` is not the bottleneck; `nn.DFD` is.
+
+## Patch 5 :
+
+This patch improves `nn.DFD` inference from
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=100})"; done
+DFD: train random forest in parallel : 8.452295 samples/sec; 0.047324 sec/tree, 23.662212 sec
+DFD: updateOutput : 176.617872 samples/sec; 9.059109 sec
+DFD: train random forest in parallel : 8.350019 samples/sec; 0.047904 sec/tree, 23.952042 sec
+DFD: updateOutput : 183.508204 samples/sec; 8.718962 sec
+DFD: train random forest in parallel : 8.525779 samples/sec; 0.046917 sec/tree, 23.458266 sec
+DFD: updateOutput : 178.877077 samples/sec; 8.944692 sec
+```
+
+to
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=100})"; done
+DFD: train random forest in parallel : 8.434502 samples/sec; 0.047424 sec/tree, 23.712129 sec
+DFD: updateOutput : 6479.597179 samples/sec; 0.246933 sec
+DFD: train random forest in parallel : 8.334543 samples/sec; 0.047993 sec/tree, 23.996518 sec
+DFD: updateOutput : 6663.641184 samples/sec; 0.240114 sec
+DFD: train random forest in parallel : 8.353265 samples/sec; 0.047885 sec/tree, 23.942735 sec
+DFD: updateOutput : 6882.607456 samples/sec; 0.232475 sec
+```
+
+That is a 37x speedup for `nn.DFD`.
+
+## Patch 6:
+
+This patch improves `nn.DFD` from the previous result to
+
+```
+for i in `seq 5`; do th -e "dt = require 'decisiontree'; dt.benchmark({'DFD'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,batchsize=16,nActive=1200,nFeature=1300,nloop=10000})"; done
+DFD: train random forest in parallel : 8.353504 samples/sec; 0.047884 sec/tree, 23.942050 sec
+DFD: updateOutput : 91967.342339 samples/sec; 1.739753 sec
+DFD: train random forest in parallel : 8.528141 samples/sec; 0.046904 sec/tree, 23.451770 sec
+DFD: updateOutput : 91405.321702 samples/sec; 1.750451 sec
+DFD: train random forest in parallel : 8.184562 samples/sec; 0.048872 sec/tree, 24.436250 sec
+DFD: updateOutput : 91623.388867 samples/sec; 1.746284 sec
+DFD: train random forest in parallel : 8.779561 samples/sec; 0.045560 sec/tree, 22.780182 sec
+DFD: updateOutput : 93914.242852 samples/sec; 1.703686 sec
+DFD: train random forest in parallel : 8.636201 samples/sec; 0.046317 sec/tree, 23.158330 sec
+DFD: updateOutput : 94092.241963 samples/sec; 1.700465 sec
+```
+
+That is another 13.8x speedup.
+
+## Patch 7:
+
+This patch improves `nn.Sparse2Dense` computation from
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
+Sparse2Dense: updateOutput : 1103.570777 samples/sec; 28.996786 sec
+Sparse2Dense: updateOutput : 1092.064331 samples/sec; 29.302309 sec
+Sparse2Dense: updateOutput : 1036.963572 samples/sec; 30.859334 sec
+```
+
+to
+
+```
+for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
+Sparse2Dense: updateOutput : 62995.834470 samples/sec; 0.507978 sec
+Sparse2Dense: updateOutput : 62471.568253 samples/sec; 0.512242 sec
+Sparse2Dense: updateOutput : 62965.099331 samples/sec; 0.508226 sec
+```
+
+This represents a speedup of about 57x.
+
+## Patch 8:
+
+This patch improves `nn.Sparse2Dense` from the previous result to
+
+```for i in `seq 3`; do th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'Sparse2Dense'}, {nTree=500,maxLeafNodes=8,minLeafSize=1,nFeature=1500,nActive=1300,nloop=1000})"; done
+Sparse2Dense: updateOutput : 124268.079914 samples/sec; 0.257515 sec
+Sparse2Dense: updateOutput : 114750.039542 samples/sec; 0.278873 sec
+Sparse2Dense: updateOutput : 122863.314766 samples/sec; 0.260458 sec
+```
+
+which corresponds to another 1.95x speedup.
+
+## Patch 9:
+
+This patches moves the core of training GBDTs, which used to be a big bottleneck, to C. It also
+performs small optimizations across the board (faster scoring, faster branching, ...) that provide a
+little more performance.
+
+The original commit had this performance:
+
+```
+th -e "dt = require 'decisiontree'; torch.setdefaulttensortype('torch.FloatTensor'); dt.benchmark({'GradientBoostTrainer'}, {nExample=100000, sparse=false, nFeature=836, nTree=5, downsampleRatio=1, minLeafSize=1000, maxLeafNodes=8})"
+GradientBoostTrainer: train single-thread : 500.414666 samples/sec; 39.966854 sec/tree, 199.834271 sec
+GradientBoostTrainer: train feature-parallel : 1227.228044 samples/sec; 16.296890 sec/tree, 81.484448 sec (4 threads)
+GradientBoostTrainer: train feature-parallel : 1385.926280 samples/sec; 14.430782 sec/tree, 72.153910 sec (8 threads)
+```
+
+and the new version has
+
+```
+GradientBoostTrainer: train single-thread : 15285.644631 samples/sec; 1.308417 sec/tree, 6.542086 sec
+GradientBoostTrainer: train feature-parallel : 43170.435932 samples/sec; 0.463280 sec/tree, 2.316400 sec (4 threads)
+GradientBoostTrainer: train feature-parallel : 50062.681239 samples/sec; 0.399499 sec/tree, 1.997496 sec (8 threads)
+```
+
+That represents a speedup of about 30.5x over the baseline for 1 thread and 36.1x for 8 threads.
+Note that the performance doesn't increase much as we increase the number of threads since we use
+feature parallelism and the number of features evaluated is small (29 in this case) due to bagging.
+If we disable bagging, then we have the following result with 8 threads and the new code:
+
+```
+GradientBoostTrainer: train single-thread : 590.823965 samples/sec; 33.851030 sec/tree, 169.255152 sec
+GradientBoostTrainer: train feature-parallel : 3232.188576 samples/sec; 6.187758 sec/tree, 30.938789 sec
+```
+
+So processing 836 features now is much faster than processing 29 before.
diff --git a/contrib/lua-torch/decisiontree/error.h b/contrib/lua-torch/decisiontree/error.h
new file mode 100644
index 000000000..18df3c939
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/error.h
@@ -0,0 +1,24 @@
+#ifndef _ERROR_H_
+#define _ERROR_H_
+
+#include "luaT.h"
+#include <string.h>
+
+static inline int _lua_error(lua_State *L, int ret, const char* file, int line) {
+   int pos_ret = ret >= 0 ? ret : -ret;
+   return luaL_error(L, "ERROR: (%s, %d): (%d, %s)\n", file, line, pos_ret, strerror(pos_ret));
+}
+
+static inline int _lua_error_str(lua_State *L, const char *str, const char* file, int line) {
+   return luaL_error(L, "ERROR: (%s, %d): (%s)\n", file, line, str);
+}
+
+static inline int _lua_error_str_str(lua_State *L, const char *str, const char* file, int line, const char *extra) {
+   return luaL_error(L, "ERROR: (%s, %d): (%s: %s)\n", file, line, str, extra);
+}
+
+#define LUA_HANDLE_ERROR(L, ret) _lua_error(L, ret, __FILE__, __LINE__)
+#define LUA_HANDLE_ERROR_STR(L, str) _lua_error_str(L, str, __FILE__, __LINE__)
+#define LUA_HANDLE_ERROR_STR_STR(L, str, extra) _lua_error_str_str(L, str, __FILE__, __LINE__, extra)
+
+#endif
diff --git a/contrib/lua-torch/decisiontree/generic/CartTree.c b/contrib/lua-torch/decisiontree/generic/CartTree.c
new file mode 100644
index 000000000..eb29fcf02
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/CartTree.c
@@ -0,0 +1,88 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/CartTree.c"
+#else
+
+static int nn_(tree_fast_score)(lua_State *L) {
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *score = luaT_checkudata(L, 3, torch_Tensor);
+  long n_samples = THTensor_(size)(input, 0);
+  long n_features = THTensor_(size)(input, 1);
+  THTensor_(resize1d)(score, n_samples);
+  real *input_data = THTensor_(data)(input);
+  real *score_data = THTensor_(data)(score);
+
+  lua_pushstring(L, "leftChild");
+  const int left_child_string = 4;
+  lua_pushstring(L, "rightChild");
+  const int right_child_string = 5;
+  lua_pushstring(L, "score");
+  const int score_string = 6;
+  lua_pushstring(L, "splitFeatureId");
+  const int id_string = 7;
+  lua_pushstring(L, "splitFeatureValue");
+  const int value_string = 8;
+
+  const int original_top = lua_gettop(L);
+  for (long i = 0; i < n_samples; i++) {
+    int node = 2;
+    while (1) {
+      int current_top = lua_gettop(L);
+      lua_pushvalue(L, left_child_string);
+      lua_rawget(L, node);
+      lua_pushvalue(L, right_child_string);
+      lua_rawget(L, node);
+      if (lua_isnil(L, -2) && lua_isnil(L, -1)) {
+        lua_pushvalue(L, score_string);
+        lua_rawget(L, node);
+        score_data[i] = lua_tonumber(L, -1);
+        break;
+      }
+      if (lua_isnil(L, -2)) {
+        // go to right
+        node = current_top + 2;
+        continue;
+      }
+      if (lua_isnil(L, -1)) {
+        // go to left
+        node = current_top + 1;
+        continue;
+      }
+      lua_pushvalue(L, id_string);
+      lua_rawget(L, node);
+      lua_pushvalue(L, value_string);
+      lua_rawget(L, node);
+      long feature_id = lua_tointeger(L, -2);
+      real feature_value = lua_tonumber(L, -1);
+
+      real current_value = input_data[i * n_features + (feature_id-1)];
+      if (current_value < feature_value) {
+        // go to left
+        node = current_top + 1;
+      }
+      else {
+        // go to right
+        node = current_top + 2;
+      }
+    }
+    lua_pop(L, lua_gettop(L) - original_top);
+  }
+
+  lua_pop(L, 5);
+
+  lua_pushvalue(L, 3);
+  return 1;
+}
+
+static const struct luaL_Reg nn_(CT__) [] = {
+  {"CartTreeFastScore", nn_(tree_fast_score)},
+  {NULL, NULL}
+};
+
+static void nn_(CT_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(CT__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/lua-torch/decisiontree/generic/DFD.c b/contrib/lua-torch/decisiontree/generic/DFD.c
new file mode 100644
index 000000000..599c4d794
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/DFD.c
@@ -0,0 +1,157 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DFD.c"
+#else
+
+static int nn_(DFD_computeOutput)(lua_State *L) {
+  THLongTensor *outputkeys =       luaT_checkudata(L, 1, "torch.LongTensor");
+  THTensor *outputvalues =         luaT_checkudata(L, 2, torch_Tensor);
+  THLongTensor *root_ids =         luaT_checkudata(L, 3, "torch.LongTensor");
+  THLongTensor *left_child =       luaT_checkudata(L, 4, "torch.LongTensor");
+  THLongTensor *right_child =      luaT_checkudata(L, 5, "torch.LongTensor");
+  THLongTensor *split_feature_id = luaT_checkudata(L, 6, "torch.LongTensor");
+  THTensor *split_feature_value =  luaT_checkudata(L, 7, torch_Tensor);
+  THTensor *input =                luaT_checkudata(L, 8, torch_Tensor);
+  char only_last_node =            lua_toboolean(L, 9);
+
+  // gets some important sizes from the input
+  long batch_size = THTensor_(size)(input, 0);
+  long input_size = THTensor_(size)(input, 1);
+  long roots_size = THLongTensor_size(root_ids, 0);
+  long depth = THLongTensor_size(outputkeys, 1);
+
+  // keeps track of the number of nodes traversed in the trees by each sample.
+  // each traversed node maps to an output feature having a value of 1
+  long outputsize[batch_size];
+  for (long i = 0; i < batch_size; i++)
+    outputsize[i] = 0;
+
+  // gets direct pointers to the memory of each tensor for efficiency
+  long *root_ids_data = THLongTensor_data(root_ids);
+  long *left_child_data = THLongTensor_data(left_child);
+  long *right_child_data = THLongTensor_data(right_child);
+  real *split_feature_value_data = THTensor_(data)(split_feature_value);
+  long *split_feature_id_data = THLongTensor_data(split_feature_id);
+  long *outputkeys_data = THLongTensor_data(outputkeys);
+  real *input_data = THTensor_(data)(input);
+
+  // for each sample in the batch
+  for (long sample_index = 0; sample_index < batch_size; sample_index++) {
+    // gets pointers to the direct memory associated with each sample for efficiency
+    const long outputkeys_offset = sample_index * depth;
+    const long input_offset = sample_index * input_size;
+    long *local_outputkeys_data = &outputkeys_data[outputkeys_offset];
+    real *local_input_data = &input_data[input_offset];
+
+    // for each tree in the forest
+    for (long i = 0; i < roots_size; i++) {
+      int root = 1;
+      long node_id = root_ids_data[i];
+
+      // traverses the whole tree keeping track of which nodes were seen
+      while (1) {
+        if (root) {
+          // root nodes aren't added to output because they are always traversed
+          root = 0;
+        }
+        else if (!only_last_node) {
+          // updates the outputsize for all samples traversing this node; and
+          // set the traversed node as a feature in output for exampleIds
+          long output_index = outputsize[sample_index];
+          // updates the outputsize for all samples traversing this node
+          outputsize[sample_index]++;
+          // sets the traversed node as a feature in output for exampleIds
+          local_outputkeys_data[output_index] = node_id;
+        }
+
+        // gets the left and right nodes. values of -1 represent missing node
+        long left_id = left_child_data[node_id-1];
+        long right_id = right_child_data[node_id-1];
+
+        if (left_id <= 0 && right_id <= 0) {
+          if (only_last_node) {
+            long output_index = outputsize[sample_index];
+            outputsize[sample_index]++;
+            local_outputkeys_data[output_index] = node_id;
+          }
+          // if no children, stops
+          break;
+        }
+        else if (left_id <= 0) {
+          // if no left child, traverses right node
+          node_id = right_id;
+        }
+        else if (right_id <= 0) {
+          // if no right child, traverses left node
+          node_id = left_id;
+        }
+        else {
+          // if both left and right children, finds the direction for this sample
+          // first get the reference from the node
+          real split_value = split_feature_value_data[node_id-1];
+          long split_id = split_feature_id_data[node_id-1]-1;
+
+          // then gets the value of the sample
+          real node_value = local_input_data[split_id];
+          // and branchs
+          if (node_value < split_value)
+            node_id = left_id;
+          else
+            node_id = right_id;
+        }
+      }
+    }
+  }
+
+  // now that we know which nodes were traverse for each sample, we can create the sparse output
+  // with 1 entry pair for each sample
+  THTensor *input_feature = THTensor_(new)();
+  THLongTensor *indices = THLongTensor_new();
+
+  // pushes the return table with 2 children tables
+  lua_newtable(L);
+  lua_pushinteger(L, 1);
+  lua_newtable(L);
+  lua_pushinteger(L, 2);
+  lua_newtable(L);
+
+  // for each sample...
+  for (long i = 0; i < batch_size; i++) {
+    long j = outputsize[i];
+    // selects the tensor lines from the dense output
+    THLongTensor_select(indices, outputkeys, 0, i);
+    THTensor_(select)(input_feature, outputvalues, 0, i);
+
+    // narrows the keys to actual number of nodes traversed and saves to the output
+    lua_pushinteger(L, i+1);
+    luaT_pushudata(L, THLongTensor_newNarrow(indices, 0, 0, j), "torch.LongTensor");
+    lua_settable(L, -5);
+
+    // and narrows the values
+    lua_pushinteger(L, i+1);
+    luaT_pushudata(L, THTensor_(newNarrow)(input_feature, 0, 0, j), torch_Tensor);
+    lua_settable(L, -3);
+  }
+
+  // pushes the two parts of the output into the output table
+  lua_settable(L, -5);
+  lua_settable(L, -3);
+
+  THLongTensor_free(indices);
+  THTensor_(free)(input_feature);
+
+  return 1;
+}
+
+static const struct luaL_Reg nn_(DFD__) [] = {
+  {"DFD_computeOutput", nn_(DFD_computeOutput)},
+  {NULL, NULL}
+};
+
+static void nn_(DFD_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(DFD__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/lua-torch/decisiontree/generic/GBDT.c b/contrib/lua-torch/decisiontree/generic/GBDT.c
new file mode 100644
index 000000000..31f5b025d
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/GBDT.c
@@ -0,0 +1,392 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GBDT.c"
+#else
+
+#include "GBDT_internal.h"
+#include "GBDT_internal.c"
+
+// note that each one of the functions to find the best split is a subset of the next.
+// first we have one that can only evaluate a single feature, using the logic in lua to control the
+// features
+// then we have one that can go over a shard of faetures, following the feature parallelism
+// introduced by the lua logic
+// and finally we have one that performans the feature parallelism itself in the special case of
+// dense tensors
+// these functions are provided for completeness and to test in case the logic is to be changed
+
+// finds the best split for a given node and feature
+static int nn_(gb_findBestFeatureSplit)(lua_State *L) {
+  THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
+  const int dataset_index = 2;
+  if (!lua_isnumber(L, 3))
+    return LUA_HANDLE_ERROR_STR(L, "third argument should be an integer");
+  long feature_id = lua_tointeger(L, 3);
+  if (!lua_isnumber(L, 4))
+    return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
+  long minLeafSize = lua_tointeger(L, 4);
+  // Since minLeafSize == 1 corresponds to each sample in its own leaf, any value below it doesn't
+  // make sense
+  if (minLeafSize < 1)
+    minLeafSize = 1;
+  THTensor *grad = luaT_checkudata(L, 5, torch_Tensor);
+  THTensor *hess = luaT_checkudata(L, 6, torch_Tensor);
+
+  if (!THLongTensor_isContiguous(exampleIds))
+    return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
+  if (!THTensor_(isContiguous)(grad))
+    return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
+  if (!THTensor_(isContiguous)(hess))
+    return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
+
+  // initializes the static data
+  nn_(GBInitialization) initialization_data;
+  nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
+
+  // initializes the dynamic data
+  GBRunData run_data;
+  gb_create_run_data(&run_data, minLeafSize);
+
+  // finds the best state possible for the split
+  nn_(GBBestState) bs;
+  nn_(gb_find_best_feature_split)(L, &initialization_data, &bs, feature_id, &run_data);
+
+  lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
+
+  // fills the table we the best split found and the lua logic above will do everything else
+  // if no state was found, returns nil
+  if (bs.valid_state == 0) {
+    lua_pop(L, 1);
+    lua_pushnil(L);
+  }
+  else {
+    nn_(gb_internal_split_info)(L, &bs, initialization_data.splitInfo_index);
+  }
+
+  gb_destroy_run_data(&run_data);
+
+  return 1;
+}
+
+// finds the best split for a given node and shard of features
+// this is more efficient than calling the previous one multiple times
+static int nn_(gb_findBestSplit)(lua_State *L) {
+  THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
+  const int dataset_index = 2;
+  THLongTensor *feature_ids = luaT_checkudata(L, 3, "torch.LongTensor");
+  if (!lua_isnumber(L, 4))
+    return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
+  long minLeafSize = lua_tointeger(L, 4);
+  // Since minLeafSize == 1 corresponds to each sample in its own leaf, any value below it doesn't
+  // make sense
+  if (minLeafSize < 1)
+    minLeafSize = 1;
+  if (!lua_isnumber(L, 5))
+    return LUA_HANDLE_ERROR_STR(L, "fifth argument should be an integer");
+  long shardId = lua_tointeger(L, 5);
+  if (!lua_isnumber(L, 6))
+    return LUA_HANDLE_ERROR_STR(L, "sixth argument should be an integer");
+  long nShard = lua_tointeger(L, 6);
+  THTensor *grad = luaT_checkudata(L, 7, torch_Tensor);
+  THTensor *hess = luaT_checkudata(L, 8, torch_Tensor);
+
+  if (!THLongTensor_isContiguous(exampleIds))
+    return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
+  if (!THTensor_(isContiguous)(grad))
+    return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
+  if (!THTensor_(isContiguous)(hess))
+    return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
+
+  // initializes the static data
+  nn_(GBInitialization) initialization_data;
+  nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
+
+  // initializes the dynamic data
+  GBRunData run_data;
+  gb_create_run_data(&run_data, minLeafSize);
+
+  // initializes to evaluate all the features in this shard
+  nn_(GBBestState) global_bs;
+  global_bs.valid_state = 0;
+  long n_features = THLongTensor_size(feature_ids, 0);
+  if (!THLongTensor_isContiguous(feature_ids))
+    return LUA_HANDLE_ERROR_STR(L, "feature_ids must be contiguous");
+  long *feature_ids_data = THLongTensor_data(feature_ids);
+
+  // for every feature
+  for (long i = 0; i < n_features; i++) {
+    long feature_id = feature_ids_data[i];
+    // if we are responsible for it
+    if (nShard <= 1 || (feature_id % nShard) + 1 == shardId) {
+      // finds the best state possible for the split
+      nn_(GBBestState) bs;
+      nn_(gb_find_best_feature_split)(L, &initialization_data, &bs, feature_id, &run_data);
+
+      // if it's valid and better than one we found before, saves it
+      if (bs.valid_state) {
+        if (global_bs.valid_state == 0 || bs.gain < global_bs.gain) {
+          global_bs = bs;
+        }
+      }
+    }
+  }
+
+  lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
+
+  // fills the table we the best split found and the lua logic above will do everything else
+  // if no state was found, returns nil
+  if (global_bs.valid_state == 0) {
+    lua_pop(L, 1);
+    lua_pushnil(L);
+  }
+  else {
+    nn_(gb_internal_split_info)(L, &global_bs, initialization_data.splitInfo_index);
+  }
+
+  gb_destroy_run_data(&run_data);
+
+  return 1;
+}
+
+// all the info we have to apss to the slave threads so that they can do their jobs
+// note that we do not pass the lua state since it isn't required. we perform direct C parallelism
+// instead of using lua's parallelism like with the previous version
+typedef struct {
+  nn_(GBInitialization) *initialization_data;
+  GBRunData *run_data;
+  long *index;
+  nn_(GBBestState) *global_bs;
+  long n_features;
+  long *feature_ids_data;
+  pthread_mutex_t *mutex;
+  THLongTensor *exampleIds;
+  THTensor *input;
+  THLongTensor **sorted_ids_per_feature;
+} nn_(ThreadInfo);
+
+// loops over all the features in parallel and finds the best global split
+static void* nn_(thread_worker)(void *arg) {
+  nn_(ThreadInfo) *info = (nn_(ThreadInfo) *)arg;
+
+  while (1) {
+    pthread_mutex_lock(info->mutex);
+    long index = (*info->index);
+    (*info->index)++;
+    pthread_mutex_unlock(info->mutex);
+
+    if (index >= info->n_features)
+      break;
+
+    // performs part of steps (1) and (2) of gb_find_best_feature_split without having to access the
+    // lua state using pre-loaded data
+    long feature_id = info->feature_ids_data[index];
+    THLongTensor *exampleIdsWithFeature_ret = info->exampleIds;
+    THLongTensor *featureExampleIds = info->sorted_ids_per_feature[index];
+    nn_(GBInitialization) *initialization_data = info->initialization_data;
+    GBRunData *run_data = info->run_data;
+
+    // performs steps (3) and (4) of gb_find_best_feature_split since (1) and (2) were already
+    // performed before
+    nn_(GBBestState) bs;
+    nn_(gb_internal_create)(initialization_data->grad, initialization_data->hess,
+        exampleIdsWithFeature_ret, &bs.state);
+    nn_(gb_internal_get_best_split_special)(&bs, featureExampleIds, run_data->exampleMap,
+        info->input, run_data->minLeafSize, feature_id);
+
+    // saves to the global state if it's better
+    if (bs.valid_state) {
+      pthread_mutex_lock(info->mutex);
+      if (info->global_bs->valid_state == 0 || bs.gain < info->global_bs->gain) {
+        (*info->global_bs) = bs;
+      }
+      pthread_mutex_unlock(info->mutex);
+    }
+  }
+
+  return NULL;
+}
+
+// finds the global best split by doing feature parallelism directly in C
+static int nn_(gb_findBestSplitFP)(lua_State *L) {
+  THLongTensor *exampleIds = luaT_checkudata(L, 1, "torch.LongTensor");
+  const int dataset_index = 2;
+  THLongTensor *feature_ids = luaT_checkudata(L, 3, "torch.LongTensor");
+  if (!lua_isnumber(L, 4))
+    return LUA_HANDLE_ERROR_STR(L, "fourth argument should be an integer");
+  long minLeafSize = lua_tointeger(L, 4);
+  THTensor *grad = luaT_checkudata(L, 5, torch_Tensor);
+  THTensor *hess = luaT_checkudata(L, 6, torch_Tensor);
+  if (!lua_isnumber(L, 7))
+    return LUA_HANDLE_ERROR_STR(L, "seventh argument should be an integer");
+  long nThread = lua_tointeger(L, 7);
+
+  if (!THLongTensor_isContiguous(exampleIds))
+    return LUA_HANDLE_ERROR_STR(L, "exampleIds has to be contiguous");
+  if (!THTensor_(isContiguous)(grad))
+    return LUA_HANDLE_ERROR_STR(L, "grad has to be contiguous");
+  if (!THTensor_(isContiguous)(hess))
+    return LUA_HANDLE_ERROR_STR(L, "hessian has to be contiguous");
+
+  pthread_mutex_t mutex;
+  pthread_mutex_init(&mutex, NULL);
+
+  // initializes the static data
+  nn_(GBInitialization) initialization_data;
+  nn_(gb_initialize)(L, &initialization_data, exampleIds, grad, hess, dataset_index);
+
+  // initializes the dynamic data
+  GBRunData run_data;
+  gb_create_run_data(&run_data, minLeafSize);
+
+  // initializes to evaluate all the features
+  nn_(GBBestState) global_bs;
+  global_bs.valid_state = 0;
+  long n_features = THLongTensor_size(feature_ids, 0);
+  if (!THLongTensor_isContiguous(feature_ids))
+    return LUA_HANDLE_ERROR_STR(L, "feature_ids must be contiguous");
+  long *feature_ids_data = THLongTensor_data(feature_ids);
+
+  THTensor *input = luaT_checkudata(L, initialization_data.input_index, torch_Tensor);
+
+  // performs step (1) of gb_find_best_feature_split so that we don't have to pass the lua state
+  THLongTensor *sorted_ids_per_feature[n_features];
+  for (long i = 0; i < n_features; i++) {
+    long feature_id = feature_ids_data[i];
+    lua_pushvalue(L, initialization_data.getSortedFeature_index);
+    lua_pushvalue(L, initialization_data.dataset_index);
+    lua_pushinteger(L, feature_id);
+    lua_call(L, 2, 1);
+
+    THLongTensor *featureExampleIds = luaT_checkudata(L, -1, "torch.LongTensor");
+    sorted_ids_per_feature[i] = featureExampleIds;
+  }
+
+  // performas step (2) of gb_find_best_feature_split since it's the same for all features when the
+  // data is dense
+  long exampleIds_size = THLongTensor_size(initialization_data.exampleIds, 0);
+  long *exampleIds_data = THLongTensor_data(initialization_data.exampleIds);
+
+  int ret;
+  kh_resize(long, run_data.exampleMap, exampleIds_size*8);
+  for (long i = 0; i < exampleIds_size; i++)
+    kh_put(long, run_data.exampleMap, exampleIds_data[i], &ret);
+
+  // saves the info for the threads
+  long index = 0;
+  nn_(ThreadInfo) info;
+  info.initialization_data = &initialization_data;
+  info.run_data = &run_data;
+  info.index = &index;
+  info.global_bs = &global_bs;
+  info.n_features = n_features;
+  info.feature_ids_data = feature_ids_data;
+  info.mutex = &mutex;
+  info.exampleIds = exampleIds;
+  info.input = input;
+  info.sorted_ids_per_feature = sorted_ids_per_feature;
+
+  pthread_t threads[nThread];
+
+  // let the threads run like crazy over the features to find the minimum
+  for (long i = 0; i < nThread; i++) {
+    int ret = pthread_create(&threads[i], NULL, nn_(thread_worker), &info);
+    if (ret)
+      return LUA_HANDLE_ERROR_STR(L, "falied to create thread");
+  }
+
+  for (long i = 0; i < nThread; i++) {
+    int ret = pthread_join(threads[i], NULL);
+    if (ret)
+      return LUA_HANDLE_ERROR_STR(L, "failed to join thread");
+  }
+
+  lua_pop(L, lua_gettop(L) - initialization_data.splitInfo_index);
+
+  // fills the table we the best split found and the lua logic above will do everything else
+  // if no state was found, returns nil
+  if (global_bs.valid_state == 0) {
+    lua_pop(L, 1);
+    lua_pushnil(L);
+  }
+  else {
+    nn_(gb_internal_split_info)(L, &global_bs, initialization_data.splitInfo_index);
+  }
+
+  gb_destroy_run_data(&run_data);
+  pthread_mutex_destroy(&mutex);
+
+  return 1;
+}
+
+// performs an efficient branch of the current examples based on a split info provided
+static int nn_(gb_branch)(lua_State *L) {
+  if (!lua_istable(L, 1))
+    return LUA_HANDLE_ERROR_STR(L, "first argument must be a table");
+  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
+  THLongTensor *exampleIds = luaT_checkudata(L, 3, "torch.LongTensor");
+
+  // gets direct access to the dataset
+  long n_exampleIds = THLongTensor_size(exampleIds, 0);
+  long *exampleIds_data = THLongTensor_data(exampleIds);
+  long n_features = THTensor_(size)(input, 1);
+  real *input_data = THTensor_(data)(input);
+
+  // creates the tensors to be returned
+  luaT_pushudata(L, THLongTensor_new(), "torch.LongTensor");
+  luaT_pushudata(L, THLongTensor_new(), "torch.LongTensor");
+  THLongTensor *leftExampleIds = luaT_checkudata(L, 4, "torch.LongTensor");
+  THLongTensor *rightExampleIds = luaT_checkudata(L, 5, "torch.LongTensor");
+  THLongTensor_resize1d(leftExampleIds, n_exampleIds);
+
+  // gets direct access to the examples
+  THLongTensor *splitExampleIds = leftExampleIds;
+  long *splitExampleIds_data = THLongTensor_data(splitExampleIds);
+
+  // gets the split info
+  lua_pushstring(L, "splitId");
+  lua_rawget(L, 1);
+  const long splitId = lua_tointeger(L, -1);
+  lua_pushstring(L, "splitValue");
+  lua_rawget(L, 1);
+  const real splitValue = lua_tonumber(L, -1);
+  lua_pop(L, 2);
+
+  long leftIdx = 0, rightIdx = 0;
+
+  // goes over all the samples dividing them into the two sides
+  for (long i = 0; i < n_exampleIds; i++) {
+    long exampleId = exampleIds_data[i];
+    real val = input_data[(exampleId-1) * n_features + (splitId - 1)];
+    if (val <= splitValue) {
+      leftIdx++;
+      splitExampleIds_data[leftIdx-1] = exampleId;
+    }
+    else {
+      rightIdx++;
+      splitExampleIds_data[n_exampleIds - rightIdx + 1 - 1] = exampleId;
+    }
+  }
+
+  // once done, the resulting tensors are just splits of the sample base. this is more efficient
+  // than having 2 tensors since we didn't know where the split would happen (how much to each
+  // side), but we knew that the sum would be constant
+  THLongTensor_narrow(rightExampleIds, splitExampleIds, 0, n_exampleIds-rightIdx+1-1, rightIdx);
+  THLongTensor_narrow(leftExampleIds, splitExampleIds, 0, 0, leftIdx);
+  return 2;
+}
+
+static const struct luaL_Reg nn_(GBDT__) [] = {
+  {"GBDT_findBestFeatureSplit", nn_(gb_findBestFeatureSplit)},
+  {"GBDT_findBestSplit", nn_(gb_findBestSplit)},
+  {"GBDT_findBestSplitFP", nn_(gb_findBestSplitFP)},
+  {"GBDT_branch", nn_(gb_branch)},
+  {NULL, NULL}
+};
+
+static void nn_(GBDT_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(GBDT__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/lua-torch/decisiontree/generic/GBDT_internal.c b/contrib/lua-torch/decisiontree/generic/GBDT_internal.c
new file mode 100644
index 000000000..739aabf25
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/GBDT_internal.c
@@ -0,0 +1,312 @@
+// initializes the optimization structure based on the arguments provided, either filling directly
+// or making calls to lua to load some kind of data
+static void nn_(gb_initialize)(lua_State *L, nn_(GBInitialization) *initialization_data,
+    THLongTensor *exampleIds, THTensor *grad, THTensor *hess, int dataset_index) {
+  initialization_data->dataset_index = dataset_index;
+  initialization_data->exampleIds = exampleIds;
+  initialization_data->grad = grad;
+  initialization_data->hess = hess;
+
+  lua_newtable(L);
+  initialization_data->splitInfo_index = lua_gettop(L);
+
+  lua_pushstring(L, "input");
+  lua_gettable(L, dataset_index);
+  initialization_data->input_index = lua_gettop(L);
+
+  lua_pushstring(L, "getSortedFeature");
+  lua_gettable(L, dataset_index);
+  initialization_data->getSortedFeature_index = lua_gettop(L);
+}
+
+// initializes a state that will be passed to the optimizer
+static void nn_(gb_internal_create)(THTensor *grad, THTensor *hessian,
+    THLongTensor *exampleIds, nn_(GBState)* s) {
+  long *exampleIds_data = THLongTensor_data(exampleIds);
+  long n_examples = THLongTensor_size(exampleIds, 0);
+  accreal leftGradientSum = 0;
+  accreal leftHessianSum = 0;
+
+  real *grad_data = THTensor_(data)(grad);
+  real *hessian_data = THTensor_(data)(hessian);
+
+  // only sums the relevant gradients and hessians
+  for (long i = 0; i < n_examples; i++) {
+    long exampleId = exampleIds_data[i]-1;
+    leftGradientSum += grad_data[exampleId];
+    leftHessianSum += hessian_data[exampleId];
+  }
+
+  // we move data from the left branch to the right branch
+  s->rightGradientSum = 0;
+  s->rightHessianSum = 1;
+  s->nExampleInRightBranch = 0;
+  s->leftGradientSum = leftGradientSum;
+  s->leftHessianSum = leftHessianSum + 1;
+  s->nExampleInLeftBranch = n_examples;
+
+  // stores the loss in parent for efficiency
+  real lossInParent = computeGradientBoostLoss(s->leftGradientSum + s->rightGradientSum,
+      s->leftHessianSum + s->rightHessianSum);
+  s->lossInParent = lossInParent;
+
+  // caches the direct pointers to the data for efficiency
+  s->grad_data = grad_data;
+  s->hessian_data = hessian_data;
+}
+
+// computes the gain obtained by performing the split
+static real nn_(computeSplitGain)(nn_(GBState) *s) {
+  real lossInLeftBranch = computeGradientBoostLoss(s->leftGradientSum, s->leftHessianSum);
+  real lossInRightBranch = computeGradientBoostLoss(s->rightGradientSum, s->rightHessianSum);
+  return lossInLeftBranch + lossInRightBranch - s->lossInParent;
+}
+
+// uses the state information to build the table required by the lua library about the best split
+static void nn_(gb_internal_split_info)(lua_State *L, nn_(GBBestState) *bs, int res) {
+  long feature_id = bs->feature_id;
+  real feature_value = bs->feature_value;
+  real gain  = bs->gain;
+  nn_(GBState) *s = &bs->state;
+  lua_pushstring(L, "splitGain");
+  lua_pushnumber(L, gain);
+  lua_rawset(L, res);
+  lua_pushstring(L, "splitId");
+  lua_pushinteger(L, feature_id);
+  lua_rawset(L, res);
+  lua_pushstring(L, "splitValue");
+  lua_pushnumber(L, feature_value);
+  lua_rawset(L, res);
+  lua_pushstring(L, "leftChildSize");
+  lua_pushinteger(L, s->nExampleInLeftBranch);
+  lua_rawset(L, res);
+  lua_pushstring(L, "rightChildSize");
+  lua_pushinteger(L, s->nExampleInRightBranch);
+  lua_rawset(L, res);
+  lua_pushstring(L, "leftGradient");
+  lua_pushnumber(L, s->leftGradientSum);
+  lua_rawset(L, res);
+  lua_pushstring(L, "rightGradient");
+  lua_pushnumber(L, s->rightGradientSum);
+  lua_rawset(L, res);
+  lua_pushstring(L, "leftHessian");
+  lua_pushnumber(L, s->leftHessianSum);
+  lua_rawset(L, res);
+  lua_pushstring(L, "rightHessian");
+  lua_pushnumber(L, s->rightHessianSum);
+  lua_rawset(L, res);
+}
+
+// core of the computation, where we loop over all the relevant samples looking for the best split
+// we can find
+static void nn_(gb_internal_get_best_split)(lua_State *L, nn_(GBBestState) *bs,
+    THLongTensor *featureExampleIds, khash_t(long)* exampleMap, int input_table_index,
+    long minLeafSize, long feature_id) {
+  nn_(GBState) current_state;
+  nn_(GBState) best_state;
+  current_state = bs->state;
+
+  real best_gain = INFINITY;
+  real best_value = 0;
+
+  // if the data is dense, pre-loads direct access to it
+  THTensor *input = NULL;
+  real *input_data = NULL;
+  long n_features = 0;
+  if (lua_istable(L, input_table_index)) {
+  }
+  else {
+    input = luaT_checkudata(L, input_table_index, torch_Tensor);
+    input_data = THTensor_(data)(input);
+    n_features = THTensor_(size)(input, 1);
+  }
+
+  long stride = featureExampleIds->stride[0];
+  long *featureExampleIds_data = THLongTensor_data(featureExampleIds);
+
+  khiter_t k;
+
+  real previousSplitValue = 0;
+  // for each example with the given feature and from large to small value...
+  for (long i = THLongTensor_size(featureExampleIds, 0)-1; i >= 0; i--) {
+    long exampleId = featureExampleIds_data[i * stride];
+
+    // checks if the sample is in the list of ones that have to be evaluated by this node
+    k = kh_get(long, exampleMap, exampleId);
+    if (k != kh_end(exampleMap)) {
+      long exampleIdx = exampleId;
+
+      // gets the split value, depending on whether the input is sparse or dense
+      real splitValue;
+      if (input_data) {
+        splitValue = input_data[(exampleId-1) * n_features + feature_id-1];
+      }
+      else {
+        lua_pushinteger(L, exampleId);
+        lua_gettable(L, input_table_index);
+        lua_pushinteger(L, feature_id);
+        lua_gettable(L, -2);
+        splitValue = lua_tonumber(L, -1);
+        lua_pop(L, 2);
+      }
+
+      // performs one update of the state, moving a sample from the left branch to the right
+      real gradient = current_state.grad_data[exampleIdx-1];
+      real hessian = current_state.hessian_data[exampleIdx-1];
+      current_state.leftGradientSum -= gradient;
+      current_state.rightGradientSum += gradient;
+      current_state.leftHessianSum -= hessian;
+      current_state.rightHessianSum += hessian;
+      current_state.nExampleInLeftBranch--;
+      current_state.nExampleInRightBranch++;
+
+      // since we remove from the left, once this becomes true, it stays true forever
+      // hence we stop the loop
+      if (current_state.nExampleInLeftBranch < minLeafSize)
+        break;
+
+      if (current_state.nExampleInRightBranch >= minLeafSize) {
+        // if the values are equal between the steps, it doesn't make sense to evaluate the score
+        // since we won't be able to separate the two
+        if (previousSplitValue != splitValue) {
+          // computes the gain **without including the parent** since it doesn't change as we move
+          // examples between branches
+          real lossInLeftBranch = computeGradientBoostLoss(current_state.leftGradientSum, current_state.leftHessianSum);
+          real lossInRightBranch = computeGradientBoostLoss(current_state.rightGradientSum, current_state.rightHessianSum);
+          real current_gain = lossInLeftBranch + lossInRightBranch;
+          if (current_gain < best_gain) {
+            best_gain = current_gain;
+            best_value = splitValue;
+            best_state = current_state;
+          }
+        }
+      }
+      previousSplitValue = splitValue;
+    }
+  }
+
+  // if there is a valid gain, then marks the state as valid and fills the meta-info
+  if (!isfinite(best_gain)) {
+    bs->valid_state = 0;
+  }
+  else {
+    bs->valid_state = 1;
+    bs->state = best_state;
+    bs->feature_id = feature_id;
+    bs->gain = nn_(computeSplitGain)(&bs->state);
+    bs->feature_value = best_value;
+  }
+}
+
+// exactly like the previous version, but direct access to the data for efficiency. it also doesn't
+// rely on the lua state in the particular case of dense data, so we can evaluate this without using
+// the lua state
+static void nn_(gb_internal_get_best_split_special)(nn_(GBBestState) *bs,
+    THLongTensor *featureExampleIds, khash_t(long)* exampleMap, THTensor *input, long minLeafSize,
+    long feature_id) {
+  nn_(GBState) current_state;
+  nn_(GBState) best_state;
+  current_state = bs->state;
+
+  real best_gain = INFINITY;
+  real best_value = 0;
+
+  real *input_data = NULL;
+  long n_features = 0;
+  input_data = THTensor_(data)(input);
+  n_features = THTensor_(size)(input, 1);
+
+  long stride = featureExampleIds->stride[0];
+  long *featureExampleIds_data = THLongTensor_data(featureExampleIds);
+
+  khiter_t k;
+
+  real previousSplitValue = 0;
+  for (long i = THLongTensor_size(featureExampleIds, 0)-1; i >= 0; i--) {
+    long exampleId = featureExampleIds_data[i * stride];
+
+    k = kh_get(long, exampleMap, exampleId);
+    if (k != kh_end(exampleMap)) {
+      long exampleIdx = exampleId;
+
+      // THIS is the main part that changes. seems crazy to have a special case just for this, but
+      // since there are a **lot** of samples to be evaluated, the "if" in the previous case can
+      // become expensive
+      real splitValue;
+      splitValue = input_data[(exampleId-1) * n_features + feature_id-1];
+
+      real gradient = current_state.grad_data[exampleIdx-1];
+      real hessian = current_state.hessian_data[exampleIdx-1];
+      current_state.leftGradientSum -= gradient;
+      current_state.rightGradientSum += gradient;
+      current_state.leftHessianSum -= hessian;
+      current_state.rightHessianSum += hessian;
+      current_state.nExampleInLeftBranch--;
+      current_state.nExampleInRightBranch++;
+
+      // since we remove from the left, once this becomes true, it stays true forever
+      // hence we stop the loop
+      if (current_state.nExampleInLeftBranch < minLeafSize)
+        break;
+
+      // This will always fail in the first pass since minLeafSize >= 1 and nExampleInRightBranch
+      // starts at 0
+      if (current_state.nExampleInRightBranch >= minLeafSize) {
+        if (previousSplitValue != splitValue) {
+          real lossInLeftBranch = computeGradientBoostLoss(current_state.leftGradientSum, current_state.leftHessianSum);
+          real lossInRightBranch = computeGradientBoostLoss(current_state.rightGradientSum, current_state.rightHessianSum);
+          real current_gain = lossInLeftBranch + lossInRightBranch;
+          if (current_gain < best_gain) {
+            best_gain = current_gain;
+            best_value = splitValue;
+            best_state = current_state;
+          }
+        }
+      }
+      previousSplitValue = splitValue;
+    }
+  }
+
+  if (!isfinite(best_gain)) {
+    bs->valid_state = 0;
+  }
+  else {
+    bs->valid_state = 1;
+    bs->state = best_state;
+    bs->feature_id = feature_id;
+    bs->gain = nn_(computeSplitGain)(&bs->state);
+    bs->feature_value = best_value;
+  }
+}
+
+// core of the computation to find the split for a given feature and is divided in 4 steps
+static void nn_(gb_find_best_feature_split)(lua_State *L,
+    nn_(GBInitialization) *initialization_data, nn_(GBBestState) *bs, long feature_id,
+    GBRunData *run_data) {
+
+  // 1) loads the examples in the dataset ordered by their feature value
+  lua_pushvalue(L, initialization_data->getSortedFeature_index);
+  lua_pushvalue(L, initialization_data->dataset_index);
+  lua_pushinteger(L, feature_id);
+  lua_call(L, 2, 1);
+
+  THLongTensor *featureExampleIds = luaT_checkudata(L, -1, "torch.LongTensor");
+
+  // 2) processes the data to find the intersection between the examples in the dataset and the
+  // examples the current node has to evaluate
+  THLongTensor *exampleIdsWithFeature_ret = gb_internal_prepare(L, initialization_data->exampleIds,
+      run_data->exampleIdsWithFeature_cache, initialization_data->input_index, feature_id,
+      run_data->exampleMap);
+  if (!exampleIdsWithFeature_ret) {
+    bs->valid_state = 0;
+    return;
+  }
+
+  // 3) creates a new state to be used by the optimizer
+  nn_(gb_internal_create)(initialization_data->grad, initialization_data->hess,
+      exampleIdsWithFeature_ret, &bs->state);
+
+  // 4) optimize away!
+  nn_(gb_internal_get_best_split)(L, bs, featureExampleIds, run_data->exampleMap,
+      initialization_data->input_index, run_data->minLeafSize, feature_id);
+}
diff --git a/contrib/lua-torch/decisiontree/generic/GBDT_internal.h b/contrib/lua-torch/decisiontree/generic/GBDT_internal.h
new file mode 100644
index 000000000..7119365cf
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/GBDT_internal.h
@@ -0,0 +1,34 @@
+// representation of a state used while searching for the best split
+typedef struct {
+  real leftGradientSum, rightGradientSum;
+  real leftHessianSum, rightHessianSum;
+  real lossInParent;
+  long nExampleInLeftBranch, nExampleInRightBranch;
+  real *grad_data, *hessian_data;
+} nn_(GBState);
+
+// representation for the best state found for a given feature
+typedef struct {
+  nn_(GBState) state;
+  real gain;
+  long feature_id;
+  real feature_value;
+  int valid_state;
+} nn_(GBBestState);
+
+// full data that must be initialized before calling the optimizer
+typedef struct {
+  // *_index represent positions on the lua stack
+  int dataset_index;
+  int splitInfo_index;
+  int input_index;
+  // position of the dataset's function to return the samples ordered for a given feature
+  int getSortedFeature_index;
+
+  // samples that this node has to evaluate
+  THLongTensor *exampleIds;
+
+  // cached gradient and hessian for all data
+  THTensor *grad;
+  THTensor *hess;
+} nn_(GBInitialization);
diff --git a/contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c b/contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c
new file mode 100644
index 000000000..f2ea1ef09
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/LogitBoostCriterion.c
@@ -0,0 +1,90 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogitBoostCriterion.c"
+#else
+
+#define EPS 1e-12
+
+static int nn_(LogitBoostCriterion_updateOutput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *output = luaT_checkudata(L, 3, torch_Tensor);
+  int sizeAverage = lua_toboolean(L, 4);
+
+  if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
+    luaL_error(L, "inconsistent input and target size");
+  }
+  THTensor_(resize1d)(output, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    // math.log(1 + math.exp(target[i] <= 0 and input[i] or -input[i]))
+    sum += log(1 + exp(y <= 0 ? x : -x));
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+  return 0;
+}
+
+static int nn_(LogitBoostCriterion_updateGradInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *gradInput = luaT_checkudata(L, 3, torch_Tensor);
+
+  if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
+    luaL_error(L, "inconsistent input and target size");
+  }
+  THTensor_(resizeAs)(gradInput, input);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    real p = (x >= 0) ? (1 / (1 + exp(-x))) : (1 - 1 / (1 + exp(x)));
+    *gradInput_data = (y <= 0) ? p : (p - 1);
+  );
+
+  return 0;
+}
+
+static int nn_(LogitBoostCriterion_updateHessInput)(lua_State *L)
+{
+  THTensor *input = luaT_checkudata(L, 1, torch_Tensor);
+  THTensor *target = luaT_checkudata(L, 2, torch_Tensor);
+  THTensor *hessInput = luaT_checkudata(L, 3, torch_Tensor);
+
+  if (THTensor_(nElement)(input) != THTensor_(nElement)(target)) {
+    luaL_error(L, "inconsistent input and target size");
+  }
+  THTensor_(resizeAs)(hessInput, input);
+
+  TH_TENSOR_APPLY3(real, hessInput, real, input, real, target,
+    real x = *input_data;
+    real p = (x >= 0) ? (1 / (1 + exp(-x))) : (1 - 1 / (1 + exp(x)));
+    *hessInput_data = p * (1.0 - p);
+  );
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(LogitBoostCriterion__) [] = {
+  {"LogitBoostCriterion_updateOutput", nn_(LogitBoostCriterion_updateOutput)},
+  {"LogitBoostCriterion_updateGradInput", nn_(LogitBoostCriterion_updateGradInput)},
+  {"LogitBoostCriterion_updateHessInput", nn_(LogitBoostCriterion_updateHessInput)},
+  {NULL, NULL}
+};
+
+static void nn_(LogitBoostCriterion_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(LogitBoostCriterion__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/lua-torch/decisiontree/generic/S2D.c b/contrib/lua-torch/decisiontree/generic/S2D.c
new file mode 100644
index 000000000..2392ee7c8
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/generic/S2D.c
@@ -0,0 +1,90 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/S2D.c"
+#else
+
+static int nn_(S2D_computeOutput)(lua_State *L) {
+  THTensor *output = luaT_checkudata(L, 1, torch_Tensor);
+  const int keys_index = 2;
+  const int values_index = 3;
+  const int masks_index = 4;
+
+  if (!lua_istable(L, keys_index))
+    return LUA_HANDLE_ERROR_STR(L, "expeced position 2 to be a table");
+  if (!lua_istable(L, values_index))
+    return LUA_HANDLE_ERROR_STR(L, "expeced position 3 to be a table");
+  if (!lua_istable(L, masks_index))
+    return LUA_HANDLE_ERROR_STR(L, "expeced position 4 to be a table");
+
+
+  THLongTensor *features = luaT_checkudata(L, 5, "torch.LongTensor");
+
+  const int original_top = lua_gettop(L);
+
+  long outputsize = THLongTensor_size(features, 0);
+  long batch_size = lua_objlen(L, keys_index);
+
+  // initializes output
+  THTensor_(resize2d)(output, batch_size, outputsize);
+  THTensor_(zero)(output);
+  real *output_data = THTensor_(data)(output);
+
+  // iterates over samples
+  lua_pushnil(L);
+  const int local_top = lua_gettop(L);
+  while (lua_next(L, keys_index) != 0) {
+    // gets data corresponding to the current sample
+    long i = lua_tointeger(L, -2)-1;
+    real *current_output_data = &output_data[i * outputsize];
+    THLongTensor *keys = luaT_checkudata(L, -1, "torch.LongTensor");
+    lua_rawgeti(L, values_index, i+1);
+    THTensor *values = luaT_checkudata(L, -1, torch_Tensor);
+    lua_rawgeti(L, masks_index, i+1);
+    THByteTensor *mask = luaT_checkudata(L, -1, "torch.ByteTensor");
+
+    long n_keys = THLongTensor_size(keys, 0);
+    long n_values = THTensor_(size)(values, 0);
+
+    // quick safety check
+    if (n_keys != n_values)
+      return LUA_HANDLE_ERROR_STR(L, "keys and values have to have the same size");
+
+    // gets the direct memory pointers
+    long *keys_data = THLongTensor_data(keys);
+    real *values_data = THTensor_(data)(values);
+    unsigned char *mask_data = THByteTensor_data(mask);
+
+    // for each value in the sparse input...
+    for (long j = 0; j < n_keys; j++) {
+      // loads the value and key
+      real current_value = values_data[j];
+      long current_key = keys_data[j];
+      unsigned char current_mask = mask_data[j];
+
+      // if the feature is present in the map
+      if (current_mask)
+        // saves in the given position
+        current_output_data[current_key-1] = current_value;
+    }
+    // cleans up the trash we create by iterating over keys to avoid it from overflowing
+    lua_pop(L, lua_gettop(L) - local_top);
+  }
+
+  // cleans up the trash we added to the stack
+  lua_pop(L, lua_gettop(L) - original_top);
+
+  return 0;
+}
+
+static const struct luaL_Reg nn_(S2D__) [] = {
+  {"S2D_computeOutput", nn_(S2D_computeOutput)},
+  {NULL, NULL}
+};
+
+static void nn_(S2D_init)(lua_State *L)
+{
+  luaT_pushmetatable(L, torch_Tensor);
+  luaT_registeratname(L, nn_(S2D__), "nn");
+  lua_pop(L,1);
+}
+
+#endif
diff --git a/contrib/lua-torch/decisiontree/hash_map.c b/contrib/lua-torch/decisiontree/hash_map.c
new file mode 100644
index 000000000..2c679e207
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/hash_map.c
@@ -0,0 +1,445 @@
+#include "utils.h"
+#include "hash_map.h"
+#include "internal_hash_map.h"
+#include <pthread.h>
+
+hash_map_t hash_map_init(void) {
+   return kh_init(long);
+}
+
+void hash_map_destroy(hash_map_t h_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   kh_destroy(long, h);
+}
+
+void hash_map_clear(hash_map_t h_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   kh_clear(long, h);
+}
+
+int hash_map_put(hash_map_t h_, long key, long val) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   int ret;
+   khiter_t k = kh_put(long, h, key, &ret);
+   ret = (ret >= 0);
+   if (ret)
+      kh_value(h, k) = val;
+   return ret;
+}
+
+int hash_map_put_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_) {
+   long *keys = THLongTensor_data(keys_);
+   long *vals = THLongTensor_data(vals_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      if (!hash_map_put(h_, keys[i], vals[i]))
+         return 0;
+   return 1;
+}
+
+int hash_map_fill(hash_map_t h_, long key, long *counter) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   khiter_t k = kh_get(long, h, key);
+   if (k == kh_end(h))
+      return hash_map_put(h_, key, ++(*counter));
+   return 1;
+}
+
+int hash_map_fill_tensor(hash_map_t h_, THLongTensor *keys_, long *counter) {
+   long *keys = THLongTensor_data(keys_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      if (!hash_map_fill(h_, keys[i], counter))
+         return 0;
+   return 1;
+}
+
+int hash_map_get(hash_map_t h_, long key, long* val) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   khiter_t k = kh_get(long, h, key);
+   if (k == kh_end(h))
+      return 0;
+   *val = kh_value(h, k);
+   return 1;
+}
+
+void hash_map_get_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_, THByteTensor *mask_) {
+   long *keys = THLongTensor_data(keys_);
+   long *vals = THLongTensor_data(vals_);;
+   unsigned char *mask = THByteTensor_data(mask_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      mask[i] = hash_map_get(h_, keys[i], &vals[i]);
+}
+
+void hash_map_del(hash_map_t h_, long key) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   khiter_t k = kh_get(long, h, key);
+   if (k != kh_end(h))
+      kh_del(long, h, k);
+}
+
+void hash_map_del_tensor(hash_map_t h_, THLongTensor *keys_) {
+   long *keys = THLongTensor_data(keys_);
+   long size = get_tensor_size(keys_, Long);
+   for (long i = 0; i < size; i++)
+      hash_map_del(h_, keys[i]);
+}
+
+size_t hash_map_size(hash_map_t h_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   return kh_size(h);
+}
+
+void hash_map_to_tensor(hash_map_t h_, THLongTensor *keys_, THLongTensor *vals_) {
+   internal_hash_map_t h = (internal_hash_map_t) h_;
+   long *keys = THLongTensor_data(keys_);
+   long *vals = THLongTensor_data(vals_);
+   long key, val, i = 0;
+   kh_foreach(h, key, val, {
+      keys[i] = key;
+      vals[i] = val;
+      i++;
+   });
+}
+
+static void autolock(hash_map_lua_t *h) {
+   if (h->autolock) {
+      pthread_mutex_lock(&h->mutex);
+   }
+}
+
+static void autounlock(hash_map_lua_t *h) {
+   if (h->autolock) {
+      pthread_mutex_unlock(&h->mutex);
+   }
+}
+
+int hash_map_autolock_on_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   h->autolock = 1;
+   return 0;
+}
+
+int hash_map_autolock_off_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   h->autolock = 0;
+   return 0;
+}
+
+int hash_map_init_lua(lua_State *L) {
+   hash_map_lua_t **hp = (hash_map_lua_t**)lua_newuserdata(L, sizeof(hash_map_lua_t*));
+   *hp = (hash_map_lua_t*)malloc(sizeof(hash_map_lua_t));
+   hash_map_lua_t *h = *hp;
+   h->refcount = 1;
+   h->counter = 0;
+   h->autolock = 0;
+   h->h = hash_map_init();
+
+   pthread_mutexattr_t mutex_attr;
+   pthread_mutexattr_init(&mutex_attr);
+   pthread_mutexattr_settype(&mutex_attr, PTHREAD_MUTEX_RECURSIVE);
+   pthread_mutex_init(&h->mutex, &mutex_attr);
+
+   luaL_getmetatable(L, "dt.HashMap");
+   lua_setmetatable(L, -2);
+   return 1;
+}
+
+int hash_map_gc_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (THAtomicDecrementRef(&h->refcount)) {
+      pthread_mutex_destroy(&h->mutex);
+      hash_map_destroy(h->h);
+      free(h);
+   }
+   return 0;
+}
+
+int hash_map_retain_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   THAtomicIncrementRef(&h->refcount);
+   return 0;
+}
+
+int hash_map_metatablename_lua(lua_State *L) {
+   lua_pushstring(L, "dt.HashMap");
+   return 1;
+}
+
+int hash_map_clear_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   autolock(h);
+   hash_map_clear(h->h);
+   autounlock(h);
+   return 0;
+}
+
+int hash_map_put_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   int ret;
+#if LUA_VERSION_NUM <= 501
+#define lua_isinteger lua_isnumber
+#endif
+   if (lua_isinteger(L, 2)) {
+      if (!lua_isinteger(L, 3))
+         return LUA_HANDLE_ERROR_STR(L, "second parameter is not a number");
+      long key = lua_tointeger(L, 2);
+      long val = lua_tointeger(L, 3);
+      autolock(h);
+      ret = hash_map_put(h->h, key, val);
+      autounlock(h);
+   }
+   else {
+      THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      THLongTensor *vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
+      check_tensor(L, keys, THLongTensor);
+      check_tensor(L, vals, THLongTensor);
+      check_tensors(L, keys, vals);
+      autolock(h);
+      ret = hash_map_put_tensor(h->h, keys, vals);
+      autounlock(h);
+   }
+   if (!ret)
+      return LUA_HANDLE_ERROR_STR(L, "failed to put into hash map");
+   return 0;
+}
+
+int hash_map_fill_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   int ret;
+   if (lua_isinteger(L, 2)) {
+      long key = lua_tointeger(L, 2);
+      autolock(h);
+      ret = hash_map_fill(h->h, key, &h->counter);
+      autounlock(h);
+   }
+   else {
+      THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      check_tensor(L, keys, THLongTensor);
+      autolock(h);
+      ret = hash_map_fill_tensor(h->h, keys, &h->counter);
+      autounlock(h);
+   }
+   if (!ret)
+      return LUA_HANDLE_ERROR_STR(L, "failed to fill into hash map");
+   return 0;
+}
+
+int hash_map_adjust_counter_lua(lua_State *L) {
+   hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   internal_hash_map_t h = (internal_hash_map_t) h_->h;
+
+   long val;
+   kh_foreach_value(h, val, {
+      if (val >= h_->counter)
+         h_->counter = val;
+   });
+   return 0;
+}
+
+int hash_map_set_counter_lua(lua_State *L) {
+   hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   h_->counter = lua_tointeger(L, 2);
+   return 0;
+}
+
+int hash_map_get_counter_lua(lua_State *L) {
+   hash_map_lua_t *h_ = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   lua_pushinteger(L, h_->counter);
+   return 1;
+}
+
+static int hash_map_get_tensor_lua(lua_State *L, hash_map_lua_t *h, int inplace) {
+    THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+    check_tensor(L, keys, THLongTensor);
+    THLongTensor *vals = inplace ? keys : NULL;
+    THByteTensor *mask = NULL;
+
+    int maskIdx = inplace ? 3 : 4;
+
+    if (!inplace) {
+       if (lua_gettop(L) < 3) {
+          vals = THLongTensor_new();
+       } else {
+          vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
+          check_tensor(L, vals, THLongTensor);
+       }
+    }
+
+    if (lua_gettop(L) < maskIdx) {
+        mask = THByteTensor_new();
+    } else {
+        mask = (THByteTensor *)luaT_checkudata(L, maskIdx, "torch.ByteTensor");
+        check_tensor(L, mask, THByteTensor);
+    }
+
+    int n_dim = THLongTensor_nDimension(keys);
+    THLongStorage *st = THLongStorage_newWithSize1(n_dim);
+    for (int i = 0; i < n_dim; i++) {
+        THLongStorage_set(st, i, THLongTensor_size(keys, i));
+    }
+    THByteTensor_resize(mask, st, NULL);
+    if (!inplace) THLongTensor_resize(vals, st, NULL);
+    THLongStorage_free(st);
+
+    autolock(h);
+    hash_map_get_tensor(h->h, keys, vals, mask);
+    autounlock(h);
+
+    if (!inplace && lua_gettop(L) < 3)
+        luaT_pushudata(L, vals, "torch.LongTensor");
+    if (lua_gettop(L) < maskIdx)
+        luaT_pushudata(L, mask, "torch.ByteTensor");
+
+    return 2;
+}
+
+static int hash_map_get_table_lua(lua_State *L, hash_map_lua_t *h, int inplace) {
+    const int kidx = 2;
+    const int vidx = inplace ? 2 : 3;
+    const int midx = inplace ? 3 : 4;
+    const int narg = lua_gettop(L);
+
+    if (inplace) {
+        if (narg < 3) {
+            LUA_HANDLE_ERROR_STR(L, "HashMap.getInplace requires two arguments.");
+        }
+    } else {
+        if (narg < 4) {
+            LUA_HANDLE_ERROR_STR(L, "HashMap.get requires three arguments.");
+        }
+    }
+
+    int count = push_table_contents(L, kidx);
+    verify_push_table_contents(L, vidx, count);
+    verify_push_table_contents(L, midx, count);
+
+    THLongTensor *keys;
+    THLongTensor *vals;
+    THByteTensor *mask;
+    for (int i = count - 1; i >= 0; i--) {
+        int maskIdx = i - count;
+        int valIdx = maskIdx -  count;
+        int keyIdx = inplace ? valIdx : (valIdx - count);
+
+        keys = (THLongTensor *)luaT_checkudata(L, keyIdx, "torch.LongTensor");
+        check_tensor(L, keys, THLongTensor);
+        if (inplace) {
+            vals = keys;
+        } else {
+            vals = (THLongTensor *)luaT_checkudata(L, valIdx, "torch.LongTensor");
+        }
+        mask = (THByteTensor *)luaT_checkudata(L, maskIdx, "torch.ByteTensor");
+
+        int n_dim = THLongTensor_nDimension(keys);
+        THLongStorage *st = THLongStorage_newWithSize1(n_dim);
+        for (int i = 0; i < n_dim; i++) {
+            THLongStorage_set(st, i, THLongTensor_size(keys, i));
+        }
+        THByteTensor_resize(mask, st, NULL);
+        THLongTensor_resize(vals, st, NULL);
+        THLongStorage_free(st);
+
+        autolock(h);
+        hash_map_get_tensor(h->h, keys, vals, mask);
+        autounlock(h);
+    }
+    lua_pop(L, (narg - 1) * count);
+    return 2;
+}
+
+int hash_map_get_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (lua_isinteger(L, 2)) {
+      long key = lua_tointeger(L, 2);
+      long val;
+      autolock(h);
+      int ret = hash_map_get(h->h, key, &val);
+      autounlock(h);
+      if (ret) {
+         lua_pushinteger(L, val);
+         lua_pushinteger(L, 1);
+      }
+      else {
+         lua_pushinteger(L, 0);
+         lua_pushinteger(L, 0);
+      }
+   } else if (lua_istable(L, 2)) {
+      return hash_map_get_table_lua(L, h, 0);
+   } else {
+      return hash_map_get_tensor_lua(L, h, 0);
+   }
+   return 2;
+}
+
+int hash_map_get_inplace_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (lua_isinteger(L, 2)) {
+      LUA_HANDLE_ERROR_STR(L, "HashMap.getInplace does not support integer arguments.");
+   } else if (lua_istable(L, 2)) {
+      return hash_map_get_table_lua(L, h, 1);
+   } else {
+      return hash_map_get_tensor_lua(L, h, 1);
+   }
+   return 2;
+}
+
+int hash_map_del_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   if (lua_isinteger(L, 2)) {
+      long key = lua_tointeger(L, 2);
+      autolock(h);
+      hash_map_del(h->h, key);
+      autounlock(h);
+   }
+   else {
+      THLongTensor *keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      autolock(h);
+      hash_map_del_tensor(h->h, keys);
+      autounlock(h);
+   }
+   return 0;
+}
+
+int hash_map_size_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   long size = hash_map_size(h->h);
+   lua_pushinteger(L, size);
+   return 1;
+}
+
+int hash_map_to_tensor_lua(lua_State *L) {
+   hash_map_lua_t *h = *(hash_map_lua_t**)lua_touserdata(L, 1);
+   THLongTensor *keys, *vals;
+
+   if (lua_gettop(L) < 2) {
+      keys = THLongTensor_new();
+   }
+   else {
+      keys = (THLongTensor *)luaT_checkudata(L, 2, "torch.LongTensor");
+      check_tensor(L, keys, THLongTensor);
+   }
+
+   if (lua_gettop(L) < 3) {
+      vals = THLongTensor_new();
+   }
+   else {
+      vals = (THLongTensor *)luaT_checkudata(L, 3, "torch.LongTensor");
+      check_tensor(L, vals, THLongTensor);
+   }
+
+   size_t size = hash_map_size(h->h);
+   THLongTensor_resize1d(keys, size);
+   THLongTensor_resize1d(vals, size);
+
+   autolock(h);
+   hash_map_to_tensor(h->h, keys, vals);
+   autounlock(h);
+
+   if (lua_gettop(L) < 2)
+      luaT_pushudata(L, keys, "torch.LongTensor");
+   if (lua_gettop(L) < 3)
+      luaT_pushudata(L, vals, "torch.LongTensor");
+   return 2;
+}
diff --git a/contrib/lua-torch/decisiontree/hash_map.h b/contrib/lua-torch/decisiontree/hash_map.h
new file mode 100644
index 000000000..5b215e4ca
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/hash_map.h
@@ -0,0 +1,36 @@
+#include "luaT.h"
+#include "TH.h"
+
+typedef void* hash_map_t;
+
+hash_map_t hash_map_init(void);
+void hash_map_destroy(hash_map_t);
+void hash_map_clear(hash_map_t);
+int hash_map_put(hash_map_t, long key, long val);
+int hash_map_put_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_);
+int hash_map_fill(hash_map_t, long key, long *counter);
+int hash_map_fill_tensor(hash_map_t, THLongTensor *keys_, long *counter);
+int hash_map_get(hash_map_t, long key, long *val);
+void hash_map_get_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_, THByteTensor *mask_);
+void hash_map_del(hash_map_t, long key);
+void hash_map_del_tensor(hash_map_t, THLongTensor *keys_);
+size_t hash_map_size(hash_map_t);
+void hash_map_to_tensor(hash_map_t, THLongTensor *keys_, THLongTensor *vals_);
+
+int hash_map_autolock_on_lua(lua_State *L);
+int hash_map_autolock_off_lua(lua_State *L);
+int hash_map_init_lua(lua_State *L);
+int hash_map_gc_lua(lua_State *L);
+int hash_map_retain_lua(lua_State *L);
+int hash_map_metatablename_lua(lua_State *L);
+int hash_map_clear_lua(lua_State *L);
+int hash_map_put_lua(lua_State *L);
+int hash_map_fill_lua(lua_State *L);
+int hash_map_adjust_counter_lua(lua_State *L);
+int hash_map_set_counter_lua(lua_State *L);
+int hash_map_get_counter_lua(lua_State *L);
+int hash_map_get_lua(lua_State *L);
+int hash_map_get_inplace_lua(lua_State *L);
+int hash_map_del_lua(lua_State *L);
+int hash_map_size_lua(lua_State *L);
+int hash_map_to_tensor_lua(lua_State *L);
diff --git a/contrib/lua-torch/decisiontree/init.c b/contrib/lua-torch/decisiontree/init.c
new file mode 100644
index 000000000..276241e8e
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/init.c
@@ -0,0 +1,77 @@
+#include "TH.h"
+#include "luaT.h"
+
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "error.h"
+#include "hash_map.h"
+
+#define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
+#define torch_Tensor TH_CONCAT_STRING_3(torch., Real, Tensor)
+#define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
+
+#include "generic/LogitBoostCriterion.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/DFD.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/S2D.c"
+#include "THGenerateFloatTypes.h"
+
+#include "generic/CartTree.c"
+#include "THGenerateFloatTypes.h"
+
+#include "GBDT_common.h"
+#include "generic/GBDT.c"
+#include "THGenerateFloatTypes.h"
+
+static const struct luaL_Reg decisiontree_hash_map_routines[] = {
+   {"__gc", hash_map_gc_lua},
+   {"retain", hash_map_retain_lua},
+   {"metatablename", hash_map_metatablename_lua},
+   {"clear", hash_map_clear_lua},
+   {"put", hash_map_put_lua},
+   {"fill", hash_map_fill_lua},
+   {"adjustCounter", hash_map_adjust_counter_lua},
+   {"getCounter", hash_map_get_counter_lua},
+   {"setCounter", hash_map_set_counter_lua},
+   {"get", hash_map_get_lua},
+   {"getInplace", hash_map_get_inplace_lua},
+   {"del", hash_map_del_lua},
+   {"size", hash_map_size_lua},
+   {"safe", hash_map_autolock_on_lua},
+   {"unsafe", hash_map_autolock_off_lua},
+   {"toTensors", hash_map_to_tensor_lua},
+   {"new", hash_map_init_lua},
+   {NULL, NULL}
+};
+
+DLL_EXPORT int luaopen_libdecisiontree(lua_State *L)
+{
+  // HashMap
+  luaL_newmetatable(L, "dt.HashMap");
+  lua_pushstring(L, "__index");
+  lua_pushvalue(L, -2);
+  lua_settable(L, -3);
+  luaT_setfuncs(L, decisiontree_hash_map_routines, 0);
+
+  nn_FloatLogitBoostCriterion_init(L);
+  nn_DoubleLogitBoostCriterion_init(L);
+
+  nn_FloatDFD_init(L);
+  nn_DoubleDFD_init(L);
+
+  nn_FloatS2D_init(L);
+  nn_DoubleS2D_init(L);
+
+  nn_FloatCT_init(L);
+  nn_DoubleCT_init(L);
+
+  nn_FloatGBDT_init(L);
+  nn_DoubleGBDT_init(L);
+
+  return 1;
+}
diff --git a/contrib/lua-torch/decisiontree/init.lua b/contrib/lua-torch/decisiontree/init.lua
new file mode 100644
index 000000000..26f790b60
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/init.lua
@@ -0,0 +1,70 @@
+require 'paths'
+--require 'xlua'
+require 'string'
+require 'os'
+--require 'sys'
+require 'nn'
+
+-- these actually return local variables but we will re-require them
+-- when needed. This is just to make sure they are loaded.
+require 'moses'
+
+unpack = unpack or table.unpack
+
+local dt = require 'decisiontree._env'
+
+-- c lib:
+require "paths"
+paths.require 'libdecisiontree'
+
+dt.HashMap = torch.getmetatable("dt.HashMap").new
+
+dt.EPSILON = 1e-6
+
+-- experimental Tensor-like container
+require 'decisiontree.SparseTensor'
+
+-- functions
+require 'decisiontree.math'
+require 'decisiontree.utils'
+
+-- for multi-threading
+--require 'decisiontree.WorkPool'
+
+-- abstract classes
+require 'decisiontree.DecisionTree'
+require 'decisiontree.DecisionForest'
+require 'decisiontree.DecisionForestTrainer'
+require 'decisiontree.TreeState'
+
+-- for CaRTree inference
+require 'decisiontree.CartNode'
+require 'decisiontree.CartTree'
+
+-- Criterions (extended with updateHessInput and backward2)
+require 'decisiontree.MSECriterion'
+require 'decisiontree.LogitBoostCriterion'
+
+-- Used by both RandomForestTrainer and GradientBoostTrainer
+require 'decisiontree.CartTrainer'
+
+-- Used by CartTrainer
+require 'decisiontree.DataSet'
+
+-- Random Forest Training
+require 'decisiontree.RandomForestTrainer'
+require 'decisiontree.GiniState' -- TreeState subclass
+
+-- Gradient Boosted Decision Tree Training
+require 'decisiontree.GradientBoostTrainer'
+require 'decisiontree.GradientBoostState' -- TreeState subclass
+
+-- unit tests and benchmarks
+--require 'decisiontree.test'
+--require 'decisiontree.benchmark'
+
+-- nn.Module
+require 'decisiontree.DFD'
+require 'decisiontree.Sparse2Dense'
+
+return dt
diff --git a/contrib/lua-torch/decisiontree/internal_hash_map.h b/contrib/lua-torch/decisiontree/internal_hash_map.h
new file mode 100644
index 000000000..bc8c523ef
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/internal_hash_map.h
@@ -0,0 +1,13 @@
+#include "khash.h"
+#include <pthread.h>
+
+KHASH_MAP_INIT_INT64(long, long)
+typedef khash_t(long)* internal_hash_map_t;
+
+typedef struct {
+   hash_map_t h;
+   int refcount;
+   pthread_mutex_t mutex;
+   int autolock;
+   long counter;
+} hash_map_lua_t;
diff --git a/contrib/lua-torch/decisiontree/khash.h b/contrib/lua-torch/decisiontree/khash.h
new file mode 100644
index 000000000..20e994063
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/khash.h
@@ -0,0 +1,627 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+   int ret, is_missing;
+   khiter_t k;
+   khash_t(32) *h = kh_init(32);
+   k = kh_put(32, h, 5, &ret);
+   kh_value(h, k) = 10;
+   k = kh_get(32, h, 10);
+   is_missing = (k == kh_end(h));
+   k = kh_get(32, h, 5);
+   kh_del(32, h, k);
+   for (k = kh_begin(h); k != kh_end(h); ++k)
+      if (kh_exist(h, k)) kh_value(h, k) = 1;
+   kh_destroy(32, h);
+   return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+   * Use quadratic probing. When the capacity is power of 2, stepping function
+     i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+     hashing on cache performance and is more robust than linear probing.
+
+     In theory, double hashing should be more robust than quadratic probing.
+     However, my implementation is probably not for large hash tables, because
+     the second hash function is closely tied to the first hash function,
+     which reduce the effectiveness of double hashing.
+
+   Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+   * The capacity is a power of 2. This seems to dramatically improve the
+     speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+      - http://code.google.com/p/ulib/
+      - http://nothings.org/computer/judy/
+
+   * Allow to optionally use linear probing which usually has better
+     performance for random input. Double hashing is still the default as it
+     is more robust to certain non-random input.
+
+   * Added Wang's integer hash function (not used by default). This hash
+     function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+   * Corrected the example
+   * Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+   * Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+   * Added kh_clear()
+   * Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+   * Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+   * Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+   * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+   typedef struct kh_##name##_s { \
+      khint_t n_buckets, size, n_occupied, upper_bound; \
+      khint32_t *flags; \
+      khkey_t *keys; \
+      khval_t *vals; \
+   } kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)                \
+   extern kh_##name##_t *kh_init_##name(void);                    \
+   extern void kh_destroy_##name(kh_##name##_t *h);               \
+   extern void kh_clear_##name(kh_##name##_t *h);                 \
+   extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key);   \
+   extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+   extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+   extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+   SCOPE kh_##name##_t *kh_init_##name(void) {                    \
+      return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));      \
+   }                                                  \
+   SCOPE void kh_destroy_##name(kh_##name##_t *h)                 \
+   {                                                  \
+      if (h) {                                        \
+         kfree((void *)h->keys); kfree(h->flags);              \
+         kfree((void *)h->vals);                            \
+         kfree(h);                                       \
+      }                                               \
+   }                                                  \
+   SCOPE void kh_clear_##name(kh_##name##_t *h)                \
+   {                                                  \
+      if (h && h->flags) {                               \
+         memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+         h->size = h->n_occupied = 0;                       \
+      }                                               \
+   }                                                  \
+   SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key)  \
+   {                                                  \
+      if (h->n_buckets) {                                   \
+         khint_t k, i, last, mask, step = 0; \
+         mask = h->n_buckets - 1;                           \
+         k = __hash_func(key); i = k & mask;                   \
+         last = i; \
+         while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+            i = (i + (++step)) & mask; \
+            if (i == last) return h->n_buckets;                \
+         }                                            \
+         return __ac_iseither(h->flags, i)? h->n_buckets : i;     \
+      } else return 0;                                   \
+   }                                                  \
+   SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+   { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+      khint32_t *new_flags = 0;                             \
+      khint_t j = 1;                                     \
+      {                                               \
+         kroundup32(new_n_buckets);                            \
+         if (new_n_buckets < 4) new_n_buckets = 4;             \
+         if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
+         else { /* hash table size to be changed (shrink or expand); rehash */ \
+            new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));  \
+            if (!new_flags) return -1;                      \
+            memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+            if (h->n_buckets < new_n_buckets) { /* expand */      \
+               khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+               if (!new_keys) { kfree(new_flags); return -1; }    \
+               h->keys = new_keys;                          \
+               if (kh_is_map) {                          \
+                  khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+                  if (!new_vals) { kfree(new_flags); return -1; } \
+                  h->vals = new_vals;                       \
+               }                                      \
+            } /* otherwise shrink */                        \
+         }                                            \
+      }                                               \
+      if (j) { /* rehashing is needed */                       \
+         for (j = 0; j != h->n_buckets; ++j) {                 \
+            if (__ac_iseither(h->flags, j) == 0) {             \
+               khkey_t key = h->keys[j];                    \
+               khval_t val;                              \
+               khint_t new_mask;                         \
+               new_mask = new_n_buckets - 1;                   \
+               if (kh_is_map) val = h->vals[j];             \
+               __ac_set_isdel_true(h->flags, j);               \
+               while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+                  khint_t k, i, step = 0; \
+                  k = __hash_func(key);                     \
+                  i = k & new_mask;                      \
+                  while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+                  __ac_set_isempty_false(new_flags, i);        \
+                  if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+                     { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+                     if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+                     __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+                  } else { /* write the element and jump out of the loop */ \
+                     h->keys[i] = key;                   \
+                     if (kh_is_map) h->vals[i] = val;       \
+                     break;                              \
+                  }                                   \
+               }                                      \
+            }                                         \
+         }                                            \
+         if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+            h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+            if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+         }                                            \
+         kfree(h->flags); /* free the working space */            \
+         h->flags = new_flags;                              \
+         h->n_buckets = new_n_buckets;                      \
+         h->n_occupied = h->size;                           \
+         h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+      }                                               \
+      return 0;                                          \
+   }                                                  \
+   SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+   {                                                  \
+      khint_t x;                                         \
+      if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+         if (h->n_buckets > (h->size<<1)) {                    \
+            if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+               *ret = -1; return h->n_buckets;                 \
+            }                                         \
+         } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+            *ret = -1; return h->n_buckets;                    \
+         }                                            \
+      } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+      {                                               \
+         khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+         x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+         if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
+         else {                                          \
+            last = i; \
+            while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+               if (__ac_isdel(h->flags, i)) site = i;          \
+               i = (i + (++step)) & mask; \
+               if (i == last) { x = site; break; }             \
+            }                                         \
+            if (x == h->n_buckets) {                        \
+               if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+               else x = i;                               \
+            }                                         \
+         }                                            \
+      }                                               \
+      if (__ac_isempty(h->flags, x)) { /* not present at all */      \
+         h->keys[x] = key;                               \
+         __ac_set_isboth_false(h->flags, x);                   \
+         ++h->size; ++h->n_occupied;                           \
+         *ret = 1;                                       \
+      } else if (__ac_isdel(h->flags, x)) { /* deleted */            \
+         h->keys[x] = key;                               \
+         __ac_set_isboth_false(h->flags, x);                   \
+         ++h->size;                                      \
+         *ret = 2;                                       \
+      } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+      return x;                                          \
+   }                                                  \
+   SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)          \
+   {                                                  \
+      if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {        \
+         __ac_set_isdel_true(h->flags, x);                     \
+         --h->size;                                      \
+      }                                               \
+   }
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)                     \
+   __KHASH_TYPE(name, khkey_t, khval_t)                        \
+   __KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+   __KHASH_TYPE(name, khkey_t, khval_t)                        \
+   __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+   KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+   khint_t h = (khint_t)*s;
+   if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+   return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+            the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;      \
+   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
+      if (!kh_exist(h,__i)) continue;                 \
+      (kvar) = kh_key(h,__i);                      \
+      (vvar) = kh_val(h,__i);                      \
+      code;                                  \
+   } }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;      \
+   for (__i = kh_begin(h); __i != kh_end(h); ++__i) {    \
+      if (!kh_exist(h,__i)) continue;                 \
+      (vvar) = kh_val(h,__i);                      \
+      code;                                  \
+   } }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)                            \
+   KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)                      \
+   KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)                             \
+   KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)                       \
+   KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)                            \
+   KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)                      \
+   KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/contrib/lua-torch/decisiontree/math.lua b/contrib/lua-torch/decisiontree/math.lua
new file mode 100644
index 000000000..eb71b31ed
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/math.lua
@@ -0,0 +1,84 @@
+local dt = require "decisiontree._env"
+
+local PSEUDOCOUNT = 1.0
+local MIN_LOGISTIC = 1E-8
+local MAX_LOGISTIC = 1.0 - MIN_LOGISTIC
+
+-- Create counts of possible results (last column of each row is the result)
+function dt.uniquecounts(counts, inputset, nclass)
+   counts = counts or inputset.input.new()
+   nclass = nclass or inputset.target:max()
+   counts:resize(nclass):zero()
+
+   inputset.target:apply(function(c) counts[c] = counts[c] + 1 end)
+   return counts
+end
+
+-- Entropy is the sum of -p(x)log(p(x)) across all the different possible results
+local counts, logprobs
+function dt.entropy(inputset, nclass)
+   local dt = require 'decisiontree'
+   counts = dt.uniquecounts(counts, inputset, nclass)
+   -- convert counts to categorical probabilities
+   counts:add(0.0000001) -- prevent NaN
+   counts:div(counts:sum())
+
+   logprobs = logprobs or counts.new()
+   logprobs:resize(counts:size())
+   logprobs:log(counts):div(math.log(2)) -- log2(x)
+
+   counts:cmul(logprobs)
+
+   return -counts:sum()
+end
+
+-- Compute and return the probability of positive label.
+function dt.probabilityPositive(nPositive, nTotal)
+   return (nPositive + PSEUDOCOUNT) / (nTotal + 2.0 * PSEUDOCOUNT);
+end
+
+-- Ref. https://en.wikipedia.org/wiki/Logit
+-- Calculates logit of the probability.
+-- Logit represents the log-odds. Probabilities transformed to logit 'space' can be combined linearly.
+function dt.logit(p)
+   assert(p >= 0.0 and p <= 1.0, "Expecting probability for arg 1")
+   local truncatedP = math.max(MIN_LOGISTIC, math.min(MAX_LOGISTIC, p))
+   return math.log(truncatedP / (1.0 - truncatedP))
+end
+
+function dt.logistic(x)
+   return (x >= 0) and (1 / (1 + math.exp(-x))) or (1 - 1 / (1 + math.exp(x)))
+end
+
+function dt.computeGradientBoostLoss(gradient, hessian)
+   return -gradient * gradient / hessian
+end
+
+function dt.computeNewtonScore(gradient, hessian)
+   return -0.5 * gradient / hessian;
+end
+
+-- Calculates the logit score for a Node in a Decision Tree based on the probability of a positive label.
+-- params: number of positive examples and total number of examples.
+function dt.calculateLogitScore(nPositive, nTotal)
+   local dt = require 'decisiontree'
+   return dt.logit(dt.probabilityPositive(nPositive, nTotal))
+end
+
+-- Compute and return the Gini impurity score based on an input contingency table.
+function dt.computeGini(leftCount, positiveLeftCount, rightCount, positiveRightCount)
+   assert(torch.type(leftCount) == 'number', 'Expecting total number examples falling into leftBranch.')
+   assert(torch.type(positiveLeftCount) == 'number', 'Expecting total number of positive examples falling into left branch.')
+   assert(torch.type(rightCount) == 'number', 'Expecting total number of examples falling into the right branch.')
+   assert(torch.type(positiveRightCount) == 'number', 'Expecting total number of positive examples falling into the right branch.')
+
+   local total = leftCount + rightCount
+
+   local pPositiveLeft = leftCount == 0 and 0 or (positiveLeftCount / leftCount)
+   local leftGini = pPositiveLeft * (1.0 - pPositiveLeft)
+
+   local pPositiveRight = rightCount == 0 and 0 or (positiveRightCount / rightCount)
+   local rightGini = pPositiveRight * (1.0 - pPositiveRight)
+
+   return (leftCount * leftGini + rightCount * rightGini) / total
+end
+\ No newline at end of file
diff --git a/contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec b/contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec
new file mode 100644
index 000000000..d5d916275
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/rocks/decisiontree-scm-1.rockspec
@@ -0,0 +1,40 @@
+package = "decisiontree"
+version = "scm-1"
+
+source = {
+   url = "git://github.com/Twitter/decisiontree",
+   tag = "master"
+}
+
+description = {
+   summary = "Decision trees for Torch by Twitter",
+   detailed = [[
+   Classification and regression trees (CART).
+   Gradients boosted decision trees (GBDT).
+   ]],
+   homepage = "https://github.com/Twitter/decisiontree",
+   license = "BSD"
+}
+
+dependencies = {
+   "torch >= 7.0",
+   "moses >= 1.3.1",
+   "xlua >= 1.0",
+   "image >= 1.0",
+   "luafilesystem >= 1.6.2",
+   "sys >= 1.1",
+   "paths >= 1.0",
+   "ipc >= 1.0",
+   "nn >= 1.0"
+}
+
+build = {
+   type = "command",
+   build_command = [[
+cmake -E make_directory build;
+cd build;
+cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" -DCMAKE_C_FLAGS=-fPIC -DCMAKE_CXX_FLAGS=-fPIC;
+$(MAKE)
+   ]],
+   install_command = "cd build && $(MAKE) install"
+}
diff --git a/contrib/lua-torch/decisiontree/test.lua b/contrib/lua-torch/decisiontree/test.lua
new file mode 100644
index 000000000..80510a4f6
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/test.lua
@@ -0,0 +1,817 @@
+local dt = require "decisiontree._env"
+
+local dttest = {}
+local nloop = 50
+local epsilon = 0.000001
+local mytester
+
+--e.g. usage: th -e "dt = require 'decisiontree'; dt.test()"
+
+-- test 99% accuracy
+local function testAccuracy(cartTree, name, dataset, minacc)
+   assert(torch.isTypeOf(dataset, 'dt.DataSet'))
+   minacc = minacc or 0.99
+   local output = torch.Tensor(dataset:size())
+   local target, input = dataset.target, dataset.input
+
+   for i=1,dataset:size() do
+      local stack = {}
+      local score = cartTree:score(input[i], stack)
+      output[i] = score >= 0 and 1 or 0
+
+      if dt.VERBOSE and torch.type(cartTree) == 'dt.CartTree' and target[i] ~= output[i] then
+         print(cartTree:stackToString(stack, example.input))
+         print(i, score, target[i], output[i])
+      end
+   end
+
+   local accuracy = torch.eq(target, output):float():mean()
+   mytester:assert(accuracy >= minacc, name .. ": insufficient accuracy: " .. accuracy .. " < " .. minacc)
+end
+
+function dttest.SparseTensor()
+   local keys = torch.LongTensor{1,5,6,10}
+   local values = torch.randn(keys:size(1))
+   local st = torch.SparseTensor(keys, values)
+
+   mytester:assert(st[1] == values[1])
+   mytester:assert(st[5] == values[2])
+   mytester:assert(st[6] == values[3])
+   mytester:assert(st[10] == values[4])
+
+   mytester:assert(st[2] == nil)
+
+   st:buildIndex()
+
+   mytester:assert(st[1] == values[1])
+   mytester:assert(st[5] == values[2])
+   mytester:assert(st[6] == values[3])
+   mytester:assert(st[10] == values[4])
+
+   mytester:assert(st[2] == nil)
+
+   -- test empty sparse tensor
+
+   local est = torch.SparseTensor()
+end
+
+function dttest.GiniState()
+   local featureId = 2
+   local minLeafSize = 0
+
+   local input = torch.Tensor({{0,1,0},{0,2,0},{0,3,0}})
+   local target = torch.Tensor({1, 1, 1})
+   local dataset = dt.DataSet(input, target, 3)
+
+   local splitInfo1 = {_id=1}
+   local splitInfo2 = {_id=2, leftChildSize = 1, rightChildSize = 2, splitGain = 0}
+   local splitInfo3 = {_id=3, leftChildSize = 2, rightChildSize = 1, splitGain = -1}
+
+   local exampleIds = torch.LongTensor{1,2,3}
+
+   local treeState = dt.GiniState(exampleIds)
+
+   function treeState.computeSplitInfo(self, splitFeatureId, splitFeatureValue)
+      if splitFeatureId  == featureId and splitFeatureValue == 2 then
+         return splitInfo2
+      elseif splitFeatureId  == featureId and splitFeatureValue == 3 then
+         return splitInfo3
+      else
+         error("Unhandled computeSplitInfo call "..splitFeatureId.." "..splitFeatureValue)
+      end
+   end
+
+   local splitInfo = treeState:findBestFeatureSplit(dataset, featureId, minLeafSize)
+   mytester:assert(splitInfo._id == splitInfo3._id)
+end
+
+function dttest.CartTree()
+
+   local splitFeatureId = 100
+   local splitFeatureValue = 1.0
+
+   local function getBinaryCartTreeRootNode()
+
+      local leftNodeScore = 0.2
+      local rightNodeScore = 0.4
+
+      local rootNode = dt.CartNode()
+      rootNode.nodeId = 0
+      rootNode.score = 0.5
+      rootNode.splitFeatureId = splitFeatureId
+      rootNode.splitFeautreValue = splitFeatureValue
+
+      local leftChild = dt.CartNode()
+      leftChild.score = leftNodeScore
+      leftChild.nodeId = 1
+
+      local rightChild = dt.CartNode()
+      rightChild.score = rightNodeScore
+      rightChild.nodeId = 2
+
+      rootNode.leftChild = leftChild
+      rootNode.rightChild = rightChild
+
+      return rootNode
+   end
+
+   local function testScoreCartTreeBranchLeftIfMissing()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.SparseTensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
+      mytester:assert(rootNode.leftChild.nodeId == nodeId)
+   end
+
+   local function testBranchRightWithFeature()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.zeros(100)
+      continuousFeatures[splitFeatureId] = splitFeatureValue
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
+      mytester:assert(rootNode.rightChild.nodeId == nodeId)
+   end
+
+   local function testMissingRightNode()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      rootNode.rightChild = nil
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.Tensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
+      mytester:assert(rootNode.leftChild.nodeId == nodeId)
+   end
+
+   local function testMissingLeftNode()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      rootNode.leftChild = nil
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.Tensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
+      mytester:assert(rootNode.rightChild.nodeId == nodeId)
+   end
+
+   local function testMissingAllChildren()
+      local rootNode = getBinaryCartTreeRootNode()
+
+      rootNode.leftChild = nil
+      rootNode.rightChild = nil
+
+      local cartTree = dt.CartTree(rootNode)
+
+      local continuousFeatures = torch.Tensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.score - score) < epsilon)
+      mytester:assert(rootNode.nodeId == nodeId)
+   end
+
+   local function testScoreCartTreeBranchRandomlyRight()
+      local rootNode = getBinaryCartTreeRootNode();
+
+      -- Force Branch Right
+      local cartTree = dt.CartTree(rootNode, function() return false end);
+
+      local continuousFeatures = torch.SparseTensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.rightChild.score - score) < epsilon)
+      mytester:assert(rootNode.rightChild.nodeId == nodeId)
+   end
+
+   local function testScoreCartTreeBranchRandomlyLeft()
+      local rootNode = getBinaryCartTreeRootNode();
+
+      -- Force Branch Left
+      local cartTree = dt.CartTree(rootNode, function() return true end);
+
+      local continuousFeatures = torch.SparseTensor()
+
+      local score, nodeId = cartTree:score(continuousFeatures)
+
+      mytester:assert(math.abs(rootNode.leftChild.score - score) < epsilon)
+      mytester:assert(rootNode.leftChild.nodeId == nodeId)
+   end
+
+   testScoreCartTreeBranchLeftIfMissing()
+   testBranchRightWithFeature()
+   testMissingRightNode()
+   testMissingLeftNode()
+   testMissingAllChildren()
+   testScoreCartTreeBranchRandomlyRight()
+   testScoreCartTreeBranchRandomlyLeft()
+
+end
+
+function dttest.TreeState_branch()
+   local _ = require 'moses'
+   local binFeatureId = 1
+   local featureId = 2
+
+   local input = {
+      torch.SparseTensor(torch.LongTensor{binFeatureId},torch.Tensor{1}),
+      torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{1,1}),
+      torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{0,2}),
+      torch.SparseTensor(torch.LongTensor{binFeatureId,featureId},torch.Tensor{0,3})
+   }
+   local target = torch.LongTensor(4):fill(1)
+
+   local dataset = dt.DataSet(input, target)
+
+   local treeState = dt.TreeState(torch.LongTensor():range(1,4))
+   local splitInfo = {splitId = binFeatureId, splitValue = 1}
+
+   local function testBranchBinaryFeature()
+      splitInfo = {splitId = binFeatureId, splitValue = 1}
+      local leftBranch, rightBranch = treeState:branch(splitInfo, dataset)
+      mytester:assert(leftBranch ~= nil and rightBranch ~= nil)
+
+      mytester:assert(2 == leftBranch:size())
+      mytester:assert(leftBranch:contains(3))
+      mytester:assert(leftBranch:contains(4))
+
+      mytester:assert(2 == rightBranch:size())
+      mytester:assert(rightBranch:contains(1))
+      mytester:assert(rightBranch:contains(2))
+   end
+
+   local function testBranchContinuousFeature()
+      local splitValue = 2
+      splitInfo = {splitId = featureId, splitValue = splitValue}
+
+      local leftBranch, rightBranch = treeState:branch(splitInfo, dataset)
+      mytester:assert(leftBranch ~= nil and rightBranch ~= nil)
+
+      mytester:assert(1 == leftBranch:size())
+      mytester:assert(leftBranch:contains(2))
+
+      mytester:assert(2 == rightBranch:size())
+      mytester:assert(rightBranch:contains(3))
+      mytester:assert(rightBranch:contains(4))
+   end
+
+   testBranchBinaryFeature()
+   testBranchContinuousFeature()
+
+end
+
+function dttest.DecisionForest()
+   -- Create test decision forest, each forest has only a single node, and returns score == score of root node.
+
+   local function createCartTreeWithSingleNode(score)
+      local cartNode = dt.CartNode()
+      cartNode.score = score
+      return dt.CartTree(cartNode)
+   end
+
+   local function getTestDecisionForest()
+      local cartTrees = {
+         createCartTreeWithSingleNode(1),
+         createCartTreeWithSingleNode(2),
+         createCartTreeWithSingleNode(3)
+      }
+      local weight = torch.Tensor{10,20,30}
+      local bias = 0.5
+
+      return dt.DecisionForest(cartTrees, weight, bias)
+   end
+
+   local function testScoreDecisionForest()
+      local df = getTestDecisionForest()
+      local continuousFeatures = torch.SparseTensor()
+
+      local expectedResult = 1.0 * 10.0 + 2.0 * 20.0 + 3.0 * 30.0 + 0.5;
+      local result = df:score(continuousFeatures)
+
+      mytester:assert(math.abs(expectedResult - result) < epsilon)
+   end
+
+   testScoreDecisionForest()
+end
+
+function dttest.CartTrainer()
+   local minLeafSize, maxLeafNodes = 1, 1000
+   local nExample = 100
+
+   -- 1. dense dataset
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample)
+
+   -- assert that the dataset is valid
+   for clusterId, exampleIds in ipairs(clusterExamples) do
+      local exampleIdx = torch.LongTensor(exampleIds)
+      local input = inputs:index(1,exampleIdx)
+      local target = targets:index(1,exampleIdx)
+      assert(input:std(1):mean() < 0.05)
+   end
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+
+   mytester:assert(nleaf == nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
+   testAccuracy(cartTree, "dense train single-thread first", trainSet, 0.99)
+   testAccuracy(cartTree, "dense valid single-thread first", validSet, 0.7) -- they don't generalize very well..
+
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   testAccuracy(cartTree, "dense single-thread second", trainSet)
+
+   -- test feature parallelization
+   local nThread = 2
+   cartTrainer:featureParallel(nThread)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   testAccuracy(cartTree, "dense feature-parallel", trainSet)
+
+   -- 2. sparse-dense dataset
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getSparseDummyData(nExample, nil, 10, nil, nil, 10)
+
+   -- assert that the dataset is valid
+   for clusterId, exampleIds in ipairs(clusterExamples) do
+      local input = torch.Tensor(#exampleIds, 10):zero()
+      for i, exampleId in ipairs(exampleIds) do
+         input[i]:indexCopy(1, inputs[exampleId].keys, inputs[exampleId].values)
+      end
+      assert(input:std(1):mean() < 0.05)
+   end
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+
+   mytester:assert(nleaf == nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
+   testAccuracy(cartTree, "sparse-dense train single-thread first", trainSet, 0.99)
+
+   local shuffle = torch.LongTensor():randperm(10)
+   for i, input in ipairs(inputs) do
+      input.keys = input.keys:index(1, shuffle)
+      input.values = input.values:index(1, shuffle)
+      input._map = nil
+   end
+   testAccuracy(cartTree, "sparse-dense shuffled keys train single-thread first", trainSet, 0.99)
+   testAccuracy(cartTree, "sparse-dense valid single-thread first", validSet, 0.8)
+
+   -- 3. sparse dataset
+   local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNodes)
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local cartTree, nleaf = cartTrainer:train(treeState, trainSet.featureIds)
+   cartTree.branchleft = function() return true end
+
+   mytester:assert(nleaf < nExample) -- for dense inputs, minLeafSize =1 and maxLeafNode = inf, this is true
+   testAccuracy(cartTree, "sparse train single-thread first", trainSet, 0.9) -- the TreeBrancher drops examples with missing features, making it difficult to overfit
+   testAccuracy(cartTree, "sparse valid single-thread first", validSet, 0.8)
+end
+
+function dttest.GradientBoostTrainer()
+   local nExample = 100
+   local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
+
+   local maxLeafNode, minLeafSize = nExample/2, nExample/10
+   local loss = nn.LogitBoostCriterion(false)
+
+   local cartTrainer = dt.CartTrainer(trainSet, minLeafSize, maxLeafNode)
+
+   local opt = {
+      lossFunction=loss,
+      treeTrainer=cartTrainer,
+      shrinkage=0.1,
+      downsampleRatio=6,
+      featureBaggingSize=-1,
+      nTree=14,
+      evalFreq=8,
+      earlyStop=0 -- no early-stopping
+   }
+
+   -- test single-thread
+   local trainer = dt.GradientBoostTrainer(opt)
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread first", validSet, 0.95)
+
+   -- test stateless
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread second", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread second", validSet, 0.95)
+
+   -- test feature-parallel
+   local nThread = 2
+   cartTrainer:featureParallel(nThread)
+
+   local trainer = dt.GradientBoostTrainer(opt)
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds, validSet)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train feature-parallel first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid feature-parallel first", validSet, 0.95)
+
+end
+
+function dttest.RandomForestTrainer()
+   local nExample = 100
+   local trainSet, validSet = dt.getSparseDummyData(nExample, 2, 10, nil, nil, 9)
+
+   local opt = {
+      activeRatio=0.5,
+      featureBaggingSize=5,
+      nTree=14,
+      maxLeafNodes=nExample/2,
+      minLeafSize=nExample/10,
+   }
+
+   local trainer = dt.RandomForestTrainer(opt)
+
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds)
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread first", validSet, 0.95)
+
+   -- test stateless
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train single-thread second", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid single-thread second", validSet, 0.95)
+
+   -- test tree-parallel
+   local nThread = 2
+   trainer:treeParallel(nThread)
+
+   local trainer = dt.RandomForestTrainer(opt)
+   local decisionForest = trainer:train(trainSet, trainSet.featureIds)
+
+   mytester:assert(#decisionForest.trees == opt.nTree)
+   testAccuracy(decisionForest, "sparse train tree-parallel first", trainSet, 0.98)
+   testAccuracy(decisionForest, "sparse valid tree-parallel first", validSet, 0.95)
+end
+
+function dttest.WorkPool()
+   local nThread = 2
+   local wp = dt.WorkPool(nThread)
+
+   -- 1. some easy tests
+   local store = {key='nick',value=7}
+   wp:update('storeKeyValue', store)
+
+   wp:update('require', {libname='decisiontree', varname='dt'})
+
+   local bias = 2
+   local obj = nn.MSECriterion()
+   wp:update('require', {libname='decisiontree', varname='dt'})
+   wp:writeup('execute', function(store) return bias + obj:updateOutput(torch.Tensor{1},torch.Tensor{1}) + store.nick end)
+
+   local taskname, res = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(res == 9)
+
+   -- 2. trying to reproduce a difficult error
+   local trainSet, validSet = dt.getSparseDummyData()
+
+   -- setup worker store (each worker will have its own copy)
+   local store = {
+      trainSet=trainSet,
+      minLeafSize=2
+   }
+   wp:update('storeKeysValues', store)
+
+   -- arguments/upvalues
+   local treeState = dt.GiniState(trainSet:getExampleIds())
+   local shardId = 1
+   local nShard = nThread
+   local featureIds = trainSet.featureIds
+
+   local task = function(store, args)
+      assert(store.trainSet)
+      assert(store.minLeafSize)
+
+      local bestSplit = args.treeState:findBestSplit(store.trainSet, args.featureIds, store.minLeafSize, args.shardId, args.nShard)
+      return bestSplit
+   end
+   local args = {treeState=treeState,featureIds=featureIds,shardId=shardId,nShard=nShard}
+   wp:writeup("execute", {func=task,args=args})
+
+   local taskname, bestSplit = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(torch.type(bestSplit) == 'table')
+
+   -- closure
+   local task = function(store)
+      assert(store.trainSet)
+      assert(store.minLeafSize)
+
+      local bestSplit = treeState:findBestSplit(store.trainSet, featureIds, store.minLeafSize, shardId, nShard)
+      return bestSplit
+   end
+   wp:writeup("execute", task)
+
+   local taskname, bestSplit = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(torch.type(bestSplit) == 'table')
+
+   local task = function(store, args)
+      assert(store.trainSet)
+      assert(torch.isTypeOf(treeState, 'dt.TreeState'), torch.type(treeState))
+
+      local bestSplit = treeState:findBestSplit(store.trainSet, featureIds, store.minLeafSize, shardId, nShard)
+      return bestSplit
+   end
+   local args = {featureIds=featureIds,shardId=shardId,nShard=nShard}
+   wp:writeup("execute", {func=task,args=args})
+
+
+   local taskname, bestSplit = wp:read()
+   mytester:assert(taskname == 'execute')
+   mytester:assert(torch.type(bestSplit) == 'table')
+
+   wp:terminate()
+end
+
+function dttest.Sparse2Dense()
+   local batchsize = 4
+   local minFeatureId, maxFeatureId = 10, 100
+   local input = {{},{}}
+   for i=1,batchsize do
+      local inputsize = math.random(5,10)
+      input[1][i] = torch.LongTensor(inputsize):random(minFeatureId,maxFeatureId)
+      input[2][i] = torch.Tensor(inputsize):uniform(0,1)
+   end
+   local s2d = nn.Sparse2Dense(torch.LongTensor():range(minFeatureId,maxFeatureId))
+   -- test 2d forward
+   local output = s2d:forward(input)
+   local output2 = torch.Tensor(batchsize, maxFeatureId-minFeatureId+1):zero()
+   local featureMap = {}
+   local j = 0
+   for i=minFeatureId,maxFeatureId do
+      j = j + 1
+      featureMap[i] = j
+   end
+   for i=1,batchsize do
+      local keys, values = input[1][i], input[2][i]
+      for j=1,keys:size(1) do
+         output2[{i,featureMap[keys[j] ]}] = values[j]
+      end
+   end
+   mytester:assertTensorEq(output, output2, 0.000001)
+   -- test 1d forward
+   local input = {input[1][batchsize], input[2][batchsize]}
+   local output = s2d:forward(input)
+   mytester:assertTensorEq(output, output2[batchsize], 0.000001)
+end
+
+function dttest.Sparse2DenseDouble()
+   local batchsize = 4
+   local minFeatureId, maxFeatureId = 10, 100
+   local input = {{},{}}
+   for i=1,batchsize do
+      local inputsize = math.random(5,10)
+      input[1][i] = torch.LongTensor(inputsize):random(minFeatureId,maxFeatureId)
+      input[2][i] = torch.Tensor(inputsize):uniform(0,1):double()
+   end
+   local s2d = nn.Sparse2Dense(torch.LongTensor():range(minFeatureId,maxFeatureId))
+   s2d:double()
+   -- test 2d forward
+   local output = s2d:forward(input)
+   local output2 = torch.Tensor(batchsize, maxFeatureId-minFeatureId+1):zero():double()
+   local featureMap = {}
+   local j = 0
+   for i=minFeatureId,maxFeatureId do
+      j = j + 1
+      featureMap[i] = j
+   end
+   for i=1,batchsize do
+      local keys, values = input[1][i], input[2][i]
+      for j=1,keys:size(1) do
+         output2[{i,featureMap[keys[j] ]}] = values[j]
+      end
+   end
+   mytester:assertTensorEq(output, output2, 0.000001)
+   -- test 1d forward
+   local input = {input[1][batchsize], input[2][batchsize]}
+   local output = s2d:forward(input)
+   mytester:assertTensorEq(output, output2[batchsize], 0.000001)
+end
+
+function dttest.LogitBoostCriterion()
+   local input = torch.randn(10)
+   local target = torch.LongTensor(10):random(0,1):type(torch.type(input))
+
+   local lb = nn.LogitBoostCriterion(false)
+   local loss = lb:updateOutput(input, target)
+
+   local loss2 = 0
+   for i=1,10 do
+      loss2 = loss2 + math.log(1 + math.exp(target[i] <= 0 and input[i] or -input[i]))
+   end
+   mytester:assert(math.abs(loss - loss2) < 0.00001)
+
+   local gradInput = lb:updateGradInput(input, target)
+   local gradInput2 = gradInput:clone():zero()
+   for i=1,10 do
+      local p = dt.logistic(input[i])
+      gradInput2[i] = (target[i] <= 0) and p or (p - 1)
+   end
+   mytester:assertTensorEq(gradInput, gradInput2, 0.000001)
+
+   local hessInput = lb:updateHessInput(input, target)
+   local hessInput2 = hessInput:clone():zero()
+   for i=1,10 do
+      local p = dt.logistic(input[i])
+      hessInput2[i] = p * (1.0 - p)
+   end
+   mytester:assertTensorEq(hessInput, hessInput2, 0.000001)
+end
+
+function dttest.DFD()
+   local nExample = 100
+   local batchsize = 4
+   local inputsize = 10
+
+   -- train Random Forest
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample, nil, inputsize)
+   local opt = {
+      activeRatio=0.5,
+      featureBaggingSize=5,
+      nTree=4,
+      maxLeafNodes=nExample/2,
+      minLeafSize=nExample/10,
+   }
+   local trainer = dt.RandomForestTrainer(opt)
+   local df = trainer:train(trainSet, trainSet.featureIds)
+   mytester:assert(#df.trees == opt.nTree)
+
+   local dfd = nn.DFD(df)
+   dfd = nn.DFD(dfd:getReconstructionInfo())
+   local dfd2 = nn.DFD(dfd:getReconstructionInfo(), true)
+   local input = validSet.input:sub(1,batchsize)
+   local output = dfd:forward(input)
+   local output2 = dfd2:forward(input)
+
+   local _ = require 'moses'
+
+   local function hasKey(keys,key)
+      local found = false
+      keys:apply(function(x)
+         if x == key then
+            found = true
+         end
+      end)
+      return found
+   end
+
+   for i=1,batchsize do
+      local nodes = {}
+      local keys = output[1][i]
+      local keys2 = output2[1][i]
+      for j,tree in ipairs(df.trees) do
+         local stack = {}
+         tree:score(input[i], stack)
+         mytester:assert(hasKey(keys2, stack[#stack]._nodeId))
+
+         for k,node in ipairs(stack) do
+            if k > 1 then
+               assert(node._nodeId)
+               mytester:assert(hasKey(keys, node._nodeId), string.format("missing key=%d in %s", node._nodeId, tostring(keys)))
+               table.insert(nodes, node._nodeId)
+            end
+         end
+      end
+      mytester:assert(#nodes == keys:size(1))
+      mytester:assert(#df.trees == keys2:size(1))
+   end
+end
+
+function dttest.DFDDouble()
+   local nExample = 100
+   local batchsize = 4
+   local inputsize = 10
+
+   -- train Random Forest
+   local trainSet, validSet, clusterExamples, inputs, targets = dt.getDenseDummyData(nExample, nil, inputsize)
+   local opt = {
+      activeRatio=0.5,
+      featureBaggingSize=5,
+      nTree=4,
+      maxLeafNodes=nExample/2,
+      minLeafSize=nExample/10,
+   }
+   local trainer = dt.RandomForestTrainer(opt)
+   local df = trainer:train(trainSet, trainSet.featureIds)
+   mytester:assert(#df.trees == opt.nTree)
+
+   local dfd = nn.DFD(df)
+   dfd:double()
+   dfd = nn.DFD(dfd:getReconstructionInfo())
+   local dfd2 = nn.DFD(dfd:getReconstructionInfo(), true)
+   local input = validSet.input:sub(1,batchsize):double()
+   local output = dfd:forward(input)
+   local output2 = dfd2:forward(input)
+
+   local _ = require 'moses'
+
+   local function hasKey(keys,key)
+      local found = false
+      keys:apply(function(x)
+         if x == key then
+            found = true
+         end
+      end)
+      return found
+   end
+
+   for i=1,batchsize do
+      local nodes = {}
+      local keys = output[1][i]
+      local keys2 = output2[1][i]
+      for j,tree in ipairs(df.trees) do
+         local stack = {}
+         tree:score(input[i], stack)
+         mytester:assert(hasKey(keys2, stack[#stack]._nodeId))
+
+         for k,node in ipairs(stack) do
+            if k > 1 then
+               assert(node._nodeId)
+               mytester:assert(hasKey(keys, node._nodeId), string.format("missing key=%d in %s", node._nodeId, tostring(keys)))
+               table.insert(nodes, node._nodeId)
+            end
+         end
+      end
+      mytester:assert(#nodes == keys:size(1))
+      mytester:assert(#df.trees == keys2:size(1))
+   end
+end
+
+function dttest.uniquecounts() -- DEPRECATED
+   local target = torch.LongTensor(100):random(1,3)
+   local input = torch.Tensor()
+   local inputset = {input=input, target=target}
+
+   local counts = dt.uniquecounts(nil, inputset, 3)
+
+   mytester:assert(counts:sum() == 100)
+   mytester:assert(counts:nElement() == 3)
+
+   local res = torch.Tensor(3):zero()
+   target:apply(function(t) res[t] = res[t] + 1 end)
+
+   mytester:assertTensorEq(counts, res)
+end
+
+function dttest.entropy() -- DEPRECATED
+   -- 2 clusters with a bit overlap between classes:
+   local input = torch.Tensor(100,2)
+   input:narrow(1,1,50):normal(-1,.01)
+   input:narrow(1,51,50):normal(2,.01)
+
+   local target = torch.LongTensor(100):fill(3)
+   target:narrow(1,1,45):fill(1)
+   target:narrow(1,56,45):fill(2)
+
+   local inputset = {input=input, target=target}
+
+   -- test entropy()
+   local fullent = dt.entropy(inputset)
+
+   local halfset = {input=input:narrow(1,1,50), target=target:narrow(1,1,50)}
+   local halfent = dt.entropy(halfset)
+
+   local perfectset = {input=input:narrow(1,56,45), target=target:narrow(1,56,45)}
+   local perfectent = dt.entropy(perfectset)
+
+   mytester:assert(fullent > halfent)
+   mytester:assert(halfent > perfectent)
+   mytester:assert(perfectent < 0.0000001 and perfectent >= 0)
+end
+
+function dt.test(tests)
+   math.randomseed(os.time())
+   mytester = torch.Tester()
+   mytester:add(dttest)
+   mytester:run(tests)
+end
diff --git a/contrib/lua-torch/decisiontree/utils.h b/contrib/lua-torch/decisiontree/utils.h
new file mode 100644
index 000000000..8a0196a58
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/utils.h
@@ -0,0 +1,45 @@
+#include "error.h"
+
+#define check_tensors(L, a, b) \
+   do { \
+      if ((a)->nDimension != (b)->nDimension) \
+         return LUA_HANDLE_ERROR_STR((L), "different tensor dimensions"); \
+      for (int __local__var = 0; __local__var < (a)->nDimension; __local__var++) \
+         if ((a)->size[__local__var] != (b)->size[__local__var]) \
+            return LUA_HANDLE_ERROR_STR((L), "different tensor sizes"); \
+   } while (0)
+
+#define check_tensor(L, t, type) \
+   do { \
+      if (!type##_isContiguous(t)) \
+         return LUA_HANDLE_ERROR_STR((L), "tensor should be contiguous"); \
+   } while (0)
+
+#define get_tensor_size(t, type)                \
+    (TH##type##Tensor_nElement(t))
+
+#define get_tensor(L, idx, type)                                        \
+    (TH##type##Tensor *)luaT_checkudata(L, idx, "torch." #type "Tensor")
+
+static int push_table_contents(lua_State *L, int arg)
+{
+    int size = 0;
+    while(1) {
+        lua_checkstack(L, 1);
+        lua_rawgeti(L, arg, size + 1);
+        if (lua_isnil(L, -1)) {
+            lua_pop(L, 1);
+            break;
+        }
+        size++;
+    }
+    return size;
+}
+
+#define verify_push_table_contents(L, idx, count)  do {             \
+        int __tmp_count = push_table_contents(L, idx);              \
+        if (__tmp_count != count) {                                 \
+            lua_pop(L, __tmp_count);                                \
+            LUA_HANDLE_ERROR_STR(L, "Table sizes do not match");    \
+        }                                                           \
+    } while(0)
diff --git a/contrib/lua-torch/decisiontree/utils.lua b/contrib/lua-torch/decisiontree/utils.lua
new file mode 100644
index 000000000..c32c3d08b
--- /dev/null
+++ b/contrib/lua-torch/decisiontree/utils.lua
@@ -0,0 +1,125 @@
+local dt = require "decisiontree._env"
+
+-- returns a buffer table local to a thread (no serialized)
+function dt.getBufferTable(name)
+   local dt = require 'decisiontree'
+   assert(torch.type(name) == 'string')
+   dt.buffer = dt.buffer or {}
+   dt.buffer[name] = dt.buffer[name] or {}
+   return dt.buffer[name]
+end
+
+function dt.getSparseDummyData(nExample, nCluster, nFeature, overlap, nValid, nActive)
+   local dt = require 'decisiontree'
+   if torch.type(nExample) == 'table' then
+      local opt = nExample
+      nExample = opt.nExample
+      nCluster = opt.nCluster
+      nFeature = opt.nFeature
+      overlap = opt.overlap
+      nValid = opt.nValid
+      nActive = opt.nActive
+   end
+   nExample = nExample or 100 -- training set size
+   nCluster = nCluster or 10
+   assert(nCluster >= 2)
+   nFeature = math.max(2, nFeature or 10)
+   overlap = overlap or 0
+   nValid = nValid or nExample/10 -- validation set size
+   nActive = nActive or math.max(2, nFeature / 2)
+
+   -- sample nCluster centers
+   local clusterCenter = torch.rand(nCluster, nFeature)
+   local clusterLabel = torch.LongTensor(nCluster)
+   local clusterExamples = {}
+   for i=1,nCluster do
+      clusterCenter[i]:add(i)
+      clusterLabel[i] = i % 2
+      clusterExamples[i] = {}
+   end
+
+   local sparseCenter = torch.Tensor()
+
+   local shuffle = torch.LongTensor()
+
+   -- build dataset in pseudo-dense format
+   local inputs = {}
+   local targets = torch.Tensor(nExample+nValid)
+   for i=1,nExample+nValid do
+      local clusterIdx = torch.random(1,nCluster)
+      table.insert(clusterExamples[clusterIdx], i)
+
+      shuffle:randperm(nFeature)
+      local keys = torch.LongTensor(nActive):copy(shuffle:narrow(1,1,nActive))
+      sparseCenter:index(clusterCenter[clusterIdx], 1, keys)
+      local stdiv = i <= nExample and 100 or 1000
+      local values = torch.randn(nActive):div(stdiv):add(sparseCenter)
+
+      table.insert(inputs, torch.SparseTensor(keys, values))
+
+      local label = clusterLabel[clusterIdx]
+      if math.random() < overlap then
+         targets[i] = label == 1 and 0 or 1
+      else
+         targets[i] = label
+      end
+   end
+
+   local _ = require 'moses'
+   local validSet = dt.DataSet(_.slice(inputs, nExample+1, nExample+nValid), targets:narrow(1,nExample+1,nValid))
+   local trainSet = dt.DataSet(_.slice(inputs, 1, nExample), targets:narrow(1,1,nExample))
+
+   return trainSet, validSet, clusterExamples, inputs, targets
+end
+
+function dt.getDenseDummyData(nExample, nCluster, nFeature, overlap, nValid)
+   local dt = require 'decisiontree'
+   if torch.type(nExample) == 'table' then
+      local opt = nExample
+      nExample = opt.nExample
+      nCluster = opt.nCluster
+      nFeature = opt.nFeature
+      overlap = opt.overlap
+      nValid = opt.nValid
+   end
+   nExample = nExample or 100 -- training set size
+   nCluster = nCluster or 10
+   assert(nCluster >= 2)
+   nFeature = math.max(2, nFeature or 10)
+   overlap = overlap or 0
+   nValid = nValid or nExample/10 -- validation set size
+
+   -- sample nCluster centers
+   local clusterCenter = torch.rand(nCluster, nFeature)
+   local clusterLabel = torch.LongTensor(nCluster)
+   local clusterExamples = {}
+   for i=1,nCluster do
+      clusterCenter[i]:add(i)
+      clusterLabel[i] = i % 2
+      clusterExamples[i] = {}
+   end
+
+   -- build dataset in pseudo-dense format
+   local inputs = torch.Tensor(nExample+nValid, nFeature)
+   local targets = torch.Tensor(nExample+nValid)
+   for i=1,nExample+nValid do
+      local clusterIdx = torch.random(1,nCluster)
+      table.insert(clusterExamples[clusterIdx], i)
+
+      local stdiv = i <= nExample and 100 or 1000
+      inputs[i]:normal():div(stdiv):add(clusterCenter[clusterIdx])
+
+      local label = clusterLabel[clusterIdx]
+      if math.random() < overlap then
+         targets[i] = label == 1 and 0 or 1
+      else
+         targets[i] = label
+      end
+   end
+
+   local _ = require 'moses'
+   local validSet = dt.DataSet(inputs:narrow(1,nExample+1,nValid), targets:narrow(1,nExample+1,nValid))
+   local trainSet = dt.DataSet(inputs:narrow(1,1,nExample), targets:narrow(1,1,nExample))
+
+   return trainSet, validSet, clusterExamples, inputs, targets
+end
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-05-23 18:14:15 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-05-23 18:14:15 +0100
commit	714eb56e1760fdfb26afccde92664d3a2f1e8435 (patch)
tree	84d1399acbb92f852b4bd64f9ea5412680b0c6ab /contrib/lua-torch/decisiontree
parent	220a51ff68013dd668a45b78c60a7b8bfc10f074 (diff)
download	rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.tar.gz rspamd-714eb56e1760fdfb26afccde92664d3a2f1e8435.zip