mirrors
/
rspamd
kopia lustrzana https://github.com/vstakhov/rspamd.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
							-- nn.DFD: Decision Forest Discretizer
-- Takes a dense input and outputs a sparse output.
-- Each node in the forest is its own feature.
-- When a node is traversed, its commensurate feature takes on a value of 1.
-- For all non-traversed nodes, the feature is 0.
local DFD, parent = torch.class("nn.DFD", "nn.Module")

-- TODO: add :type, as the default will convert the long tensors
function DFD:__init(df, onlyLastNode)
   parent.__init(self)
   if torch.type(df) == 'table' then
      self:reconstructFromInfo(df)
   else
      assert(torch.type(df) == 'dt.DecisionForest')

      self.rootIds = torch.LongTensor()
      -- nodeId of left and right child nodes
      self.leftChild = torch.LongTensor()
      self.rightChild = torch.LongTensor()
      -- index and value of the feature that splits this node
      self.splitFeatureId = torch.LongTensor()
      self.splitFeatureValue = torch.Tensor()
      -- initialize state given df
      self:convertForest2Tensors(df)
      self:clearState()
   end
   self.onlyLastNode = onlyLastNode
   self.nTrees = self.rootIds:size(1)
end

-- converts a DecisionForest to efficient tensor representation
function DFD:convertForest2Tensors(df)
   self.rootIds:resize(#df.trees)

   -- nodeId will map to featureId
   local nodeId = 0
   -- sets nodeIds of all subnodes
   -- and measures the maximum depth over all trees
   local function recursiveTree(node, depth)
      depth = (depth or 0) + 1
      local rdepth = depth
      nodeId = nodeId + 1
      node._nodeId = nodeId

      if node.leftChild then
         rdepth = math.max(rdepth, recursiveTree(node.leftChild, depth))
      end
      if node.rightChild then
         rdepth = math.max(rdepth, recursiveTree(node.rightChild, depth))
      end
      return rdepth
   end

   -- sum over trees of max depth
   self.depth = 0
   for i,tree in ipairs(df.trees) do
      assert(torch.isTypeOf(tree.root, 'dt.CartNode'))
      self.depth = self.depth + recursiveTree(tree.root)
   end
   -- remove roots from depth
   self.depth = self.depth - self.rootIds:size(1)

   -- total number of nodes in all trees
   self.nNode = nodeId

   -- nodeId of left and right child nodes
   self.leftChild:resize(self.nNode):fill(-1)
   self.rightChild:resize(self.nNode):fill(-1)
   -- index and value of the feature that splits this node
   self.splitFeatureId:resize(self.nNode):fill(-1)
   self.splitFeatureValue:resize(self.nNode):fill(-1)

   -- aggregates CartNode attributes to an efficient tensor representation
   local function recursiveTree2(node)
      local nodeId = assert(node._nodeId)
      assert(self.splitFeatureId[nodeId] == -1)

      if node.leftChild then
         self.leftChild[nodeId] = assert(node.leftChild._nodeId)
         recursiveTree2(node.leftChild)
      else
      	 self.leftChild[nodeId] = 0
      end

      if node.rightChild then
         self.rightChild[nodeId] = assert(node.rightChild._nodeId)
         recursiveTree2(node.rightChild)
      else
      	 self.rightChild[nodeId] = 0
      end

      -- each node splits the dataset on a feature id-value pair
      self.splitFeatureId[nodeId] = assert(node.splitFeatureId)
      self.splitFeatureValue[nodeId] = assert(node.splitFeatureValue)
   end

   for i,tree in ipairs(df.trees) do
      self.rootIds[i] = assert(tree.root._nodeId)
      recursiveTree2(tree.root)
   end

   assert(self.leftChild:min() >= 0)
   assert(self.rightChild:min() >= 0)
end

-- input is a batchsize x inputsize tensor
function DFD:updateOutput(input)
   assert(torch.isTensor(input))
   assert(input:dim() == 2)
   input = input:contiguous()

   local batchsize, inputsize = input:size(1), input:size(2)
   local size = self.onlyLastNode and self.nTree or self.depth

   -- each sample's output keys is resized to maxdepth, which is the maximum size that it can take on
   self.outputkeys = self.outputkeys or torch.LongTensor()
   self.outputkeys:resize(batchsize, size)
   -- values are 1
   self.outputvalues = self.outputvalues or input.new()
   self.outputvalues:resize(batchsize, size):fill(1)

   self.output = input.nn.DFD_computeOutput(self.outputkeys, self.outputvalues, self.rootIds, self.leftChild, self.rightChild, self.splitFeatureId, self.splitFeatureValue, input, self.onlyLastNode)
   return self.output
end

function DFD:type(type, tensorCache)
   if type then
      local info = self:getReconstructionInfo()
      for k, v in pairs(info) do
         if torch.type(v) ~= 'torch.LongTensor' then
            info[k] = nil
         end
      end
      parent.type(self, type, tensorCache)
      self:reconstructFromInfo(info)
      return self
   else
      return parent.type(self)
   end
end

function DFD:updateGradInput()
   error"Not Implemented"
end

function DFD:clearState()
   self.output = {{},{}}
   self.taskbuffer = {}
   self.outputkeys = nil
   self.outputvalues = nil
   self._range = nil
   self._indices = nil
   self._mask = nil
end

function DFD:reconstructFromInfo(DFDinfo)
   for k,v in pairs(DFDinfo) do
      self[k] = v
   end
   assert(self.leftChild:nDimension() == 1)
   assert(self.rightChild:nDimension() == 1)
   assert(self.leftChild:size(1) == self.nNode)
   assert(self.rightChild:size(1) == self.nNode)
   assert(self.leftChild:min() >= 0)
   assert(self.rightChild:min() >= 0)
   assert(self.splitFeatureId:nDimension() == 1)
   assert(self.splitFeatureValue:nDimension() == 1)
   assert(self.splitFeatureId:size(1) == self.splitFeatureValue:size(1))
end

function DFD:getReconstructionInfo()
   local DFDinfo = {
      nNode = self.nNode,
      rootIds = self.rootIds,
      leftChild = self.leftChild,
      rightChild = self.rightChild,
      splitFeatureId = self.splitFeatureId,
      splitFeatureValue = self.splitFeatureValue,
      depth = self.depth
   }
   return DFDinfo
end