73 files changed, 17098 insertions, 0 deletions
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Abs.c b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c
new file mode 100644
index 000000000..28721ec8e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    real z = *input_data;
+    *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c
new file mode 100644
index 000000000..9bee5de9e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  real sum = 0;
+  THNN_CHECK_NELEMENT(input, target);
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += fabs(*input_data - *target_data);
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c
new file mode 100644
index 000000000..637a4067e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c
@@ -0,0 +1,66 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BCECriterion.c"
+#else
+
+#define EPS 1e-12
+
+void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input,
+				      THTensor *target, THTensor *output,
+				      bool sizeAverage, THTensor *weights)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  real sum = 0;
+
+  if(weights)
+    TH_TENSOR_APPLY3(real, input, real, target, real, weights,
+      real x = *input_data;
+      real y = *target_data;
+      real w = *weights_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
+      sum -= (log(x + EPS) * y + log(1. - x + EPS) * (1. - y)) * w;
+    )
+  else
+    TH_TENSOR_APPLY2(real, input, real, target,
+      real x = *input_data;
+      real y = *target_data;
+      THAssertMsg(x >= 0. && x <= 1.,
+        "input value should be between 0~1, but got %f",
+		  (double) x);
+      sum -= log(x + EPS) * y + log(1. - x + EPS) * (1. - y);
+    );
+
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input,
+					 THTensor *target, THTensor *gradInput,
+					 bool sizeAverage, THTensor *weights)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_NELEMENT(input, weights);
+
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data;
+    real y = *target_data;
+    *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS));
+  );
+
+  if(weights)
+    THTensor_(cmul)(gradInput, gradInput, weights);
+}
+
+#undef EPS
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c
new file mode 100644
index 000000000..b8f462790
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c
@@ -0,0 +1,149 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
+#else
+
+void THNN_(BatchNormalization_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output,
+  THTensor *weight, THTensor *bias,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double momentum, double eps)
+{
+  THTensor_(resizeAs)(output, input);
+  long nInput = THTensor_(size)(input, 1);
+  long f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+    real mean, invstd;
+
+    if (train) {
+      // compute mean per input
+      accreal sum = 0;
+      TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+      mean = (real) sum / n;
+      THTensor_(set1d)(save_mean, f, (real) mean);
+
+      // compute variance per input
+      sum = 0;
+      TH_TENSOR_APPLY(real, in,
+        sum += (*in_data - mean) * (*in_data - mean););
+
+      if (sum == 0 && eps == 0.0) {
+        invstd = 0;
+      } else {
+        invstd = (real) (1 / sqrt(sum/n + eps));
+      }
+      THTensor_(set1d)(save_std, f, (real) invstd);
+
+      // update running averages
+      THTensor_(set1d)(running_mean, f,
+        (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+
+      accreal unbiased_var = sum / (n - 1);
+      THTensor_(set1d)(running_var, f,
+        (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // compute output
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+    TH_TENSOR_APPLY2(real, in, real, out,
+      *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+    THTensor_(free)(out);
+    THTensor_(free)(in);
+  }
+}
+
+void THNN_(BatchNormalization_backward)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+  THTensor *running_mean, THTensor *running_var,
+  THTensor *save_mean, THTensor *save_std,
+  bool train, double scale, double eps)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  long nInput = THTensor_(size)(input, 1);
+  long f;
+  ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+  #pragma omp parallel for
+  for (f = 0; f < nInput; ++f) {
+    THTensor *in = THTensor_(newSelect)(input, 1, f);
+    THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+    real w = weight ? THTensor_(get1d)(weight, f) : 1;
+    real mean, invstd;
+    if (train) {
+      mean = THTensor_(get1d)(save_mean, f);
+      invstd = THTensor_(get1d)(save_std, f);
+    } else {
+      mean = THTensor_(get1d)(running_mean, f);
+      invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+    }
+
+    // sum over all gradOutput in feature plane
+    accreal sum = 0;
+    TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+    // dot product of the Q(X) and gradOuput
+    accreal dotp = 0;
+    TH_TENSOR_APPLY2(real, in, real, gradOut,
+      dotp += (*in_data - mean) * (*gradOut_data););
+
+    if (gradInput) {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+      if (train) {
+        // when in training mode
+        // Q(X) = X - E[x] ; i.e. input centered to zero mean
+        // Y = Q(X) / σ    ; i.e. BN output before weight and bias
+        // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+        // projection of gradOutput on to output scaled by std
+        real k = (real) dotp * invstd * invstd / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, in,
+          *gradIn_data = (*in_data - mean) * k;);
+
+        accreal gradMean = sum / n;
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+      } else {
+        // when in evaluation mode
+        // Q(X) = X - running_mean  ; i.e. input centered to zero mean
+        // Y = Q(X) / running_std    ; i.e. BN output before weight and bias
+        // dL/dX = w / running_std
+        TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+          *gradIn_data = *gradOut_data * invstd * w;);
+      }
+
+      THTensor_(free)(gradIn);
+    }
+
+    if (gradWeight) {
+      real val = THTensor_(get1d)(gradWeight, f);
+      THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+    }
+
+    if (gradBias) {
+      real val = THTensor_(get1d)(gradBias, f);
+      THTensor_(set1d)(gradBias, f, val + scale * sum);
+    }
+
+    THTensor_(free)(gradOut);
+    THTensor_(free)(in);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c
new file mode 100644
index 000000000..4cf37aeaf
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c
@@ -0,0 +1,163 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight,
+          long ignore_index)
+{
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  THNN_CHECK_DIM_SIZE(total_weight, 1, 0, 1);
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THDescBuff s1 = THTensor_(sizeDesc)(weights);
+    THError("weight tensor should be defined either for all %d classes or no classes"
+	    " but got weight tensor of shape: %s", n_classes, s1.str);
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  output_data[0] = total_weight_data[0] = 0.0;
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+      total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+      output_data[0] = -input_data[cur_target] * total_weight_data[0];
+    }
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++) {
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
+
+        real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+        total_weight_data[0] += cur_weight;
+        output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+      }
+    }
+  }
+
+  if (sizeAverage && total_weight_data[0]) {
+    output_data[0] /= total_weight_data[0];
+  }
+
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight,
+          long ignore_index)
+{
+  int n_dims = THTensor_(nDimension)(input);
+  int n_classes = THTensor_(size)(input, n_dims - 1);
+  ignore_index -= TH_INDEX_BASE;
+
+  if (!THTensor_(isContiguous)(gradInput)) {
+    THError("gradInput must be contiguous");
+  }
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  if (!(*total_weight_data > 0)) {
+    return;
+  }
+
+  if (THIndexTensor_(nDimension)(target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  if (THTensor_(nDimension)(input) > 2) {
+    THError("input tensor should be 1D or 2D");
+  }
+
+  if (weights && THTensor_(nElement)(weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  if (THTensor_(nDimension)(input) == 1) {
+    int cur_target = target_data[0] - TH_INDEX_BASE;
+    if (cur_target != ignore_index) {
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[cur_target] =
+        (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+    }
+
+  } else if (THTensor_(nDimension)(input) == 2) {
+    int batch_size = THTensor_(size)(input, 0);
+    THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+    int n_target = THTensor_(size)(input, 1);
+
+    int i;
+    for (i = 0; i < batch_size; i++){
+      int cur_target = target_data[i] - TH_INDEX_BASE;
+
+      if (cur_target != ignore_index) {
+        THAssert(cur_target >= 0 && cur_target < n_classes);
+
+        gradInput_data[i * n_target + cur_target] =
+          -(weights ? weights_data[cur_target] : 1.0f);
+
+        if (sizeAverage && *total_weight_data) {
+          gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+        }
+      }
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights) {
+    THTensor_(free)(weights);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c
new file mode 100644
index 000000000..6bd6aa067
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ELU.c b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c
new file mode 100644
index 000000000..ddcfb9705
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c
@@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+void THNN_(ELU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal alpha_,
+          bool inplace)
+{
+  real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
+  if(inplace) {
+    TH_TENSOR_APPLY(real, input,
+      if(*input_data <= 0) {
+        *input_data = (exp(*input_data) - 1) * alpha;
+      }
+    );
+    THTensor_(set)(output, input);
+  } else {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, input, real, output,
+      *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+    );
+  }
+}
+
+void THNN_(ELU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal alpha_,
+          bool inplace)
+{
+  real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if(inplace) {
+    TH_TENSOR_APPLY2(real, gradOutput, real, output,
+      if(*output_data <= 0) {
+        *gradOutput_data *= *output_data + alpha;
+      }
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  } else {
+    THTensor_(resizeAs)(gradInput, output);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+    );
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c
new file mode 100644
index 000000000..30788b0a2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c
@@ -0,0 +1,55 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FusedRNNKernel.c"
+#else
+
+void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *hx,
+          THTensor *hy,
+          THTensor *storage)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
+          THTensor *gradOutput,
+          THTensor *gradInputHx,
+          THTensor *storage)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1,
+          THTensor *bias2,
+          THTensor *cx,
+          THTensor *hy,
+          THTensor *cy)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *storage,
+          THTensor *gradInGates,
+          THTensor *prevC,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInputCx)
+{
+  THAssertMsg(false, "Not implemented for CPU");
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c
new file mode 100644
index 000000000..274a27e3b
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c
@@ -0,0 +1,73 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GatedLinearUnit.c"
+#else
+
+void THNN_(GatedLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int dim)
+{
+  // size output to half of input
+  dim = dim - TH_INDEX_BASE;
+  const long nIn = THTensor_(size)(input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  const long inputSize = THTensor_(size)(input, dim) / 2;
+  THLongStorage *newSizes = THTensor_(newSizeOf)(input);
+  THLongStorage_set(newSizes, dim, inputSize);
+  THTensor_(resize)(output, newSizes, NULL);
+
+  // halve tensor
+  THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+  THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+
+  // x = x1:cmul( sigmoid(x2) )
+  THTensor_(sigmoid)(output, secondHalf);
+  THTensor_(cmul)(output, output, firstHalf);
+
+  THLongStorage_free(newSizes);
+  THTensor_(free)(firstHalf);
+  THTensor_(free)(secondHalf);
+}
+
+void THNN_(GatedLinear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int dim)
+{
+  // set up tensors
+  dim = dim - TH_INDEX_BASE;
+  const long nIn = THTensor_(size)(input, dim);
+  THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+      dim + TH_INDEX_BASE, nIn);
+
+  THTensor_(resizeAs)(gradInput, input);
+  const long inputSize = THTensor_(size)(input, dim) / 2;
+  THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+  THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+  THTensor *gradInputfirstHalf = THTensor_(newNarrow)(gradInput, dim, 0, inputSize);
+  THTensor *gradInputsecondHalf = THTensor_(newNarrow)(gradInput, dim, inputSize, inputSize);
+
+  THTensor_(sigmoid)(gradInputfirstHalf, secondHalf);
+
+  TH_TENSOR_APPLY2(real, gradInputsecondHalf, real, gradInputfirstHalf,
+    real z = *gradInputfirstHalf_data;
+    *gradInputsecondHalf_data = (1. - z) * z;
+  );
+
+  THTensor_(cmul)(gradInputfirstHalf, gradInputfirstHalf, gradOutput);
+
+  THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, gradOutput);
+  THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, firstHalf);
+
+  THTensor_(free)(firstHalf);
+  THTensor_(free)(secondHalf);
+  THTensor_(free)(gradInputfirstHalf);
+  THTensor_(free)(gradInputsecondHalf);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c
new file mode 100644
index 000000000..aaae85bac
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+void THNN_(HardShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if (*input_data > lambda)
+      *output_data = *input_data;
+    else if (*input_data < -lambda)
+      *output_data = *input_data;
+    else
+      *output_data = 0;
+  );
+}
+
+void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if (*input_data > lambda || *input_data < -lambda)
+      *gradInput_data = *gradOutput_data;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c
new file mode 100644
index 000000000..589a66e15
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c
@@ -0,0 +1,133 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+void THNN_(HardTanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal min_val_,
+          accreal max_val_,
+          bool inplace)
+{
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+  if (inplace)
+    THTensor_(set)(output, input);
+  else
+    THTensor_(resizeAs)(output, input);
+
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    if (inplace)
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data < min_val)
+          *input_data = min_val;
+        else if (*input_data > max_val)
+          *input_data = max_val;
+      );
+      TH_TENSOR_APPLY2(real, output, real, input,
+        if (*input_data < min_val)
+          *output_data = min_val;
+        else if (*input_data <= max_val)
+          *output_data = *input_data;
+        else
+          *output_data = max_val;
+      );
+  }
+  else
+  {
+    real* ptr_input  = THTensor_(data)(input);
+    real* ptr_output = THTensor_(data)(output);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_input[i] = min_val;
+        else if (ptr_input[i] > max_val)
+          ptr_input[i] = max_val;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] < min_val)
+          ptr_output[i] = min_val;
+        else if (ptr_input[i] <= max_val)
+          ptr_output[i] = ptr_input[i];
+        else
+          ptr_output[i] = max_val;
+      }
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal min_val_,
+          accreal max_val_,
+          bool inplace)
+{
+  real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+  real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+    THTensor_(set)(gradInput, gradOutput);
+  else
+    THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 ||
+    !THTensor_(isContiguous)(input) ||
+    !THTensor_(isContiguous)(gradOutput) ||
+    !THTensor_(isContiguous)(gradInput))
+  {
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= min_val || *input_data >= max_val)
+          *gradOutput_data = 0;
+      );
+    }
+    else
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        if (*input_data <= min_val || *input_data >= max_val)
+          *gradInput_data = 0;
+        else
+          *gradInput_data = *gradOutput_data;
+      );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_input      = THTensor_(data)(input);
+    ptrdiff_t i;
+    ptrdiff_t n = THTensor_(nElement)(input);
+
+    if (inplace)
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+      }
+    else
+#pragma omp parallel for private(i)
+      for (i = 0; i < n; i++)
+      {
+        if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+          ptr_gradInput[i] = 0;
+        else
+          ptr_gradInput[i] = ptr_gradOutput[i];
+      }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c
new file mode 100644
index 000000000..42d8368ba
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c
@@ -0,0 +1,742 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/IndexLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+/* Threshold used to trigger multithreading */
+#ifndef THNN_SPARSE_OMP_THRESHOLD
+#define THNN_SPARSE_OMP_THRESHOLD 100000
+#endif
+
+/* Threshold used to trigger BLAS axpy call */
+#ifndef THNN_SPARSE_OUTDIM_THRESHOLD
+#define THNN_SPARSE_OUTDIM_THRESHOLD 49
+#endif
+
+/* sign MACRO */
+#ifndef THNN_INDEXLINEAR_SIGN
+#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 )  ?  -1   : ( (a) > 0 ) )
+#endif
+
+static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values)
+{
+  return THLongTensor_size(keys, 0) == THTensor_(nElement)(values)
+                && THTensor_(nDimension)(values) == 1
+                && THLongTensor_nDimension(keys) == 1;
+}
+
+void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int  train)
+{
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  long* sizesData = THLongTensor_data(sizes);
+  long* cumSumSizesData = THLongTensor_data(cumSumSizes);
+
+  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
+  real* normalizedValuesData = NULL;
+  if (maxNormalize)
+  {
+    THTensor_(resize1d)(normalizedValues, keysSize);
+    normalizedValuesData = THTensor_(data)(normalizedValues);
+  }
+
+  /* Resize the output */
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  /* Access the storage data/strides */
+  real* outputData = THTensor_(data)(output);
+  real* valuesData = THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  long weightStride0 = weight->stride[0];
+  real* biasData = THTensor_(data)(bias);
+  long* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");
+  long i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations. */
+  if (outDim == 1)
+  {
+    THVector_(fill)(outputData, *biasData, batchSize);
+    if (maxNormalize)
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, keysOffset,                                    \
+                 weightData, keysData,                                  \
+                 valuesData, outputData,                                \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        real* loutputData = outputData + j;
+        real val = 0;
+        real absVal = 0;
+        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          long woffset = weightStride0*(keysData[offset] + keysOffset);
+          absVal = fabs(valuesData[offset]);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * This is used at update time.
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+          normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
+          val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+    else
+    {
+      /* Parallelize on the batch itself */
+#pragma omp parallel                                                    \
+    for private(i,j)                                                    \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 outputData, cumSumSizesData,                           \
+                 sizesData)                                             \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+      for (j = 0; j < batchSize; j++)
+      {
+        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+        real* loutputData = outputData + j;
+        real val = 0;
+
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
+          offset++;
+        }
+        *loutputData += val;
+      }
+    }
+  }
+  else {
+#pragma omp parallel                                                    \
+    for private(i,j,k)                                                  \
+    firstprivate(outDim, weightData,                                    \
+                 keysData, valuesData,                                  \
+                 biasData, outputData,                                  \
+                 cumSumSizesData, sizesData)                            \
+    schedule(static)                                                    \
+    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j == 0 ? 0 : cumSumSizesData[j -  1];
+      real val = 0;
+      real* loutputData = outputData + j*outDim;
+      real* lweightData = weightData;
+      memcpy(loutputData, biasData, outDim*sizeof(real));
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val;
+        long woffset = weightStride0*(keysData[offset] + keysOffset);
+        if (maxNormalize)
+        {
+          val = valuesData[offset];
+          real absVal = fabs(val);
+          if (train)
+          {
+            if (absVal > weightData[woffset])
+            {
+              weightData[woffset] = absVal;
+              weightData[woffset+1] = 1/absVal;
+            }
+
+            /*
+             * The following can be used to scale the size of the updates
+             * depending on some rule, e.g. the frequency of a feature, ...
+             * The commented section thereafter is just an example of what can be done:
+             *
+             *```
+             * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
+             * real alpha = 1;
+             * real beta = 0.01;
+             * real gamma = 1 - 0.000001;
+             * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
+             * l = gamma*l;
+             * weightData[woffset+2] = (alpha-beta)*l + beta;
+             * ```
+             *
+             * TODO: implement a smarter update scale.
+             */
+            weightData[woffset+2] = 1;
+          }
+
+          /* Normalize + Clamp */
+          val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
+          normalizedValuesData[offset] = val;
+
+          lweightData = weightData + woffset + maxNormalize;
+        }
+        else
+        {
+          val = valuesData[offset];
+          lweightData = weightData + woffset;
+        }
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            loutputData[k] += lweightData[k] * val;
+          }
+        }
+        offset++;
+      }
+    }
+  }
+  return;
+}
+
+void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THLongTensor *runningKeys,
+          THLongTensor *cumSumSizes,
+          long keysOffset,
+          accreal weightDecay_,
+          accreal learningRate_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  /* Retrieve all the dimensions of the problem */
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  long keysSize = THLongTensor_size(runningKeys, 0);
+
+  /* Access the storage data/strides */
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  long weightStride0 = weight->stride[0];
+  real* gradBiasData = THTensor_(data)(gradBias);
+  real* biasData = THTensor_(data)(bias);
+  long* keysData = THLongTensor_data(runningKeys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
+  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");
+
+  int j,k;
+  long offset = 0;
+
+  /* Update the bias first */
+  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+          real lr = learningRate*weightData[woffset-2];
+          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+          weightData[woffset] -= gradWeightData[2*j+1]*lr;
+        }
+      }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          long woffset = weightStride0*(keysData[j] + keysOffset);
+          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
+        }
+      }
+      else
+      {
+        for (j = 0; j < keysSize; j++)
+        {
+          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
+        }
+      }
+    }
+  }
+  else
+  {
+    for (j = 0; j < keysSize; j++)
+    {
+      real lr = learningRate;
+      real wd = weightDecay;
+      real* lweightData;
+      long woffset = weightStride0*(keysData[j] + keysOffset);
+      real* lgradWeightData = gradWeightData + j*outDim;
+      if (maxNormalize)
+      {
+        lgradWeightData += j*outDim;
+        /* weightData[woffset + 2] */
+        lweightData = weightData + woffset + maxNormalize - 2;
+        lr = lr*lweightData[0];
+        wd = weightDecay*lweightData[0];
+        /* weightData[woffset + 3] */
+        lweightData++;
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
+        }
+        lweightData++;
+        lgradWeightData += outDim;
+      }
+      else
+      {
+        lweightData = weightData + woffset;
+      }
+
+      /* We do sparse weight decay.
+       * We think it makes more sense. */
+      if (weightDecay)
+      {
+        for (k=0; k < outDim; k++)
+        {
+            lweightData[k] -= lweightData[k]*wd;
+        }
+      }
+
+      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+      {
+        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
+      }
+      else
+      {
+        for (k=0; k < outDim; k++)
+        {
+          lweightData[k] -= lgradWeightData[k]*lr;
+        }
+      }
+    }
+  }
+}
+
+
+void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  int maxNormalize = woutDim - outDim;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* weightData = THTensor_(data)(weight);
+  real* biasData = THTensor_(data)(bias);
+  long weightStride0 = weight->stride[0];
+  long biasStride = bias->stride[0];
+  long* keysData = THLongTensor_data(keys);
+  long* sizesData = THLongTensor_data(sizes);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    if (maxNormalize)
+    {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
+            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
+            offset++;
+          }
+        }
+
+        offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+            weightData[idx-2] = 0;
+            offset++;
+          }
+        }
+    }
+    else
+    {
+      if (weightDecay)
+      {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real* lgradOutputData = gradOutputData + j;
+          *biasData -= *lgradOutputData * scale;
+          real val = *lgradOutputData * scale;
+          real* lweightData = weightData;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            long idx = weightStride0*(keysData[offset] + keysOffset);
+            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
+            offset++;
+          }
+        }
+      }
+      else
+      {
+        long offset = 0;
+        for (j = 0; j < batchSize; j++)
+        {
+          real val = gradOutputData[j] * scale;
+          for (i = 0; i < sizesData[j]; i++)
+          {
+            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
+            offset++;
+          }
+          *biasData -= val;
+        }
+      }
+    }
+  }
+  else {
+    long offset = 0;
+    for (j = 0; j < batchSize; j++)
+    {
+      real val = 0;
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lweightData = weightData;
+      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        real wd = weightDecay;
+
+        // Max normalize case
+        if (maxNormalize)
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          val *= lweightData[0];
+          wd *= lweightData[0];
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
+          }
+          lweightData += 2;
+        }
+        else
+        {
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
+        }
+
+        /* We do sparse weight decay.
+         * We think it makes more sense. */
+        if (weightDecay)
+        {
+          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+          {
+            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
+          }
+          else
+          {
+            for (k=0; k < outDim; k++)
+            {
+              lweightData[k] -= wd * lweightData[k];
+            }
+          }
+        }
+
+        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+        {
+          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
+        }
+        else
+        {
+          for (k=0; k < outDim; k++)
+          {
+            lweightData[k] -= val * lgradOutputData[k];
+          }
+        }
+        offset++;
+      }
+    }
+
+    /* Max Normalize case:
+     * Reset the smart update scaling if
+     * one does it batch-wise.
+     * TODO: Decide what to do with that piece of code.
+     * NB: If the code belowe is uncommented, so should the commented
+     * code in IndexLinear:zeroGradParameters() */
+
+    /*
+    if (maxNormalize)
+    {
+      offset = 0;
+      for (j = 0; j < batchSize; j++)
+      {
+        real* lweightData = weightData;
+        for (i = 0; i < sizesData[j]; i++)
+        {
+          real val = valuesData[offset] * scale;
+          real wd = weightDecay;
+
+          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+          lweightData[0] = 0;
+          offset++;
+        }
+      }
+    }
+    */
+  }
+  return;
+}
+
+void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THLongTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THLongTensor *sizes,
+          THLongTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *valuesBuffer,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  /* Retrieve all the dimensions of the problem */
+  long batchSize = THLongTensor_size(sizes, 0);
+  long keysSize = THLongTensor_size(keys, 0);
+  long outDim = THTensor_(size)(bias, 0);
+  long woutDim = THTensor_(size)(weight, 1);
+  long maxNormalize = (woutDim - outDim) > 0 ?1:0;
+  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+  long* sizesData = THLongTensor_data(sizes);
+
+  /* COmpute the cumulative sizes */
+  THLongTensor* cumSizes = THLongTensor_new();
+  THLongTensor_cumsum(cumSizes, sizes, 0);
+  long* cumSizesData = THLongTensor_data(cumSizes);
+
+  /* Resize the gradWeight buffer to keep it dense.
+   * That speeds up updates A LOT assuming random mem access. */
+  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));
+
+  /* Access the storage data/strides */
+  real* gradOutputData = THTensor_(data)(gradOutput);
+  real* valuesData =THTensor_(data)(values);
+  real* gradWeightData = THTensor_(data)(gradWeight);
+  real* weightData = THTensor_(data)(weight);
+  real* gradBiasData = THTensor_(data)(gradBias);
+  long gradWeightStride0 = gradWeight->stride[0];
+  long weightStride0 = weight->stride[0];
+  long* keysData = THLongTensor_data(keys);
+
+  /* Make sure these inputs are contiguous to accelerate computations */
+  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
+  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");
+
+  int i,j,k;
+
+  /* Separate cases: output dimension is == 1, or > 1
+   * This allows for some optimizations.
+   * No multithreading here as this could
+   * corrupt the results (hogwild style) */
+  if (outDim == 1)
+  {
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j==0?0:cumSizesData[j-1];
+      real val = gradOutputData[j] * scale;
+      real* lgradWeightData = gradWeightData + offset;
+      real* lvaluesData = valuesData + offset;
+      long end = sizesData[j];
+
+      if (maxNormalize)
+      {
+        lgradWeightData += offset;
+        i = 0;
+        for(;i < end; i++)
+        {
+          lgradWeightData[2*i] = val;
+          lgradWeightData[2*i+1] = val * lvaluesData[i];
+        }
+      }
+      else
+      {
+        i = 0;
+        for(;i < end-4; i += 4)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+          lgradWeightData[i+1] = val * lvaluesData[i+1];
+          lgradWeightData[i+2] = val * lvaluesData[i+2];
+          lgradWeightData[i+3] = val * lvaluesData[i+3];
+        }
+
+        for(; i < end; i++)
+        {
+          lgradWeightData[i] = val * lvaluesData[i];
+        }
+      }
+      *gradBiasData += val;
+      offset += end;
+    }
+  }
+  else {
+    for (j = 0; j < batchSize; j++)
+    {
+      long offset = j==0?0:cumSizesData[j-1];
+      real val = 0;
+      real* lgradOutputData = gradOutputData + j*outDim;
+      real* lgradWeightData = gradWeightData;
+      real* lweightData = weightData;
+      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
+      for (i = 0; i < sizesData[j]; i++)
+      {
+        real val = valuesData[offset] * scale;
+        lgradWeightData = gradWeightData + offset*outDim;
+        if (maxNormalize)
+        {
+          lgradWeightData += offset*outDim;
+          k = 0;
+          for(;k < outDim-4; k += 4)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
+            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
+            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
+          }
+
+          for(; k < outDim; k++)
+          {
+            lgradWeightData[k] = lgradOutputData[k]*scale;
+          }
+          lgradWeightData += outDim;
+        }
+        k = 0;
+        for(;k < outDim-4; k += 4)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+          lgradWeightData[k+1] = val * lgradOutputData[k+1];
+          lgradWeightData[k+2] = val * lgradOutputData[k+2];
+          lgradWeightData[k+3] = val * lgradOutputData[k+3];
+        }
+
+        for(; k < outDim; k++)
+        {
+          lgradWeightData[k] = val * lgradOutputData[k];
+        }
+        offset++;
+      }
+    }
+  }
+  THLongTensor_free(cumSizes);
+  return;
+}
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c
new file mode 100644
index 000000000..53940e894
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c
@@ -0,0 +1,38 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  accreal sum = 0;
+
+  TH_TENSOR_APPLY(real, input,
+    sum += fabs(*input_data);
+  );
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY2(real, gradInput, real, input,
+    if (*input_data > 0)
+      *gradInput_data = 1;
+    else if (*input_data < 0)
+      *gradInput_data = -1;
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c
new file mode 100644
index 000000000..074047d83
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c
@@ -0,0 +1,57 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal negval_,
+          bool inplace)
+{
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= 0)
+        *input_data *= negval;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = *input_data > 0 ? *input_data : *input_data * negval;
+    );
+  }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal negval_,
+          bool inplace)
+{
+  real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if (*input_data <= 0)
+        *gradOutput_data *= negval;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+    );
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Linear.c b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c
new file mode 100644
index 000000000..8c5cd115e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c
@@ -0,0 +1,114 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Linear.c"
+#else
+
+void THNN_(Linear_updateAddBuffer)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *addBuffer)
+{
+  long nframe = THTensor_(size)(input,0);
+  long nElement = THTensor_(nElement)(addBuffer);
+  if (nElement != nframe) {
+    THTensor_(resize1d)(addBuffer,nframe);
+    THTensor_(fill)(addBuffer,1.0);
+  }
+}
+
+void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer)
+{
+  long dim = THTensor_(nDimension)(input);
+  if (dim == 1) {
+    THTensor_(resize1d)(output,THTensor_(size)(weight,0));
+    if (bias) {
+      THTensor_(copy)(output,bias);
+    }
+    else {
+      THTensor_(zero)(output);
+    }
+    THTensor_(addmv)(output,1,output,1,weight,input);
+  }
+  else if (dim == 2) {
+    long nframe = THTensor_(size)(input,0);
+    long nElement = THTensor_(nElement)(output);
+    THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0));
+    if (THTensor_(nElement)(output) != nElement) {
+      THTensor_(zero)(output);
+    }
+    THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight,weight,0,1);
+    THTensor_(addmm)(output,0,output,1,input,tweight);
+    THTensor_(free)(tweight);
+    if (bias) {
+      THTensor_(addr)(output,1,output,1,addBuffer,bias);
+    }
+  }
+}
+
+void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight)
+{
+  if (gradInput) {
+    long nElement = THTensor_(nElement)(gradInput);
+    THTensor_(resizeAs)(gradInput,input);
+    if (THTensor_(nElement)(gradInput) != nElement) {
+      THTensor_(zero)(gradInput);
+    }
+
+    long dim = THTensor_(nDimension)(input);
+    if (dim == 1) {
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight,weight,0,1);
+      THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput);
+      THTensor_(free)(tweight);
+    }
+    else if (dim == 2) {
+      THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight);
+    }
+  }
+}
+
+void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long dim = THTensor_(nDimension)(input);
+  if (dim == 1) {
+    THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input);
+    if (bias) {
+      THTensor_(cadd)(gradBias,gradBias,scale,gradOutput);
+    }
+  }
+  else if (dim == 2) {
+    THTensor *tgradOutput = THTensor_(new)();
+    THTensor_(transpose)(tgradOutput,gradOutput,0,1);
+    THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input);
+    if (bias) {
+      THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+      THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer);
+    }
+    THTensor_(free)(tgradOutput);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c
new file mode 100644
index 000000000..651d56002
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c
@@ -0,0 +1,36 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *buffer)
+{
+  THTensor_(resizeAs)(output, input);
+  THTensor_(resizeAs)(buffer, input);
+
+  TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+    real z = exp(-*input_data);
+    *buffer_data = z;
+    *output_data = -log(1. + z);
+  );
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *buffer)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, buffer);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
+    real z = *buffer_data;
+    *gradInput_data = *gradOutput_data * z / (1. + z);
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c
new file mode 100644
index 000000000..a7280422b
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c
@@ -0,0 +1,137 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSoftMax.c"
+#else
+
+void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t, d;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  real *input_data0 = THTensor_(data)(input);
+  real *output_data0 = THTensor_(data)(output);
+
+  accreal logsum;
+  real maxInput;
+  #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    logsum = 0;
+    maxInput = -THInf;
+    input_data = input_data0 + (t/stride)*dim*stride + t % stride;
+    output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+
+    for (d = 0; d < dim; d++)
+      maxInput = THMax(maxInput, input_data[d*stride]);
+
+    for (d = 0; d < dim; d++)
+      logsum += exp(input_data[d*stride] - maxInput);
+    logsum = maxInput + log(logsum);
+
+    for (d = 0; d < dim; d++)
+      output_data[d*stride] = input_data[d*stride] - logsum;
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t, d;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if (output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+    THError("1D, 2D, 3D or 4D tensor expected");
+
+  output = THTensor_(newContiguous)(output);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  THTensor_(resizeAs)(gradInput, output);
+  real *gradInput_data0 = THTensor_(data)(gradInput);
+  real *output_data0 = THTensor_(data)(output);
+  real *gradOutput_data0 = THTensor_(data)(gradOutput);
+  accreal sum;
+  #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    sum = 0;
+    gradInput_data = gradInput_data0 + (t/stride)*dim*stride + t % stride;
+    output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+    gradOutput_data = gradOutput_data0 + (t/stride)*dim*stride + t % stride;
+
+    for (d = 0; d < dim; d++)
+      sum += gradOutput_data[d*stride];
+
+    for (d = 0; d < dim; d++)
+      gradInput_data[d*stride] = gradOutput_data[d*stride] - exp(output_data[d*stride])*sum;
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c
new file mode 100644
index 000000000..46bc2c3c1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c
@@ -0,0 +1,225 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void THNN_(LookupTable_resetCount)(
+          THInteger_t *count_data,
+          THIndexTensor *input)
+{
+  ptrdiff_t i;
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
+
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - TH_INDEX_BASE;
+    count_data[k] = 0;
+  }
+  for (i = 0; i<numel; i++)
+  {
+    long k = input_data[i] - TH_INDEX_BASE;
+    count_data[k]++;
+  }
+}
+
+void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,
+          THIndexTensor *indices,
+          bool scaleGradByFreq,
+          int paddingValue,
+          accreal ascale)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale);
+  ptrdiff_t i;
+  THInteger_t *count_data = NULL;
+
+  if (scaleGradByFreq)
+  {
+    THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+    count_data = THIntegerTensor_(data)(count);
+  }
+
+  if (!THTensor_(isContiguous)(gradWeight))
+    THError("gradWeight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(input))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) {
+    THDescBuff s1 = THIndexTensor_(sizeDesc)(input);
+    THError("input must be a vector or matrix, but is of shape: %s", s1.str);
+  }
+
+  THIndex_t *input_data = THIndexTensor_(data)(input);
+  ptrdiff_t numel = THIndexTensor_(nElement)(input);
+  long numw = THTensor_(size)(gradWeight, 0);
+
+  // check that inputs are all within range
+  for (i=0; i<numel; i++)
+    if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) {
+      THError("inputs need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      input_data[i]);
+    }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  real *gw = THTensor_(data)(gradWeight);
+  real *go = THTensor_(data)(gradOutput);
+  long stride = THTensor_(stride)(gradWeight, 0);
+
+  if (count_data)
+    THNN_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over sections of the vocabulary, so that
+    // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+    // has to traverse the entire input, but the dominating factor is the axpy
+    // BLAS call.
+    #pragma omp parallel private(i)
+    {
+      int tid = omp_get_thread_num();
+      int nthreads = omp_get_num_threads();
+
+      long start = tid * (numw/nthreads + 1);
+      long end = start + (numw/nthreads + 1);
+      for (i=0; i<numel; i++)
+      {
+        if (input_data[i] != paddingValue)
+        {
+            long k = input_data[i] - TH_INDEX_BASE;
+            if (k >= start && k < end)
+            {
+                real scale_ = scale;
+                if (count_data) scale_ /= count_data[k];
+                THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+            }
+        }
+      }
+    }
+
+    THTensor_(free)(gradOutput);
+    return;
+  }
+#endif
+
+  for (i=0; i<numel; i++)
+  {
+    if (input_data[i] != paddingValue)
+    {
+        long k = input_data[i] - TH_INDEX_BASE;
+        real scale_ = scale;
+        if (count_data) scale_ /= count_data[k];
+        THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+     }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+          real *row_data,
+          long stride,
+          real maxNorm,
+          real normType)
+{
+  real norm = 0;
+  real new_norm;
+  long j;
+  for (j=0; j<stride; j++)
+  {
+    if (normType == 1) {
+      norm += fabs(row_data[j]);
+    } else if (normType == 2) {
+      norm += row_data[j] * row_data[j];
+    } else {
+      norm += pow(fabs(row_data[j]), normType);
+    }
+  }
+  norm = pow(norm, 1.0 / normType);
+  if (norm > maxNorm)
+  {
+    new_norm = maxNorm / (norm + 1e-7);
+    for (j=0; j<stride; j++) {
+      row_data[j] *= new_norm;
+    }
+  }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+   return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+          THNNState *state,
+          THIndexTensor *idx,
+          THTensor *weight,
+          accreal maxNorm_,
+          accreal normType_)
+{
+  real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_);
+  real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_);
+  if (!THTensor_(isContiguous)(weight))
+    THError("weight must be contiguous");
+  if (!THIndexTensor_(isContiguous)(idx))
+    THError("input must be contiguous");
+  if (THIndexTensor_(nDimension)(idx) != 1)
+    THError("idx must be a vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  ptrdiff_t i;
+  THIndex_t *row_idx = THIndexTensor_(data)(idx);
+  ptrdiff_t numel = THIndexTensor_(nElement)(idx);
+
+  long numw = THTensor_(size)(weight, 0);
+  long stride = THTensor_(stride)(weight, 0);
+  real *gw = THTensor_(data)(weight);
+  for (i=0; i<numel; i++) {
+    if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) {
+      THError("input need to be in the range %ld <= input < %ld, "
+	      "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+	      row_idx[i]);
+    }
+  }
+  // get unique indices
+  qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+  ptrdiff_t ptr = 0;
+  for (i=0; i<numel; i++)
+    if (i == 0 || row_idx[i] != row_idx[i-1])
+      row_idx[ptr++] = row_idx[i];
+  numel = ptr;
+
+#ifdef _OPENMP
+  if (numel > 1000)
+  {
+    // The strategy is to parallelize over the rows that appear in
+    // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+    // This distributes the work evenly to each thread.
+    #pragma omp parallel for private(i)
+    for (i=0; i<numel; i++)
+    {
+      long k = row_idx[i] - TH_INDEX_BASE;
+      THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+    }
+    return;
+  }
+#endif
+  for (i=0; i<numel; i++)
+  {
+    long k = row_idx[i] - TH_INDEX_BASE;
+    THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c
new file mode 100644
index 000000000..58911f6f0
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c
@@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (*input_data - *target_data);
+    sum += z*z;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+
+  real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = norm * (*input_data - *target_data);
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c
new file mode 100644
index 000000000..d6d9b60b9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+  real sum = 0;
+
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = (margin - *input_data * *target_data);
+    sum += z>0 ? z : 0;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c
new file mode 100644
index 000000000..16398c13c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -0,0 +1,184 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data, *isTarget_data;
+  THIndex_t *target_data;
+  long nframe, dim;
+  long t, d, dt, ddt;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+	       "inconsistent target size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+	       && (target->size[1] == dim), 3, "inconsistent target size");
+  }
+
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+  target = THIndexTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+
+  THNN_resizeAs_indices(isTarget, target);
+  THTensor_(zero)(isTarget);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    for (ddt = 0; ddt < dim; ddt++)
+    {
+      THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE;
+      if (target_idx < 0)
+        break;
+      isTarget_data[target_idx] = 1;
+    }
+    for (dt = 0; dt < dim; dt++)
+    {
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+            sum += z;
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+  }
+
+  sum /= dim;
+  if (sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage)
+{
+  real *input_data;
+  real *gradInput_data;
+  THIndex_t *target_data;
+  real *isTarget_data;
+  long nframe, dim;
+  long t, d, dt;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+	       "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3,
+	       "inconsistent isTarget size");
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+	       && (target->size[1] == dim), 3, "inconsistent target size");
+    THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe)
+	       && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+  }
+
+  THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+  THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+  THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+  THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
+  target = THIndexTensor_(newContiguous)(target);
+  input = THTensor_(newContiguous)(input);
+  isTarget = THTensor_(newContiguous)(isTarget);
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+  isTarget_data = THTensor_(data)(isTarget);
+
+  g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  for (t = 0; t < nframe; t++)
+  {
+    for (dt = 0; dt < dim; dt++)
+    {
+      THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+      real input_target;
+      if (target_idx < 0)
+        break;
+
+      input_target = input_data[target_idx];
+      for (d = 0; d < dim; d++)
+      {
+        if (!isTarget_data[d])
+        {
+          real z = 1 - input_target + input_data[d];
+          if (z > 0)
+          {
+            gradInput_data[target_idx] -= g;
+            gradInput_data[d] += g;
+          }
+        }
+      }
+    }
+    input_data += dim;
+    target_data += dim;
+    isTarget_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  THTensor_(free)(isTarget);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c
new file mode 100644
index 000000000..2f8f8ff58
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c
@@ -0,0 +1,168 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  real *input_data, *weights_data;
+  THIndex_t *target_data;
+  long nframe, dim;
+  long t, d;
+  real sum;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+	       "inconsistent target size");
+  }
+
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t idx = THIndexTensor_(get1d)(target, t);
+    THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3,
+	       "target out of range");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  input_data = THTensor_(data)(input);
+  target_data = THIndexTensor_(data)(target);
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  sum = 0;
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+    real input_target = input_data[target_idx];
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0) {
+        real h = (p==1) ? z : z*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        sum += h;
+      }
+    }
+    input_data += dim;
+  }
+
+  sum /= dim;
+  if(sizeAverage)
+    sum /= nframe;
+
+  THTensor_(set1d)(output, 0, sum);
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,
+          accreal margin_)
+{
+  real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+  real *input_data;
+  real *gradInput_data;
+  THIndex_t *target_data;
+  real *weights_data;
+  long nframe, dim;
+  long t, d;
+  real g;
+
+  THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+	     "vector or matrix expected");
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+  }
+  else
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+	       "inconsistent target size");
+  }
+
+  g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+  target_data = THIndexTensor_(data)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+  weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+  for (t = 0; t < nframe; t++)
+  {
+    THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+    real input_target = input_data[target_idx];
+    real gradInput_target = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = margin - input_target + input_data[d];
+      if (d == target_idx)
+        continue;
+
+      if (z > 0)
+      {
+        real h = (p == 1) ? g : 2*g*z;
+        if(weights_data)
+          h *= weights_data[target_idx];
+        gradInput_target -= h;
+        gradInput_data[d] = h;
+      }
+      else
+        gradInput_data[d] = 0;
+    }
+    gradInput_data[target_idx] = gradInput_target;
+
+    input_data += dim;
+    gradInput_data += dim;
+  }
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if(weights)
+    THTensor_(free)(weights);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c
new file mode 100644
index 000000000..488322fde
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c
@@ -0,0 +1,207 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THTensor_(resizeAs)(output, input);
+
+  if (nOutputPlane == 0)
+  {
+    // handle shared parameter case
+    real w = *THTensor_(data)(weight);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
+    );
+  }
+  else
+  {
+    input = THTensor_(newContiguous)(input);
+    long bs = 1, ks = 1;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
+          bs = input->size[0];
+          for (int d = 2; d < input_ndim; d++) {
+              ks *= input->size[d];
+          }
+      }
+    }
+
+    real *output_data = THTensor_(data)(output);
+    real *input_data = THTensor_(data)(input);
+    real *weight_data = THTensor_(data)(weight);
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      real* n_input_data = input_data + i*nOutputPlane*ks;
+      real* n_output_data = output_data + i*nOutputPlane*ks;
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        for (k = 0; k < ks; ++k)
+          n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+        n_input_data += ks;
+        n_output_data += ks;
+      }
+    }
+    THTensor_(free)(input);
+  }
+}
+
+void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane)
+{
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (nOutputPlane == 0)
+  {
+    real w = THTensor_(data)(weight)[0];
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+       if ((*input_data) > 0)
+         *gradInput_data = *gradOutput_data;
+       else
+         *gradInput_data = w * (*gradOutput_data);
+    );
+  }
+  else
+  {
+    input = THTensor_(newContiguous)(input);
+    gradOutput = THTensor_(newContiguous)(gradOutput);
+    weight = THTensor_(newContiguous)(weight);
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradInput_data = THTensor_(data)(gradInput);
+
+    long bs = 1, ks = 1;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
+          bs = input->size[0];
+          for (int d = 2; d < input_ndim; d++) {
+              ks *= input->size[d];
+          }
+      }
+    }
+
+    THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+      real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real w = weight_data[j];
+        for (k = 0; k < ks; ++k)
+        {
+          if (n_input_data[k] > 0)
+            n_gradInput_data[k] = n_gradOutput_data[k];
+          else
+            n_gradInput_data[k] = n_gradOutput_data[k] * w;
+        }
+        n_input_data += ks;
+        n_gradInput_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+    THTensor_(free)(input);
+    THTensor_(free)(gradOutput);
+    THTensor_(free)(weight);
+  }
+}
+
+void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+
+  if (nOutputPlane == 0)
+  {
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+    real sum = 0;
+    TH_TENSOR_APPLY2(real, input, real, gradOutput,
+      if ((*input_data) <= 0)
+        sum += (*input_data) * (*gradOutput_data);
+    );
+    gradWeight_data[0] += scale * sum;
+  }
+  else
+  {
+    THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous");
+    input = THTensor_(newContiguous)(input);
+    gradOutput = THTensor_(newContiguous)(gradOutput);
+    weight = THTensor_(newContiguous)(weight);
+    long bs = 1, ks = 1;
+    {
+      long input_ndim = THTensor_(nDimension)(input);
+      if (input->size[input_ndim > 1] != nOutputPlane)
+        THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+      if (input_ndim > 1) {
+          bs = input->size[0];
+          for (int d = 2; d < input_ndim; d++) {
+            ks *= input->size[d];
+          }
+      }
+    }
+
+    const real *input_data = THTensor_(data)(input);
+    const real *gradOutput_data = THTensor_(data)(gradOutput);
+    const real *weight_data = THTensor_(data)(weight);
+    real *gradWeight_data = THTensor_(data)(gradWeight);
+
+    THIndex_t i, j, k;
+    for (i = 0; i < bs; ++i)
+    {
+      const real *n_input_data = input_data + i*nOutputPlane*ks;
+      const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+      for (j = 0; j < nOutputPlane; ++j)
+      {
+        real sum = 0;
+        for (k = 0; k < ks; ++k)
+          if (n_input_data[k] <= 0)
+            sum += n_gradOutput_data[k] * n_input_data[k];
+        gradWeight_data[j] += scale * sum;
+        n_input_data += ks;
+        n_gradOutput_data += ks;
+      }
+    }
+    THTensor_(free)(input);
+    THTensor_(free)(gradOutput);
+    THTensor_(free)(weight);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c
new file mode 100644
index 000000000..8fd46d3c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c
@@ -0,0 +1,132 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          accreal lower_,
+          accreal upper_,
+          bool train,
+          bool inplace,
+          THGenerator *generator)
+{
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+  if (train)
+  {
+    // get default random generator
+    THTensor_(resizeAs)(noise, input);
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, input, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *input_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *noise_data = 1;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+        if (*input_data <= 0)
+        {
+          const real r = (real)THRandom_uniform(generator, lower, upper);
+          *output_data = (*input_data) * r;
+          *noise_data = r;
+        }
+        else
+        {
+          *output_data = *input_data;
+          *noise_data = 1;
+        }
+      );
+    }
+  }
+  else
+  {
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY(real, input,
+        if (*input_data <= 0)
+        {
+          *input_data = *input_data * negSlope;
+        }
+      );
+      THTensor_(set)(output, input);
+    }
+    else
+    {
+      THTensor_(resizeAs)(output, input);
+      TH_TENSOR_APPLY2(real, input, real, output,
+        const real r = (*input_data) <= 0 ? negSlope : 1;
+        *output_data = *input_data * r;
+      );
+    }
+  }
+}
+
+void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          accreal lower_,
+          accreal upper_,
+          bool train,
+          bool inplace)
+{
+  real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+  real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THTensor_(cmul)(gradOutput, gradOutput, noise);
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      THTensor_(cmul)(gradInput, gradOutput, noise);
+    }
+  }
+  else
+  {
+    // use constant factor for negative input values
+    const real negSlope = (lower + upper) / 2;
+    if (inplace)
+    {
+      TH_TENSOR_APPLY2(real, gradOutput, real, input,
+        if (*input_data <= 0)
+        {
+          *gradOutput_data = (*gradOutput_data) * negSlope;
+        }
+      );
+      THTensor_(set)(gradInput, gradOutput);
+    }
+    else
+    {
+      THTensor_(resizeAs)(gradInput, input);
+      TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+        *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
+      );
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c
new file mode 100644
index 000000000..17fb2cb4d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(sigmoid)(output, input);
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_NELEMENT(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = *output_data;
+    *gradInput_data = *gradOutput_data * (1. - z) * z;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c
new file mode 100644
index 000000000..d1928d11c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+    real z = fabs(*input_data - *target_data);
+    sum += z < 1 ? 0.5*z*z : z - 0.5;
+  );
+
+  if (sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+    real x = *input_data - *target_data;
+    if (x < -1.)
+     *gradInput_data = - norm;
+    else if (x > 1.)
+     *gradInput_data = norm;
+    else
+     *gradInput_data = norm * x;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c
new file mode 100644
index 000000000..bac0a3b53
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *output,
+  bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+  real sum;
+
+  sum = 0;
+  TH_TENSOR_APPLY2(real, input, real, target,
+                   real z = log(1. + exp(-*input_data* *target_data));
+                   sum += z;)
+
+  if(sizeAverage)
+    sum /= THTensor_(nElement)(input);
+
+  THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *target,
+  THTensor *gradInput,
+  bool sizeAverage)
+{
+  THNN_CHECK_NELEMENT(input, target);
+  real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+                   real z = exp(-*target_data * *input_data);
+                   *gradInput_data = -norm*(*target_data)*z/(1. + z);)
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c
new file mode 100644
index 000000000..7b60d64c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c
@@ -0,0 +1,150 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMax.c"
+#else
+
+void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  real *input_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t;
+
+  if (input->nDimension == 1)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    nframe = 1;
+    dim = input->size[0];
+    stride = input->size[1]*input->size[2];
+  }
+  else if (input->nDimension == 4)
+  {
+    nframe = input->size[0];
+    dim = input->size[1];
+    stride = input->size[2]*input->size[3];
+  }
+  else
+  {
+    THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+  }
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resizeAs)(output, input);
+
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+
+    real inputMax = -THInf;
+    accreal sum;
+
+    ptrdiff_t d;
+    for (d = 0; d < dim; d++)
+    {
+      if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
+    }
+
+    sum = 0;
+    for (d = 0; d < dim; d++)
+    {
+      real z = exp(input_ptr[d*stride] - inputMax);
+      output_ptr[d*stride] = z;
+      sum += z;
+    }
+
+    for (d = 0; d < dim; d++)
+    {
+      output_ptr[d*stride] *= 1/sum;
+    }
+  }
+
+  THTensor_(free)(input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  real *gradInput_data, *gradOutput_data, *output_data;
+  ptrdiff_t nframe = 0, dim = 0, stride = 0;
+  ptrdiff_t t;
+
+  if (output->nDimension == 1)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = 1;
+  }
+  else if (output->nDimension == 2)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = 1;
+  }
+  else if (output->nDimension == 3)
+  {
+    nframe = 1;
+    dim = output->size[0];
+    stride = output->size[1]*output->size[2];
+  }
+  else if (output->nDimension == 4)
+  {
+    nframe = output->size[0];
+    dim = output->size[1];
+    stride = output->size[2]*output->size[3];
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D tensor expected");
+  }
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  output = THTensor_(newContiguous)(output);
+
+  THTensor_(resizeAs)(gradInput, output);
+  gradInput_data = THTensor_(data)(gradInput);
+  output_data = THTensor_(data)(output);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(t)
+  for (t = 0; t < stride*nframe; t++)
+  {
+    real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
+    real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+    real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
+
+    ptrdiff_t d;
+    accreal sum = 0;
+    for (d = 0; d < dim; d++)
+      sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
+
+    for (d = 0; d < dim; d++)
+      gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
+  }
+
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(output);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c
new file mode 100644
index 000000000..6491e66d6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal beta_,
+          accreal threshold_)
+{
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THTensor_(resizeAs)(output, input);
+
+  // f(x) = 1/beta * log(1 + exp(beta * x))
+  TH_TENSOR_APPLY2(real, output, real, input,               \
+    *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+  );
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal beta_,
+          accreal threshold_)
+{
+  real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+
+  // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+  // SINCE
+  // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+  // THEREFORE:
+  // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+    real z = exp(*output_data * beta);
+    *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c
new file mode 100644
index 000000000..e77950868
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THTensor_(resizeAs)(output, input);
+
+  TH_TENSOR_APPLY2(real, output, real, input,
+    if ((*input_data) > lambda)
+     *output_data = *input_data - lambda;
+    else if ((*input_data) < -lambda)
+     *output_data = *input_data + lambda;
+    else
+     *output_data = 0;
+  );
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda_)
+{
+  real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+  TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+    if ((*input_data) > lambda || (*input_data) < -lambda)
+      *gradInput_data = (*gradOutput_data);
+    else
+      *gradInput_data = 0;
+  );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c
new file mode 100644
index 000000000..1cf712212
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c
@@ -0,0 +1,564 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
+static bool THNN_(checkLegacyInput)(THTensor* t)
+{
+  return t->nDimension == 3 && t->size[2] == 2;
+}
+
+static bool THNN_(checkInput)(THTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
+{
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THTensor* t, long size0)
+{
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static void THNN_(set1d)(THTensor *t, long x0, real value) {
+  THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
+  return THStorage_(get)(t->storage, t->storageOffset +
+                         x0*t->stride[0] + x1*t->stride[1]);
+}
+
+void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i, j, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+  long batchSize = THTensor_(size)(output, 0);
+
+  THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+  THLongTensor_zero(csr);
+
+  weight = THTensor_(newContiguous)(weight);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i=0; i<nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+    hp1 = (i+1 == nnz) ?
+            batchSize :
+            (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csr, h+1, i+1);
+    }
+  }
+
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+  for (h = 0; h < batchSize; h++) {
+    long i_start = THLongTensor_get1d(csr, h);
+    long i_end = THLongTensor_get1d(csr, h+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = THNN_(get2d)(input, i, 2);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            COL_PTR2(weight, offset), weight->stride[0],
+            ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+            offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THLongTensor_free(csr);
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias)
+{
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+  weight = THTensor_(newContiguous)(weight);
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(output, batchSize, outDim);
+
+  // output = weight * input + bias
+  THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      COL_PTR2(weight, offset), weight->stride[0],
+                      ROW_PTR2(output, h), output->stride[1]);
+      } else {
+        THError("index out of bound. updateOutput: %d not between 1 and %d",
+                offset + 1, inDim);
+      }
+    }
+  }
+
+  THTensor* output_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(output_row, output, 0, h);
+    THTensor_(cadd)(output_row, bias, 1.0, output_row);
+  }
+  THTensor_(free)(output_row);
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long h, i, col, hp0, hp1;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkInput)(input), 2,
+             "input must be in coo format, nnz x 3");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long nnz = THTensor_(size)(input, 0);
+
+  THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+  THLongTensor_zero(csc);
+  weight = THTensor_(newContiguous)(weight);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+  for (i = 0; i < nnz; i++) {
+    hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+    hp1 = (i+1 == nnz) ?
+            inDim :
+            (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+    if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+      THLongTensor_set1d(csc, h+1, i+1);
+    }
+  }
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+  for (col = 0; col < inDim; col++) {
+    long i_start = THLongTensor_get1d(csc, col);
+    long i_end = THLongTensor_get1d(csc, col+1);
+    for (i = i_start; i < i_end; i++) {
+      real val = scale * THNN_(get2d)(input, i, 2);
+
+      h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+      long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+            val,
+            ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+            COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+            "index out of bound. accGradParameters: %d not between 1 and %d",
+            offset + 1,
+            inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* buf = THTensor_(new)();
+  THTensor_(sum)(buf, gradOutput, 0, 1);
+  THTensor_(cadd)(gradBias, gradBias, scale, buf);
+  THTensor_(free)(buf);
+  THLongTensor_free(csc);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+  THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay_,
+          accreal scale_)
+{
+  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long h, i;
+  long outDim = THTensor_(size)(weight, 0);
+  long inDim = THTensor_(size)(weight, 1);
+
+  THArgCheck(THNN_(checkLegacyInput)(input), 2,
+             "input size must be batchsize x nnz x 2");
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+             "gradBias size wrong");
+  THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+             "gradOutput must be contiguous");
+
+  long batchSize = THTensor_(size)(input, 0);
+  long nnz = THTensor_(size)(input, 1);
+  THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+  // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+  batchSize * nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    for (h = 0; h < batchSize; h++) {
+      real val = scale * THNN_(get3d)(input, h, i, 1);
+      if (val == 0) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THBlas_(axpy)(outDim,
+                      val,
+                      ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+                      COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+      } else {
+        THError(
+          "index out of bound. accGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+
+  // gradBias += gradOutput
+  THTensor* gradOutput_row = THTensor_(new)();
+  for (h = 0; h < batchSize; h++) {
+    THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+  }
+  THTensor_(free)(gradOutput_row);
+
+  if (weightDecay != 0) {
+    THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+  }
+}
+
+void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate_)
+{
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 6,
+             "input must be in coo format, nnz x 3");
+
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+  long cnt = 0;
+  for (i = 0; i < nnz; i++) {
+    real val = THNN_(get2d)(lastInput, i, 2);
+    if (val == 0) {
+      continue;
+    }
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      THNN_(set1d)(offsets, cnt++, offset);
+    } else {
+      THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+  if (cnt == 0) return;
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate_)
+{
+  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+  long h, i;
+  long outDim = weight->size[0];
+  long inDim = weight->size[1];
+
+  THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+             "gradWeight size wrong");
+  THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
+             "input size must be batchsize x nnz x 2");
+
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+  // collect unique offsets of non-0 val in input
+  THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+  long cnt = 0;
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      real val = THNN_(get3d)(lastInput, h, i, 1);
+      if (val == 0 ) {
+        continue;
+      }
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        THNN_(set1d)(offsets, cnt++, offset);
+      } else {
+        THError(
+          "index out of bound. updateParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+  THTensor_(resize1d)(offsets, cnt);
+
+  THTensor* uniqueOffsets = THTensor_(new)();
+  THLongTensor* ri = THLongTensor_new();
+  THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+  THLongTensor_free(ri);
+  THTensor_(free)(offsets);
+
+  cnt = 1;
+  real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+  for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+    if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+      uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+    }
+  }
+  THTensor_(resize1d)(uniqueOffsets, cnt);
+
+  // weight += -learningRate * gradWeight
+  THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+  for (i = 0; i < cnt; i++) {
+    long offset = (long)uniqueOffsets_p[i];
+    THBlas_(axpy)(outDim,
+                  -learningRate,
+                  COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+                  COL_PTR2(weight, offset), weight->stride[0]);
+  }
+
+  THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkInput)(lastInput), 4,
+             "input must be in coo format, nnz x 3");
+
+  THTensor_(zero)(gradBias);
+
+  long nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if (   \
+  nnz * outDim > 10000)
+  for (i = 0; i < nnz; i++) {
+    if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+      continue;
+    }
+
+    long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+    if (offset >= 0 && offset < inDim) {
+      real* pGradWeight = COL_PTR2(gradWeight, offset);
+      if (gradWeight->stride[0] == 1) {
+        THVector_(fill)(pGradWeight, 0, outDim);
+      } else {
+        long stride = gradWeight->stride[0];
+        for (j = 0; j < outDim; ++j) {
+          pGradWeight[j * stride] = 0;
+        }
+      }
+    } else {
+      THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+    }
+  }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput)
+{
+  long h, i, j;
+
+  long outDim = gradWeight->size[0];
+  long inDim = gradWeight->size[1];
+
+  THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+  THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
+             "input size must be batchsize x nnz x 2");
+
+  THTensor_(zero)(gradBias);
+
+  long batchSize = THTensor_(size)(lastInput, 0);
+  long nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if (   \
+  batchSize > 1 && batchSize * nnz * outDim > 10000)
+  for (h = 0; h < batchSize; h++) {
+    for (i = 0; i < nnz; i++) {
+      if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+        continue;
+      }
+
+      long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+      if (offset >= 0 && offset < inDim) {
+        real* pGradWeight = COL_PTR2(gradWeight, offset);
+        if (gradWeight->stride[0] == 1) {
+          THVector_(fill)(pGradWeight, 0, outDim);
+        } else {
+          long stride = gradWeight->stride[0];
+          for (j = 0; j < outDim; ++j) {
+            pGradWeight[j * stride] = 0;
+          }
+        }
+      } else {
+        THError(
+          "index out of bound. zeroGradParameters: %d not between 1 and %d",
+          offset + 1,
+          inDim);
+      }
+    }
+  }
+}
+
+#undef ROW_PTR2
+#undef COL_PTR2
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
new file mode 100644
index 000000000..3675b42d7
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
@@ -0,0 +1,258 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c)  (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = START_IND(i, oheight, iheight);
+      int y_end   = END_IND(i, oheight, iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = START_IND(j, owidth, iwidth);
+        int x_end   = END_IND(j, owidth, iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+
+        /* compute local average: */
+        real sum = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            sum += val;
+          }
+        }
+
+        /* set output to local average */
+        *op = sum / kW / kH;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+
+    /* calculate average */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = START_IND(i, oheight, iheight);
+      int y_end   = END_IND(i, oheight, iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = START_IND(j, owidth, iwidth);
+        int x_end   = END_IND(j, owidth, iwidth);
+        int kW = x_end-x_start;
+
+        int x,y;
+        for(y = y_start; y < y_end; y++)
+        {
+          for(x = x_start; x < x_end; x++)
+          {
+            /* update gradient */
+            gradInput_p_k[y*iwidth + x] += gradOutput_p_k[i*owidth + j] / kW / kH;
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
+
+#undef START_IND
+#undef END_IND
+\ No newline at end of file
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
new file mode 100644
index 000000000..fff716e67
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -0,0 +1,274 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *indx_p,
+          THIndex_t *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          long stridew,
+          long strideh,
+          long strided)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float)i / oheight * iheight);
+      int y_end   = (int)ceil((float)(i + 1) / oheight * iheight);
+      int kH = y_end-y_start;
+
+      for(j = 0; j < owidth; j++)
+      {
+
+        int x_start = (int)floor((float)j / owidth * iwidth);
+        int x_end   = (int)ceil((float)(j + 1) / owidth * iwidth);
+        int kW = x_end-x_start;
+
+        /* local pointers */
+        real *ip = input_p   + k*strided + y_start*strideh + x_start*stridew;
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -FLT_MAX;
+        long tcntr = 0;
+        int x,y;
+        for(y = 0; y < kH; y++)
+        {
+          for(x = 0; x < kW; x++)
+          {
+            real val = *(ip + y*strideh + x*stridew);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+            tcntr++;
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max (x,y) */
+        *indyp = (maxindex / kW) + TH_INDEX_BASE;
+        *indxp = (maxindex % kW) + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth,
+          int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+
+  long istride_d;
+  long istride_h;
+  long istride_w;
+  long istride_b;
+
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    istride_b = input->stride[0];
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  /* strides */
+  istride_d = input->stride[dimh-1];
+  istride_h = input->stride[dimh];
+  istride_w = input->stride[dimw];
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THIndexTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+                                                      indices_data+nslices*owidth*oheight, indices_data,
+                                                      nslices,
+                                                      iwidth, iheight,
+                                                      owidth, oheight,
+                                                      istride_w,istride_h,
+                                                      istride_d);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    /* indices will contain i,j locations for each output point */
+    THIndexTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+                                                        indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                        nslices,
+                                                        iwidth, iheight,
+                                                        owidth, oheight,
+                                                        istride_w,istride_h,
+                                                        istride_d);
+    }
+  }
+}
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *indx_p,
+          THIndex_t *indy_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    THIndex_t *indx_p_k = indx_p + k*owidth*oheight;
+    THIndex_t *indy_p_k = indy_p + k*owidth*oheight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < oheight; i++)
+    {
+      int y_start = (int)floor((float) i / oheight * iheight);
+      for(j = 0; j < owidth; j++)
+      {
+        int x_start = (int)floor((float) j / owidth * iwidth);
+        /* retrieve position of max */
+        long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
+        long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
+
+        /* update gradient */
+        gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                         indices_data+nslices*owidth*oheight, indices_data,
+                                                         nslices,
+                                                         iwidth, iheight,
+                                                         owidth, oheight);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                           indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+                                                           nslices,
+                                                           iwidth, iheight,
+                                                           owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c
new file mode 100644
index 000000000..c063502e7
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c
@@ -0,0 +1,329 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+static inline void THNN_(SpatialAveragePooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  long nInputPlane = input->size[dimh-1];
+  long inputHeight = input->size[dimh];
+  long inputWidth = input->size[dimw];
+  long outputHeight, outputWidth;
+  long nOutputPlane = nInputPlane;
+
+  if(ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  long k;
+
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = 0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real sum = 0;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+
+          for(ky = hstart; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              sum += ptr_input[ky*inputWidth + kx];
+          }
+          /* Update output */
+          *ptr_output++ += sum/divide_factor;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode,
+          bool count_include_pad)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+  long ndim = 3;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+  long nInputPlane; // number of channels (or colors)
+
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  THNN_(SpatialAveragePooling_shapeCheck)
+    (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimc++;
+    ndim = 4;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  nInputPlane = input->size[dimc];
+
+  if(ceil_mode)
+  {
+    outputWidth  = (long)(ceil((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  else
+  {
+    outputWidth  = (long)(floor((float)(inputWidth  - kW + 2*padW) / dW)) + 1;
+    outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+  THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+
+  THTensor_(resizeAs)(gradInput, input);
+
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
+
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          long hstart = yy * dH - padH;
+          long wstart = xx * dW - padW;
+          long hend = fminf(hstart + kH, inputHeight + padH);
+          long wend = fminf(wstart + kW, inputWidth + padW);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = fmaxf(hstart, 0);
+          wstart = fmaxf(wstart, 0);
+          hend = fminf(hend, inputHeight);
+          wend = fminf(wend, inputWidth);
+
+          real z = *ptr_gradOutput++;
+
+          int divide_factor;
+          if(count_include_pad)
+            divide_factor = pool_size;
+          else
+            divide_factor = (hend - hstart) * (wend - wstart);
+
+          long kx, ky;
+          for(ky = hstart ; ky < hend; ky++)
+          {
+            for(kx = wstart; kx < wend; kx++)
+              ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+          }
+        }
+      }
+    }
+  }
+
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c
new file mode 100644
index 000000000..d711c8590
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c
@@ -0,0 +1,131 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK                                                            \
+  THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3,                         \
+    "only batches of spatial targets supported (3D tensors)"		         \
+	     " but got targets of dimension: %d",			         \
+	     THIndexTensor_(nDimension)(target));			         \
+  THArgCheck(THTensor_(nDimension)(input) == 4, 2,			         \
+	     "only batches of spatial inputs supported (4D tensors), "	         \
+	     "but got input of dimension: %d", THTensor_(nDimension)(input));    \
+  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
+    THError("weight tensor should be defined either for all or no classes");     \
+  }                                                                              \
+                                                                                 \
+  {                                                                              \
+    long input0 = THTensor_(size)(input, 0);                                     \
+    long input1 = THTensor_(size)(input, 1);                                     \
+    long input2 = THTensor_(size)(input, 2);                                     \
+    long input3 = THTensor_(size)(input, 3);                                     \
+    long target0 = THIndexTensor_(size)(target, 0);                              \
+    long target1 = THIndexTensor_(size)(target, 1);                              \
+    long target2 = THIndexTensor_(size)(target, 2);                              \
+    THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2,     \
+              "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+              input0, input1, input2, input3, target0, target1, target2);        \
+  }
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+
+  input = THTensor_(newContiguous)(input);
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  real *input_data = THTensor_(data)(input);
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *output_data = THTensor_(data)(output);
+  real *total_weight_data = THTensor_(data)(total_weight);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real total_weight_acc = 0;
+  real output_acc = 0;
+  for (int b = 0; b < batch_size; b++) {
+    for (int elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+      total_weight_acc += cur_weight;
+      output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+    }
+  }
+  *total_weight_data = total_weight_acc;
+  *output_data = output_acc;
+
+  if (sizeAverage && *total_weight_data)
+    *output_data /= *total_weight_data;
+
+  THTensor_(free)(input);
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights,
+          THTensor *total_weight)
+{
+  INITIAL_CHECK;
+  THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+              "gradInput must be contiguous");
+
+  real *total_weight_data = THTensor_(data)(total_weight);
+  if (*total_weight_data <= 0)
+    return;
+
+  target = THIndexTensor_(newContiguous)(target);
+  weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+  THIndex_t *target_data = THIndexTensor_(data)(target);
+  real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+  real *gradInput_data = THTensor_(data)(gradInput);
+
+  long batch_size = THTensor_(size)(input, 0);
+  long n_classes = THTensor_(size)(input, 1);
+  long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+  long sample_size = map_size * n_classes;
+
+  real normalize = sizeAverage ? *total_weight_data : 1.0f;
+
+  int b;
+  #pragma omp parallel for
+  for (b = 0; b < batch_size; b++) {
+    int elem;
+    for (elem = 0; elem < map_size; elem++) {
+      int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+      THAssert(cur_target >= 0 && cur_target < n_classes);
+
+      gradInput_data[b * sample_size + cur_target * map_size + elem] =
+        -(weights ? weights_data[cur_target] : 1.0f) / normalize;
+    }
+  }
+
+  THIndexTensor_(free)(target);
+  if (weights)
+    THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c
new file mode 100644
index 000000000..6db5a5db9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -0,0 +1,367 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
+    THTensor *input, THTensor *gradOutput,
+    THTensor *weight, THTensor *bias,
+    int kH, int kW, int dH,
+    int dW, int padH, int padW,
+    long inputHeight, long inputWidth,
+    long outputHeight, long outputWidth) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+         "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+        "3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane = weight->size[2] / (kH * kW);
+  long nOutputPlane = weight->size[1];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight);
+    THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(view_weight_local)(THTensor *_weight)
+{
+  THTensor *weight = THTensor_(newContiguous)(_weight);
+  THArgCheck(weight->nDimension == 3 || weight->nDimension == 6, 4,
+          "weight tensor should be 3D or 6D - got %dD", weight->nDimension);
+  if (weight->nDimension == 6) {
+    long s1 = weight->size[0] * weight->size[1];
+    long s2 = weight->size[2];
+    long s3 = weight->size[3] * weight->size[4] * weight->size[5];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage3d)(weight->storage,
+                       weight->storageOffset,
+                       s1, -1, s2, -1, s3, -1);
+    THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
+     (
+      THTensor *input, THTensor *output,
+      THTensor *weight, THTensor *bias, THTensor *finput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
+{
+  long i;
+  THTensor *output3d, *finput3d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+               nInputPlane, inputWidth, inputHeight,
+               outputWidth, outputHeight);
+
+  THTensor_(copy)(output, bias);
+
+  output3d = THTensor_(newWithStorage3d)
+    (output->storage, output->storageOffset,
+     outputHeight * outputWidth, 1,
+     nOutputPlane, outputHeight * outputWidth,
+     1, nOutputPlane * outputHeight * outputWidth);
+
+  finput3d = THTensor_(newWithStorage3d)
+    (finput->storage, finput->storageOffset,
+     outputHeight * outputWidth, 1,
+     kW * kH * nInputPlane, outputHeight * outputWidth,
+     1, kW * kH * nInputPlane * outputHeight * outputWidth);
+
+  // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+  // finput3d:  oH*oW x nInputPlane*kH*kW x 1
+  THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+  // output3d:  oH*oW x nOutputPlane x 1
+
+  THTensor_(free)(output3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  weight = THNN_(view_weight_local)(weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+
+  long nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH);
+  long nOutputPlane = THTensor_(size)(weight, 1);
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionLocal_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateOutput_frame)
+    (input_t, output_t, weight, bias, finput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+     (THTensor *gradInput, THTensor *gradOutput,
+      THTensor *weight, THTensor *fgradInput,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
+{
+  THTensor *gradOutput3d, *fgradInput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+  // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
+  THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+  // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(fgradInput3d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH,
+              nInputPlane, inputWidth, inputHeight,
+              outputWidth, outputHeight);
+
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight)
+{
+  weight = THNN_(view_weight_local)(weight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(weight,1);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 1, 2);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+      (gradInput, gradOutput, tweight,
+       fgradInput, kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+    (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+     (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+      THTensor *finput, real scale,
+      int kW, int kH, int dW, int dH, int padW, int padH,
+      long nInputPlane, long inputWidth, long inputHeight,
+      long nOutputPlane, long outputWidth, long outputHeight)
+{
+
+  THTensor *gradOutput3d, *finput3d;
+  gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+  finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+                                         outputHeight*outputWidth, 1,
+                                         1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                         kW*kH*nInputPlane, outputHeight*outputWidth);
+  // gradOutput3d:  oH*oW x nOutputPlane x 1
+  // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+  THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+  // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+  THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+  THTensor_(free)(gradOutput3d);
+  THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *finput,
+    THTensor *fgradInput,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    long inputWidth, long inputHeight,
+    long outputWidth, long outputHeight,
+    accreal scale_)
+{
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  gradWeight = THNN_(view_weight_local)(gradWeight);
+
+  THNN_(SpatialConvolutionLocal_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     inputHeight, inputWidth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+  long nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+      (gradOutput, gradWeight, gradBias, finput, scale,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+    (gradOutput_t, gradWeight, gradBias, finput_t, scale,
+     kW, kH, dW, dH, padW, padH,
+     nInputPlane, inputWidth, inputHeight,
+     nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c
new file mode 100644
index 000000000..28fea517c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c
@@ -0,0 +1,377 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1] / (kH * kW);
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static THTensor* THNN_(view_weight_MM2d)(THTensor *weight) {
+  weight = THTensor_(newContiguous)(weight);
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    THTensor *old_weight = weight;
+    weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+					 s1, -1, s2, -1);
+	THTensor_(free)(old_weight);
+  }
+  return weight;
+}
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)
+	  (output->storage->data + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  weight = THNN_(view_weight_MM2d)(weight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+  input = THTensor_(newContiguous)(input);
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  long nInputPlane = input->size[dimf];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if(input->nDimension == 3)
+  {
+    THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+    THNN_(SpatialConvolutionMM_updateOutput_frame)
+      (input, output, weight, bias, finput,
+       kW, kH, dW, dH, padW, padH,
+       nInputPlane, inputWidth, inputHeight,
+       nOutputPlane, outputWidth, outputHeight);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+    THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateOutput_frame)
+	(input_t, output_t, weight, bias, finput_t,
+	 kW, kH, dW, dH, padW, padH,
+	 nInputPlane, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  weight = THNN_(view_weight_MM2d)(weight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
+						      tweight, fgradInput,
+						      kW, kH, dW, dH, padW, padH);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+#pragma omp parallel for private(t)
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
+							tweight, fgradInput_t,
+							kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor *tfinput = THTensor_(new)();
+  THTensor_(transpose)(tfinput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+  THTensor_(free)(tfinput);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          accreal scale_)
+{
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  gradWeight = THNN_(view_weight_MM2d)(gradWeight);
+
+  THNN_(SpatialConvolutionMM_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  if(input->nDimension == 3)
+  {
+    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
+							gradBias, finput, scale);
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    for(t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
+							  gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c
new file mode 100644
index 000000000..142a03551
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c
@@ -0,0 +1,277 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+void THNN_(SpatialConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  int dimw = 2;
+  int dimh = 1;
+  int dimc = 0;
+  long nbatch = 1;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimc++;
+    dimw++;
+    dimh++;
+  }
+
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+  const long input_w  = input->size[dimw];
+  const long input_h  = input->size[dimh];
+  const long output_w = (input_w - kW) / dW + 1;
+  const long output_h = (input_h - kH) / dH + 1;
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+  else
+    THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  output = THTensor_(newContiguous)(output);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  connTable = THTensor_(newContiguous)(connTable);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      /* add bias */
+      real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long j, k;
+      real z= bias_data[p];
+      for (j = 0; j < output_h*output_w; j++)
+        ptr_output[j] = z;
+
+      /* convolve all maps */
+      int nweight = connTable->size[0];
+      for (k = 0; k < nweight; k++)
+      {
+        /* get offsets for input/output */
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+        if (o == p)
+        {
+          THTensor_(validXCorr2Dptr)(
+            output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+            1.0,
+            input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+            weight_data + k*kW*kH,
+            kH, kW,
+            dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(output);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+  THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = weight->size[1];
+  const long kW       = weight->size[2];
+
+  /* contiguous */
+  gradInput = THTensor_(newContiguous)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  connTable = THTensor_(newContiguous)(connTable);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      long k;
+      /* backward all */
+      int nkernel = connTable->size[0];
+      for (k = 0; k < nkernel; k++)
+      {
+        int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+        int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+        if (i == p)
+        {
+          /* gradient to input */
+          THTensor_(fullConv2Dptr)(
+            gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+            gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,  output_h,  output_w,
+            weight_data + k*kW*kH, kH, kW, dH, dW
+          );
+        }
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(gradInput);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *connTable,
+          int nInputPlane,
+          int nOutputPlane,
+          int dW, int dH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* and dims */
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  const long input_h  = input->size[dimh];
+  const long input_w  = input->size[dimw];
+  const long output_h = gradOutput->size[dimh];
+  const long output_w = gradOutput->size[dimw];
+  const long kH       = gradWeight->size[1];
+  const long kW       = gradWeight->size[2];
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+
+  long k;
+  /* gradients wrt bias */
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+      long l;
+      for (l = 0; l < output_h*output_w; l++)
+        gradBias_data[k] += scale*ptr_gradOutput[l];
+    }
+  }
+
+  /* gradients wrt weight */
+  const int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    long m;
+    for (m = 0; m < nbatch; m++)
+    {
+      int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+      int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+      /* gradient to kernel */
+      THTensor_(validXCorr2DRevptr)(
+        gradWeight_data + k*kW*kH,
+        scale,
+        input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+        gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+        dH, dW
+      );
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c
new file mode 100644
index 000000000..efb66a3e3
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c
@@ -0,0 +1,528 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDepthWiseConvolution.c"
+#else
+
+static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 2, 0, weight->size[0]);
+    THNN_CHECK_DIM_SIZE(bias, 2, 1, weight->size[1]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane*nInputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimf, nInputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimh, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw + 1, outputWidth);
+  }
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+		       nInputPlane, inputWidth, inputHeight,
+		       outputWidth, outputHeight);
+
+  output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+                                         nOutputPlane, -1,
+                                         outputHeight*outputWidth, -1);
+  if (bias) {
+    for(i = 0; i < nOutputPlane; i++)
+        THVector_(fill)
+	  (output->storage->data + output->storageOffset + output->stride[0] * i,
+	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  long nOutputPlane = weight->size[0];
+  if (weight->nDimension == 2) {
+    THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+  }
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+  THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+  weight = THTensor_(newContiguous)(_weight);
+
+  THTensor *_bias = NULL;
+  if(bias) {
+  	_bias = THTensor_(newTranspose)(bias, 0, 1);
+  	bias = THTensor_(newContiguous)(_bias);
+  }
+
+  // resize weight
+  long s1 = weight->size[0];
+  long s2 = weight->size[1];
+  long s3 = weight->size[2] * weight->size[3];
+  weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+  int ndim = input->nDimension;
+
+  int batch = 1;
+  if (ndim == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+
+  THTensor_(resize5d)(output, T, nInputPlane, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < T; t++)
+  {
+    THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+    THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+    THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+      THTensor *input_i = THTensor_(newNarrow)(input_t, 0, i, 1);
+      THTensor *output_i = THTensor_(newSelect)(output_t, 0, i);
+      THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+      THTensor *bias_i = NULL;
+      if(bias) {
+        bias_i = THTensor_(newSelect)(bias, 0, i);
+      }
+      THNN_(SpatialDepthWiseConvolution_updateOutput_frame)
+	(input_i, output_i, weight_i, bias_i, finput_i,
+	 kW, kH, dW, dH, padW, padH,
+	 1, inputWidth, inputHeight,
+	 nOutputPlane, outputWidth, outputHeight);
+
+      THTensor_(free)(input_i);
+      THTensor_(free)(weight_i);
+      THTensor_(free)(bias_i);
+      THTensor_(free)(output_i);
+      THTensor_(free)(finput_i);
+    }
+    THTensor_(free)(input_t);
+    THTensor_(free)(output_t);
+    THTensor_(free)(finput_t);
+  }
+
+  THTensor_(free)(weight);
+  THTensor_(free)(_weight);
+  THTensor_(free)(bias);
+  THTensor_(free)(_bias);
+  THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputHeight, outputWidth);
+
+  if (batch == 0) {
+    THTensor_(select)(output, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(finput, NULL, 0, 0);
+  }
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+		      padW, padH,
+		      gradInput->size[0], gradInput->size[2], gradInput->size[1],
+		      gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH)
+{
+  long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  long nOutputPlane = weight->size[0];
+  if (weight->nDimension == 2) {
+    THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+  }
+  gradOutput = THTensor_(newWithTensor)(gradOutput);
+
+  if (input->nDimension == 3) {
+    if (gradOutput->nDimension == 3) {
+      THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+    }
+  }
+  else
+  {
+    if (gradOutput->nDimension == 4) {
+      THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+    }
+  }
+
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+  THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+  weight = THTensor_(newContiguous)(_weight);
+
+
+  // resize weight
+  long s1 = weight->size[0];
+  long s2 = weight->size[1];
+  long s3 = weight->size[2] * weight->size[3];
+  weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resize4d)(fgradInput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+
+
+
+#pragma omp parallel for private(t)
+  for(t = 0; t < T; t++)
+  {
+    THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+    THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+    THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+      THTensor *gradInput_i = THTensor_(newNarrow)(gradInput_t, 0, i, 1);
+      THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+      THTensor *fgradInput_i = THTensor_(newSelect)(fgradInput_t, 0, i);
+
+      THTensor_(transpose)(weight_i, weight_i, 0, 1);
+
+      THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(gradInput_i, gradOutput_i,
+              weight_i, fgradInput_i,
+              kW, kH, dW, dH, padW, padH);
+
+      THTensor_(free)(gradInput_i);
+      THTensor_(free)(weight_i);
+      THTensor_(free)(gradOutput_i);
+      THTensor_(free)(fgradInput_i);
+    }
+
+    THTensor_(free)(gradInput_t);
+    THTensor_(free)(gradOutput_t);
+    THTensor_(free)(fgradInput_t);
+  }
+
+  if (batch == 0) {
+    THTensor_(select)(gradOutput, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(gradInput, NULL, 0, 0);
+    THTensor_(select)(fgradInput, NULL, 0, 0);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+  THTensor_(free)(_weight);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          accreal scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+    (gradOutput->storage, gradOutput->storageOffset,
+     gradOutput->size[0], -1,
+     gradOutput->size[1]*gradOutput->size[2], -1);
+
+  THTensor_(transpose)(finput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+  THTensor_(transpose)(finput, finput, 0, 1);
+
+  if (gradBias) {
+    for(i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for(k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          accreal scale)
+{
+  long nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kH*kW) : gradWeight->size[1];
+  long nOutputPlane = gradWeight->size[0];
+  if (gradWeight->nDimension == 2) {
+    THTensor_(resize4d)(gradWeight, nOutputPlane, nInputPlane, kH, kW);
+  }
+
+  gradOutput = THTensor_(newWithTensor)(gradOutput);
+  if (input->nDimension == 3) {
+    if (gradOutput->nDimension == 3) {
+      THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+    }
+  }
+  else
+  {
+    if (gradOutput->nDimension == 4) {
+      THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+    }
+  }
+
+
+  THNN_(SpatialDepthWiseConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+  // Transpose gradWeight & gradBias
+  THTensor_(transpose)(gradWeight, NULL, 0, 1);
+  THTensor *_gradWeight;
+  _gradWeight = gradWeight;
+  gradWeight = THTensor_(newContiguous)(gradWeight);
+
+  THTensor *_gradBias = NULL;
+  if(gradBias) {
+	  THTensor_(transpose)(gradBias, NULL, 0, 1);
+	  _gradBias = gradBias;
+	  gradBias = THTensor_(newContiguous)(gradBias);
+  }
+
+  // resize gradWeight
+  long s1 = gradWeight->size[0];
+  long s2 = gradWeight->size[1];
+  long s3 = gradWeight->size[2] * gradWeight->size[3];
+  gradWeight = THTensor_(newWithStorage3d)(gradWeight->storage, gradWeight->storageOffset,
+          s1, -1, s2, -1, s3, -1);
+
+  input = THTensor_(newContiguous)(input);
+
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[2];
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+
+  long T = input->size[0];
+  long t;
+  THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+  for(t = 0; t < T; t++)
+  {
+    THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+    THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < nInputPlane; i++)
+    {
+      THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+      THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+      THTensor *gradWeight_i = THTensor_(newSelect)(gradWeight, 0, i);
+      THTensor *gradBias_i = NULL;
+      if(gradBias) {
+      	gradBias_i = THTensor_(newSelect)(gradBias, 0, i);
+      }
+      THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(gradOutput_i, gradWeight_i,
+                gradBias_i, finput_i, scale);
+
+      THTensor_(free)(finput_i);
+      THTensor_(free)(gradOutput_i);
+      THTensor_(free)(gradWeight_i);
+      THTensor_(free)(gradBias_i);
+    }
+
+    THTensor_(free)(gradOutput_t);
+    THTensor_(free)(finput_t);
+  }
+
+  // Copy back and transpose back
+  THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+  THTensor_(resize4d)(_gradWeight, nInputPlane, nOutputPlane, kH, kW);
+  THTensor_(copy)(_gradWeight, gradWeight);
+  THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+
+  if(gradBias) {
+	  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+	  THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane);
+	  THTensor_(copy)(_gradBias, gradBias);
+	  THTensor_(transpose)(_gradBias, NULL, 0, 1);
+  }
+
+  if (batch == 0) {
+    THTensor_(select)(gradOutput, NULL, 0, 0);
+    THTensor_(select)(input, NULL, 0, 0);
+    THTensor_(select)(finput, NULL, 0, 0);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(gradWeight);
+  THTensor_(free)(gradBias);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c
new file mode 100644
index 000000000..897cc0da4
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -0,0 +1,408 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW) {
+
+  THNN_ARGCHECK(weight->nDimension == 4, 4, weight,
+                "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+                "but got: %s");
+  THArgCheck(kW > 0 && kH > 0, 9,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+             dilationH, dilationW);
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[1];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[0];
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%ld x %ld x %ld). "
+	    "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH)
+{
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1],
+			gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int dilationW, int dilationH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialDilatedConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+     dilationH, dilationW);
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
+			gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c
new file mode 100644
index 000000000..8f4ad13c3
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -0,0 +1,401 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
+	THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
+	int kH, int kW, int dH, int dW, int padH, int padW,
+	int dilationH, int dilationW, bool ceil_mode) {
+
+  THArgCheck(kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(dilationH > 0 && dilationW > 0, 12,
+             "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
+             dilationH, dilationW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+	     "pad should be smaller than half of kernel size, but got "
+	     "padW = %d, padH = %d, kW = %d, kH = %d",
+	     padW, padH, kW, kH);
+
+  long nInputPlane = input->size[dimh-1];
+  long inputHeight = input->size[dimh];
+  long inputWidth = input->size[dimw];
+  long outputHeight, outputWidth;
+  long nOutputPlane = nInputPlane;
+
+  if (ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). "
+	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth);
+  }
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          long nslices,
+          long iwidth,
+          long iheight,
+          long owidth,
+          long oheight,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH
+          )
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j;
+    real *ip = input_p   + k*iwidth*iheight;
+    for(i = 0; i < oheight; i++)
+    {
+      for(j = 0; j < owidth; j++)
+      {
+        long hstart = i * dH - padH;
+        long wstart = j * dW - padW;
+        long hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight);
+        long wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth);
+        while(hstart < 0)
+          hstart += dilationH;
+        while(wstart < 0)
+          wstart += dilationW;
+
+        /* local pointers */
+        real *op = output_p  + k*owidth*oheight + i*owidth + j;
+        THIndex_t *indp = ind_p   + k*owidth*oheight + i*owidth + j;
+
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long tcntr = 0;
+        long x,y;
+        for(y = hstart; y < hend; y += dilationH)
+        {
+          for(x = wstart; x < wend; x += dilationW)
+          {
+            tcntr = y*iwidth + x;
+            real val = *(ip + tcntr);
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = tcntr;
+            }
+          }
+        }
+
+        /* set output to local max */
+        *op = maxval;
+
+        /* store location of max */
+        *indp = maxindex + TH_INDEX_BASE;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH,
+          bool ceil_mode)
+{
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  long nInputPlane;
+  long inputHeight;
+  long inputWidth;
+  long outputHeight;
+  long outputWidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, NULL, NULL, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  if (ceil_mode)
+  {
+    outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (long)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize3d)(indices,  nInputPlane, outputHeight, outputWidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+      (input_data, output_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       kW, kH, dW, dH,
+       padW, padH,
+       dilationW, dilationH
+       );
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+	(input_data+p*nInputPlane*inputWidth*inputHeight,
+	 output_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 kW, kH, dW, dH,
+	 padW, padH,
+	 dilationW, dilationH
+	 );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          long nInputPlane,
+          long inputWidth,
+          long inputHeight,
+          long outputWidth,
+          long outputHeight,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
+    real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
+    THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
+
+    /* calculate max points */
+    long i, j;
+    for(i = 0; i < outputHeight; i++)
+    {
+      for(j = 0; j < outputWidth; j++)
+      {
+        /* retrieve position of max */
+        long maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE;
+	if (maxp != -1) {
+	  /* update gradient */
+	  gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+	}
+      }
+    }
+  }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int dilationW,
+          int dilationH,
+          bool ceil_mode)
+{
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+  int nInputPlane;
+  int inputHeight;
+  int inputWidth;
+  int outputHeight;
+  int outputWidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_(SpatialDilatedMaxPooling_shapeCheck)
+    (input, gradOutput, indices, kH, kW, dH, dW,
+     padH, padW, dilationH, dilationW, ceil_mode);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nInputPlane = input->size[dimh-1];
+  inputHeight = input->size[dimh];
+  inputWidth = input->size[dimw];
+  outputHeight = gradOutput->size[dimh];
+  outputWidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+      (gradInput_data, gradOutput_data,
+       indices_data,
+       nInputPlane,
+       inputWidth, inputHeight,
+       outputWidth, outputHeight,
+       dW, dH);
+  }
+  else
+  {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+	(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+	 gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+	 indices_data+p*nInputPlane*outputWidth*outputHeight,
+	 nInputPlane,
+	 inputWidth, inputHeight,
+	 outputWidth, outputHeight,
+	 dW, dH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c
new file mode 100644
index 000000000..a98954cc6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c
@@ -0,0 +1,253 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH,
+  int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 2 random samples, one for W and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 2;
+
+    /* Generate interval sequence */
+    long* sequenceW =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w;
+
+    real* inputForPlane = input + plane * inputW * inputH;
+    real* outputForPlane = output + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        real maxVal = -THInf;
+        long maxIndex = -1;
+
+        long h2, w2;
+        for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+          for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+            THAssert(h2 >= 0 && h2 < inputH);
+            THAssert(w2 >= 0 && w2 < inputW);
+
+            long planeIndex = h2 * inputW + w2;
+            real val = inputForPlane[planeIndex];
+            if (val > maxVal) {
+              maxVal = val;
+              maxIndex = planeIndex;
+            }
+          }
+        }
+
+        THAssert(maxVal != -THInf);
+        THAssert(maxIndex != -1);
+
+        outputForPlane[h * outputW + w] = maxVal;
+        /* +1 to lua index */
+        indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE;
+      }
+    }
+
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(numInputDims == 3 || numInputDims == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+    THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+        numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  long numPlanes,
+  long inputW, long inputH,
+  long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+    long h, w;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        long outputIndex = h * outputW + w;
+        long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+        THAssert(index >= 0 && index < inputW * inputH);
+
+        gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+      }
+    }
+  }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputW, int outputH,
+    int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 4) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 3) {
+    THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputW, inputH, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+        numPlanes, inputW, inputH, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c
new file mode 100644
index 000000000..2edc53b5a
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c
@@ -0,0 +1,462 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+static void THNN_(im2col)(const real* data_im, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_col) {
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        data_col[(c_col * height_col + h_col) * width_col + w_col] =
+          (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+          data_im[(c_im * height + h_im) * width + w_im] : 0;
+      }
+    }
+  }
+}
+
+static void THNN_(col2im)(const real* data_col, const int channels,
+      const int height, const int width, const int kernel_h, const int kernel_w,
+      const int pad_h, const int pad_w,
+      const int stride_h, const int stride_w,
+      const int dilation_h, const int dilation_w,
+      real* data_im) {
+  memset(data_im, 0, sizeof(real) * height * width * channels);
+  const int height_col = (height + 2 * pad_h -
+                          (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_col = (width + 2 * pad_w -
+                         (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channels_col = channels * kernel_h * kernel_w;
+  for (int c_col = 0; c_col < channels_col; ++c_col) {
+    int w_offset = c_col % kernel_w;
+    int h_offset = (c_col / kernel_w) % kernel_h;
+    int c_im = c_col / kernel_h / kernel_w;
+    for (int h_col = 0; h_col < height_col; ++h_col) {
+      for (int w_col = 0; w_col < width_col; ++w_col) {
+        int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+        if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+          data_im[(c_im * height + h_im) * width + w_im] +=
+            data_col[(c_col * height_col + h_col) * width_col + w_col];
+      }
+    }
+  }
+}
+
+static inline void THNN_(SpatialFullConvolution_shapeCheck)(
+	THTensor *input, THTensor *gradOutput,
+	THTensor *weight, THTensor *bias,
+	int kH, int kW, int dH, int dW, int padH, int padW, int adjH, int adjW) {
+
+  THArgCheck(kW > 0 && kH > 0, 9,
+	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+  THArgCheck(dW > 0 && dH > 0, 11,
+	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  THArgCheck(adjW < dW && adjH < dH, 15,
+        "output adjustment must be smaller than stride, but got adjH: %d adjW: %d dH: %d dW: %d",
+        adjH, adjW, dH, dW);
+  THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+		"2D or 4D weight tensor expected, but got: %s");
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+  }
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+
+  long nInputPlane  = weight->size[0];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long nOutputPlane = weight->size[1];
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%d x %d x %d). "
+	    "Calculated output size: (%d x %d x %d). Output size is too small",
+	    nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+  }
+
+  long inputHeight  = input->size[2];
+  long inputWidth   = input->size[3];
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(input_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2im)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+          't', 'n',
+          n_, m_, k_,
+          1,
+          THTensor_(data)(ones), k_,
+          THTensor_(data)(bias), k_,
+          1,
+          THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    THTensor *gradColumns,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH)
+{
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+  int nInputPlane = THTensor_(size)(weight,0);
+  int nOutputPlane = THTensor_(size)(weight,1);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 'n',
+        n, m, k,
+        1,
+        THTensor_(data)(gradColumns), n,
+        THTensor_(data)(weight), k,
+        0,
+        THTensor_(data)(gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+    THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    THTensor *columns,
+    THTensor *ones,
+    int kW, int kH,
+    int dW, int dH,
+    int padW, int padH,
+    int adjW, int adjH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialFullConvolution_shapeCheck)
+    (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+  int nOutputPlane = THTensor_(size)(gradWeight,1);
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+    THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize2d)(ones, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(im2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(input_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c
new file mode 100644
index 000000000..6952fbe25
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -0,0 +1,222 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+  THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  const int kH = (int)weight->size[1];
+  const int kW = (int)weight->size[2];
+
+  THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
+  THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+  THTensor_(resize3d)(
+    output_, nOutputPlane,
+    (input->size[1] - 1) * dH + kH,
+    (input->size[2] - 1) * dW + kW
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  THTensor* output = THTensor_(newContiguous)(output_);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *output_data = THTensor_(data)(output);
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = output->size[1];
+  const long output_w = output->size[2];
+  const long weight_h = weight->size[1];
+  const long weight_w = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nOutputPlane; p++)
+  {
+    /* add bias */
+    real *ptr_output = output_data + p*output_w*output_h;
+    long j;
+    int nweight;
+    long k;
+
+    for (j = 0; j < output_h*output_w; j++)
+      ptr_output[j] = bias_data[p];
+
+    /* convolve all maps */
+    nweight = connTable->size[0];
+    for (k = 0; k < nweight; k++)
+    {
+      /* get offsets for input/output */
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+      if (o == p)
+      {
+        THTensor_(fullConv2Dptr)(
+          output_data + o*output_w*output_h,
+          1.0,
+          input_data + i*input_w*input_h, input_h, input_w,
+          weight_data + k*weight_w*weight_h, weight_h, weight_w,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(freeCopyTo)(output, output_);
+}
+
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
+  THTensor *connTable, int nInputPlane, int nOutputPlane,
+  int dW, int dH)
+{
+  THArgCheck(
+    weight != NULL && weight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+    "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* Resize/Zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* get raw pointers */
+  real *gradInput_data = THTensor_(data)(gradInput);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *weight_data = THTensor_(data)(weight);
+  real *connTable_data = THTensor_(data)(connTable);
+
+  /* and dims */
+  const long input_h = input->size[1];
+  const long input_w = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long kH = weight->size[1];
+  const long kW = weight->size[2];
+
+  long p;
+#pragma omp parallel for private(p)
+  for (p = 0; p < nInputPlane; p++)
+  {
+    long k;
+    /* backward all */
+    int nkernel = connTable->size[0];
+    for (k = 0; k < nkernel; k++)
+    {
+      int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+      int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+      if (i == p)
+      {
+        /* gradient to input */
+        THTensor_(validXCorr2Dptr)(
+          gradInput_data + i*input_w*input_h,
+          1.0,
+          gradOutput_data + o*output_w*output_h,  output_h,  output_w,
+          weight_data + k*kW*kH, kH, kW,
+          dH, dW
+        );
+      }
+    }
+  }
+
+  /* clean up */
+  THTensor_(freeCopyTo)(gradInput, gradInput_);
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *connTable,
+  int nInputPlane,
+  int nOutputPlane,
+  int dW, int dH,
+  accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(
+    gradWeight != NULL && gradWeight->nDimension == 3
+    && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+    "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+  );
+
+  /* contiguous */
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* get raw pointers */
+  real *input_data = THTensor_(data)(input);
+  real *gradOutput_data = THTensor_(data)(gradOutput);
+  real *gradWeight_data = THTensor_(data)(gradWeight);
+  real *gradBias_data = THTensor_(data)(gradBias);
+
+  /* and dims */
+  const long input_h  = input->size[1];
+  const long input_w  = input->size[2];
+  const long output_h = gradOutput->size[1];
+  const long output_w = gradOutput->size[2];
+  const long weight_h = gradWeight->size[1];
+  const long weight_w = gradWeight->size[2];
+
+  /* gradients wrt bias */
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nOutputPlane; k++)
+  {
+    real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+    long l;
+    for (l = 0; l < output_h*output_w; l++)
+      gradBias_data[k] += scale*ptr_gradOutput[l];
+  }
+
+  /* gradients wrt weight */
+  int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+  for (k = 0; k < nkernel; k++)
+  {
+    int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+    int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+    /* gradient to kernel */
+    THTensor_(validXCorr2DRevptr)(
+      gradWeight_data + k*weight_w*weight_h,
+      scale,
+      gradOutput_data + o*output_w*output_h, output_h, output_w,
+      input_data + i*input_w*input_h, input_h, input_w,
+      dH, dW
+    );
+  }
+
+  /* clean up */
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c
new file mode 100644
index 000000000..88aaa40e1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateOutput)(
+      state, input, output, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+      state, input, gradOutput, gradInput, indices,
+      kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+    );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c
new file mode 100644
index 000000000..320538686
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -0,0 +1,234 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+                                                      THIndex_t *ind_p,
+                                                      int nslices,
+                                                      int iwidth, int iheight,
+                                                      int owidth, int oheight)
+{
+  int k;
+  int has_error = 0;
+  THIndex_t error_index;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *output_p_k = output_p + k*owidth*oheight;
+    real *input_p_k = input_p + k*iwidth*iheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+    int i, j;
+    THIndex_t maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE;  /* retrieve position of max */
+        if(maxp<0 || maxp>=owidth*oheight){
+#pragma omp critical
+          {
+            has_error = 1;
+            error_index = maxp;
+          }
+        } else {
+          output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+        }
+      }
+    }
+  }
+  if (has_error) {
+    THError("found an invalid max index %ld (output volumes are of size %dx%d)",
+        error_index, oheight, owidth);
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THIndexTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  /* get contiguous input and indices */
+  input = THTensor_(newContiguous)(input);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+                                              indices_data,
+                                              nslices,
+                                              iwidth, iheight,
+                                              owidth, oheight);
+  }
+  else
+  {
+    int p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateOutput_frame)(
+						    input_data+p*nslices*iwidth*iheight,
+						    output_data+p*nslices*owidth*oheight,
+						    indices_data+p*nslices*iwidth*iheight,
+						    nslices,
+						    iwidth, iheight,
+						    owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THIndexTensor_(free)(indices);
+}
+
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+                                                         THIndex_t *ind_p,
+                                                         int nslices,
+                                                         int iwidth, int iheight,
+                                                         int owidth, int oheight)
+{
+  int k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+    real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+    THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+    int i, j;
+    THIndex_t maxp;
+    for(i = 0; i < iheight; i++)
+    {
+      for(j = 0; j < iwidth; j++)
+      {
+        maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
+        if(maxp < 0 || maxp >= owidth * oheight) {
+            THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight);
+        }
+        gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+      }
+    }
+  }
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THIndexTensor *indices,
+    int owidth, int oheight)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int nbatch = 1;
+  int nslices;
+  int iheight;
+  int iwidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  /* get contiguous gradOutput and indices */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimh-1];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+
+  if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
+	    oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]);
+  }
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 3)
+  {
+    THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+                                                 indices_data,
+                                                 nslices,
+                                                 iwidth, iheight,
+                                                 owidth, oheight);
+  }
+  else
+  {
+    int p;
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+                                                   indices_data+p*nslices*iwidth*iheight,
+                                                   nslices,
+                                                   iwidth, iheight,
+                                                   owidth, oheight);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c
new file mode 100644
index 000000000..dcde660ea
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+                                                  THTensor *input,
+                                                  THTensor *output,
+                                                  int pad_l, int pad_r,
+                                                  int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReflectionPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l * 2 - j;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = (iwidth + pad_l - 1) * 2 - j;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t * 2 - i;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = (iheight + pad_t - 1) * 2 - i;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c
new file mode 100644
index 000000000..4e318aa70
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+        real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p = *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *output,
+                                                         int pad_l, int pad_r,
+                                                         int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+	     "input (H: %d, W: %d)is too small."
+	     " Calculated output H: %d W: %d",
+	     iheight, iwidth, oheight, owidth);
+
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 3)
+  {
+    THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight,
+                                                    pad_l, pad_r,
+                                                    pad_t, pad_b);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(SpatialReplicationPadding_updateOutput_frame)(
+        input_data+p*nslices*iwidth*iheight,
+        output_data+p*nslices*owidth*oheight,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight,
+  long owidth, long oheight,
+  int pad_l, int pad_r,
+  int pad_t, int pad_b)
+{
+  int iStartX = fmax(0, -pad_l);
+  int iStartY = fmax(0, -pad_t);
+  int oStartX = fmax(0, pad_l);
+  int oStartY = fmax(0, pad_t);
+
+  long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+  for (k = 0; k < nslices; k++)
+  {
+    long i, j;
+    for (i = 0; i < oheight; i++) {
+      for (j = 0; j < owidth; j++) {
+        if (j < pad_l) {
+          ip_x = pad_l;
+        } else if (j >= pad_l && j < iwidth + pad_l) {
+          ip_x = j;
+        } else {
+          ip_x = iwidth + pad_l - 1;
+        }
+        ip_x = ip_x - oStartX + iStartX;
+
+        if (i < pad_t) {
+          ip_y = pad_t;
+        } else if (i >= pad_t && i < iheight + pad_t) {
+          ip_y = i;
+        } else {
+          ip_y = iheight + pad_t - 1;
+        }
+        ip_y = ip_y - oStartY + iStartY;
+
+        real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+        real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+        *dest_p += *src_p;
+      }
+    }
+  }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *gradOutput,
+                                                      THTensor *gradInput,
+                                                      int pad_l, int pad_r,
+                                                      int pad_t, int pad_b)
+{
+  int dimw = 2;
+  int dimh = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long iheight;
+  long iwidth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 4)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  oheight = iheight + pad_t + pad_b;
+  owidth  = iwidth + pad_l + pad_r;
+
+  THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+	     "gradOutput width unexpected. Expected: %d, Got: %d",
+	     owidth, THTensor_(size)(gradOutput, dimw));
+  THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+                "gradOutput height unexpected. Expected: %d, Got: %d",
+	     oheight, THTensor_(size)(gradOutput, dimh));
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 3) {
+    THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight,
+      owidth, oheight,
+      pad_l, pad_r,
+      pad_t, pad_b);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+        nslices,
+        iwidth, iheight,
+        owidth, oheight,
+        pad_l, pad_r,
+        pad_t, pad_b);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c
new file mode 100644
index 000000000..4c077bc64
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c
@@ -0,0 +1,302 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+static inline void THNN_(SpatialSubSampling_shapeCheck)(
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         int kW, int kH) {
+  int ndims = input->nDimension;
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+                  "3D or 4D input tensor expected but got: %s");
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+
+  int nInputPlane = THTensor_(size)(weight, 0);
+
+  int dimw = 2;
+  int dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+
+  THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+  THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+}
+
+void THNN_(SpatialSubSampling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    THTensor *weight,
+    THTensor *bias,
+    int kW, int kH,
+    int dW, int dH)
+{
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+  real *weight_data = THTensor_(data)(weight);
+  real *bias_data = THTensor_(data)(bias);
+  real *output_data;
+  real *input_data;
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  long k;
+
+  THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH);
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  if (input->nDimension == 3)
+    THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+  else
+    THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+  output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      long xx, yy;
+      /* For all output pixels... */
+      real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+      /* Get the good mask for (k,i) (k out, i in) */
+      real the_weight = weight_data[k];
+      /* Initialize to the bias */
+      real z = bias_data[k];
+      long i;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        ptr_output[i] = z;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          /* Compute the mean of the input image... */
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real sum = 0;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += ptr_input[kx];
+            ptr_input += inputWidth; /* next input line */
+          }
+          /* Update output */
+          *ptr_output++ += the_weight*sum;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    THTensor *weight,
+    int kW, int kH,
+    int dW, int dH)
+{
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH);
+
+  int dimw = 2;
+  int dimh = 1;
+  long nbatch = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(weight,0);
+
+  real *weight_data;
+  real *gradOutput_data;
+  real *input_data, *gradInput_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  weight_data = THTensor_(data)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input_data = THTensor_(data)(input);
+
+  THTensor_(resizeAs)(gradInput, input);
+  gradInput_data = THTensor_(data)(gradInput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real the_weight = weight_data[k];
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      long xx, yy;
+
+      real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+      long i;
+      for(i=0; i<inputWidth*inputHeight; i++)
+        ptr_gi[i] = 0.0;
+
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++ * the_weight;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              ptr_gradInput[kx] += z;
+            ptr_gradInput += inputWidth;
+          }
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradWeight,
+    THTensor *gradBias,
+    int kW, int kH,
+    int dW, int dH,
+    accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);
+
+  long nbatch = 1;
+  long dimw = 2;
+  long dimh = 1;
+
+  long inputWidth;
+  long inputHeight;
+  long outputWidth;
+  long outputHeight;
+
+  int nInputPlane = THTensor_(size)(gradWeight,0);
+
+  real *gradWeight_data;
+  real *gradBias_data;
+  real *gradOutput_data;
+  real *input_data;
+
+  long k;
+
+  if (input->nDimension == 4) {
+    dimw++;
+    dimh++;
+    nbatch = input->size[0];
+  }
+
+  inputWidth = input->size[dimw];
+  inputHeight = input->size[dimh];
+  outputWidth = (inputWidth - kW) / dW + 1;
+  outputHeight = (inputHeight - kH) / dH + 1;
+
+  gradWeight_data = THTensor_(data)(gradWeight);
+  gradBias_data = THTensor_(data)(gradBias);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  input = THTensor_(newContiguous)(input);
+  input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < nInputPlane; k++)
+  {
+    long p;
+    for(p = 0; p < nbatch; p++)
+    {
+      real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+      real sum;
+      long xx, yy;
+      long i;
+
+      sum = 0;
+      for(i = 0; i < outputWidth*outputHeight; i++)
+        sum += ptr_gradOutput[i];
+      gradBias_data[k] += scale*sum;
+
+      sum = 0;
+      for(yy = 0; yy < outputHeight; yy++)
+      {
+        for(xx = 0; xx < outputWidth; xx++)
+        {
+          real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+          real z = *ptr_gradOutput++;
+          long kx, ky;
+
+          for(ky = 0; ky < kH; ky++)
+          {
+            for(kx = 0; kx < kW; kx++)
+              sum += z * ptr_input[kx];
+            ptr_input += inputWidth;
+          }
+        }
+      }
+      gradWeight_data[k] += scale*sum;
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c
new file mode 100644
index 000000000..8bc487ead
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c
@@ -0,0 +1,174 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputHeight, int inputWidth,
+      int outputHeight, int outputWidth) {
+  THArgCheck(inputHeight > 0 && inputWidth > 0
+	     && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+	     inputHeight, inputWidth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->nDimension == 4, 2, input,
+		  "4D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+  }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputHeight,
+    int outputWidth){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputHeight = THTensor_(size)(input, 2);
+  int inputWidth = THTensor_(size)(input, 3);
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize4d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        const real* pos1 = &idata[h1 * inputWidth + w1];
+        real* pos2 = &odata[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = pos1[0];
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      const real* pos1 = &idata[h1 * inputWidth + w1];
+      real* pos2 = &odata[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+                  + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                  + w1lambda * pos1[h1p * inputWidth + w1p]);
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputHeight,
+    int inputWidth,
+    int outputHeight,
+    int outputWidth){
+
+  THNN_(SpatialUpSamplingBilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputHeight, inputWidth,
+     outputHeight, outputWidth);
+
+  THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const int w1 = w2;
+        real* pos1 = &data1[h1 * inputWidth + w1];
+        const real* pos2 = &data2[h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += pos2[0];
+          pos1 += inputWidth * inputHeight;
+          pos2 += outputWidth * outputHeight;
+        }
+      }
+    }
+    return;
+  }
+  const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+  for (int h2 = 0; h2 < outputHeight; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+    const real h1lambda = h1r - h1;
+    const real h0lambda = (real)1. - h1lambda;
+    for (int w2 = 0; w2 < outputWidth; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+      const real w1lambda = w1r - w1;
+      const real w0lambda = (real)1. - w1lambda;
+      real* pos1 = &data1[h1 * inputWidth + w1];
+      const real* pos2 = &data2[h2 * outputWidth + w2];
+      for (int c = 0; c < channels; ++c) {
+        pos1[0] += h0lambda * w0lambda * pos2[0];
+        pos1[w1p] += h0lambda * w1lambda * pos2[0];
+        pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0];
+        pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0];
+        pos1 += inputWidth * inputHeight;
+        pos2 += outputWidth * outputHeight;
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c
new file mode 100644
index 000000000..b4699ff3e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c
@@ -0,0 +1,199 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int scale_factor) {
+  THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL");
+  THArgCheck(scale_factor > 1, 4,
+	     "scale_factor must be greater than 1, but got: %d", scale_factor);
+  THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+		"3D or 4D input tensor expected but got: %s");
+  if (input->nDimension == 3) {
+    int nChannels    = THTensor_(size)(input, 0);
+    int inputHeight  = THTensor_(size)(input, 1);
+    int inputWidth   = THTensor_(size)(input, 2);
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth);
+    }
+  } else {
+    int nBatch       = THTensor_(size)(input, 0);
+    int nChannels    = THTensor_(size)(input, 1);
+    int inputHeight  = THTensor_(size)(input, 2);
+    int inputWidth   = THTensor_(size)(input, 3);
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+  int inputHeight = THTensor_(size)(input, input->nDimension-2);
+  int inputWidth  = THTensor_(size)(input,  input->nDimension-1);
+  int outputHeight = inputHeight * scale_factor;
+  int outputWidth = inputWidth * scale_factor;
+
+  if (input->nDimension == 3) {
+    THTensor_(resize3d)(output,
+			THTensor_(size)(input, 0),
+			outputHeight, outputWidth);
+  } else {
+    THTensor_(resize4d)(output,
+			THTensor_(size)(input, 0),
+			THTensor_(size)(input, 1),
+			outputHeight, outputWidth);
+  }
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-2;
+  int yDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = 1;
+  if (idim > 3) {
+    osz3 = output->size[3];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst;
+  int iout[4];  // Output indices
+  int iin[4];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+
+          // set the indices for the upsampled dimensions
+          iin[xDim] = iout[xDim] / dW;
+          iin[yDim] = iout[yDim] / dH;
+
+          idst = i0*os[0] + i1*os[1] + i2*os[2];
+          isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
+          if (idim > 3) {
+            idst += i3*os[3];
+            isrc += iin[3]*is[3];
+          }
+
+          pout[idst] = pin[isrc];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  THNN_(SpatialUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+  THTensor_(resizeAs)(gradInput, input);
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = gradInput->nDimension-2;
+  int yDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Guaranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = 1;
+  if (idim > 3) {
+    isz3 = gradInput->size[3];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, isrc, idst, x, y;
+  int iin[4];  // Input indices
+  int iout[4];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          idst = i0*is[0] + i1*is[1] + i2*is[2];
+          if (idim > 3) {
+            idst += i3*is[3];
+          }
+
+          // Now accumulate the gradients from gradOutput
+          for (y = 0; y < dH; y++) {
+            for (x = 0; x < dW; x++) {
+              iout[xDim] = dW * iin[xDim] + x;
+              iout[yDim] = dH * iin[yDim] + y;
+              isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
+              if (idim > 3) {
+                isrc += iout[3]*os[3];
+              }
+              pin[idst] += pout[isrc];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c
new file mode 100644
index 000000000..174884e34
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c
@@ -0,0 +1,52 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal eps_)
+{
+  real eps = TH_CONVERT_ACCREAL_TO_REAL(eps_);
+  THTensor_(resizeAs)(output, input);
+  THTensor_(sqrt)(output, input);
+}
+
+void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (output->nDimension == 1 ||
+      !THTensor_(isContiguous)(output) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *output_data     = THTensor_(data)(output);
+    long i;
+#pragma omp parallel for private(i)
+    for(i = 0; i < THTensor_(nElement)(output); i++)
+    {
+      if (output_data[i] == 0.0)
+        gradInput_data[i] = 0.0;
+      else
+        gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Square.c b/contrib/lua-torch/nn/lib/THNN/generic/Square.c
new file mode 100644
index 000000000..aad0a911c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Square.c
@@ -0,0 +1,59 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(resizeAs)(output, input);
+
+  if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+  {
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *output_data = THTensor_(data)(output);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(input); i++)
+      output_data[i] = input_data[i]*input_data[i];
+  }
+}
+
+void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput)
+{
+  THNN_CHECK_SHAPE(input, gradOutput);
+  THTensor_(resizeAs)(gradInput, input);
+
+  if (input->nDimension == 1 ||
+      !THTensor_(isContiguous)(input) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      *gradInput_data  = 2.0 * (*gradOutput_data) * (*input_data);
+    );
+  }
+  else
+  {
+    real *gradOutput_data = THTensor_(data)(gradOutput);
+    real *gradInput_data  = THTensor_(data)(gradInput);
+    real *input_data  = THTensor_(data)(input);
+    long i;
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+      gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/THNN.h b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h
new file mode 100644
index 000000000..76a28eb2d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h
@@ -0,0 +1,1501 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THNN.h"
+#else
+
+TH_API void THNN_(Abs_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] Abs output
+TH_API void THNN_(Abs_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput);        // [OUT] gradient w.r.t. input
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage);           // if true, the loss will be divided by batch size
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // tensor with target values
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the gradient will be normalized by batch size
+
+TH_API void THNN_(BCECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          THTensor *weights);          // [OPTIONAL]
+TH_API void THNN_(BCECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          THTensor *weights);          // [OPTIONAL]
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          long ignore_index);          // target index to ignore (loss = 0, gradInput = 0)
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (1D/2D)
+          THIndexTensor *target,       // tensor containing indexes of target classes
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight,      // [BUFFER]
+          long ignore_index);          // target index to ignore (loss = 0, gradInput = 0)
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *output,            // [OUT] a one-element tensor with loss
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor (4D)
+          THIndexTensor *target,       // tensor containing indexes of target classes (3D)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage,            // if true, the loss will be normalized by batch size and class weights
+          THTensor *weights,           // [OPTIONAL] class weights
+          THTensor *total_weight);     // [BUFFER]
+
+TH_API void THNN_(ELU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] ELU output
+          accreal alpha,               // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+TH_API void THNN_(ELU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output,            // output from a forward pass
+          accreal alpha,               // an ELU parameter (as in paper)
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          bool sizeAverage);           // if true, the loss will be normalized **by total number of elements**
+
+TH_API void THNN_(GatedLinear_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor, half size of input along dimension dim
+          int dim);                    // dimension for halving operation
+TH_API void THNN_(GatedLinear_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t input
+          int dim);                    // dimension for halving operation
+
+// HardShink outputs 0 on interval of (-lambda; lambda) or original value otherwise.
+TH_API void THNN_(HardShrink_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal lambda);             // HardShrink parameter
+TH_API void THNN_(HardShrink_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          accreal lambda);             // HardShrink parameter
+
+// HardTanh clamps the values to the interval [min_val; max_val].
+TH_API void THNN_(HardTanh_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
+TH_API void THNN_(HardTanh_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          accreal min_val,             // lower threshold
+          accreal max_val,             // upper threshold
+          bool inplace);
+
+TH_API void THNN_(L1Cost_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(L1Cost_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [OPTIONAL] gradient w.r.t module's output
+          THTensor *gradInput);        // [OUT] gradient w.r.t the input
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // [MODIFIED] input tensor
+          THTensor *output,            // [OUT] output tensor
+          accreal negval,              // negative part slope
+          bool inplace);               // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // [MODIFIED] gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. the input
+          accreal negval,              // negative part slope
+          bool inplace);               // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(GRUFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *hx,
+          THTensor *output,
+          THTensor *storage);
+TH_API void THNN_(GRUFused_updateGradInput)(
+          THNNState *state,
+          THTensor *gradInInput,
+          THTensor *gradInHidden,
+          THTensor *gradOutput,
+          THTensor *gradInputHx,
+          THTensor *storage);
+
+TH_API void THNN_(LSTMFused_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *hidden,
+          THTensor *bias1, // [OPTIONAL]
+          THTensor *bias2, // [OPTIONAL]
+          THTensor *cell,
+          THTensor *output,
+          THTensor *outputCell);
+TH_API void THNN_(LSTMFused_updateGradInput)(
+          THNNState *state,
+          THTensor *storage,
+          THTensor *gradInGates,
+          THTensor *cx,
+          THTensor *cy,
+          THTensor *gradOutput,
+          THTensor *gradOutputCell,
+          THTensor *gradInputCx);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output,            // output tensor
+          THTensor *buffer);           // [BUFFER]
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *buffer);           // [BUFFER]
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *output);           // [OUT] output tensor
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *gradOutput,        // gradient w.r.t. module's output
+          THTensor *gradInput,         // [OUT] gradient w.r.t. input
+          THTensor *output);           // module's output
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THIntegerTensor *count,
+          THTensor *sorted,            // [OPTIONAL]
+          THIndexTensor *indices,      // [OPTIONAL]
+          bool scaleGradByFreq,
+          int paddingValue,
+          accreal scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+          THNNState *state,            // library's state
+          THIndexTensor *idx,          // vector containing row indices (modified in function)
+          THTensor *weight,            // 2D tensor whose rows will be renormalized
+          accreal maxNorm,             // maximum norm
+          accreal normType);           // the norm type (e.g., normType=2, then it's 2-norm)
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contain only 1s and -1s)
+          THTensor *output,            // [OUT] a one-element tensor containing the loss
+          bool sizeAverage,            // if true, the loss is normalized by **total number of elements**
+          accreal margin);             // a margin that is required for the loss to be 0
+
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+          THNNState *state,            // library's state
+          THTensor *input,             // input tensor
+          THTensor *target,            // target tensor (should contin only 1s and -1s)
+          THTensor *gradInput,         // [OUT] gradient w.r.t. module's input
+          bool sizeAverage,            // if true, the gradient is normalized by **total number of elements**
+          accreal margin);             // a margin that is required for the loss to be 0
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(MSECriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          THTensor *isTarget,
+          bool sizeAverage);
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          THTensor *isTarget,
+          bool sizeAverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *output,
+          bool sizeAverage,
+          int p,
+          THTensor* weights,      // [OPTIONAL]
+          accreal margin);
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THIndexTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage,
+          int p,
+          THTensor *weights,      // [OPTIONAL]
+          accreal margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradWeight,
+          THTensor *gradWeightBuf,
+          THTensor *gradWeightBuf2,
+          THIndex_t nOutputPlane,
+          accreal scale);
+
+TH_API void THNN_(Linear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *addBuffer);
+TH_API void THNN_(Linear_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight);
+TH_API void THNN_(Linear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *addBuffer,
+          accreal scale);
+
+TH_API void THNN_(RReLU_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *noise,
+          accreal lower,
+          accreal upper,
+          bool train,
+          bool inplace,
+          THGenerator *generator);
+TH_API void THNN_(RReLU_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *noise,
+          accreal lower,
+          accreal upper,
+          bool train,
+          bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Sigmoid_updateGradInput)(
+          THNNState *state,
+          THTensor *input,             // [OPTIONAL]
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *output,
+          bool sizeAverage);
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *target,
+          THTensor *gradInput,
+          bool sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(SoftMax_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal beta,
+          accreal threshold);
+TH_API void THNN_(SoftPlus_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output,
+          accreal beta,
+          accreal threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal lambda);
+TH_API void THNN_(SoftShrink_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal lambda);
+
+
+TH_API void THNN_(IndexLinear_updateOutput)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *normalizedValues,
+          int   train);
+TH_API void THNN_(IndexLinear_accGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor* valuesBuffer,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_accUpdateGradParameters)(
+          THNNState *state,
+          THIndexTensor *keys,
+          long keysOffset,
+          THTensor *values,
+          THIndexTensor *sizes,
+          THIndexTensor *cumSumSizes,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(IndexLinear_updateParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          THIndexTensor *runningKeys,
+          THIndexTensor *cumSumSizes,
+          long keysOffset,
+          accreal weightDecay,
+          accreal learningRate);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *weight,
+          THTensor *bias,
+          accreal weightDecay,
+          accreal scale);
+TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
+          THNNState *state,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput);
+TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
+          THNNState *state,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *lastInput,
+          accreal learningRate);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal eps);
+TH_API void THNN_(Sqrt_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Square_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Square_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output);
+TH_API void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,             // [OPTIONAL]
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output);
+
+TH_API void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal threshold,
+          accreal val,
+          bool inplace);
+TH_API void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal threshold,
+          accreal val,
+          bool inplace);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize,
+          int outputFrameSize);
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState* state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          accreal scale);
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int dW,
+          int inputFrameSize);
+TH_API void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int dW,
+          accreal scale);
+
+TH_API void THNN_(TemporalRowConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW,
+          int dW,
+          int padW,
+          bool featFirst,
+          accreal scale);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double momentum,
+          double eps);
+TH_API void THNN_(BatchNormalization_backward)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,    // [OPTIONAL]
+          THTensor *gradWeight,   // [OPTIONAL]
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *weight,       // [OPTIONAL]
+          THTensor *running_mean,
+          THTensor *running_var,
+          THTensor *save_mean,
+          THTensor *save_std,
+          bool train,
+          double scale,
+          double eps);
+
+TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          accreal scale);         // scaling factor
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          accreal scale);
+
+TH_API void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          accreal scale);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          long inputWidth, long inputHeight,
+          long outputWidth, long outputHeight,
+          accreal scale);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode,
+          bool count_include_pad);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputW, int outputH,
+          int poolSizeW, int poolSizeH,
+          THIndexTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int adjW, int adjH,
+          accreal scale);
+
+TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *output,       // [OUT] convolution output
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradInput,    // [OUT] gradient w.r.t. input
+          THTensor *weight,       // 3D weight tensor (connTable:size(1) x kH x kW)
+          THTensor *bias,         // 1D bias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH);        // stride
+TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+          THNNState *state,       // library state
+          THTensor *input,        // input tensor
+          THTensor *gradOutput,   // gradient w.r.t. output
+          THTensor *gradWeight,   // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+          THTensor *gradBias,     // 1D gradBias tensor (nOutputPlane)
+          THTensor *connTable,    // connection table
+          int nInputPlane,        // number of input planes
+          int nOutputPlane,       // number of output planes
+          int dW, int dH,         // stride
+          accreal scale);         // scaling factor
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,         // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,     // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          accreal scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int dilationW, int dilationH,
+          bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int owidth, int oheight);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW, int kH,
+          int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW, int kH,
+          int dW, int dH,
+          accreal scale);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+	  int outputHeight,
+          int outputWidth);
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputHeight,
+          int inputWidth,
+          int outputHeight,
+          int outputWidth);
+
+TH_API void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW, int kH,
+          int dW, int dH,
+          int padW, int padH,
+          int nInputPlane,
+          int inputWidth, int inputHeight,
+          int outputWidth, int outputHeight);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *finput,
+          THTensor *fgradInput,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices,
+          THTensor *randomSamples);
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int outputT, int outputW, int outputH,
+          int poolSizeT, int poolSizeW, int poolSizeH,
+          THIndexTensor *indices);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *output,         // [OUT] volumetric convolution output
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *bias,           // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // [OUT] internal columns buffer
+          THTensor *fgradInput,     // [OUT] internal ones buffer
+          int dT, int dW, int dH,   // stride of the convolution
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradInput,      // [OUT] gradient w.r.t. input
+          THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH);  // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+          THNNState *state,         // library state
+          THTensor *input,          // 4D or 5D (batch) tensor
+          THTensor *gradOutput,     // gradient w.r.t. output
+          THTensor *gradWeight,     // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+          THTensor *gradBias,       // [OPTIONAL] gradBias tensor (nOutputPlane)
+          THTensor *finput,         // internal columns buffer
+          THTensor *fgradInput,     // internal ones buffer
+          int dT, int dW, int dH,   // stride
+          int pT, int pW, int pH,   // padding
+          int aT, int aW, int aH,   // extra output adjustment
+          accreal scale);           // scaling factor
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,           // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,       // [OPTIONAL]
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          accreal scale);
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          bool ceilMode);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          int dilationT, int dilationW, int dilationH,
+          bool ceilMode);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int oT, int oW, int oH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pad_l, int pad_r,
+          int pad_t, int pad_b);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int pleft, int pright,
+          int ptop, int pbottom,
+          int pfront, int pback);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int scale_factor);
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int scale_factor);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+	  int outputDepth,
+          int outputHeight,
+          int outputWidth);
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+          THNNState *state,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int nbatch,
+          int nchannels,
+          int inputDepth,
+          int inputHeight,
+          int inputWidth,
+          int outputDepth,
+          int outputHeight,
+          int outputWidth);
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c
new file mode 100644
index 000000000..ecf0708c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+void THNN_(Tanh_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output)
+{
+  THTensor_(tanh)(output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *output)
+{
+  THNN_CHECK_SHAPE(output, gradOutput);
+  THTensor_(resizeAs)(gradInput, output);
+
+  if (output->nDimension == 1 ||
+      !THTensor_(isContiguous)(output) ||
+      !THTensor_(isContiguous)(gradOutput) ||
+      !THTensor_(isContiguous)(gradInput))
+  {
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+      real z = *output_data;            \
+      *gradInput_data = *gradOutput_data * (1. - z*z);
+    );
+  }
+  else
+  {
+    real* ptr_gradOutput = THTensor_(data)(gradOutput);
+    real* ptr_gradInput  = THTensor_(data)(gradInput);
+    real* ptr_output     = THTensor_(data)(output);
+    long i;
+
+#pragma omp parallel for private(i)
+    for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+    {
+      real z = ptr_output[i];
+      ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c
new file mode 100644
index 000000000..8cfd97d85
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c
@@ -0,0 +1,398 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+static inline void THNN_(TemporalConvolution_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+
+  THArgCheck(kW > 0, 9,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 11,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+               "invalid input frame size. Got: %d, Expected: %d",
+               input->size[dimF], *inputFrameSize);
+  }
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+}
+
+void THNN_(TemporalConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize,
+          int outputFrameSize)
+{
+  THTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+  THNN_(TemporalConvolution_shapeCheck)
+       (state, input, kW, dW, &inputFrameSize);
+  input = THTensor_(newContiguous)(input);
+  outputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THTensor_(resize2d)(output,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(outputWindow, output, 0, k);
+      THTensor_(copy)(outputWindow, bias);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(outputWindow, output->storage,
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THTensor *tweight = THTensor_(new)();
+      THTensor_(transpose)(tweight, weight, 0, 1);
+      THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+      THTensor_(free)(tweight);
+    }
+  }
+  else
+  {
+    THTensor *outputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    THTensor_(resize3d)(output,
+                        nBatchFrame,
+                        nOutputFrame,
+                        outputFrameSize);
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(outputSample, output, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(outputWindow, outputSample, 0, k);
+        THTensor_(copy)(outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(outputWindow, outputSample->storage,
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THTensor *tweight = THTensor_(new)();
+        THTensor_(transpose)(tweight, weight, 0, 1);
+        THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+        THTensor_(free)(tweight);
+      }
+    }
+    THTensor_(free)(outputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(outputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *gradInputWindow;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (gradOutput->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  gradOutputWindow = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (gradOutput->nDimension == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *gradInputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(gradInputSample, gradInput, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(gradInputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  long nInputFrame;
+  long nOutputFrame;
+
+  THTensor *gradOutputWindow;
+  THTensor *inputWindow;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (gradOutput->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  THNN_(TemporalConvolution_shapeCheck)(
+        state, input, kW, dW, NULL);
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  gradOutputWindow = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  if (input->nDimension == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+      THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THTensor_(setStorage2d)(inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THTensor *tgradOutputWindow = THTensor_(new)();
+      THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+      THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+      THTensor_(free)(tgradOutputWindow);
+    }
+  }
+  else
+  {
+    THTensor *gradOutputSample = THTensor_(new)();
+    THTensor *inputSample = THTensor_(new)();
+    int nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+      THTensor_(select)(inputSample, input, 0, i);
+      int nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+        THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THTensor *tgradOutputWindow = THTensor_(new)();
+        THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+        THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+        THTensor_(free)(tgradOutputWindow);
+      }
+    }
+    THTensor_(free)(gradOutputSample);
+    THTensor_(free)(inputSample);
+  }
+
+  THTensor_(free)(gradOutputWindow);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(input);
+
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c
new file mode 100644
index 000000000..344c1b3fd
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c
@@ -0,0 +1,283 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+static inline void THNN_(TemporalMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kW,
+                         int dW) {
+  long niframe;
+  long framesize;
+  long noframe;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+  int ndims = input->nDimension;
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  THArgCheck(kW > 0, 5,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 6,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(input->size[dimS] >= kW, 2,
+             "input sequence smaller than kernel size. Got: %d, Expected: %d",
+             input->size[dimS], kW);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize)
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize);
+  }
+}
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  long framesize;
+  long noframe;
+
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  long t, y;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+
+  /* sizes */
+  niframe = input->size[dimS];
+  framesize = input->size[dimF];
+  noframe = (niframe - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 2)
+  {
+    /* resize output */
+    THTensor_(resize2d)(output, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THIndexTensor_(resize2d)(indices, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for(t = 0; t < noframe; t++)
+    {
+      real *ip = input_data + t*framesize*dW;
+      real *op = output_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = -1;
+        real maxval = -THInf;
+        long x;
+        for(x = 0; x < kW; x++)
+        {
+          real val = ip[x*framesize+y];
+          if (val > maxval)
+          {
+            maxval = val;
+            maxindex = x;
+          }
+        }
+
+        /* set output to local max */
+        op[y] = maxval;
+        xp[y] = (real)maxindex;
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    /* resize output */
+    THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+    /* indices will contain index locations for each output point */
+    THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+    /* get raw pointers */
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *inputSample_data = input_data + i*niframe*framesize;
+      real *outputSample_data = output_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *ip = inputSample_data + t*framesize*dW;
+        real *op = outputSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = -1;
+          real maxval = -THInf;
+          long x;
+          for(x = 0; x < kW; x++)
+          {
+            real val = ip[x*framesize+y];
+            if (val > maxval)
+            {
+              maxval = val;
+              maxindex = x;
+            }
+          }
+
+          /* set output to local max */
+          op[y] = maxval;
+          xp[y] = (real)maxindex;
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kW,
+          int dW)
+{
+  long niframe;
+  int noframe;
+  long framesize;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  long t, y;
+
+  THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize and zero */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  /* sizes */
+  niframe = input->size[dimS];
+  noframe = gradOutput->size[dimS];
+  framesize = gradOutput->size[dimF];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  if (input->nDimension == 2)
+  {
+    for(t = 0; t < noframe; t++)
+    {
+      real *gip = gradInput_data + t*framesize*dW;
+      real *gop = gradOutput_data + t*framesize;
+      THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+      for(y = 0; y < framesize; y++)
+      {
+        /* compute local max: */
+        long maxindex = (long)xp[y];
+	if (maxindex != -1)
+	  gip[maxindex*framesize+y] += gop[y];
+      }
+    }
+  }
+  else
+  {
+    /* number of batch frames */
+    long nbframe = input->size[0];
+    long i;
+
+    for(i = 0; i < nbframe; i++)
+    {
+      real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+      real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+      THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+      for(t = 0; t < noframe; t++)
+      {
+        real *gip = gradInputSample_data + t*framesize*dW;
+        real *gop = gradOutputSample_data + t*framesize;
+        THIndex_t *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+        for(y = 0; y < framesize; y++)
+        {
+          /* compute local max: */
+          long maxindex = (long)xp[y];
+	  if (maxindex != -1)
+	    gip[maxindex*framesize+y] += gop[y];
+        }
+      }
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c
new file mode 100644
index 000000000..e3ae41e22
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c
@@ -0,0 +1,472 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalRowConvolution.c"
+#else
+
+static inline void THNN_(TemporalRowConvolution_shapeCheck)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *bias,
+	int kW,
+	int dW,
+	int padW) {
+
+	THArgCheck(kW > 0, 5,
+	           "kernel size should be greater than zero, but got kW: %d", kW);
+	THArgCheck(dW > 0, 6,
+	           "stride should be greater than zero, but got dW: %d", dW);
+	THNN_ARGCHECK(weight->nDimension == 3, 3, weight,
+	              "3D weight tensor expected, but got: %s");
+    THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+    THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+	if (bias != NULL) {
+		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+	}
+
+	// we're always looking at (possibly batch) x feats x seq
+	int ndim = input->nDimension;
+	int dimF = 0;
+	int dimS = 1;
+
+	if (ndim == 3) {
+		++dimS;
+		++dimF;
+	}
+
+	THNN_ARGCHECK(ndim == 2 || ndim == 3, 1, input,
+	              "2D or 3D (batch mode) input tensor expected, but got :%s");
+
+	long inputFrameSize = weight->size[0];
+	long nInputFrame = input->size[dimS];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (nOutputFrame < 1) {
+		THError("Given input size: (%d x %d). "
+		        "Calculated output size: (%d x %d). Output size is too small",
+		        inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
+	}
+
+	THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
+
+	if (gradOutput != NULL) {
+		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
+		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
+	}
+}
+
+static void THNN_(unfolded_acc_row)(
+	THTensor *finput,
+	THTensor *input,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	size_t c;
+	real *input_data = THTensor_(data)(input);
+	real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(c)
+	for (c = 0; c < inputFrameSize; c++) {
+		size_t kw, x;
+		long long ix = 0;
+
+		for (kw = 0; kw < kW; kw++) {
+			real *src = finput_data
+			            + c * (kW * nOutputFrame)
+			            + kw * (nOutputFrame);
+			real *dst = input_data + c * (nInputFrame);
+
+			ix = (long long)(kw);
+			if (dW == 1) {
+			  real *dst_slice = dst + (size_t)(ix);
+			  THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
+			} else {
+				for (x = 0; x < nOutputFrame; x++) {
+				  real *dst_slice = dst + (size_t)(ix + x * dW);
+				  THVector_(cadd)(dst_slice, dst_slice,
+						  src + (size_t)(x), 1, 1);
+				}
+			}
+		}
+	}
+}
+
+static void THNN_(unfolded_copy_row)(
+	THTensor *finput,
+	THTensor *input,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	long k;
+	real *input_data = THTensor_(data)(input);
+	real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(k)
+	for (k = 0; k < inputFrameSize * kW; k++) {
+		size_t c = k / kW;
+		size_t rest = k % kW;
+		size_t kw = rest % kW;
+		size_t x;
+		long long ix;
+		real *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
+		real *src = input_data + c * (nInputFrame);
+
+		ix = (long long)(kw);
+		if (dW == 1) {
+			memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame));
+		} else {
+			for (x = 0; x < nOutputFrame; x++) {
+				memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
+				       sizeof(real) * 1);
+			}
+		}
+	}
+}
+
+static void THNN_(TemporalRowConvolution_updateOutput_frame)(
+	THTensor *input,
+	THTensor *output,
+	THTensor *weight,
+	THTensor *bias,
+	THTensor *finput,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	long i;
+
+	THTensor *output3d = THTensor_(newWithStorage3d)(
+		output->storage, output->storageOffset,
+		inputFrameSize, -1,
+		1, -1,
+		nOutputFrame, -1);
+
+	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
+	                         inputFrameSize, nInputFrame, nOutputFrame);
+
+	THTensor_(zero)(output);
+
+	if (bias != NULL) {
+		for (i = 0; i < inputFrameSize; i++)
+			THVector_(fill)
+			        (output->storage->data + output->storageOffset
+			        + output->stride[0] * i,
+			        THTensor_(get1d)(bias, i), nOutputFrame);
+	}
+
+	THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
+
+	THTensor_(free)(output3d);
+}
+
+void THNN_(TemporalRowConvolution_updateOutput)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *output,
+	THTensor *weight,
+	THTensor *bias,
+	THTensor *finput,
+	THTensor *fgradInput,     // unused here but needed for Cuda
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst) {
+
+	int ndim = input->nDimension;
+
+	THTensor *tinput;
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		input = THTensor_(newContiguous)(tinput);
+	} else {
+		input = THTensor_(newContiguous)(input);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)(
+		state, input, NULL, weight, bias, kW, dW, padW);
+
+	long inputFrameSize = weight->size[0];
+	long nInputFrame = input->size[ndim - 1];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (ndim == 2) { /* non-batch mode */
+
+		THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
+		THTensor_(zero)(output);
+
+		THNN_(TemporalRowConvolution_updateOutput_frame)
+		        (input, output, weight, bias, finput,
+		        kW, dW, padW,
+		        inputFrameSize, nInputFrame, nOutputFrame);
+
+	} else {
+		long T = input->size[0];
+		long t;
+
+		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
+		THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
+
+		THTensor_(zero)(finput);
+		THTensor_(zero)(output);
+
+#pragma omp parallel for private(t)
+		for (t = 0; t < T; t++) {
+			THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+			THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+			THNN_(TemporalRowConvolution_updateOutput_frame)
+			        (input_t, output_t, weight, bias, finput_t,
+			        kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
+
+			THTensor_(free)(input_t);
+			THTensor_(free)(output_t);
+			THTensor_(free)(finput_t);
+		}
+	}
+
+	if (!featFirst) { // NOTE: output will NOT be contiguous in this case
+		THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
+		THTensor_(free)(tinput);
+	}
+
+	THTensor_(free)(input);
+}
+
+static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
+	THTensor *gradInput,
+	THTensor *gradOutput,
+	THTensor *weight,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	long inputFrameSize,
+	long nInputFrame,
+	long nOutputFrame) {
+
+	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+		gradOutput->storage, gradOutput->storageOffset,
+		inputFrameSize, -1,
+		1, -1,
+		nOutputFrame, -1);
+
+	// weight:			inputFrameSize x kW x 1
+	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
+	THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
+	// fgradInput:		inputFrameSize x kW x nOutputFrame
+	THTensor_(free)(gradOutput3d);
+
+	THTensor_(zero)(gradInput);
+
+	THNN_(unfolded_acc_row)(fgradInput, gradInput,
+	                        kW, dW, padW,
+	                        inputFrameSize, nInputFrame, nOutputFrame);
+}
+
+void THNN_(TemporalRowConvolution_updateGradInput)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *gradInput,
+	THTensor *weight,
+	THTensor *finput,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst) {
+
+	int ndim = input->nDimension;
+
+	THTensor *tinput, *tgradOutput;
+
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+		input = THTensor_(newContiguous)(tinput);
+		gradOutput = THTensor_(newContiguous)(tgradOutput);
+
+	} else {
+		input = THTensor_(newContiguous)(input);
+		gradOutput = THTensor_(newContiguous)(gradOutput);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
+	                                         NULL, kW, dW, padW);
+
+	long inputFrameSize = weight->size[0];
+	long nInputFrame = input->size[ndim - 1];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	THTensor_(resizeAs)(fgradInput, finput);
+	THTensor_(resizeAs)(gradInput, input);
+
+	THTensor_(zero)(fgradInput);
+	THTensor_(zero)(gradInput);
+
+    THTensor *tweight = THTensor_(new)();
+    THTensor_(transpose)(tweight, weight, 1, 2);
+
+	if (ndim == 2) {
+		THNN_(TemporalRowConvolution_updateGradInput_frame)
+		        (gradInput, gradOutput, tweight, fgradInput,
+		        kW, dW, padW,
+		        inputFrameSize, nInputFrame, nOutputFrame);
+	} else {
+		long T = input->size[0];
+		long t;
+
+#pragma omp parallel for private(t)
+		for (t = 0; t < T; t++) {
+
+			THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+			THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+			THNN_(TemporalRowConvolution_updateGradInput_frame)
+			        (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+			        kW, dW, padW,
+			        inputFrameSize, nInputFrame, nOutputFrame);
+
+			THTensor_(free)(gradInput_t);
+			THTensor_(free)(gradOutput_t);
+			THTensor_(free)(fgradInput_t);
+		}
+	}
+
+    THTensor_(free)(tweight);
+
+	if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
+
+		THTensor_(free)(tinput);
+		THTensor_(free)(tgradOutput);
+
+		THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
+	}
+
+	THTensor_(free)(input);
+	THTensor_(free)(gradOutput);
+
+}
+
+static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
+	THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+	THTensor *finput, real scale) {
+
+	long i;
+	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+		gradOutput->storage, gradOutput->storageOffset,
+		gradOutput->size[0], -1,
+		1, -1,
+		gradOutput->size[1], -1);
+
+    THTensor *tfinput = THTensor_(new)();
+	THTensor_(transpose)(tfinput, finput, 1, 2);
+	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
+	// finput:			inputFrameSize x nOutputFrame x kW
+	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
+	// gradWeight:		inputFrameSize x 1 x kW
+    THTensor_(free)(tfinput);
+
+	if (gradBias != NULL) {
+		for (i = 0; i < gradBias->size[0]; i++) {
+			long k;
+			real sum = 0;
+			real *data = gradOutput3d->storage->data
+			             + gradOutput3d->storageOffset
+			             + i * gradOutput3d->stride[0];
+			for (k = 0; k < gradOutput3d->size[2]; k++) {
+				sum += data[k];
+			}
+			(gradBias->storage->data + gradBias->storageOffset)[i]
+			        += scale * sum;
+		}
+	}
+
+	THTensor_(free)(gradOutput3d);
+
+}
+
+void THNN_(TemporalRowConvolution_accGradParameters)(
+	THNNState *state,
+	THTensor *input,
+	THTensor *gradOutput,
+	THTensor *gradWeight,
+	THTensor *gradBias,
+	THTensor *finput,
+	THTensor *fgradInput,
+	int kW,
+	int dW,
+	int padW,
+	bool featFirst,
+	accreal scale_) {
+
+    real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+	int ndim = input->nDimension;
+
+	THTensor *tinput, *tgradOutput;
+
+	if (!featFirst) {
+		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+		input = THTensor_(newContiguous)(tinput);
+		gradOutput = THTensor_(newContiguous)(tgradOutput);
+	} else {
+		input = THTensor_(newContiguous)(input);
+		gradOutput = THTensor_(newContiguous)(gradOutput);
+	}
+
+	THNN_(TemporalRowConvolution_shapeCheck)
+	        (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
+
+	long inputFrameSize = gradWeight->size[0];
+	long nInputFrame = input->size[ndim - 1];
+	long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+	if (ndim == 2) {
+		THNN_(TemporalRowConvolution_accGradParameters_frame)(
+			gradOutput, gradWeight, gradBias, finput, scale);
+	} else {
+		long T = input->size[0];
+		long t;
+
+		for (t = 0; t < T; t++) {
+			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+			THNN_(TemporalRowConvolution_accGradParameters_frame)(
+				gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+			THTensor_(free)(gradOutput_t);
+			THTensor_(free)(finput_t);
+		}
+	}
+
+	if (!featFirst) {
+		THTensor_(free)(tinput);
+		THTensor_(free)(tgradOutput);
+	}
+
+	THTensor_(free)(input);
+	THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c
new file mode 100644
index 000000000..68f35e28a
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c
@@ -0,0 +1,156 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+static inline void THNN_(TemporalSubSampling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kW,
+                         int dW,
+                         int *inputFrameSize) {
+  int nInputFrame, nOutputFrame;
+
+  THArgCheck(kW > 0, 6,
+             "kernel size should be greater than zero, but got kW: %d", kW);
+  THArgCheck(dW > 0, 7,
+             "stride should be greater than zero, but got dW: %d", dW);
+
+  THNN_ARGCHECK(input->nDimension == 2, 2, input,
+                  "2D or 3D (batch mode) tensor expected for input, but got: %s");
+  if (inputFrameSize != NULL) {
+    THArgCheck( input->size[1] == *inputFrameSize, 2,
+                "invalid input frame size.  Got: %d, Expected: %d",
+                input->size[1], *inputFrameSize);
+  }
+  THArgCheck( input->size[0] >= kW, 2,
+              "input sequence smaller than kernel size.  Got %d, Expected: %d",
+              input->size[0], kW);
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 0, nOutputFrame);
+    if (inputFrameSize != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 1, *inputFrameSize);
+    }
+  }
+}
+
+void THNN_(TemporalSubSampling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          int kW,
+          int dW,
+          int inputFrameSize)
+{
+  THTensor *outputFrame, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k;
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
+
+  outputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+
+  nInputFrame = input->size[0];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  THTensor_(resize2d)(output,
+                      nOutputFrame,
+                      inputFrameSize);
+
+  for(k = 0; k < nOutputFrame; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(outputFrame, output, 0, k);
+    THTensor_(sum)(outputFrame, inputWindow, 0, 1);
+    THTensor_(cmul)(outputFrame, outputFrame, weight);
+    THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+  }
+
+  THTensor_(free)(outputFrame);
+  THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          int kW,
+          int dW)
+{
+
+  THTensor *gradOutputFrame;
+  THTensor *gradInputWindow, *buffer, *kwunit;
+  long k;
+
+  THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+
+  gradOutputFrame = THTensor_(new)();
+  gradInputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+  kwunit = THTensor_(newWithSize1d)(kW);
+
+  THTensor_(fill)(kwunit, 1);
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(cmul)(buffer, weight, gradOutputFrame);
+    THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(gradInputWindow);
+  THTensor_(free)(buffer);
+  THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          int kW,
+          int dW,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THTensor *gradOutputFrame;
+  THTensor *inputWindow, *buffer;
+  long k;
+
+  THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+  gradOutputFrame = THTensor_(new)();
+  inputWindow = THTensor_(new)();
+  buffer = THTensor_(new)();
+
+  for(k = 0; k < gradOutput->size[0]; k++)
+  {
+    THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+    THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+    THTensor_(sum)(buffer, inputWindow, 0, 1);
+    THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+    THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+  }
+
+  THTensor_(free)(gradOutputFrame);
+  THTensor_(free)(inputWindow);
+  THTensor_(free)(buffer);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c
new file mode 100644
index 000000000..949c7a07c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c
@@ -0,0 +1,64 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+void THNN_(Threshold_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          accreal threshold_,
+          accreal val_,
+          bool inplace)
+{
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY(real, input,
+      if (*input_data <= threshold)
+        *input_data = val;
+    );
+    THTensor_(set)(output, input);
+  }
+  else
+  {
+    THTensor_(resizeAs)(output, input);
+    TH_TENSOR_APPLY2(real, output, real, input,
+      *output_data = (*input_data > threshold) ? *input_data : val;
+    );
+  }
+}
+
+void THNN_(Threshold_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          accreal threshold_,
+          accreal val_,
+          bool inplace)
+{
+  real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+  real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+  THNN_CHECK_NELEMENT(input, gradOutput);
+  if (inplace)
+  {
+    TH_TENSOR_APPLY2(real, gradOutput, real, input,
+      if ((*input_data) <= threshold)
+        *gradOutput_data = 0;
+    );
+    THTensor_(set)(gradInput, gradOutput);
+  }
+  else
+  {
+    THTensor_(resizeAs)(gradInput, input);
+    TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+      if ((*input_data) > threshold)
+        *gradInput_data = *gradOutput_data;
+      else
+        *gradInput_data = 0;
+    );
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c
new file mode 100644
index 000000000..91c870e6f
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c
@@ -0,0 +1,373 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH) {
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  int ndim = input->nDimension;
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+             && input->size[dimt] >= kT, 2,
+             "input image (T: %d H: %d W: %d) smaller than "
+             "kernel size (kT: %d kH: %d kW: %d)",
+             input->size[dimt], input->size[dimh], input->size[dimw],
+             kT, kH, kW);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+}
+
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = input_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* compute local sum: */
+          real sum = 0.0;
+          int x, y, z;
+
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
+              }
+            }
+          }
+
+          /* set output to local max */
+          *op = sum / (kT * kW * kH);
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, NULL, kT, kW, kH,
+        dT, dW, dH);
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  otime   = (itime   - kT) / dT + 1;
+  oheight = (iheight - kH) / dH + 1;
+  owidth  = (iwidth  - kW) / dW + 1;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricAveragePooling_updateOutput_frame)(
+      input_data, output_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else  /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateOutput_frame)(
+        input_data + p * istride, output_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+          real *ip = gradInput_p + k * itime * iwidth * iheight
+            + ti * iwidth * iheight * dT +  i * iwidth * dH + j * dW;
+          real *op = gradOutput_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+          /* scatter gradients out to footprint: */
+          real val  = *op / (kT * kW * kH);
+          int x,y,z;
+          for (z=0; z < kT; z++)
+          {
+            for (y = 0; y < kH; y++)
+            {
+              for (x = 0; x < kW; x++)
+              {
+                *(ip + z * iwidth * iheight + y * iwidth + x) += val;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THNN_(VolumetricAveragePooling_shapeCheck)(
+        state, input, gradOutput, kT, kW, kH,
+        dT, dW, dH);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data, nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c
new file mode 100644
index 000000000..be1aa82e6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nOutputPlane = weight->size[0];
+  long kT           = weight->size[2];
+  long kH           = weight->size[3];
+  long kW           = weight->size[4];
+  long inputDepth   = input->size[dimt];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth - kT) / dT + 1;
+  long outputWidth  = (inputWidth - kW) / dW + 1;
+  long outputHeight = (inputHeight - kH) / dH + 1;
+  THTensor *outn = THTensor_(new)();
+  long i, j;
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    /* add bias */
+    if (bias) {
+      for (i = 0; i < bias->size[0]; i++)
+      {
+        THTensor_(select)(outn, output, 0, i);
+        THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+      }
+    } else {
+      THTensor_(zero)(output);
+    }
+
+    /* do convolutions */
+    THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+  }
+  else /* batch mode */
+  {
+    long nBatch = input->size[0];
+    THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor *inb = THTensor_(new)();
+    THTensor *outb = THTensor_(new)();
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inb, input, 0, j);
+      THTensor_(select)(outb, output, 0, j);
+
+      /* add bias */
+      if (bias) {
+        for (i = 0; i < bias->size[0]; i++)
+        {
+          THTensor_(select)(outn, outb, 0, i);
+          THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+        }
+      } else {
+        THTensor_(zero)(outb);
+      }
+
+      /* do convolutions */
+      THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+    }
+
+    THTensor_(free)(inb);
+    THTensor_(free)(outb);
+  }
+  THTensor_(free)(outn);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for weight, but got: %s");
+
+  int nOutputPlane = (int)weight->size[0];
+
+  THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+		gradOutput,
+		"4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  /* gradient to input */
+  THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *ginpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    THTensor_(resize5d)(gradInput,
+      input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+    );
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(ginpb, gradInput, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+      THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+    }
+    THTensor_(free)(ginpb);
+    THTensor_(free)(goutb);
+  }
+
+  THTensor_(free)(tweight);
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,     // only used by cuda impl
+          THTensor *fgradInput, // only used by cuda impl
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend");   // sharing signature with CUDA version
+
+  THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
+		"5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+		"expected for gradWeight, but got: %s");
+
+  int nOutputPlane = (int)gradWeight->size[0];
+  if (gradBias) {
+    THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+      "gradBias tensor has wrong size"
+    );
+  }
+
+  long k;
+  real *gradBias_data;
+  THTensor *gradOutSlice;
+  int dimPlane = 0;
+  if (gradOutput->nDimension == 5)
+  {
+    dimPlane++;
+  }
+
+  THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+    "Number of output features is not equal to nOutputPlane"
+  );
+
+  if (gradOutput->nDimension == 4) /* non-batch mode */
+  {
+    /* gradient to bias */
+    if (gradBias) {
+      gradBias_data = THTensor_(data)(gradBias);
+      gradOutSlice = THTensor_(new)();
+      for (k = 0; k < nOutputPlane; k++)
+      {
+        THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+        gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+      }
+      THTensor_(free)(gradOutSlice);
+    }
+
+    /* gradient to kernels */
+    THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+  }
+  else /* batch mode */
+  {
+    long nBatch = gradOutput->size[0];
+    THTensor *inpb = THTensor_(new)();
+    THTensor *goutb = THTensor_(new)();
+    long j;
+
+    /* loop over batches */
+    for (j = 0; j < nBatch; j++)
+    {
+      THTensor_(select)(inpb, input, 0, j);
+      THTensor_(select)(goutb, gradOutput, 0, j);
+
+      /* gradient to bias */
+      if (gradBias) {
+        gradBias_data = THTensor_(data)(gradBias);
+        gradOutSlice = THTensor_(new)();
+        for (k = 0; k < nOutputPlane; k++)
+        {
+          THTensor_(select)(gradOutSlice, goutb, 0, k);
+          gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+        }
+        THTensor_(free)(gradOutSlice);
+      }
+
+      /* gradient to kernels */
+      THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+    }
+    THTensor_(free)(inpb);
+    THTensor_(free)(goutb);
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c
new file mode 100644
index 000000000..00a121db6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -0,0 +1,628 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THTensor *weight,
+                         THTensor *bias,
+                         int kT,
+                         int kW,
+                         int kH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+
+  int ndim = input->nDimension;
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1)
+  {
+    THError(
+      "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      nOutputPlane, outputDepth, outputHeight, outputWidth
+    );
+  }
+
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4,
+             "weight tensor should be 2D or 5D - got %d", weight->nDimension);
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+static int THNN_(view_weight)(THTensor **_weight)
+{
+  THTensor *weight = *_weight;
+  if (weight->nDimension == 5) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    *_weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    return 1;
+  }
+  return 0;
+}
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void THNN_(unfolded_acc_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  int nip;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+//#pragma omp parallel for private(nip)
+  for (nip = 0; nip < nInputPlane; nip++)
+  {
+    int kt, kw, kh, t, y, x, it, ix, iy;
+    for (kt = 0; kt < kT; kt++)
+    {
+      for (kh = 0; kh < kH; kh++)
+      {
+        for (kw = 0; kw < kW; kw++)
+        {
+          real *src = finput_data
+            + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+            + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+            + kh  * (kW*outputDepth*outputHeight*outputWidth)
+            + kw  * (outputDepth*outputHeight*outputWidth);
+
+          real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
+          if (pT > 0 || pH > 0 || pW > 0)
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT - pT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH - pH + kh;
+                for (x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW - pW + kw;
+                  if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+                  {
+                  }
+                  else
+                  {
+                    real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+                    THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                  }
+                }
+              }
+            }
+          }
+          else
+          {
+            for (t = 0; t < outputDepth; t++)
+            {
+              it = t*dT + kt;
+              for (y = 0; y < outputHeight; y++)
+              {
+                iy = y*dH + kh;
+                for(x = 0; x < outputWidth; x++)
+                {
+                  ix = x*dW + kw;
+                  real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+                  THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(unfolded_copy_vol)(
+          THTensor *finput,
+          THTensor *input,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int nInputPlane,
+          int inputDepth,
+          int inputWidth,
+          int inputHeight,
+          int outputDepth,
+          int outputWidth,
+          int outputHeight)
+{
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+// #pragma omp parallel for private(k)
+  for (k = 0; k < nInputPlane*kT*kH*kW; k++)
+  {
+    int nip = k / (kT*kH*kW);
+    int rest = k % (kT*kH*kW);
+    int kt = rest / (kH*kW);
+    rest = rest % (kH*kW);
+    int kh = rest / kW;
+    int kw = rest % kW;
+    int t,x,y,it,ix,iy;
+    real *dst = finput_data
+      + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+      + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
+      + kh  * (kW*outputDepth*outputHeight*outputWidth)
+      + kw  * (outputDepth*outputHeight*outputWidth);
+    real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
+
+    if (pT > 0 || pH > 0 || pW > 0)
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT - pT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH - pH + kh;
+          for (x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW - pW + kw;
+            if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+              memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
+            else
+              memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+    else
+    {
+      for (t = 0; t < outputDepth; t++)
+      {
+        it = t*dT + kt;
+        for (y = 0; y < outputHeight; y++)
+        {
+          iy = y*dH + kh;
+          for(x = 0; x < outputWidth; x++)
+          {
+            ix = x*dW + kw;
+            memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+          }
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          long nInputPlane,
+          long inputDepth,
+          long inputWidth,
+          long inputHeight,
+          long nOutputPlane,
+          long outputDepth,
+          long outputWidth,
+          long outputHeight)
+{
+  long i;
+  THTensor *output2d;
+
+  THNN_(unfolded_copy_vol)(
+    finput, input,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    nInputPlane,
+    inputDepth, inputWidth, inputHeight,
+    outputDepth, outputWidth, outputHeight
+  );
+
+  output2d = THTensor_(newWithStorage2d)(
+    output->storage, output->storageOffset, nOutputPlane, -1,
+    outputDepth*outputHeight*outputWidth, -1
+  );
+
+  if (bias) {
+      for (i = 0; i < nOutputPlane; i++)
+      {
+        THVector_(fill)(
+          output->storage->data+output->storageOffset+output->stride[0]*i,
+          THTensor_(get1d)(bias, i),
+          outputDepth*outputHeight*outputWidth
+        );
+      }
+  } else {
+    THTensor_(zero)(output);
+  }
+
+  THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+  THTensor_(free)(output2d);
+}
+
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *finput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimf = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+  int freeWeight = 0;
+
+  long nInputPlane;
+  long inputDepth;
+  long inputHeight;
+  long inputWidth;
+  long nOutputPlane;
+  long outputDepth;
+  long outputHeight;
+  long outputWidth;
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, NULL, weight, bias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 5)
+  {
+    dimf++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  nInputPlane = input->size[dimf];
+  inputDepth = input->size[dimt];
+  inputHeight  = input->size[dimh];
+  inputWidth   = input->size[dimw];
+  nOutputPlane = weight->size[0];
+  outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
+  outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+  outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;
+
+  freeWeight = THNN_(view_weight)(&weight);
+
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+    THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+      input, output, weight, bias, finput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      nInputPlane, inputDepth, inputWidth, inputHeight,
+      nOutputPlane, outputDepth, outputWidth, outputHeight
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+    THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+    THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+// #pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+      THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+        input_t, output_t, weight, bias, finput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        nInputPlane, inputDepth, inputWidth, inputHeight,
+        nOutputPlane, outputDepth, outputWidth, outputHeight
+      );
+
+      THTensor_(free)(input_t);
+      THTensor_(free)(output_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  if (freeWeight)
+    THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+          THTensor *gradInput,
+          THTensor *gradOutput,
+          THTensor *weight,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+  THTensor_(free)(gradOutput2d);
+
+  THTensor_(zero)(gradInput);
+
+  THNN_(unfolded_acc_vol)(
+    fgradInput, gradInput,
+    kT, kW, kH,
+    dT, dW, dH,
+    pT, pW, pH,
+    gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+    gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+  );
+}
+
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *finput,
+          THTensor *fgradInput,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int nOutputPlane = (int)weight->size[0];
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, weight, NULL,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int freeWeight = THNN_(view_weight)(&weight);
+
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(resizeAs)(fgradInput, finput);
+  // depending on the BLAS library, fgradInput (result tensor) might
+  // be left uninitialized on zero alpha, which might lead to weird behavior
+  // hence, to be safe, zero it
+  THTensor_(zero)(fgradInput);
+  THTensor *tweight = THTensor_(new)();
+  THTensor_(transpose)(tweight, weight, 0, 1);
+
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+      gradInput, gradOutput, tweight, fgradInput,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    long T = input->size[0];
+    long t;
+
+//#pragma omp parallel for private(t)
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+        gradInput_t, gradOutput_t, tweight, fgradInput_t,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+
+      THTensor_(free)(gradInput_t);
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(fgradInput_t);
+    }
+  }
+
+  THTensor_(free)(tweight);
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (freeWeight)
+    THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          real scale)
+{
+  long i;
+  THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+    gradOutput->storage, gradOutput->storageOffset,
+    gradOutput->size[0], -1,
+    gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+  );
+
+  THTensor *tfinput = THTensor_(new)();
+  THTensor_(transpose)(tfinput, finput, 0, 1);
+  THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+  THTensor_(free)(tfinput);
+
+  if (gradBias) {
+    for (i = 0; i < gradBias->size[0]; i++)
+    {
+      long k;
+      real sum = 0;
+      real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+      for (k = 0; k < gradOutput2d->size[1]; k++)
+        sum += data[k];
+
+      (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+    }
+  }
+
+  THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *finput,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int pT, int pW, int pH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  int freeWeight;
+  int nOutputPlane = (int)gradWeight->size[0];
+
+  THNN_(VolumetricConvolutionMM_shapeCheck)(
+        state, input, gradOutput, gradWeight, gradBias,
+        kT, kW, kH, dT, dW, dH, pT, pW, pH);
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  freeWeight = THNN_(view_weight)(&gradWeight);
+
+  if (input->nDimension == 4)   // non-batch mode
+  {
+    THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+  }
+  else  // batch mode
+  {
+    long T = input->size[0];
+    long t;
+
+    for (t = 0; t < T; t++)
+    {
+      THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+      THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+      THTensor_(free)(gradOutput_t);
+      THTensor_(free)(finput_t);
+    }
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  if (freeWeight)
+    THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c
new file mode 100644
index 000000000..ca740f78e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -0,0 +1,420 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int kT, int kH, int kW, int dT, int dH, int dW,
+                         int padT, int padH, int padW,
+                         int dilationT, int dilationH, int dilationW) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                "expected for weight, but got: %s");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+             "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+             dilationT, dilationH, dilationW);
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+  }
+
+  // Params
+  int ndim = input->nDimension;
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  long inputDepth  = input->size[dimd];
+  long inputHeight  = input->size[dimh];
+  long inputWidth   = input->size[dimw];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THTensor *weight,
+          THTensor *bias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[4];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 ||
+      ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        0,
+        THTensor_(data)(output_n), n_
+      );
+    } else {
+      THTensor_(zero)(output_n);
+    }
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(columns), n,
+      THTensor_(data)(weight), k,
+      1,
+      THTensor_(data)(output_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THTensor *weight,
+          THTensor *gradColumns,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH)
+{
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  weight = THTensor_(newContiguous)(weight);
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+  THTensor_(zero)(gradColumns);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    long m = nInputPlane*kT*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        'n', 't',
+        n, m, k,
+        1,
+        THTensor_(data)(gradOutput_n), n,
+        THTensor_(data)(weight), m,
+        0,
+        THTensor_(data)(gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(gradInput_n)
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradWeight,
+          THTensor *gradBias,
+          THTensor *columns,
+          THTensor *ones,
+          int kT, int kW, int kH,
+          int dT, int dW, int dH,
+          int padT, int padW, int padH,
+          int dilationT, int dilationW, int dilationH,
+          accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  THNN_(VolumetricDilatedConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        kT, kH, kW, dT, dH, dW, padT, padH, padW,
+        dilationT, dilationH, dilationW);
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    long m = nOutputPlane;
+    long n = nInputPlane*kT*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+        't', 'n',
+        n, m, k,
+        scale,
+        THTensor_(data)(columns), k,
+        THTensor_(data)(gradOutput_n), k,
+        1,
+        THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+          't',
+          k_, m_,
+          scale,
+          THTensor_(data)(gradOutput_n), k_,
+          THTensor_(data)(ones), 1,
+          1,
+          THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c
new file mode 100644
index 000000000..66c0f9531
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -0,0 +1,515 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int kT, int kW, int kH,
+                         int dT, int dW, int dH,
+                         int pT, int pW, int pH,
+                         int dilationT, int dilationW, int dilationH,
+                         bool ceilMode) {
+  int ndim = input->nDimension;
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+             kT, kH, kW);
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+             "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d",
+             dilationT, dilationH, dilationW);
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+             "pad should be smaller than half of kernel size, but got "
+             "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
+             kT, kW, kH, pT, pW, pH);
+
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  if (otime < 1 || owidth < 1 || oheight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+  }
+  if (indices != NULL) {
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight);
+    THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth);
+  }
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    /* loop over output */
+    long i, j, ti;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* local pointers */
+
+          long start_t = ti * dT - pT;
+          long start_h = i * dH - pH;
+          long start_w = j * dW - pW;
+
+          long kernel_t = fminf(kT, kT + start_t);
+          long kernel_h = fminf(kH, kH + start_h);
+          long kernel_w = fminf(kW, kW + start_w);
+
+          while(start_t < 0)
+            start_t += dilationT;
+          while(start_h < 0)
+            start_h += dilationH;
+          while(start_w < 0)
+            start_w += dilationW;
+
+          real *ip = input_p + k * itime * iwidth * iheight
+            + start_t * iwidth * iheight + start_h * iwidth + start_w;
+          real *op = output_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+          THIndex_t *indzp = indz_p + k * otime * owidth * oheight
+            + ti * owidth * oheight + i * owidth + j;
+
+	  /* compute local max: */
+	  real maxval = -THInf;
+	  int x,y,z;
+	  int mx, my, mz;
+	  mx = my = mz = -1;
+
+          for (z = 0; z < kernel_t; z++)
+          {
+            for (y = 0; y < kernel_h; y++)
+            {
+              for (x = 0; x < kernel_w; x++)
+              {
+                if ((start_t + z * dilationT < itime) && (start_h + y * dilationH < iheight) && (start_w + x * dilationW < iwidth))
+                {
+                  real val = *(ip + z * dilationT * iwidth * iheight + y * dilationH * iwidth + x * dilationW);
+                  if (val > maxval)
+                  {
+                    maxval = val;
+                    // Store indices w.r.t the kernel dimension
+                    mz = z + (kT - kernel_t);
+                    my = y + (kH - kernel_h);
+                    mx = x + (kW - kernel_w);
+                  }
+                }
+              }
+            }
+          }
+
+          // set max values
+          ((unsigned char*)(indzp))[0] = mz;
+          ((unsigned char*)(indzp))[1] = my;
+          ((unsigned char*)(indzp))[2] = mx;
+          ((unsigned char*)(indzp))[3] = 0;
+
+          /* set output to local max */
+          *op = maxval;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  long nslices;
+  long itime;
+  long iheight;
+  long iwidth;
+  long otime;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, NULL, NULL,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime   = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth  = input->size[dimw];
+  if (ceilMode)
+  {
+    otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(ceil((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+  else
+  {
+    otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+    oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+    owidth  = (int)(floor((float)(iwidth  - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+  }
+
+  if (pT || pW || pH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((otime - 1)*dT >= itime + pT)
+      --otime;
+    if ((oheight - 1)*dH >= iheight + pH)
+      --oheight;
+    if ((owidth  - 1)*dW >= iwidth  + pW)
+      --owidth;
+  }
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (input->nDimension == 4) /* non-batch mode */
+  {
+    /* resize output */
+    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j uchar locations packed into float/double */
+    THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      kT, kW, kH,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+    /* resize output */
+    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+    /* indices will contain ti,i,j locations for each output point */
+    THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+    for (p=0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+        input_data   + p * istride,
+        output_data  + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        kT, kW, kH,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *indz_p,
+          long nslices,
+          long itime,
+          long iwidth,
+          long iheight,
+          long otime,
+          long owidth,
+          long oheight,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH)
+{
+  long k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    real *gradInput_p_k  = gradInput_p  + k * itime * iwidth * iheight;
+    real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+    THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+    /* calculate max points */
+    long ti, i, j;
+    for (ti = 0; ti < otime; ti++)
+    {
+      for (i = 0; i < oheight; i++)
+      {
+        for (j = 0; j < owidth; j++)
+        {
+          /* retrieve position of max */
+          THIndex_t * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+          long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT;
+          long maxi  = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
+          long maxj  = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
+
+	  if (maxti != -1) {
+	    /* update gradient */
+	    gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+	      gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+	  }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          int dilationT,
+          int dilationW,
+          int dilationH,
+          bool ceilMode)
+{
+  int nslices;
+  int itime;
+  int iheight;
+  int iwidth;
+  int otime;
+  int oheight;
+  int owidth;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  int dimN = 0;
+  int dimt = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        kT,  kW,  kH, dT,  dW,  dH,
+        pT,  pW,  pH, dilationT,  dilationW,  dilationH,
+        ceilMode);
+
+  // TODO: gradOutput shape check
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    dimN++;
+    dimt++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimN];
+  itime = input->size[dimt];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  otime = gradOutput->size[dimt];
+  oheight = gradOutput->size[dimh];
+  owidth = gradOutput->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4) /* non-batch mode*/
+  {
+    THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      itime, iwidth, iheight,
+      otime, owidth, oheight,
+      dT, dW, dH,
+      pT, pW, pH,
+      dilationT, dilationW, dilationH
+    );
+  }
+  else /* batch mode */
+  {
+    long p;
+    long nBatch = input->size[0];
+
+    long istride = nslices * itime * iwidth * iheight;
+    long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nBatch; p++)
+    {
+      THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+        gradInput_data + p * istride,
+        gradOutput_data + p * ostride,
+        indices_data + p * ostride,
+        nslices,
+        itime, iwidth, iheight,
+        otime, owidth, oheight,
+        dT, dW, dH,
+        pT, pW, pH,
+        dilationT, dilationW, dilationH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c
new file mode 100644
index 000000000..236986bb9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c
@@ -0,0 +1,279 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c"
+#else
+
+static long* THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+  real sample,
+  long inputSize,
+  long outputSize,
+  int poolSize) {
+  real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+  long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+  long i;
+  for (i = 0; i < outputSize - 1; ++i) {
+    sequence[i] =
+      (long) ((i + sample) * alpha) - (long) (sample * alpha);
+  }
+  sequence[outputSize - 1] = inputSize - poolSize;
+
+  return sequence;
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+  real* input,
+  real* output,
+  THIndex_t* indices,
+  real* randomSamples,
+  long numPlanes,
+  long inputT, long inputW, long inputH,
+  long outputT, long outputW, long outputH,
+  int poolSizeT, int poolSizeW, int poolSizeH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; ++plane) {
+    /* each plane contains 3 random samples, one for T, one for W, and one for H */
+    real* randomSamplesForPlane = randomSamples + plane * 3;
+
+    /* Generate interval sequence */
+    long* sequenceT =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[0], inputT, outputT, poolSizeT);
+    long* sequenceW =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[1], inputW, outputW, poolSizeW);
+    long* sequenceH =
+      THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+        randomSamplesForPlane[2], inputH, outputH, poolSizeH);
+
+    /* loop over output */
+    long h, w, t;
+
+    real* inputForPlane = input + plane * inputT * inputW * inputH;
+    real* outputForPlane = output + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    for (h = 0; h < outputH; ++h) {
+      long inputHStart = sequenceH[h];
+
+      for (w = 0; w < outputW; ++w) {
+        long inputWStart = sequenceW[w];
+
+        for (t = 0; t < outputT; ++t) {
+          long inputTStart = sequenceT[t];
+
+          real maxVal = -THInf;
+          long maxIndex = -1;
+
+          long h2, w2, t2;
+          for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+            for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+              for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) {
+                THAssert(h2 >= 0 && h2 < inputH);
+                THAssert(w2 >= 0 && w2 < inputW);
+                THAssert(t2 >= 0 && t2 < inputT);
+
+                long planeIndex = h2 * inputW * inputT + w2 * inputT + t2;
+                real val = inputForPlane[planeIndex];
+                if (val > maxVal) {
+                  maxVal = val;
+                  maxIndex = planeIndex;
+                }
+              }
+            }
+          }
+
+          THAssert(maxVal != -THInf);
+          THAssert(maxIndex != -1);
+
+          outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal;
+          /* +1 to lua index */
+          indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE;
+        }
+      }
+    }
+
+    THFree(sequenceT);
+    THFree(sequenceW);
+    THFree(sequenceH);
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices,
+    THTensor *randomSamples) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  THNN_ARGCHECK(numInputDims == 4 || numInputDims == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim++;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+  long inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 9,
+             "poolSizeH (%d) too large relative to input height (%d)",
+	     poolSizeH, inputH);
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 8,
+             "poolSizeW (%d) too large relative to input width (%d)",
+	     poolSizeW, inputW);
+  THArgCheck(outputT + poolSizeT - 1 < inputT, 7,
+             "poolSizeT (%d) too large relative to input time (%d)",
+	     poolSizeT, inputT);
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  if (numInputDims == 4) {
+    /* resize output */
+    THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT);
+
+    THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+      THTensor_(data)(input),
+      THTensor_(data)(output),
+      THIndexTensor_(data)(indices),
+      THTensor_(data)(randomSamples),
+      numPlanes, inputT, inputW, inputH,
+      outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+  } else {
+    THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT);
+    /* indices will contain the locations for each output point */
+    THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT);
+
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+        THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        THTensor_(data)(randomSamples) + batch * numPlanes * 3,
+        numPlanes, inputT, inputW, inputH,
+        outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+  real* gradInput,
+  real* gradOutput,
+  THIndex_t* indices,
+  long numPlanes,
+  long inputT, long inputW, long inputH,
+  long outputT, long outputW, long outputH) {
+  long plane;
+#pragma omp parallel for private(plane)
+  for (plane = 0; plane < numPlanes; plane++) {
+    real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH;
+    real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH;
+    THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+    long h, w, t;
+    for (h = 0; h < outputH; ++h) {
+      for (w = 0; w < outputW; ++w) {
+        for (t = 0; t < outputT; ++t) {
+          long outputIndex = h * outputW * outputT + w * outputT + t;
+          long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+          THAssert(index >= 0 && index < inputT * inputW * inputH);
+
+          gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int outputT, int outputW, int outputH,
+    int poolSizeT, int poolSizeW, int poolSizeH,
+    THIndexTensor *indices) {
+
+  long numBatch = 1;
+  int planeDim = 0;
+  int heightDim = 1;
+  int widthDim = 2;
+  int timeDim = 3;
+
+  long numInputDims = THTensor_(nDimension)(input);
+  if (numInputDims == 5) {
+    numBatch = THTensor_(size)(input, 0);
+    planeDim = 1;
+    heightDim++;
+    widthDim++;
+    timeDim++;
+  }
+
+  /* sizes */
+  long numPlanes = THTensor_(size)(input, planeDim);
+  long inputH = THTensor_(size)(input, heightDim);
+  long inputW = THTensor_(size)(input, widthDim);
+  long inputT = THTensor_(size)(input, timeDim);
+
+  THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3,
+             "gradOutput time unexpected");
+  THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+             "gradOutput width unexpected");
+  THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+             "gradOutput height unexpected");
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (numInputDims == 4) {
+    THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      THIndexTensor_(data)(indices),
+      numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+  } else {
+    long batch;
+#pragma omp parallel for private(batch)
+    for (batch = 0; batch < numBatch; ++batch) {
+      THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT,
+        THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT,
+        THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+        numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c
new file mode 100644
index 000000000..c974fab50
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c
@@ -0,0 +1,541 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+static void THNN_(vol2col)(
+  const real *data_vol, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
+  real *data_col)
+{
+  int c, t, h, w;
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+              data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+static void THNN_(col2vol)(
+  const real* data_col, const int channels,
+  const int depth, const int height, const int width,
+  const int kT, const int kH, const int kW,
+  const int pT, const int pH, const int pW,
+  const int dT, const int dH, const int dW,
+  const int dilationT, const int dilationH, const int dilationW,
+  real* data_vol)
+{
+  int c, t, h, w;
+  memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+  int depth_col  = (depth  + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  int width_col  = (width  + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  int channels_col = channels * kT * kH * kW;
+  for (c = 0; c < channels_col; ++c)
+  {
+    int w_offset = c % kW;
+    int h_offset = (c / kW) % kH;
+    int t_offset = (c / kW / kH) % kT;
+    int c_vol = c / kT / kH / kW;
+    for (t = 0; t < depth_col; ++t)
+    {
+      for (h = 0; h < height_col; ++h)
+      {
+        for (w = 0; w < width_col; ++w)
+        {
+          int t_pad = t * dT - pT + t_offset * dilationT;
+          int h_pad = h * dH - pH + h_offset * dilationH;
+          int w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth &&
+              h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+              data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
+                         THTensor *input, THTensor *gradOutput,
+                         THTensor *weight, THTensor *bias,
+                         int dT, int dW, int dH, int pT, int pW, int pH,
+                         int aT, int aW, int aH) {
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+                "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+                "expected for weight, but got: %s");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+  THArgCheck(aT < dT && aW < dW && aH < dH, 15,
+             "output adjustment must be smaller than stride, but got "
+             "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
+             aT, aH, aW, dT, dH, dW);
+
+  int ndim = input->nDimension;
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  if (bias != NULL) {
+    THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+  }
+
+  int dimf = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  if (ndim == 5) {
+    dimf++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  const long inputWidth   = input->size[dimw];
+  const long inputHeight  = input->size[dimh];
+  const long inputDepth   = input->size[dimd];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+  THNNState *state,
+  THTensor *input,          // 4D or 5D (batch) tensor
+  THTensor *output,
+  THTensor *weight,         // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+  THTensor *bias,
+  THTensor *finput,         // internal columns buffer
+  THTensor *fgradInput,     // internal ones buffer
+  int dT, int dW, int dH,   // stride of the convolution
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *columns = finput;
+  THTensor *ones    = fgradInput;
+
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, NULL, weight, bias,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  bias = bias ? THTensor_(newContiguous)(bias) : bias;
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+  THTensor_(zero)(columns);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *output_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    const long n = columns->size[1];
+    const long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 't',
+      n, m, k,
+      1,
+      THTensor_(data)(input_n), n,
+      THTensor_(data)(weight), m,
+      0,
+      THTensor_(data)(columns), n
+    );
+
+    // Unpack columns back into input:
+    THNN_(col2vol)(
+      THTensor_(data)(columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long n_ = outputDepth * outputHeight * outputWidth;
+    const long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+	if (bias) {
+      THBlas_(gemm)(
+        't', 'n',
+        n_, m_, k_,
+        1,
+        THTensor_(data)(ones), k_,
+        THTensor_(data)(bias), k_,
+        1,
+        THTensor_(data)(output_n), n_
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(output_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(weight);
+  if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradInput,
+  THTensor *weight,
+  THTensor *finput,
+  THTensor *fgradInput,     // only used by cuda impl
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH)   // extra output adjustment
+{
+  THTensor *gradColumns = finput;
+
+  // number of input & output planes and kernel size is indirectly defined by the weight tensor
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, gradOutput, weight, NULL,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+  const int nInputPlane  = (int)weight->size[0];
+  const int nOutputPlane = (int)weight->size[1];
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  input = THTensor_(newContiguous)(input);
+  weight = THTensor_(newContiguous)(weight);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Resize output
+  THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+
+  // Resize temporary columns
+  THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *gradInput_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per sample:
+    THTensor_(select)(gradInput_n, gradInput, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(gradColumns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m = weight->size[0];
+    const long n = gradColumns->size[1];
+    const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      'n', 'n',
+      n, m, k,
+      1,
+      THTensor_(data)(gradColumns), n,
+      THTensor_(data)(weight), k,
+      0,
+      THTensor_(data)(gradInput_n), n
+    );
+  }
+
+  // Free
+  THTensor_(free)(gradInput_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+  THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+  THNNState *state,
+  THTensor *input,
+  THTensor *gradOutput,
+  THTensor *gradWeight,
+  THTensor *gradBias,
+  THTensor *finput,
+  THTensor *fgradInput,
+  int dT, int dW, int dH,   // stride
+  int pT, int pW, int pH,   // padding
+  int aT, int aW, int aH,   // extra output adjustment
+  accreal scale_)
+{
+  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+  // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+  THNN_(VolumetricFullConvolution_shapeCheck)(
+        input, gradOutput, gradWeight, gradBias,
+        dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+  int nInputPlane  = (int)gradWeight->size[0];
+  int nOutputPlane = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
+
+  THTensor *columns = finput;
+  THTensor *ones = fgradInput;
+
+  input = THTensor_(newContiguous)(input);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+  if (gradBias)
+    THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  const long inputWidth   = input->size[4];
+  const long inputHeight  = input->size[3];
+  const long inputDepth   = input->size[2];
+  const long outputWidth  = (inputWidth  - 1) * dW - 2*pW + kW + aW;
+  const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+  const long outputDepth  = (inputDepth  - 1) * dT - 2*pT + kT + aT;
+
+  // Batch size + input planes
+  const long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+    THTensor_(fill)(ones, 1);
+  }
+
+  // Resize temporary columns
+  THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THTensor *input_n = THTensor_(new)();
+  THTensor *gradOutput_n = THTensor_(new)();
+
+  int elt;
+  // For each elt in batch, do:
+  for (elt = 0; elt < batchSize; ++elt)
+  {
+    // Matrix mulitply per output:
+    THTensor_(select)(input_n, input, 0, elt);
+    THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    THNN_(vol2col)(
+      THTensor_(data)(gradOutput_n), nOutputPlane,
+      outputDepth, outputHeight, outputWidth,
+      kT, kH, kW,
+      pT, pH, pW,
+      dT, dH, dW,
+       1,  1,  1,
+      THTensor_(data)(columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    const long m = input_n->size[0];   // nInputPlane
+    const long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    THBlas_(gemm)(
+      't', 'n',
+      n, m, k,
+      scale,
+      THTensor_(data)(columns), k,
+      THTensor_(data)(input_n), k,
+      1,
+      THTensor_(data)(gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    const long m_ = nOutputPlane;
+    const long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      THBlas_(gemv)(
+        't',
+        k_, m_,
+        scale,
+        THTensor_(data)(gradOutput_n), k_,
+        THTensor_(data)(ones), 1,
+        1,
+        THTensor_(data)(gradBias), 1
+      );
+    }
+  }
+
+  // Free
+  THTensor_(free)(input_n);
+  THTensor_(free)(gradOutput_n);
+
+  // Resize
+  if (batch == 0)
+  {
+    THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+
+  THTensor_(free)(input);
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c
new file mode 100644
index 000000000..a3601e0b6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+          state, input, output, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int kT,
+          int kW,
+          int kH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH,
+          bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+          state, input, gradOutput, gradInput, indices,
+          kT, kW, kH, dT, dW, dH,
+          pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c
new file mode 100644
index 000000000..d9d9e5951
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c
@@ -0,0 +1,373 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         THIndexTensor *indices,
+                         int oT,
+                         int oW,
+                         int oH,
+                         int dT,
+                         int dW,
+                         int dH,
+                         int pT,
+                         int pW,
+                         int pH)
+{
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+                "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  THNN_CHECK_SHAPE_INDICES(input, indices);
+
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+             dT, dH, dW);
+
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int dimn = 0;
+
+  if (input->nDimension == 5)
+  {
+    dimt++;
+    dimw++;
+    dimh++;
+    dimn++;
+  }
+  int nslices = input->size[dimn];
+
+  if (gradOutput != NULL) {
+    if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+    {
+      THError(
+        "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
+        oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]
+      );
+    }
+
+    THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, dimn, nslices);
+  }
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+          real *input_p,
+          real *output_p,
+          THIndex_t *ind_p,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int k;
+  int has_error = 0;
+  THIndex_t error_index;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    int ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          int start_t = ti * dT - pT;
+          int start_h = i * dH - pH;
+          int start_w = j * dW - pW;
+
+          real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          THIndex_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx);
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT
+	      || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+#pragma omp critical
+            {
+              has_error = 1;
+              error_index = idx;
+            }
+          } else {
+            output_p[idx] = *input_p_k; /* update output */
+          }
+        }
+      }
+    }
+  }
+  if (has_error) {
+    THError(
+        "found an invalid max index %ld (output volumes are of size %dx%dx%d)",
+        error_index, oT, oH, oW
+    );
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *output,
+          THIndexTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *input_data;
+  real *output_data;
+  THIndex_t *indices_data;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, NULL, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+      input_data, output_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH, pT, pW, pH
+    );
+  }
+  else
+  {
+    int p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
+    THTensor_(zero)(output);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+    indices_data = THIndexTensor_(data)(indices);
+
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+        input_data+p*nslices*iT*iW*iH,
+        output_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+  THIndexTensor_(free)(indices);
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+          real *gradInput_p,
+          real *gradOutput_p,
+          THIndex_t *ind_p,
+          int nslices,
+          int iT,
+          int iW,
+          int iH,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int k;
+#pragma omp parallel for private(k)
+  for (k = 0; k < nslices; k++)
+  {
+    int ti, i, j, maxz, maxy, maxx;
+    for (ti = 0; ti < iT; ti++)
+    {
+      for (i = 0; i < iH; i++)
+      {
+        for (j = 0; j < iW; j++)
+        {
+          int start_t = ti * dT - pT;
+          int start_h = i * dH - pH;
+          int start_w = j * dW - pW;
+
+          real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+          THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+          maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+          maxy = ((unsigned char*)(ind_p_k))[1];
+          maxx = ((unsigned char*)(ind_p_k))[2];
+
+          if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0
+	      || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+          {
+            THError(
+              "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+              start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+            );
+          }
+          *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz)
+					+ oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+          THNNState *state,
+          THTensor *input,
+          THTensor *gradOutput,
+          THTensor *gradInput,
+          THIndexTensor *indices,
+          int oT,
+          int oW,
+          int oH,
+          int dT,
+          int dW,
+          int dH,
+          int pT,
+          int pW,
+          int pH)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimt = 1;
+  int nbatch = 1;
+  int nslices;
+  int iT;
+  int iH;
+  int iW;
+  real *gradInput_data;
+  real *gradOutput_data;
+  THIndex_t *indices_data;
+
+  THNN_(VolumetricMaxUnpooling_shapeCheck)(
+        state, input, gradOutput, indices,
+        oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+  // TODO: check gradOutput shape
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  indices = THIndexTensor_(newContiguous)(indices);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimt++;
+    dimw++;
+    dimh++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimt-1];
+  iT = input->size[dimt];
+  iH = input->size[dimh];
+  iW = input->size[dimw];
+
+  /* get raw pointers */
+  gradInput_data = THTensor_(data)(gradInput);
+  gradOutput_data = THTensor_(data)(gradOutput);
+  indices_data = THIndexTensor_(data)(indices);
+
+  /* backprop */
+  if (input->nDimension == 4)
+  {
+    THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+      gradInput_data, gradOutput_data,
+      indices_data,
+      nslices,
+      iT, iW, iH,
+      oT, oW, oH,
+      dT, dW, dH,
+      pT, pW, pH
+    );
+  }
+  else
+  {
+    int p;
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+        gradInput_data+p*nslices*iT*iW*iH,
+        gradOutput_data+p*nslices*oT*oW*oH,
+        indices_data+p*nslices*iT*iW*iH,
+        nslices,
+        iT, iW, iH,
+        oT, oW, oH,
+        dT, dW, dH,
+        pT, pW, pH
+      );
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+  THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c
new file mode 100644
index 000000000..4d8993ec2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c
@@ -0,0 +1,357 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
+                         THNNState *state,
+                         THTensor *input,
+                         THTensor *gradOutput,
+                         int pleft, int pright,
+                         int ptop, int pbottom,
+                         int pfront, int pback) {
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+  if (input->nDimension == 5)
+  {
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+             "input (D: %d H: %d, W: %d)is too small."
+             " Calculated output D: %d H: %d W: %d",
+             idepth, iheight, iwidth, odepth, oheight, owidth);
+
+  if (gradOutput != NULL) {
+    THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               nslices, THTensor_(size)(gradOutput, dimslices));
+    THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+               "gradOutput width unexpected. Expected: %d, Got: %d",
+               owidth, THTensor_(size)(gradOutput, dimw));
+    THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+               "gradOutput height unexpected. Expected: %d, Got: %d",
+               oheight, THTensor_(size)(gradOutput, dimh));
+    THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+               "gradOutput depth unexpected. Expected: %d, Got: %d",
+               odepth, THTensor_(size)(gradOutput, dimd));
+  }
+}
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+  real *input_p, real *output_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *dest_p = output_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *src_p = input_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p = *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+                                                      THTensor *input,
+                                                      THTensor *output,
+                                                      int pleft, int pright,
+                                                      int ptop, int pbottom,
+                                                      int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+  real *input_data;
+  real *output_data;
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+  /* get contiguous input */
+  input = THTensor_(newContiguous)(input);
+
+  /* resize output */
+  if (input->nDimension == 4)
+  {
+    THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+    THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+         input_data, output_data, nslices, iwidth, iheight, idepth,
+         owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+         pback);
+  }
+  else
+  {
+    long p;
+
+    THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+    input_data = THTensor_(data)(input);
+    output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++)
+    {
+      THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+        input_data + p * nslices * iwidth * iheight * idepth,
+        output_data + p * nslices * owidth * oheight * odepth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+  real *ginput_p, real *goutput_p,
+  long nslices,
+  long iwidth, long iheight, long idepth,
+  long owidth, long oheight, long odepth,
+  int pleft, int pright,
+  int ptop, int pbottom,
+  int pfront, int pback)
+{
+  int iStartX = fmax(0, -pleft);
+  int iStartY = fmax(0, -ptop);
+  int iStartZ = fmax(0, -pfront);
+  int oStartX = fmax(0, pleft);
+  int oStartY = fmax(0, ptop);
+  int oStartZ = fmax(0, pfront);
+
+  long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+  for (k = 0; k < nslices; k++) {
+    long i, j, z;
+    for (z = 0; z < odepth; z++) {
+      for (i = 0; i < oheight; i++) {
+        for (j = 0; j < owidth; j++) {
+          if (j < pleft) {
+            ip_x = pleft;
+          } else if (j >= pleft && j < iwidth + pleft) {
+            ip_x = j;
+          } else {
+            ip_x = iwidth + pleft - 1;
+          }
+          ip_x = ip_x - oStartX + iStartX;
+
+          if (i < ptop) {
+            ip_y = ptop;
+          } else if (i >= ptop && i < iheight + ptop) {
+            ip_y = i;
+          } else {
+            ip_y = iheight + ptop - 1;
+          }
+          ip_y = ip_y - oStartY + iStartY;
+
+          if (z < pfront) {
+            ip_z = pfront;
+          } else if (z >= pfront && z < idepth + pfront) {
+            ip_z = z;
+          } else {
+            ip_z = idepth + pfront - 1;
+          }
+          ip_z = ip_z - oStartZ + iStartZ;
+
+          real *src_p = goutput_p + k * owidth * oheight * odepth +
+              z * owidth * oheight + i * owidth + j;
+          real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+              ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+          *dest_p += *src_p;
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+                                                         THTensor *input,
+                                                         THTensor *gradOutput,
+                                                         THTensor *gradInput,
+                                                         int pleft, int pright,
+                                                         int ptop, int pbottom,
+                                                         int pfront, int pback)
+{
+  int dimw = 3;
+  int dimh = 2;
+  int dimd = 1;
+  int dimslices = 0;
+  long nbatch = 1;
+  long nslices;
+  long idepth;
+  long iheight;
+  long iwidth;
+  long odepth;
+  long oheight;
+  long owidth;
+
+  if (input->nDimension == 5)
+  {
+    nbatch = input->size[0];
+    dimw++;
+    dimh++;
+    dimd++;
+    dimslices++;
+  }
+
+  /* sizes */
+  nslices = input->size[dimslices];
+  idepth = input->size[dimd];
+  iheight = input->size[dimh];
+  iwidth = input->size[dimw];
+  odepth = idepth + pfront + pback;
+  oheight = iheight + ptop + pbottom;
+  owidth  = iwidth + pleft + pright;
+
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+      state, input, NULL, pleft, pright,
+      ptop, pbottom, pfront, pback);
+
+  /* get contiguous gradOutput */
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+
+  /* resize */
+  THTensor_(resizeAs)(gradInput, input);
+  THTensor_(zero)(gradInput);
+
+  /* backprop */
+  if (input->nDimension == 4) {
+    THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+      THTensor_(data)(gradInput),
+      THTensor_(data)(gradOutput),
+      nslices,
+      iwidth, iheight, idepth,
+      owidth, oheight, odepth,
+      pleft, pright,
+      ptop, pbottom,
+      pfront, pback);
+  } else {
+    long p;
+#pragma omp parallel for private(p)
+    for (p = 0; p < nbatch; p++) {
+      THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+        THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+        THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+        nslices,
+        iwidth, iheight, idepth,
+        owidth, oheight, odepth,
+        pleft, pright,
+        ptop, pbottom,
+        pfront, pback);
+    }
+  }
+
+  /* cleanup */
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c
new file mode 100644
index 000000000..9068fb58d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c
@@ -0,0 +1,226 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int scale_factor) {
+  THArgCheck(input != NULL, 2, "5D input tensor expected but got NULL");
+  THArgCheck(scale_factor > 1, 4,
+	     "scale_factor must be greater than 1, but got: %d", scale_factor);
+  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+		"4D or 5D input tensor expected but got: %s");
+  if (input->nDimension == 4) {
+    int nChannels    = THTensor_(size)(input, 0);
+    int inputDepth   = THTensor_(size)(input, 1);
+    int inputHeight  = THTensor_(size)(input, 2);
+    int inputWidth   = THTensor_(size)(input, 3);
+    int outputDepth  = inputDepth  * scale_factor;
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, outputDepth);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+    }
+  } else {
+    int nBatch       = THTensor_(size)(input, 0);
+    int nChannels    = THTensor_(size)(input, 1);
+    int inputDepth   = THTensor_(size)(input, 2);
+    int inputHeight  = THTensor_(size)(input, 3);
+    int inputWidth   = THTensor_(size)(input, 4);
+    int outputDepth  = inputDepth  * scale_factor;
+    int outputHeight = inputHeight * scale_factor;
+    int outputWidth  = inputWidth  * scale_factor;
+    if (gradOutput != NULL) {
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+      THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+    }
+  }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int scale_factor)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+  int inputDepth   = THTensor_(size)(input, input->nDimension-3);
+  int inputHeight  = THTensor_(size)(input, input->nDimension-2);
+  int inputWidth   = THTensor_(size)(input,  input->nDimension-1);
+  int outputDepth  = inputDepth * scale_factor;
+  int outputHeight = inputHeight * scale_factor;
+  int outputWidth  = inputWidth * scale_factor;
+
+  if (input->nDimension == 4) {
+    THTensor_(resize4d)(output,
+			THTensor_(size)(input, 0),
+			outputDepth, outputHeight, outputWidth);
+  } else {
+    THTensor_(resize5d)(output,
+			THTensor_(size)(input, 0),
+			THTensor_(size)(input, 1),
+			outputDepth, outputHeight, outputWidth);
+  }
+
+  int dT = scale_factor;
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int xDim = input->nDimension-3;
+  int yDim = input->nDimension-2;
+  int zDim = input->nDimension-1;
+
+  // dims
+  int idim = input->nDimension;
+  int osz0 = output->size[0];
+  int osz1 = output->size[1];
+  int osz2 = output->size[2];
+  int osz3 = output->size[3];
+  int osz4 = 1;
+  if (idim > 4) {
+    osz4 = output->size[4];
+  }
+
+  // get strides
+  long *is = input->stride;
+  long *os = output->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(input);
+  real *pout = THTensor_(data)(output);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, i4, isrc, idst;
+  int iout[5];  // Output indices
+  int iin[5];  // Input indices
+
+  for (i0 = 0; i0 < osz0; i0++) {
+    iout[0] = i0;
+    iin[0] = i0;
+    for (i1 = 0; i1 < osz1; i1++) {
+      iout[1] = i1;
+      iin[1] = i1;
+      for (i2 = 0; i2 < osz2; i2++) {
+        iout[2] = i2;
+        iin[2] = i2;
+        for (i3 = 0; i3 < osz3; i3++) {
+          iout[3] = i3;
+          iin[3] = i3;
+          for (i4 = 0; i4 < osz4; i4++) {
+            iout[4] = i4;
+            iin[4] = i4;
+
+            // set the indices for the upsampled dimensions
+            iin[xDim] = iout[xDim] / dW;
+            iin[yDim] = iout[yDim] / dH;
+            iin[zDim] = iout[zDim] / dT;
+
+            idst = i0*os[0] + i1*os[1] + i2*os[2] + i3*os[3];
+            isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2] + iin[3]*is[3];
+            if (idim > 4) {
+              idst += i4*os[4];
+              isrc += iin[4]*is[4];
+            }
+
+            pout[idst] = pin[isrc];
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int scale_factor)
+{
+  THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+  THTensor_(resizeAs)(gradInput, input);
+
+  int dW = scale_factor;
+  int dH = scale_factor;
+  int dT = scale_factor;
+  int xDim = gradInput->nDimension-3;
+  int yDim = gradInput->nDimension-2;
+  int zDim = gradInput->nDimension-1;
+
+  // dims
+  int idim = gradInput->nDimension;  // Guaranteed to be between 3 and 5
+  int isz0 = gradInput->size[0];
+  int isz1 = gradInput->size[1];
+  int isz2 = gradInput->size[2];
+  int isz3 = gradInput->size[3];
+  int isz4 = 1;
+  if (idim > 4) {
+    isz4 = gradInput->size[4];
+  }
+
+  // get strides
+  long *is = gradInput->stride;
+  long *os = gradOutput->stride;
+
+  // get raw pointers
+  real *pin = THTensor_(data)(gradInput);
+  real *pout = THTensor_(data)(gradOutput);
+
+  // perform the upsampling
+  int i0, i1, i2, i3, i4, isrc, idst, x, y, z;
+  int iin[5];  // Input indices
+  int iout[5];  // Output indices
+
+  THTensor_(zero)(gradInput);
+
+  for (i0 = 0; i0 < isz0; i0++) {
+    iin[0] = i0;
+    iout[0] = i0;
+    for (i1 = 0; i1 < isz1; i1++) {
+      iin[1] = i1;
+      iout[1] = i1;
+      for (i2 = 0; i2 < isz2; i2++) {
+        iin[2] = i2;
+        iout[2] = i2;
+        for (i3 = 0; i3 < isz3; i3++) {
+          iin[3] = i3;
+          iout[3] = i3;
+
+          for (i4 = 0; i4 < isz4; i4++) {
+            iin[4] = i4;
+            iout[4] = i4;
+
+            idst = i0*is[0] + i1*is[1] + i2*is[2] + i3*is[3];
+            if (idim > 4) {
+              idst += i4*is[4];
+            }
+
+            // Now accumulate the gradients from gradOutput
+            for (z = 0; z < dT; z++) {
+              for (y = 0; y < dH; y++) {
+                for (x = 0; x < dW; x++) {
+                  iout[xDim] = dW * iin[xDim] + x;
+                  iout[yDim] = dH * iin[yDim] + y;
+                  iout[zDim] = dT * iin[zDim] + z;
+                  isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2] + iout[3]*os[3];
+                  if (idim > 4) {
+                    isrc += iout[4]*os[4];
+                  }
+                  pin[idst] += pout[isrc];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
new file mode 100644
index 000000000..f2b04dba9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
@@ -0,0 +1,213 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c"
+#else
+
+static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+     (THTensor *input, THTensor *gradOutput,
+      int nBatch, int nChannels,
+      int inputDepth, int inputHeight, int inputWidth,
+      int outputDepth, int outputHeight, int outputWidth) {
+  THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+	     && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2,
+	     "input and output sizes should be greater than 0,"
+	     " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+	     inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+  if (input != NULL) {
+    THNN_ARGCHECK(input->nDimension == 5, 2, input,
+		  "5D input tensor expected but got: %s");
+  }
+
+  if (gradOutput != NULL) {
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+    THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+  }
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+    THNNState *state,
+    THTensor *input,
+    THTensor *output,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth){
+
+  int nbatch = THTensor_(size)(input, 0);
+  int channels = THTensor_(size)(input, 1);
+  int inputDepth = THTensor_(size)(input, 2);
+  int inputHeight = THTensor_(size)(input, 3);
+  int inputWidth = THTensor_(size)(input, 4);
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (input, NULL,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  input = THTensor_(newContiguous)(input);
+  THTensor_(resize5d)(output,
+		      THTensor_(size)(input, 0),
+		      THTensor_(size)(input, 1),
+		      outputDepth, outputHeight, outputWidth);
+  THTensor_(zero)(output);
+  real *idata = THTensor_(data)(input);
+  real *odata = THTensor_(data)(output);
+  channels = nbatch * channels;
+  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
+           outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+  // special case: just copy
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rdepth  = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+  const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth  = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const float t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const float h1r = rheight * h2;
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const float w1r = rwidth * w2;
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+                              + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+                                          + w1lambda * pos1[h1p * inputWidth + w1p]))
+                  + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + w1p])
+                              + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + h1p * inputWidth]
+                                          + w1lambda * pos1[t1p * inputHeight * inputWidth
+                                                            + h1p * inputWidth + w1p]));
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(input);
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+    THNNState *state,
+    THTensor *gradOutput,
+    THTensor *gradInput,
+    int nbatch,
+    int channels,
+    int inputDepth,
+    int inputHeight,
+    int inputWidth,
+    int outputDepth,
+    int outputHeight,
+    int outputWidth){
+
+  THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+    (NULL, gradOutput,
+     nbatch, channels,
+     inputDepth, inputHeight, inputWidth,
+     outputDepth, outputHeight, outputWidth);
+
+  THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth);
+  THTensor_(zero)(gradInput);
+  gradOutput = THTensor_(newContiguous)(gradOutput);
+  real *data1 = THTensor_(data)(gradInput);
+  real *data2 = THTensor_(data)(gradOutput);
+  channels = nbatch * channels;
+
+  // special case: same-size matching grids
+  if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+    for (int t2 = 0; t2 < outputDepth; ++t2) {
+      const int t1 = t2;
+      for (int h2 = 0; h2 < outputHeight; ++h2) {
+        const int h1 = h2;
+        for (int w2 = 0; w2 < outputWidth; ++w2) {
+          const int w1 = w2;
+          real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+          const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+          for (int c = 0; c < channels; ++c) {
+            pos1[0] += pos2[0];
+            pos1 += inputWidth * inputHeight * inputDepth;
+            pos2 += outputWidth * outputHeight * outputDepth;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rdepth  = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+  const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+  const float rwidth  = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+  for (int t2 = 0; t2 < outputDepth; ++t2) {
+    const float t1r = rdepth * t2;
+    const int t1 = t1r;
+    const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+    const real t1lambda = t1r - t1;
+    const real t0lambda = (real)1. - t1lambda;
+    for (int h2 = 0; h2 < outputHeight; ++h2) {
+      const float h1r = rheight * h2;
+      const int h1 = h1r;
+      const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+      const real h1lambda = h1r - h1;
+      const real h0lambda = (real)1. - h1lambda;
+      for (int w2 = 0; w2 < outputWidth; ++w2) {
+        const float w1r = rwidth * w2;
+        const int w1 = w1r;
+        const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+        const real w1lambda = w1r - w1;
+        const real w0lambda = (real)1. - w1lambda;
+        real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+        const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+        for (int c = 0; c < channels; ++c) {
+          pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0];
+          pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0];
+          pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0];
+          pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0];
+          pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0];
+          pos1 += inputWidth * inputHeight * inputDepth;
+          pos2 += outputWidth * outputHeight * outputDepth;
+        }
+      }
+    }
+  }
+  THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/unfold.c b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c
new file mode 100644
index 000000000..14a73b567
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c
@@ -0,0 +1,166 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  // This function assumes that
+  // outputHeight*dH does not overflow a long
+  // outputWidth*dW does not overflow a long
+
+  int nip;
+
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+  for(nip = 0; nip < nInputPlane; nip++)
+  {
+    int kw, kh, y, x;
+    long ix, iy;
+    for(kh = 0; kh < kH; kh++)
+    {
+      for(kw = 0; kw < kW; kw++)
+      {
+        real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+        real *dst = input_data + nip*((size_t)inputHeight*inputWidth);
+        if (padW > 0 || padH > 0) {
+          int lpad,rpad;
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long)y*dH - padH + kh;
+            if (iy < 0 || iy >= inputHeight) {
+            } else {
+              if (dW==1){
+                 ix = 0 - padW + kw;
+                 lpad = fmaxf(0,padW-kw);
+                 rpad = fmaxf(0,padW-(kW-kw-1));
+                 real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
+                 THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+              }
+              else{
+                for (x=0; x<outputWidth; x++){
+                   ix = (long)x*dW - padW + kw;
+                   if (ix < 0 || ix >= inputWidth){
+                   }else{
+                     real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+                     THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+                   }
+                }
+              }
+            }
+          }
+        } else {
+          for(y = 0; y < outputHeight; y++) {
+            iy = (long)y*dH + kh;
+            ix = 0 + kw;
+            if (dW == 1 ) {
+               real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+               THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
+            }else{
+              for(x = 0; x < outputWidth; x++) {
+                real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
+                THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void THNN_(unfolded_copy)(
+          THTensor *finput,
+          THTensor *input,
+          int kW,
+          int kH,
+          int dW,
+          int dH,
+          int padW,
+          int padH,
+          int nInputPlane,
+          int inputWidth,
+          int inputHeight,
+          int outputWidth,
+          int outputHeight)
+{
+  // This function assumes that
+  // kH*kW does not overflow an int
+  // nInputPlane*kH*kW does not overflow a long
+  // outputHeight*dH does not overflow a long
+  // outputWidth*dW does not overflow a long
+
+  long k;
+  real *input_data = THTensor_(data)(input);
+  real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+  for(k = 0; k < (long)nInputPlane*kH*kW; k++) {
+    long nip = k / (kH*kW);
+    long rest = k % (kH*kW);
+    long kh = rest / kW;
+    long kw = rest % kW;
+    int x, y;
+    long ix, iy;
+    real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+    real *src = input_data + nip*((size_t)inputHeight*inputWidth);
+    if (padW > 0 || padH > 0) {
+      long lpad,rpad;
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long)y*dH - padH + kh;
+        if (iy < 0 || iy >= inputHeight) {
+          memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+        } else {
+          if (dW==1){
+             ix = 0 - padW + kw;
+             lpad = fmaxf(0,padW-kw);
+             rpad = fmaxf(0,padW-(kW-kw-1));
+             if (outputWidth-rpad-lpad <= 0) {
+                memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+             } else {
+                if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad);
+                memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad));
+                if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+             }
+          }
+          else{
+            for (x=0; x<outputWidth; x++){
+               ix = (long)x*dW - padW + kw;
+               if (ix < 0 || ix >= inputWidth)
+                 memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1);
+               else
+                 memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1));
+            }
+          }
+        }
+      }
+    } else {
+      for(y = 0; y < outputHeight; y++) {
+        iy = (long)y*dH + kh;
+        ix = 0 + kw;
+        if (dW == 1)
+           memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth);
+        else{
+          for (x=0; x<outputWidth; x++)
+             memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(long)x*dW, sizeof(real)*(1));
+         }
+      }
+    }
+  }
+}
+
+#endif